xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avx512fintrin.h (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 /*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifndef __AVX512FINTRIN_H
14 #define __AVX512FINTRIN_H
15 
16 typedef char __v64qi __attribute__((__vector_size__(64)));
17 typedef short __v32hi __attribute__((__vector_size__(64)));
18 typedef double __v8df __attribute__((__vector_size__(64)));
19 typedef float __v16sf __attribute__((__vector_size__(64)));
20 typedef long long __v8di __attribute__((__vector_size__(64)));
21 typedef int __v16si __attribute__((__vector_size__(64)));
22 
23 /* Unsigned types */
24 typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
25 typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
26 typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
27 typedef unsigned int __v16su __attribute__((__vector_size__(64)));
28 
29 /* We need an explicitly signed variant for char. Note that this shouldn't
30  * appear in the interface though. */
31 typedef signed char __v64qs __attribute__((__vector_size__(64)));
32 
33 typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
34 typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64)));
35 typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
36 
37 typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1)));
38 typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1)));
39 typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)));
40 
41 typedef unsigned char __mmask8;
42 typedef unsigned short __mmask16;
43 
44 /* Rounding mode macros.  */
45 #define _MM_FROUND_TO_NEAREST_INT   0x00
46 #define _MM_FROUND_TO_NEG_INF       0x01
47 #define _MM_FROUND_TO_POS_INF       0x02
48 #define _MM_FROUND_TO_ZERO          0x03
49 #define _MM_FROUND_CUR_DIRECTION    0x04
50 
51 /* Constants for integer comparison predicates */
52 typedef enum {
53     _MM_CMPINT_EQ,      /* Equal */
54     _MM_CMPINT_LT,      /* Less than */
55     _MM_CMPINT_LE,      /* Less than or Equal */
56     _MM_CMPINT_UNUSED,
57     _MM_CMPINT_NE,      /* Not Equal */
58     _MM_CMPINT_NLT,     /* Not Less than */
59 #define _MM_CMPINT_GE   _MM_CMPINT_NLT  /* Greater than or Equal */
60     _MM_CMPINT_NLE      /* Not Less than or Equal */
61 #define _MM_CMPINT_GT   _MM_CMPINT_NLE  /* Greater than */
62 } _MM_CMPINT_ENUM;
63 
64 typedef enum
65 {
66   _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
67   _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
68   _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
69   _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
70   _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
71   _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
72   _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
73   _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
74   _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
75   _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
76   _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
77   _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
78   _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
79   _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
80   _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
81   _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
82   _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
83   _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
84   _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
85   _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
86   _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
87   _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
88   _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
89   _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
90   _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
91   _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
92   _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
93   _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
94   _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
95   _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
96   _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
97   _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
98   _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
99   _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
100   _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
101   _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
102   _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
103   _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
104   _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
105   _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
106   _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
107   _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
108   _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
109   _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
110   _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
111   _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
112   _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
113   _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
114   _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
115   _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
116   _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
117   _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
118   _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
119   _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
120   _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
121   _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
122   _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
123   _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
124   _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
125   _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
126   _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
127   _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
128   _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
129   _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
130   _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
131   _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
132   _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
133   _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
134   _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
135   _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
136   _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
137   _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
138   _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
139   _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
140   _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
141   _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
142   _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
143   _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
144   _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
145   _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
146   _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
147   _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
148   _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
149   _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
150   _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
151   _MM_PERM_DDDD = 0xFF
152 } _MM_PERM_ENUM;
153 
154 typedef enum
155 {
156   _MM_MANT_NORM_1_2,    /* interval [1, 2)      */
157   _MM_MANT_NORM_p5_2,   /* interval [0.5, 2)    */
158   _MM_MANT_NORM_p5_1,   /* interval [0.5, 1)    */
159   _MM_MANT_NORM_p75_1p5   /* interval [0.75, 1.5) */
160 } _MM_MANTISSA_NORM_ENUM;
161 
162 typedef enum
163 {
164   _MM_MANT_SIGN_src,    /* sign = sign(SRC)     */
165   _MM_MANT_SIGN_zero,   /* sign = 0             */
166   _MM_MANT_SIGN_nan   /* DEST = NaN if sign(SRC) = 1 */
167 } _MM_MANTISSA_SIGN_ENUM;
168 
169 /* Define the default attributes for the functions in this file. */
170 #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f,evex512"), __min_vector_width__(512)))
171 #define __DEFAULT_FN_ATTRS128                                                  \
172   __attribute__((__always_inline__, __nodebug__,                               \
173                  __target__("avx512f,no-evex512"), __min_vector_width__(128)))
174 #define __DEFAULT_FN_ATTRS                                                     \
175   __attribute__((__always_inline__, __nodebug__,                               \
176                  __target__("avx512f,no-evex512")))
177 
178 /* Create vectors with repeated elements */
179 
180 static  __inline __m512i __DEFAULT_FN_ATTRS512
181 _mm512_setzero_si512(void)
182 {
183   return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
184 }
185 
186 #define _mm512_setzero_epi32 _mm512_setzero_si512
187 
188 static __inline__ __m512d __DEFAULT_FN_ATTRS512
189 _mm512_undefined_pd(void)
190 {
191   return (__m512d)__builtin_ia32_undef512();
192 }
193 
194 static __inline__ __m512 __DEFAULT_FN_ATTRS512
195 _mm512_undefined(void)
196 {
197   return (__m512)__builtin_ia32_undef512();
198 }
199 
200 static __inline__ __m512 __DEFAULT_FN_ATTRS512
201 _mm512_undefined_ps(void)
202 {
203   return (__m512)__builtin_ia32_undef512();
204 }
205 
206 static __inline__ __m512i __DEFAULT_FN_ATTRS512
207 _mm512_undefined_epi32(void)
208 {
209   return (__m512i)__builtin_ia32_undef512();
210 }
211 
212 static __inline__ __m512i __DEFAULT_FN_ATTRS512
213 _mm512_broadcastd_epi32 (__m128i __A)
214 {
215   return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
216                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
217 }
218 
219 static __inline__ __m512i __DEFAULT_FN_ATTRS512
220 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
221 {
222   return (__m512i)__builtin_ia32_selectd_512(__M,
223                                              (__v16si) _mm512_broadcastd_epi32(__A),
224                                              (__v16si) __O);
225 }
226 
227 static __inline__ __m512i __DEFAULT_FN_ATTRS512
228 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
229 {
230   return (__m512i)__builtin_ia32_selectd_512(__M,
231                                              (__v16si) _mm512_broadcastd_epi32(__A),
232                                              (__v16si) _mm512_setzero_si512());
233 }
234 
235 static __inline__ __m512i __DEFAULT_FN_ATTRS512
236 _mm512_broadcastq_epi64 (__m128i __A)
237 {
238   return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
239                                           0, 0, 0, 0, 0, 0, 0, 0);
240 }
241 
242 static __inline__ __m512i __DEFAULT_FN_ATTRS512
243 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
244 {
245   return (__m512i)__builtin_ia32_selectq_512(__M,
246                                              (__v8di) _mm512_broadcastq_epi64(__A),
247                                              (__v8di) __O);
248 
249 }
250 
251 static __inline__ __m512i __DEFAULT_FN_ATTRS512
252 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
253 {
254   return (__m512i)__builtin_ia32_selectq_512(__M,
255                                              (__v8di) _mm512_broadcastq_epi64(__A),
256                                              (__v8di) _mm512_setzero_si512());
257 }
258 
259 
260 static __inline __m512 __DEFAULT_FN_ATTRS512
261 _mm512_setzero_ps(void)
262 {
263   return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
264                                  0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
265 }
266 
267 #define _mm512_setzero _mm512_setzero_ps
268 
269 static  __inline __m512d __DEFAULT_FN_ATTRS512
270 _mm512_setzero_pd(void)
271 {
272   return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
273 }
274 
275 static __inline __m512 __DEFAULT_FN_ATTRS512
276 _mm512_set1_ps(float __w)
277 {
278   return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
279                                  __w, __w, __w, __w, __w, __w, __w, __w  };
280 }
281 
282 static __inline __m512d __DEFAULT_FN_ATTRS512
283 _mm512_set1_pd(double __w)
284 {
285   return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
286 }
287 
288 static __inline __m512i __DEFAULT_FN_ATTRS512
289 _mm512_set1_epi8(char __w)
290 {
291   return __extension__ (__m512i)(__v64qi){
292     __w, __w, __w, __w, __w, __w, __w, __w,
293     __w, __w, __w, __w, __w, __w, __w, __w,
294     __w, __w, __w, __w, __w, __w, __w, __w,
295     __w, __w, __w, __w, __w, __w, __w, __w,
296     __w, __w, __w, __w, __w, __w, __w, __w,
297     __w, __w, __w, __w, __w, __w, __w, __w,
298     __w, __w, __w, __w, __w, __w, __w, __w,
299     __w, __w, __w, __w, __w, __w, __w, __w  };
300 }
301 
302 static __inline __m512i __DEFAULT_FN_ATTRS512
303 _mm512_set1_epi16(short __w)
304 {
305   return __extension__ (__m512i)(__v32hi){
306     __w, __w, __w, __w, __w, __w, __w, __w,
307     __w, __w, __w, __w, __w, __w, __w, __w,
308     __w, __w, __w, __w, __w, __w, __w, __w,
309     __w, __w, __w, __w, __w, __w, __w, __w };
310 }
311 
312 static __inline __m512i __DEFAULT_FN_ATTRS512
313 _mm512_set1_epi32(int __s)
314 {
315   return __extension__ (__m512i)(__v16si){
316     __s, __s, __s, __s, __s, __s, __s, __s,
317     __s, __s, __s, __s, __s, __s, __s, __s };
318 }
319 
320 static __inline __m512i __DEFAULT_FN_ATTRS512
321 _mm512_maskz_set1_epi32(__mmask16 __M, int __A)
322 {
323   return (__m512i)__builtin_ia32_selectd_512(__M,
324                                              (__v16si)_mm512_set1_epi32(__A),
325                                              (__v16si)_mm512_setzero_si512());
326 }
327 
328 static __inline __m512i __DEFAULT_FN_ATTRS512
329 _mm512_set1_epi64(long long __d)
330 {
331   return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
332 }
333 
334 static __inline __m512i __DEFAULT_FN_ATTRS512
335 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
336 {
337   return (__m512i)__builtin_ia32_selectq_512(__M,
338                                              (__v8di)_mm512_set1_epi64(__A),
339                                              (__v8di)_mm512_setzero_si512());
340 }
341 
342 static __inline__ __m512 __DEFAULT_FN_ATTRS512
343 _mm512_broadcastss_ps(__m128 __A)
344 {
345   return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
346                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
347 }
348 
349 static __inline __m512i __DEFAULT_FN_ATTRS512
350 _mm512_set4_epi32 (int __A, int __B, int __C, int __D)
351 {
352   return __extension__ (__m512i)(__v16si)
353    { __D, __C, __B, __A, __D, __C, __B, __A,
354      __D, __C, __B, __A, __D, __C, __B, __A };
355 }
356 
357 static __inline __m512i __DEFAULT_FN_ATTRS512
358 _mm512_set4_epi64 (long long __A, long long __B, long long __C,
359        long long __D)
360 {
361   return __extension__ (__m512i) (__v8di)
362    { __D, __C, __B, __A, __D, __C, __B, __A };
363 }
364 
365 static __inline __m512d __DEFAULT_FN_ATTRS512
366 _mm512_set4_pd (double __A, double __B, double __C, double __D)
367 {
368   return __extension__ (__m512d)
369    { __D, __C, __B, __A, __D, __C, __B, __A };
370 }
371 
372 static __inline __m512 __DEFAULT_FN_ATTRS512
373 _mm512_set4_ps (float __A, float __B, float __C, float __D)
374 {
375   return __extension__ (__m512)
376    { __D, __C, __B, __A, __D, __C, __B, __A,
377      __D, __C, __B, __A, __D, __C, __B, __A };
378 }
379 
380 #define _mm512_setr4_epi32(e0,e1,e2,e3)               \
381   _mm512_set4_epi32((e3),(e2),(e1),(e0))
382 
383 #define _mm512_setr4_epi64(e0,e1,e2,e3)               \
384   _mm512_set4_epi64((e3),(e2),(e1),(e0))
385 
386 #define _mm512_setr4_pd(e0,e1,e2,e3)                \
387   _mm512_set4_pd((e3),(e2),(e1),(e0))
388 
389 #define _mm512_setr4_ps(e0,e1,e2,e3)                \
390   _mm512_set4_ps((e3),(e2),(e1),(e0))
391 
392 static __inline__ __m512d __DEFAULT_FN_ATTRS512
393 _mm512_broadcastsd_pd(__m128d __A)
394 {
395   return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
396                                           0, 0, 0, 0, 0, 0, 0, 0);
397 }
398 
399 /* Cast between vector types */
400 
401 static __inline __m512d __DEFAULT_FN_ATTRS512
402 _mm512_castpd256_pd512(__m256d __a)
403 {
404   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
405                                  1, 2, 3, 4, 5, 6, 7);
406 }
407 
408 static __inline __m512 __DEFAULT_FN_ATTRS512
409 _mm512_castps256_ps512(__m256 __a)
410 {
411   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
412                                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
413 }
414 
415 static __inline __m128d __DEFAULT_FN_ATTRS512
416 _mm512_castpd512_pd128(__m512d __a)
417 {
418   return __builtin_shufflevector(__a, __a, 0, 1);
419 }
420 
421 static __inline __m256d __DEFAULT_FN_ATTRS512
422 _mm512_castpd512_pd256 (__m512d __A)
423 {
424   return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
425 }
426 
427 static __inline __m128 __DEFAULT_FN_ATTRS512
428 _mm512_castps512_ps128(__m512 __a)
429 {
430   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
431 }
432 
433 static __inline __m256 __DEFAULT_FN_ATTRS512
434 _mm512_castps512_ps256 (__m512 __A)
435 {
436   return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
437 }
438 
439 static __inline __m512 __DEFAULT_FN_ATTRS512
440 _mm512_castpd_ps (__m512d __A)
441 {
442   return (__m512) (__A);
443 }
444 
445 static __inline __m512i __DEFAULT_FN_ATTRS512
446 _mm512_castpd_si512 (__m512d __A)
447 {
448   return (__m512i) (__A);
449 }
450 
451 static __inline__ __m512d __DEFAULT_FN_ATTRS512
452 _mm512_castpd128_pd512 (__m128d __A)
453 {
454   __m256d __B = __builtin_nondeterministic_value(__B);
455   return __builtin_shufflevector(
456       __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
457       __B, 0, 1, 2, 3, 4, 5, 6, 7);
458 }
459 
460 static __inline __m512d __DEFAULT_FN_ATTRS512
461 _mm512_castps_pd (__m512 __A)
462 {
463   return (__m512d) (__A);
464 }
465 
466 static __inline __m512i __DEFAULT_FN_ATTRS512
467 _mm512_castps_si512 (__m512 __A)
468 {
469   return (__m512i) (__A);
470 }
471 
472 static __inline__ __m512 __DEFAULT_FN_ATTRS512
473 _mm512_castps128_ps512 (__m128 __A)
474 {
475   __m256 __B = __builtin_nondeterministic_value(__B);
476   return __builtin_shufflevector(
477       __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7),
478       __B, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
479 }
480 
481 static __inline__ __m512i __DEFAULT_FN_ATTRS512
482 _mm512_castsi128_si512 (__m128i __A)
483 {
484   __m256i __B = __builtin_nondeterministic_value(__B);
485   return __builtin_shufflevector(
486       __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
487       __B, 0, 1, 2, 3, 4, 5, 6, 7);
488 }
489 
490 static __inline__ __m512i __DEFAULT_FN_ATTRS512
491 _mm512_castsi256_si512 (__m256i __A)
492 {
493    return  __builtin_shufflevector( __A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7);
494 }
495 
496 static __inline __m512 __DEFAULT_FN_ATTRS512
497 _mm512_castsi512_ps (__m512i __A)
498 {
499   return (__m512) (__A);
500 }
501 
502 static __inline __m512d __DEFAULT_FN_ATTRS512
503 _mm512_castsi512_pd (__m512i __A)
504 {
505   return (__m512d) (__A);
506 }
507 
508 static __inline __m128i __DEFAULT_FN_ATTRS512
509 _mm512_castsi512_si128 (__m512i __A)
510 {
511   return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
512 }
513 
514 static __inline __m256i __DEFAULT_FN_ATTRS512
515 _mm512_castsi512_si256 (__m512i __A)
516 {
517   return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
518 }
519 
520 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
521 _mm512_int2mask(int __a)
522 {
523   return (__mmask16)__a;
524 }
525 
526 static __inline__ int __DEFAULT_FN_ATTRS
527 _mm512_mask2int(__mmask16 __a)
528 {
529   return (int)__a;
530 }
531 
532 /// Constructs a 512-bit floating-point vector of [8 x double] from a
533 ///    128-bit floating-point vector of [2 x double]. The lower 128 bits
534 ///    contain the value of the source vector. The upper 384 bits are set
535 ///    to zero.
536 ///
537 /// \headerfile <x86intrin.h>
538 ///
539 /// This intrinsic has no corresponding instruction.
540 ///
541 /// \param __a
542 ///    A 128-bit vector of [2 x double].
543 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
544 ///    contain the value of the parameter. The upper 384 bits are set to zero.
545 static __inline __m512d __DEFAULT_FN_ATTRS512
546 _mm512_zextpd128_pd512(__m128d __a)
547 {
548   return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
549 }
550 
551 /// Constructs a 512-bit floating-point vector of [8 x double] from a
552 ///    256-bit floating-point vector of [4 x double]. The lower 256 bits
553 ///    contain the value of the source vector. The upper 256 bits are set
554 ///    to zero.
555 ///
556 /// \headerfile <x86intrin.h>
557 ///
558 /// This intrinsic has no corresponding instruction.
559 ///
560 /// \param __a
561 ///    A 256-bit vector of [4 x double].
562 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
563 ///    contain the value of the parameter. The upper 256 bits are set to zero.
564 static __inline __m512d __DEFAULT_FN_ATTRS512
565 _mm512_zextpd256_pd512(__m256d __a)
566 {
567   return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
568 }
569 
570 /// Constructs a 512-bit floating-point vector of [16 x float] from a
571 ///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
572 ///    the value of the source vector. The upper 384 bits are set to zero.
573 ///
574 /// \headerfile <x86intrin.h>
575 ///
576 /// This intrinsic has no corresponding instruction.
577 ///
578 /// \param __a
579 ///    A 128-bit vector of [4 x float].
580 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
581 ///    contain the value of the parameter. The upper 384 bits are set to zero.
582 static __inline __m512 __DEFAULT_FN_ATTRS512
583 _mm512_zextps128_ps512(__m128 __a)
584 {
585   return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
586 }
587 
588 /// Constructs a 512-bit floating-point vector of [16 x float] from a
589 ///    256-bit floating-point vector of [8 x float]. The lower 256 bits contain
590 ///    the value of the source vector. The upper 256 bits are set to zero.
591 ///
592 /// \headerfile <x86intrin.h>
593 ///
594 /// This intrinsic has no corresponding instruction.
595 ///
596 /// \param __a
597 ///    A 256-bit vector of [8 x float].
598 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
599 ///    contain the value of the parameter. The upper 256 bits are set to zero.
600 static __inline __m512 __DEFAULT_FN_ATTRS512
601 _mm512_zextps256_ps512(__m256 __a)
602 {
603   return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
604 }
605 
606 /// Constructs a 512-bit integer vector from a 128-bit integer vector.
607 ///    The lower 128 bits contain the value of the source vector. The upper
608 ///    384 bits are set to zero.
609 ///
610 /// \headerfile <x86intrin.h>
611 ///
612 /// This intrinsic has no corresponding instruction.
613 ///
614 /// \param __a
615 ///    A 128-bit integer vector.
616 /// \returns A 512-bit integer vector. The lower 128 bits contain the value of
617 ///    the parameter. The upper 384 bits are set to zero.
618 static __inline __m512i __DEFAULT_FN_ATTRS512
619 _mm512_zextsi128_si512(__m128i __a)
620 {
621   return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
622 }
623 
624 /// Constructs a 512-bit integer vector from a 256-bit integer vector.
625 ///    The lower 256 bits contain the value of the source vector. The upper
626 ///    256 bits are set to zero.
627 ///
628 /// \headerfile <x86intrin.h>
629 ///
630 /// This intrinsic has no corresponding instruction.
631 ///
632 /// \param __a
633 ///    A 256-bit integer vector.
634 /// \returns A 512-bit integer vector. The lower 256 bits contain the value of
635 ///    the parameter. The upper 256 bits are set to zero.
636 static __inline __m512i __DEFAULT_FN_ATTRS512
637 _mm512_zextsi256_si512(__m256i __a)
638 {
639   return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
640 }
641 
642 /* Bitwise operators */
643 static __inline__ __m512i __DEFAULT_FN_ATTRS512
644 _mm512_and_epi32(__m512i __a, __m512i __b)
645 {
646   return (__m512i)((__v16su)__a & (__v16su)__b);
647 }
648 
649 static __inline__ __m512i __DEFAULT_FN_ATTRS512
650 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
651 {
652   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
653                 (__v16si) _mm512_and_epi32(__a, __b),
654                 (__v16si) __src);
655 }
656 
657 static __inline__ __m512i __DEFAULT_FN_ATTRS512
658 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
659 {
660   return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
661                                          __k, __a, __b);
662 }
663 
664 static __inline__ __m512i __DEFAULT_FN_ATTRS512
665 _mm512_and_epi64(__m512i __a, __m512i __b)
666 {
667   return (__m512i)((__v8du)__a & (__v8du)__b);
668 }
669 
670 static __inline__ __m512i __DEFAULT_FN_ATTRS512
671 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
672 {
673     return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
674                 (__v8di) _mm512_and_epi64(__a, __b),
675                 (__v8di) __src);
676 }
677 
678 static __inline__ __m512i __DEFAULT_FN_ATTRS512
679 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
680 {
681   return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
682                                          __k, __a, __b);
683 }
684 
685 static __inline__ __m512i __DEFAULT_FN_ATTRS512
686 _mm512_andnot_si512 (__m512i __A, __m512i __B)
687 {
688   return (__m512i)(~(__v8du)__A & (__v8du)__B);
689 }
690 
691 static __inline__ __m512i __DEFAULT_FN_ATTRS512
692 _mm512_andnot_epi32 (__m512i __A, __m512i __B)
693 {
694   return (__m512i)(~(__v16su)__A & (__v16su)__B);
695 }
696 
697 static __inline__ __m512i __DEFAULT_FN_ATTRS512
698 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
699 {
700   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
701                                          (__v16si)_mm512_andnot_epi32(__A, __B),
702                                          (__v16si)__W);
703 }
704 
705 static __inline__ __m512i __DEFAULT_FN_ATTRS512
706 _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
707 {
708   return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
709                                            __U, __A, __B);
710 }
711 
712 static __inline__ __m512i __DEFAULT_FN_ATTRS512
713 _mm512_andnot_epi64(__m512i __A, __m512i __B)
714 {
715   return (__m512i)(~(__v8du)__A & (__v8du)__B);
716 }
717 
718 static __inline__ __m512i __DEFAULT_FN_ATTRS512
719 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
720 {
721   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
722                                           (__v8di)_mm512_andnot_epi64(__A, __B),
723                                           (__v8di)__W);
724 }
725 
726 static __inline__ __m512i __DEFAULT_FN_ATTRS512
727 _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
728 {
729   return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
730                                            __U, __A, __B);
731 }
732 
733 static __inline__ __m512i __DEFAULT_FN_ATTRS512
734 _mm512_or_epi32(__m512i __a, __m512i __b)
735 {
736   return (__m512i)((__v16su)__a | (__v16su)__b);
737 }
738 
739 static __inline__ __m512i __DEFAULT_FN_ATTRS512
740 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
741 {
742   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
743                                              (__v16si)_mm512_or_epi32(__a, __b),
744                                              (__v16si)__src);
745 }
746 
747 static __inline__ __m512i __DEFAULT_FN_ATTRS512
748 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
749 {
750   return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
751 }
752 
753 static __inline__ __m512i __DEFAULT_FN_ATTRS512
754 _mm512_or_epi64(__m512i __a, __m512i __b)
755 {
756   return (__m512i)((__v8du)__a | (__v8du)__b);
757 }
758 
759 static __inline__ __m512i __DEFAULT_FN_ATTRS512
760 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
761 {
762   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
763                                              (__v8di)_mm512_or_epi64(__a, __b),
764                                              (__v8di)__src);
765 }
766 
767 static __inline__ __m512i __DEFAULT_FN_ATTRS512
768 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
769 {
770   return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
771 }
772 
773 static __inline__ __m512i __DEFAULT_FN_ATTRS512
774 _mm512_xor_epi32(__m512i __a, __m512i __b)
775 {
776   return (__m512i)((__v16su)__a ^ (__v16su)__b);
777 }
778 
779 static __inline__ __m512i __DEFAULT_FN_ATTRS512
780 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
781 {
782   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
783                                             (__v16si)_mm512_xor_epi32(__a, __b),
784                                             (__v16si)__src);
785 }
786 
787 static __inline__ __m512i __DEFAULT_FN_ATTRS512
788 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
789 {
790   return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
791 }
792 
793 static __inline__ __m512i __DEFAULT_FN_ATTRS512
794 _mm512_xor_epi64(__m512i __a, __m512i __b)
795 {
796   return (__m512i)((__v8du)__a ^ (__v8du)__b);
797 }
798 
799 static __inline__ __m512i __DEFAULT_FN_ATTRS512
800 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
801 {
802   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
803                                              (__v8di)_mm512_xor_epi64(__a, __b),
804                                              (__v8di)__src);
805 }
806 
807 static __inline__ __m512i __DEFAULT_FN_ATTRS512
808 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
809 {
810   return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
811 }
812 
813 static __inline__ __m512i __DEFAULT_FN_ATTRS512
814 _mm512_and_si512(__m512i __a, __m512i __b)
815 {
816   return (__m512i)((__v8du)__a & (__v8du)__b);
817 }
818 
819 static __inline__ __m512i __DEFAULT_FN_ATTRS512
820 _mm512_or_si512(__m512i __a, __m512i __b)
821 {
822   return (__m512i)((__v8du)__a | (__v8du)__b);
823 }
824 
825 static __inline__ __m512i __DEFAULT_FN_ATTRS512
826 _mm512_xor_si512(__m512i __a, __m512i __b)
827 {
828   return (__m512i)((__v8du)__a ^ (__v8du)__b);
829 }
830 
831 /* Arithmetic */
832 
833 static __inline __m512d __DEFAULT_FN_ATTRS512
834 _mm512_add_pd(__m512d __a, __m512d __b)
835 {
836   return (__m512d)((__v8df)__a + (__v8df)__b);
837 }
838 
839 static __inline __m512 __DEFAULT_FN_ATTRS512
840 _mm512_add_ps(__m512 __a, __m512 __b)
841 {
842   return (__m512)((__v16sf)__a + (__v16sf)__b);
843 }
844 
845 static __inline __m512d __DEFAULT_FN_ATTRS512
846 _mm512_mul_pd(__m512d __a, __m512d __b)
847 {
848   return (__m512d)((__v8df)__a * (__v8df)__b);
849 }
850 
851 static __inline __m512 __DEFAULT_FN_ATTRS512
852 _mm512_mul_ps(__m512 __a, __m512 __b)
853 {
854   return (__m512)((__v16sf)__a * (__v16sf)__b);
855 }
856 
857 static __inline __m512d __DEFAULT_FN_ATTRS512
858 _mm512_sub_pd(__m512d __a, __m512d __b)
859 {
860   return (__m512d)((__v8df)__a - (__v8df)__b);
861 }
862 
863 static __inline __m512 __DEFAULT_FN_ATTRS512
864 _mm512_sub_ps(__m512 __a, __m512 __b)
865 {
866   return (__m512)((__v16sf)__a - (__v16sf)__b);
867 }
868 
869 static __inline__ __m512i __DEFAULT_FN_ATTRS512
870 _mm512_add_epi64 (__m512i __A, __m512i __B)
871 {
872   return (__m512i) ((__v8du) __A + (__v8du) __B);
873 }
874 
875 static __inline__ __m512i __DEFAULT_FN_ATTRS512
876 _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
877 {
878   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
879                                              (__v8di)_mm512_add_epi64(__A, __B),
880                                              (__v8di)__W);
881 }
882 
883 static __inline__ __m512i __DEFAULT_FN_ATTRS512
884 _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
885 {
886   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
887                                              (__v8di)_mm512_add_epi64(__A, __B),
888                                              (__v8di)_mm512_setzero_si512());
889 }
890 
891 static __inline__ __m512i __DEFAULT_FN_ATTRS512
892 _mm512_sub_epi64 (__m512i __A, __m512i __B)
893 {
894   return (__m512i) ((__v8du) __A - (__v8du) __B);
895 }
896 
897 static __inline__ __m512i __DEFAULT_FN_ATTRS512
898 _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
899 {
900   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
901                                              (__v8di)_mm512_sub_epi64(__A, __B),
902                                              (__v8di)__W);
903 }
904 
905 static __inline__ __m512i __DEFAULT_FN_ATTRS512
906 _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
907 {
908   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
909                                              (__v8di)_mm512_sub_epi64(__A, __B),
910                                              (__v8di)_mm512_setzero_si512());
911 }
912 
913 static __inline__ __m512i __DEFAULT_FN_ATTRS512
914 _mm512_add_epi32 (__m512i __A, __m512i __B)
915 {
916   return (__m512i) ((__v16su) __A + (__v16su) __B);
917 }
918 
919 static __inline__ __m512i __DEFAULT_FN_ATTRS512
920 _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
921 {
922   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
923                                              (__v16si)_mm512_add_epi32(__A, __B),
924                                              (__v16si)__W);
925 }
926 
927 static __inline__ __m512i __DEFAULT_FN_ATTRS512
928 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
929 {
930   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
931                                              (__v16si)_mm512_add_epi32(__A, __B),
932                                              (__v16si)_mm512_setzero_si512());
933 }
934 
935 static __inline__ __m512i __DEFAULT_FN_ATTRS512
936 _mm512_sub_epi32 (__m512i __A, __m512i __B)
937 {
938   return (__m512i) ((__v16su) __A - (__v16su) __B);
939 }
940 
941 static __inline__ __m512i __DEFAULT_FN_ATTRS512
942 _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
943 {
944   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
945                                              (__v16si)_mm512_sub_epi32(__A, __B),
946                                              (__v16si)__W);
947 }
948 
949 static __inline__ __m512i __DEFAULT_FN_ATTRS512
950 _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
951 {
952   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
953                                              (__v16si)_mm512_sub_epi32(__A, __B),
954                                              (__v16si)_mm512_setzero_si512());
955 }
956 
957 #define _mm512_max_round_pd(A, B, R) \
958   ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
959                                     (__v8df)(__m512d)(B), (int)(R)))
960 
961 #define _mm512_mask_max_round_pd(W, U, A, B, R) \
962   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
963                                    (__v8df)_mm512_max_round_pd((A), (B), (R)), \
964                                    (__v8df)(W)))
965 
966 #define _mm512_maskz_max_round_pd(U, A, B, R) \
967   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
968                                    (__v8df)_mm512_max_round_pd((A), (B), (R)), \
969                                    (__v8df)_mm512_setzero_pd()))
970 
971 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
972 _mm512_max_pd(__m512d __A, __m512d __B)
973 {
974   return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
975                                            _MM_FROUND_CUR_DIRECTION);
976 }
977 
978 static __inline__ __m512d __DEFAULT_FN_ATTRS512
979 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
980 {
981   return (__m512d)__builtin_ia32_selectpd_512(__U,
982                                               (__v8df)_mm512_max_pd(__A, __B),
983                                               (__v8df)__W);
984 }
985 
986 static __inline__ __m512d __DEFAULT_FN_ATTRS512
987 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
988 {
989   return (__m512d)__builtin_ia32_selectpd_512(__U,
990                                               (__v8df)_mm512_max_pd(__A, __B),
991                                               (__v8df)_mm512_setzero_pd());
992 }
993 
994 #define _mm512_max_round_ps(A, B, R) \
995   ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
996                                    (__v16sf)(__m512)(B), (int)(R)))
997 
998 #define _mm512_mask_max_round_ps(W, U, A, B, R) \
999   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1000                                   (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
1001                                   (__v16sf)(W)))
1002 
1003 #define _mm512_maskz_max_round_ps(U, A, B, R) \
1004   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1005                                   (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
1006                                   (__v16sf)_mm512_setzero_ps()))
1007 
1008 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1009 _mm512_max_ps(__m512 __A, __m512 __B)
1010 {
1011   return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
1012                                           _MM_FROUND_CUR_DIRECTION);
1013 }
1014 
1015 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1016 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
1017 {
1018   return (__m512)__builtin_ia32_selectps_512(__U,
1019                                              (__v16sf)_mm512_max_ps(__A, __B),
1020                                              (__v16sf)__W);
1021 }
1022 
1023 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1024 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
1025 {
1026   return (__m512)__builtin_ia32_selectps_512(__U,
1027                                              (__v16sf)_mm512_max_ps(__A, __B),
1028                                              (__v16sf)_mm512_setzero_ps());
1029 }
1030 
1031 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1032 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1033   return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1034                 (__v4sf) __B,
1035                 (__v4sf) __W,
1036                 (__mmask8) __U,
1037                 _MM_FROUND_CUR_DIRECTION);
1038 }
1039 
1040 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1041 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1042   return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1043                 (__v4sf) __B,
1044                 (__v4sf)  _mm_setzero_ps (),
1045                 (__mmask8) __U,
1046                 _MM_FROUND_CUR_DIRECTION);
1047 }
1048 
1049 #define _mm_max_round_ss(A, B, R) \
1050   ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1051                                            (__v4sf)(__m128)(B), \
1052                                            (__v4sf)_mm_setzero_ps(), \
1053                                            (__mmask8)-1, (int)(R)))
1054 
1055 #define _mm_mask_max_round_ss(W, U, A, B, R) \
1056   ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1057                                            (__v4sf)(__m128)(B), \
1058                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
1059                                            (int)(R)))
1060 
1061 #define _mm_maskz_max_round_ss(U, A, B, R) \
1062   ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1063                                            (__v4sf)(__m128)(B), \
1064                                            (__v4sf)_mm_setzero_ps(), \
1065                                            (__mmask8)(U), (int)(R)))
1066 
1067 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1068 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1069   return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1070                 (__v2df) __B,
1071                 (__v2df) __W,
1072                 (__mmask8) __U,
1073                 _MM_FROUND_CUR_DIRECTION);
1074 }
1075 
1076 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1077 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1078   return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1079                 (__v2df) __B,
1080                 (__v2df)  _mm_setzero_pd (),
1081                 (__mmask8) __U,
1082                 _MM_FROUND_CUR_DIRECTION);
1083 }
1084 
1085 #define _mm_max_round_sd(A, B, R) \
1086   ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1087                                             (__v2df)(__m128d)(B), \
1088                                             (__v2df)_mm_setzero_pd(), \
1089                                             (__mmask8)-1, (int)(R)))
1090 
1091 #define _mm_mask_max_round_sd(W, U, A, B, R) \
1092   ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1093                                             (__v2df)(__m128d)(B), \
1094                                             (__v2df)(__m128d)(W), \
1095                                             (__mmask8)(U), (int)(R)))
1096 
1097 #define _mm_maskz_max_round_sd(U, A, B, R) \
1098   ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1099                                             (__v2df)(__m128d)(B), \
1100                                             (__v2df)_mm_setzero_pd(), \
1101                                             (__mmask8)(U), (int)(R)))
1102 
1103 static __inline __m512i
1104 __DEFAULT_FN_ATTRS512
1105 _mm512_max_epi32(__m512i __A, __m512i __B)
1106 {
1107   return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B);
1108 }
1109 
1110 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1111 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1112 {
1113   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1114                                             (__v16si)_mm512_max_epi32(__A, __B),
1115                                             (__v16si)__W);
1116 }
1117 
1118 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1119 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
1120 {
1121   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1122                                             (__v16si)_mm512_max_epi32(__A, __B),
1123                                             (__v16si)_mm512_setzero_si512());
1124 }
1125 
1126 static __inline __m512i __DEFAULT_FN_ATTRS512
1127 _mm512_max_epu32(__m512i __A, __m512i __B)
1128 {
1129   return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B);
1130 }
1131 
1132 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1133 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1134 {
1135   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1136                                             (__v16si)_mm512_max_epu32(__A, __B),
1137                                             (__v16si)__W);
1138 }
1139 
1140 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1141 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
1142 {
1143   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1144                                             (__v16si)_mm512_max_epu32(__A, __B),
1145                                             (__v16si)_mm512_setzero_si512());
1146 }
1147 
1148 static __inline __m512i __DEFAULT_FN_ATTRS512
1149 _mm512_max_epi64(__m512i __A, __m512i __B)
1150 {
1151   return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B);
1152 }
1153 
1154 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1155 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1156 {
1157   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1158                                              (__v8di)_mm512_max_epi64(__A, __B),
1159                                              (__v8di)__W);
1160 }
1161 
1162 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1163 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
1164 {
1165   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1166                                              (__v8di)_mm512_max_epi64(__A, __B),
1167                                              (__v8di)_mm512_setzero_si512());
1168 }
1169 
1170 static __inline __m512i __DEFAULT_FN_ATTRS512
1171 _mm512_max_epu64(__m512i __A, __m512i __B)
1172 {
1173   return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B);
1174 }
1175 
1176 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1177 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1178 {
1179   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1180                                              (__v8di)_mm512_max_epu64(__A, __B),
1181                                              (__v8di)__W);
1182 }
1183 
1184 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1185 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
1186 {
1187   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1188                                              (__v8di)_mm512_max_epu64(__A, __B),
1189                                              (__v8di)_mm512_setzero_si512());
1190 }
1191 
1192 #define _mm512_min_round_pd(A, B, R) \
1193   ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
1194                                     (__v8df)(__m512d)(B), (int)(R)))
1195 
1196 #define _mm512_mask_min_round_pd(W, U, A, B, R) \
1197   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1198                                    (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1199                                    (__v8df)(W)))
1200 
1201 #define _mm512_maskz_min_round_pd(U, A, B, R) \
1202   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1203                                    (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1204                                    (__v8df)_mm512_setzero_pd()))
1205 
1206 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1207 _mm512_min_pd(__m512d __A, __m512d __B)
1208 {
1209   return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
1210                                            _MM_FROUND_CUR_DIRECTION);
1211 }
1212 
1213 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1214 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
1215 {
1216   return (__m512d)__builtin_ia32_selectpd_512(__U,
1217                                               (__v8df)_mm512_min_pd(__A, __B),
1218                                               (__v8df)__W);
1219 }
1220 
1221 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1222 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
1223 {
1224   return (__m512d)__builtin_ia32_selectpd_512(__U,
1225                                               (__v8df)_mm512_min_pd(__A, __B),
1226                                               (__v8df)_mm512_setzero_pd());
1227 }
1228 
1229 #define _mm512_min_round_ps(A, B, R) \
1230   ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
1231                                    (__v16sf)(__m512)(B), (int)(R)))
1232 
1233 #define _mm512_mask_min_round_ps(W, U, A, B, R) \
1234   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1235                                   (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1236                                   (__v16sf)(W)))
1237 
1238 #define _mm512_maskz_min_round_ps(U, A, B, R) \
1239   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1240                                   (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1241                                   (__v16sf)_mm512_setzero_ps()))
1242 
1243 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1244 _mm512_min_ps(__m512 __A, __m512 __B)
1245 {
1246   return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
1247                                           _MM_FROUND_CUR_DIRECTION);
1248 }
1249 
1250 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1251 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
1252 {
1253   return (__m512)__builtin_ia32_selectps_512(__U,
1254                                              (__v16sf)_mm512_min_ps(__A, __B),
1255                                              (__v16sf)__W);
1256 }
1257 
1258 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1259 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
1260 {
1261   return (__m512)__builtin_ia32_selectps_512(__U,
1262                                              (__v16sf)_mm512_min_ps(__A, __B),
1263                                              (__v16sf)_mm512_setzero_ps());
1264 }
1265 
1266 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1267 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1268   return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1269                 (__v4sf) __B,
1270                 (__v4sf) __W,
1271                 (__mmask8) __U,
1272                 _MM_FROUND_CUR_DIRECTION);
1273 }
1274 
1275 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1276 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1277   return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1278                 (__v4sf) __B,
1279                 (__v4sf)  _mm_setzero_ps (),
1280                 (__mmask8) __U,
1281                 _MM_FROUND_CUR_DIRECTION);
1282 }
1283 
1284 #define _mm_min_round_ss(A, B, R) \
1285   ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1286                                            (__v4sf)(__m128)(B), \
1287                                            (__v4sf)_mm_setzero_ps(), \
1288                                            (__mmask8)-1, (int)(R)))
1289 
1290 #define _mm_mask_min_round_ss(W, U, A, B, R) \
1291   ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1292                                            (__v4sf)(__m128)(B), \
1293                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
1294                                            (int)(R)))
1295 
1296 #define _mm_maskz_min_round_ss(U, A, B, R) \
1297   ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1298                                            (__v4sf)(__m128)(B), \
1299                                            (__v4sf)_mm_setzero_ps(), \
1300                                            (__mmask8)(U), (int)(R)))
1301 
1302 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1303 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1304   return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1305                 (__v2df) __B,
1306                 (__v2df) __W,
1307                 (__mmask8) __U,
1308                 _MM_FROUND_CUR_DIRECTION);
1309 }
1310 
1311 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1312 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1313   return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1314                 (__v2df) __B,
1315                 (__v2df)  _mm_setzero_pd (),
1316                 (__mmask8) __U,
1317                 _MM_FROUND_CUR_DIRECTION);
1318 }
1319 
1320 #define _mm_min_round_sd(A, B, R) \
1321   ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1322                                             (__v2df)(__m128d)(B), \
1323                                             (__v2df)_mm_setzero_pd(), \
1324                                             (__mmask8)-1, (int)(R)))
1325 
1326 #define _mm_mask_min_round_sd(W, U, A, B, R) \
1327   ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1328                                             (__v2df)(__m128d)(B), \
1329                                             (__v2df)(__m128d)(W), \
1330                                             (__mmask8)(U), (int)(R)))
1331 
1332 #define _mm_maskz_min_round_sd(U, A, B, R) \
1333   ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1334                                             (__v2df)(__m128d)(B), \
1335                                             (__v2df)_mm_setzero_pd(), \
1336                                             (__mmask8)(U), (int)(R)))
1337 
1338 static __inline __m512i
1339 __DEFAULT_FN_ATTRS512
1340 _mm512_min_epi32(__m512i __A, __m512i __B)
1341 {
1342   return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B);
1343 }
1344 
1345 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1346 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1347 {
1348   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1349                                             (__v16si)_mm512_min_epi32(__A, __B),
1350                                             (__v16si)__W);
1351 }
1352 
1353 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1354 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
1355 {
1356   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1357                                             (__v16si)_mm512_min_epi32(__A, __B),
1358                                             (__v16si)_mm512_setzero_si512());
1359 }
1360 
1361 static __inline __m512i __DEFAULT_FN_ATTRS512
1362 _mm512_min_epu32(__m512i __A, __m512i __B)
1363 {
1364   return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B);
1365 }
1366 
1367 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1368 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1369 {
1370   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1371                                             (__v16si)_mm512_min_epu32(__A, __B),
1372                                             (__v16si)__W);
1373 }
1374 
1375 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1376 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
1377 {
1378   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1379                                             (__v16si)_mm512_min_epu32(__A, __B),
1380                                             (__v16si)_mm512_setzero_si512());
1381 }
1382 
1383 static __inline __m512i __DEFAULT_FN_ATTRS512
1384 _mm512_min_epi64(__m512i __A, __m512i __B)
1385 {
1386   return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B);
1387 }
1388 
1389 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1390 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1391 {
1392   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1393                                              (__v8di)_mm512_min_epi64(__A, __B),
1394                                              (__v8di)__W);
1395 }
1396 
1397 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1398 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
1399 {
1400   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1401                                              (__v8di)_mm512_min_epi64(__A, __B),
1402                                              (__v8di)_mm512_setzero_si512());
1403 }
1404 
1405 static __inline __m512i __DEFAULT_FN_ATTRS512
1406 _mm512_min_epu64(__m512i __A, __m512i __B)
1407 {
1408   return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B);
1409 }
1410 
1411 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1412 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1413 {
1414   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1415                                              (__v8di)_mm512_min_epu64(__A, __B),
1416                                              (__v8di)__W);
1417 }
1418 
1419 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1420 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
1421 {
1422   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1423                                              (__v8di)_mm512_min_epu64(__A, __B),
1424                                              (__v8di)_mm512_setzero_si512());
1425 }
1426 
1427 static __inline __m512i __DEFAULT_FN_ATTRS512
1428 _mm512_mul_epi32(__m512i __X, __m512i __Y)
1429 {
1430   return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
1431 }
1432 
1433 static __inline __m512i __DEFAULT_FN_ATTRS512
1434 _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
1435 {
1436   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1437                                              (__v8di)_mm512_mul_epi32(__X, __Y),
1438                                              (__v8di)__W);
1439 }
1440 
1441 static __inline __m512i __DEFAULT_FN_ATTRS512
1442 _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
1443 {
1444   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1445                                              (__v8di)_mm512_mul_epi32(__X, __Y),
1446                                              (__v8di)_mm512_setzero_si512 ());
1447 }
1448 
1449 static __inline __m512i __DEFAULT_FN_ATTRS512
1450 _mm512_mul_epu32(__m512i __X, __m512i __Y)
1451 {
1452   return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
1453 }
1454 
1455 static __inline __m512i __DEFAULT_FN_ATTRS512
1456 _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
1457 {
1458   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1459                                              (__v8di)_mm512_mul_epu32(__X, __Y),
1460                                              (__v8di)__W);
1461 }
1462 
1463 static __inline __m512i __DEFAULT_FN_ATTRS512
1464 _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
1465 {
1466   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1467                                              (__v8di)_mm512_mul_epu32(__X, __Y),
1468                                              (__v8di)_mm512_setzero_si512 ());
1469 }
1470 
1471 static __inline __m512i __DEFAULT_FN_ATTRS512
1472 _mm512_mullo_epi32 (__m512i __A, __m512i __B)
1473 {
1474   return (__m512i) ((__v16su) __A * (__v16su) __B);
1475 }
1476 
1477 static __inline __m512i __DEFAULT_FN_ATTRS512
1478 _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
1479 {
1480   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1481                                              (__v16si)_mm512_mullo_epi32(__A, __B),
1482                                              (__v16si)_mm512_setzero_si512());
1483 }
1484 
1485 static __inline __m512i __DEFAULT_FN_ATTRS512
1486 _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1487 {
1488   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1489                                              (__v16si)_mm512_mullo_epi32(__A, __B),
1490                                              (__v16si)__W);
1491 }
1492 
1493 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1494 _mm512_mullox_epi64 (__m512i __A, __m512i __B) {
1495   return (__m512i) ((__v8du) __A * (__v8du) __B);
1496 }
1497 
1498 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1499 _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
1500   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1501                                              (__v8di)_mm512_mullox_epi64(__A, __B),
1502                                              (__v8di)__W);
1503 }
1504 
1505 #define _mm512_sqrt_round_pd(A, R) \
1506   ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
1507 
1508 #define _mm512_mask_sqrt_round_pd(W, U, A, R) \
1509   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1510                                        (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1511                                        (__v8df)(__m512d)(W)))
1512 
1513 #define _mm512_maskz_sqrt_round_pd(U, A, R) \
1514   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1515                                        (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1516                                        (__v8df)_mm512_setzero_pd()))
1517 
1518 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1519 _mm512_sqrt_pd(__m512d __A)
1520 {
1521   return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
1522                                            _MM_FROUND_CUR_DIRECTION);
1523 }
1524 
1525 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1526 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
1527 {
1528   return (__m512d)__builtin_ia32_selectpd_512(__U,
1529                                               (__v8df)_mm512_sqrt_pd(__A),
1530                                               (__v8df)__W);
1531 }
1532 
1533 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1534 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
1535 {
1536   return (__m512d)__builtin_ia32_selectpd_512(__U,
1537                                               (__v8df)_mm512_sqrt_pd(__A),
1538                                               (__v8df)_mm512_setzero_pd());
1539 }
1540 
1541 #define _mm512_sqrt_round_ps(A, R) \
1542   ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
1543 
1544 #define _mm512_mask_sqrt_round_ps(W, U, A, R) \
1545   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1546                                       (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1547                                       (__v16sf)(__m512)(W)))
1548 
1549 #define _mm512_maskz_sqrt_round_ps(U, A, R) \
1550   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1551                                       (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1552                                       (__v16sf)_mm512_setzero_ps()))
1553 
1554 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1555 _mm512_sqrt_ps(__m512 __A)
1556 {
1557   return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
1558                                           _MM_FROUND_CUR_DIRECTION);
1559 }
1560 
1561 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1562 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
1563 {
1564   return (__m512)__builtin_ia32_selectps_512(__U,
1565                                              (__v16sf)_mm512_sqrt_ps(__A),
1566                                              (__v16sf)__W);
1567 }
1568 
1569 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1570 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
1571 {
1572   return (__m512)__builtin_ia32_selectps_512(__U,
1573                                              (__v16sf)_mm512_sqrt_ps(__A),
1574                                              (__v16sf)_mm512_setzero_ps());
1575 }
1576 
1577 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1578 _mm512_rsqrt14_pd(__m512d __A)
1579 {
1580   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1581                  (__v8df)
1582                  _mm512_setzero_pd (),
1583                  (__mmask8) -1);}
1584 
1585 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1586 _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
1587 {
1588   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1589                   (__v8df) __W,
1590                   (__mmask8) __U);
1591 }
1592 
1593 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1594 _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
1595 {
1596   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1597                   (__v8df)
1598                   _mm512_setzero_pd (),
1599                   (__mmask8) __U);
1600 }
1601 
1602 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1603 _mm512_rsqrt14_ps(__m512 __A)
1604 {
1605   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1606                 (__v16sf)
1607                 _mm512_setzero_ps (),
1608                 (__mmask16) -1);
1609 }
1610 
1611 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1612 _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
1613 {
1614   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1615                  (__v16sf) __W,
1616                  (__mmask16) __U);
1617 }
1618 
1619 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1620 _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
1621 {
1622   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1623                  (__v16sf)
1624                  _mm512_setzero_ps (),
1625                  (__mmask16) __U);
1626 }
1627 
1628 static  __inline__ __m128 __DEFAULT_FN_ATTRS128
1629 _mm_rsqrt14_ss(__m128 __A, __m128 __B)
1630 {
1631   return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1632              (__v4sf) __B,
1633              (__v4sf)
1634              _mm_setzero_ps (),
1635              (__mmask8) -1);
1636 }
1637 
1638 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1639 _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
1640 {
1641  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1642           (__v4sf) __B,
1643           (__v4sf) __W,
1644           (__mmask8) __U);
1645 }
1646 
1647 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1648 _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
1649 {
1650  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1651           (__v4sf) __B,
1652           (__v4sf) _mm_setzero_ps (),
1653           (__mmask8) __U);
1654 }
1655 
1656 static  __inline__ __m128d __DEFAULT_FN_ATTRS128
1657 _mm_rsqrt14_sd(__m128d __A, __m128d __B)
1658 {
1659   return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
1660               (__v2df) __B,
1661               (__v2df)
1662               _mm_setzero_pd (),
1663               (__mmask8) -1);
1664 }
1665 
1666 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1667 _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
1668 {
1669  return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1670           (__v2df) __B,
1671           (__v2df) __W,
1672           (__mmask8) __U);
1673 }
1674 
1675 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1676 _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
1677 {
1678  return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1679           (__v2df) __B,
1680           (__v2df) _mm_setzero_pd (),
1681           (__mmask8) __U);
1682 }
1683 
1684 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1685 _mm512_rcp14_pd(__m512d __A)
1686 {
1687   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1688                (__v8df)
1689                _mm512_setzero_pd (),
1690                (__mmask8) -1);
1691 }
1692 
1693 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1694 _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
1695 {
1696   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1697                 (__v8df) __W,
1698                 (__mmask8) __U);
1699 }
1700 
1701 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1702 _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
1703 {
1704   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1705                 (__v8df)
1706                 _mm512_setzero_pd (),
1707                 (__mmask8) __U);
1708 }
1709 
1710 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1711 _mm512_rcp14_ps(__m512 __A)
1712 {
1713   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1714               (__v16sf)
1715               _mm512_setzero_ps (),
1716               (__mmask16) -1);
1717 }
1718 
1719 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1720 _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
1721 {
1722   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1723                    (__v16sf) __W,
1724                    (__mmask16) __U);
1725 }
1726 
1727 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1728 _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
1729 {
1730   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1731                    (__v16sf)
1732                    _mm512_setzero_ps (),
1733                    (__mmask16) __U);
1734 }
1735 
1736 static  __inline__ __m128 __DEFAULT_FN_ATTRS128
1737 _mm_rcp14_ss(__m128 __A, __m128 __B)
1738 {
1739   return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1740                  (__v4sf) __B,
1741                  (__v4sf)
1742                  _mm_setzero_ps (),
1743                  (__mmask8) -1);
1744 }
1745 
1746 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1747 _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
1748 {
1749  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1750           (__v4sf) __B,
1751           (__v4sf) __W,
1752           (__mmask8) __U);
1753 }
1754 
1755 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1756 _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
1757 {
1758  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1759           (__v4sf) __B,
1760           (__v4sf) _mm_setzero_ps (),
1761           (__mmask8) __U);
1762 }
1763 
1764 static  __inline__ __m128d __DEFAULT_FN_ATTRS128
1765 _mm_rcp14_sd(__m128d __A, __m128d __B)
1766 {
1767   return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
1768             (__v2df) __B,
1769             (__v2df)
1770             _mm_setzero_pd (),
1771             (__mmask8) -1);
1772 }
1773 
1774 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1775 _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
1776 {
1777  return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1778           (__v2df) __B,
1779           (__v2df) __W,
1780           (__mmask8) __U);
1781 }
1782 
1783 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1784 _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
1785 {
1786  return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1787           (__v2df) __B,
1788           (__v2df) _mm_setzero_pd (),
1789           (__mmask8) __U);
1790 }
1791 
1792 static __inline __m512 __DEFAULT_FN_ATTRS512
1793 _mm512_floor_ps(__m512 __A)
1794 {
1795   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1796                                                   _MM_FROUND_FLOOR,
1797                                                   (__v16sf) __A, (unsigned short)-1,
1798                                                   _MM_FROUND_CUR_DIRECTION);
1799 }
1800 
1801 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1802 _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
1803 {
1804   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1805                    _MM_FROUND_FLOOR,
1806                    (__v16sf) __W, __U,
1807                    _MM_FROUND_CUR_DIRECTION);
1808 }
1809 
1810 static __inline __m512d __DEFAULT_FN_ATTRS512
1811 _mm512_floor_pd(__m512d __A)
1812 {
1813   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1814                                                    _MM_FROUND_FLOOR,
1815                                                    (__v8df) __A, (unsigned char)-1,
1816                                                    _MM_FROUND_CUR_DIRECTION);
1817 }
1818 
1819 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1820 _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
1821 {
1822   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1823                 _MM_FROUND_FLOOR,
1824                 (__v8df) __W, __U,
1825                 _MM_FROUND_CUR_DIRECTION);
1826 }
1827 
1828 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1829 _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
1830 {
1831   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1832                    _MM_FROUND_CEIL,
1833                    (__v16sf) __W, __U,
1834                    _MM_FROUND_CUR_DIRECTION);
1835 }
1836 
1837 static __inline __m512 __DEFAULT_FN_ATTRS512
1838 _mm512_ceil_ps(__m512 __A)
1839 {
1840   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1841                                                   _MM_FROUND_CEIL,
1842                                                   (__v16sf) __A, (unsigned short)-1,
1843                                                   _MM_FROUND_CUR_DIRECTION);
1844 }
1845 
1846 static __inline __m512d __DEFAULT_FN_ATTRS512
1847 _mm512_ceil_pd(__m512d __A)
1848 {
1849   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1850                                                    _MM_FROUND_CEIL,
1851                                                    (__v8df) __A, (unsigned char)-1,
1852                                                    _MM_FROUND_CUR_DIRECTION);
1853 }
1854 
1855 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1856 _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
1857 {
1858   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1859                 _MM_FROUND_CEIL,
1860                 (__v8df) __W, __U,
1861                 _MM_FROUND_CUR_DIRECTION);
1862 }
1863 
1864 static __inline __m512i __DEFAULT_FN_ATTRS512
1865 _mm512_abs_epi64(__m512i __A)
1866 {
1867   return (__m512i)__builtin_elementwise_abs((__v8di)__A);
1868 }
1869 
1870 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1871 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
1872 {
1873   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1874                                              (__v8di)_mm512_abs_epi64(__A),
1875                                              (__v8di)__W);
1876 }
1877 
1878 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1879 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
1880 {
1881   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1882                                              (__v8di)_mm512_abs_epi64(__A),
1883                                              (__v8di)_mm512_setzero_si512());
1884 }
1885 
1886 static __inline __m512i __DEFAULT_FN_ATTRS512
1887 _mm512_abs_epi32(__m512i __A)
1888 {
1889   return (__m512i)__builtin_elementwise_abs((__v16si) __A);
1890 }
1891 
1892 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1893 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
1894 {
1895   return (__m512i)__builtin_ia32_selectd_512(__U,
1896                                              (__v16si)_mm512_abs_epi32(__A),
1897                                              (__v16si)__W);
1898 }
1899 
1900 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1901 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
1902 {
1903   return (__m512i)__builtin_ia32_selectd_512(__U,
1904                                              (__v16si)_mm512_abs_epi32(__A),
1905                                              (__v16si)_mm512_setzero_si512());
1906 }
1907 
1908 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1909 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1910   __A = _mm_add_ss(__A, __B);
1911   return __builtin_ia32_selectss_128(__U, __A, __W);
1912 }
1913 
1914 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1915 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1916   __A = _mm_add_ss(__A, __B);
1917   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
1918 }
1919 
1920 #define _mm_add_round_ss(A, B, R) \
1921   ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1922                                            (__v4sf)(__m128)(B), \
1923                                            (__v4sf)_mm_setzero_ps(), \
1924                                            (__mmask8)-1, (int)(R)))
1925 
1926 #define _mm_mask_add_round_ss(W, U, A, B, R) \
1927   ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1928                                            (__v4sf)(__m128)(B), \
1929                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
1930                                            (int)(R)))
1931 
1932 #define _mm_maskz_add_round_ss(U, A, B, R) \
1933   ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1934                                            (__v4sf)(__m128)(B), \
1935                                            (__v4sf)_mm_setzero_ps(), \
1936                                            (__mmask8)(U), (int)(R)))
1937 
1938 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1939 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1940   __A = _mm_add_sd(__A, __B);
1941   return __builtin_ia32_selectsd_128(__U, __A, __W);
1942 }
1943 
1944 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1945 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1946   __A = _mm_add_sd(__A, __B);
1947   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
1948 }
1949 #define _mm_add_round_sd(A, B, R) \
1950   ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1951                                             (__v2df)(__m128d)(B), \
1952                                             (__v2df)_mm_setzero_pd(), \
1953                                             (__mmask8)-1, (int)(R)))
1954 
1955 #define _mm_mask_add_round_sd(W, U, A, B, R) \
1956   ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1957                                             (__v2df)(__m128d)(B), \
1958                                             (__v2df)(__m128d)(W), \
1959                                             (__mmask8)(U), (int)(R)))
1960 
1961 #define _mm_maskz_add_round_sd(U, A, B, R) \
1962   ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1963                                             (__v2df)(__m128d)(B), \
1964                                             (__v2df)_mm_setzero_pd(), \
1965                                             (__mmask8)(U), (int)(R)))
1966 
1967 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1968 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
1969   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
1970                                               (__v8df)_mm512_add_pd(__A, __B),
1971                                               (__v8df)__W);
1972 }
1973 
1974 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1975 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
1976   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
1977                                               (__v8df)_mm512_add_pd(__A, __B),
1978                                               (__v8df)_mm512_setzero_pd());
1979 }
1980 
1981 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1982 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
1983   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
1984                                              (__v16sf)_mm512_add_ps(__A, __B),
1985                                              (__v16sf)__W);
1986 }
1987 
1988 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1989 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
1990   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
1991                                              (__v16sf)_mm512_add_ps(__A, __B),
1992                                              (__v16sf)_mm512_setzero_ps());
1993 }
1994 
1995 #define _mm512_add_round_pd(A, B, R) \
1996   ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
1997                                     (__v8df)(__m512d)(B), (int)(R)))
1998 
1999 #define _mm512_mask_add_round_pd(W, U, A, B, R) \
2000   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2001                                    (__v8df)_mm512_add_round_pd((A), (B), (R)), \
2002                                    (__v8df)(__m512d)(W)))
2003 
2004 #define _mm512_maskz_add_round_pd(U, A, B, R) \
2005   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2006                                    (__v8df)_mm512_add_round_pd((A), (B), (R)), \
2007                                    (__v8df)_mm512_setzero_pd()))
2008 
2009 #define _mm512_add_round_ps(A, B, R) \
2010   ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
2011                                    (__v16sf)(__m512)(B), (int)(R)))
2012 
2013 #define _mm512_mask_add_round_ps(W, U, A, B, R) \
2014   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2015                                   (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2016                                   (__v16sf)(__m512)(W)))
2017 
2018 #define _mm512_maskz_add_round_ps(U, A, B, R) \
2019   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2020                                   (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2021                                   (__v16sf)_mm512_setzero_ps()))
2022 
2023 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2024 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2025   __A = _mm_sub_ss(__A, __B);
2026   return __builtin_ia32_selectss_128(__U, __A, __W);
2027 }
2028 
2029 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2030 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2031   __A = _mm_sub_ss(__A, __B);
2032   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2033 }
2034 #define _mm_sub_round_ss(A, B, R) \
2035   ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2036                                            (__v4sf)(__m128)(B), \
2037                                            (__v4sf)_mm_setzero_ps(), \
2038                                            (__mmask8)-1, (int)(R)))
2039 
2040 #define _mm_mask_sub_round_ss(W, U, A, B, R) \
2041   ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2042                                            (__v4sf)(__m128)(B), \
2043                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
2044                                            (int)(R)))
2045 
2046 #define _mm_maskz_sub_round_ss(U, A, B, R) \
2047   ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2048                                            (__v4sf)(__m128)(B), \
2049                                            (__v4sf)_mm_setzero_ps(), \
2050                                            (__mmask8)(U), (int)(R)))
2051 
2052 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2053 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2054   __A = _mm_sub_sd(__A, __B);
2055   return __builtin_ia32_selectsd_128(__U, __A, __W);
2056 }
2057 
2058 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2059 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2060   __A = _mm_sub_sd(__A, __B);
2061   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2062 }
2063 
2064 #define _mm_sub_round_sd(A, B, R) \
2065   ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2066                                             (__v2df)(__m128d)(B), \
2067                                             (__v2df)_mm_setzero_pd(), \
2068                                             (__mmask8)-1, (int)(R)))
2069 
2070 #define _mm_mask_sub_round_sd(W, U, A, B, R) \
2071   ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2072                                             (__v2df)(__m128d)(B), \
2073                                             (__v2df)(__m128d)(W), \
2074                                             (__mmask8)(U), (int)(R)))
2075 
2076 #define _mm_maskz_sub_round_sd(U, A, B, R) \
2077   ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2078                                             (__v2df)(__m128d)(B), \
2079                                             (__v2df)_mm_setzero_pd(), \
2080                                             (__mmask8)(U), (int)(R)))
2081 
2082 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2083 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2084   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2085                                               (__v8df)_mm512_sub_pd(__A, __B),
2086                                               (__v8df)__W);
2087 }
2088 
2089 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2090 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2091   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2092                                               (__v8df)_mm512_sub_pd(__A, __B),
2093                                               (__v8df)_mm512_setzero_pd());
2094 }
2095 
2096 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2097 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2098   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2099                                              (__v16sf)_mm512_sub_ps(__A, __B),
2100                                              (__v16sf)__W);
2101 }
2102 
2103 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2104 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2105   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2106                                              (__v16sf)_mm512_sub_ps(__A, __B),
2107                                              (__v16sf)_mm512_setzero_ps());
2108 }
2109 
2110 #define _mm512_sub_round_pd(A, B, R) \
2111   ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
2112                                     (__v8df)(__m512d)(B), (int)(R)))
2113 
2114 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \
2115   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2116                                    (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2117                                    (__v8df)(__m512d)(W)))
2118 
2119 #define _mm512_maskz_sub_round_pd(U, A, B, R) \
2120   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2121                                    (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2122                                    (__v8df)_mm512_setzero_pd()))
2123 
2124 #define _mm512_sub_round_ps(A, B, R) \
2125   ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
2126                                    (__v16sf)(__m512)(B), (int)(R)))
2127 
2128 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \
2129   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2130                                   (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2131                                   (__v16sf)(__m512)(W)))
2132 
2133 #define _mm512_maskz_sub_round_ps(U, A, B, R) \
2134   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2135                                   (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2136                                   (__v16sf)_mm512_setzero_ps()))
2137 
2138 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2139 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2140   __A = _mm_mul_ss(__A, __B);
2141   return __builtin_ia32_selectss_128(__U, __A, __W);
2142 }
2143 
2144 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2145 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2146   __A = _mm_mul_ss(__A, __B);
2147   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2148 }
2149 #define _mm_mul_round_ss(A, B, R) \
2150   ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2151                                            (__v4sf)(__m128)(B), \
2152                                            (__v4sf)_mm_setzero_ps(), \
2153                                            (__mmask8)-1, (int)(R)))
2154 
2155 #define _mm_mask_mul_round_ss(W, U, A, B, R) \
2156   ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2157                                            (__v4sf)(__m128)(B), \
2158                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
2159                                            (int)(R)))
2160 
2161 #define _mm_maskz_mul_round_ss(U, A, B, R) \
2162   ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2163                                            (__v4sf)(__m128)(B), \
2164                                            (__v4sf)_mm_setzero_ps(), \
2165                                            (__mmask8)(U), (int)(R)))
2166 
2167 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2168 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2169   __A = _mm_mul_sd(__A, __B);
2170   return __builtin_ia32_selectsd_128(__U, __A, __W);
2171 }
2172 
2173 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2174 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2175   __A = _mm_mul_sd(__A, __B);
2176   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2177 }
2178 
2179 #define _mm_mul_round_sd(A, B, R) \
2180   ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2181                                             (__v2df)(__m128d)(B), \
2182                                             (__v2df)_mm_setzero_pd(), \
2183                                             (__mmask8)-1, (int)(R)))
2184 
2185 #define _mm_mask_mul_round_sd(W, U, A, B, R) \
2186   ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2187                                             (__v2df)(__m128d)(B), \
2188                                             (__v2df)(__m128d)(W), \
2189                                             (__mmask8)(U), (int)(R)))
2190 
2191 #define _mm_maskz_mul_round_sd(U, A, B, R) \
2192   ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2193                                             (__v2df)(__m128d)(B), \
2194                                             (__v2df)_mm_setzero_pd(), \
2195                                             (__mmask8)(U), (int)(R)))
2196 
2197 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2198 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2199   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2200                                               (__v8df)_mm512_mul_pd(__A, __B),
2201                                               (__v8df)__W);
2202 }
2203 
2204 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2205 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2206   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2207                                               (__v8df)_mm512_mul_pd(__A, __B),
2208                                               (__v8df)_mm512_setzero_pd());
2209 }
2210 
2211 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2212 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2213   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2214                                              (__v16sf)_mm512_mul_ps(__A, __B),
2215                                              (__v16sf)__W);
2216 }
2217 
2218 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2219 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2220   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2221                                              (__v16sf)_mm512_mul_ps(__A, __B),
2222                                              (__v16sf)_mm512_setzero_ps());
2223 }
2224 
2225 #define _mm512_mul_round_pd(A, B, R) \
2226   ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
2227                                     (__v8df)(__m512d)(B), (int)(R)))
2228 
2229 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \
2230   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2231                                    (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2232                                    (__v8df)(__m512d)(W)))
2233 
2234 #define _mm512_maskz_mul_round_pd(U, A, B, R) \
2235   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2236                                    (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2237                                    (__v8df)_mm512_setzero_pd()))
2238 
2239 #define _mm512_mul_round_ps(A, B, R) \
2240   ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
2241                                   (__v16sf)(__m512)(B), (int)(R)))
2242 
2243 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \
2244   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2245                                   (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2246                                   (__v16sf)(__m512)(W)))
2247 
2248 #define _mm512_maskz_mul_round_ps(U, A, B, R) \
2249   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2250                                   (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2251                                   (__v16sf)_mm512_setzero_ps()))
2252 
2253 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2254 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2255   __A = _mm_div_ss(__A, __B);
2256   return __builtin_ia32_selectss_128(__U, __A, __W);
2257 }
2258 
2259 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2260 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2261   __A = _mm_div_ss(__A, __B);
2262   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2263 }
2264 
2265 #define _mm_div_round_ss(A, B, R) \
2266   ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2267                                            (__v4sf)(__m128)(B), \
2268                                            (__v4sf)_mm_setzero_ps(), \
2269                                            (__mmask8)-1, (int)(R)))
2270 
2271 #define _mm_mask_div_round_ss(W, U, A, B, R) \
2272   ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2273                                            (__v4sf)(__m128)(B), \
2274                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
2275                                            (int)(R)))
2276 
2277 #define _mm_maskz_div_round_ss(U, A, B, R) \
2278   ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2279                                            (__v4sf)(__m128)(B), \
2280                                            (__v4sf)_mm_setzero_ps(), \
2281                                            (__mmask8)(U), (int)(R)))
2282 
2283 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2284 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2285   __A = _mm_div_sd(__A, __B);
2286   return __builtin_ia32_selectsd_128(__U, __A, __W);
2287 }
2288 
2289 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2290 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2291   __A = _mm_div_sd(__A, __B);
2292   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2293 }
2294 
2295 #define _mm_div_round_sd(A, B, R) \
2296   ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2297                                             (__v2df)(__m128d)(B), \
2298                                             (__v2df)_mm_setzero_pd(), \
2299                                             (__mmask8)-1, (int)(R)))
2300 
2301 #define _mm_mask_div_round_sd(W, U, A, B, R) \
2302   ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2303                                             (__v2df)(__m128d)(B), \
2304                                             (__v2df)(__m128d)(W), \
2305                                             (__mmask8)(U), (int)(R)))
2306 
2307 #define _mm_maskz_div_round_sd(U, A, B, R) \
2308   ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2309                                             (__v2df)(__m128d)(B), \
2310                                             (__v2df)_mm_setzero_pd(), \
2311                                             (__mmask8)(U), (int)(R)))
2312 
2313 static __inline __m512d __DEFAULT_FN_ATTRS512
2314 _mm512_div_pd(__m512d __a, __m512d __b)
2315 {
2316   return (__m512d)((__v8df)__a/(__v8df)__b);
2317 }
2318 
2319 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2320 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2321   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2322                                               (__v8df)_mm512_div_pd(__A, __B),
2323                                               (__v8df)__W);
2324 }
2325 
2326 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2327 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2328   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2329                                               (__v8df)_mm512_div_pd(__A, __B),
2330                                               (__v8df)_mm512_setzero_pd());
2331 }
2332 
2333 static __inline __m512 __DEFAULT_FN_ATTRS512
2334 _mm512_div_ps(__m512 __a, __m512 __b)
2335 {
2336   return (__m512)((__v16sf)__a/(__v16sf)__b);
2337 }
2338 
2339 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2340 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2341   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2342                                              (__v16sf)_mm512_div_ps(__A, __B),
2343                                              (__v16sf)__W);
2344 }
2345 
2346 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2347 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2348   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2349                                              (__v16sf)_mm512_div_ps(__A, __B),
2350                                              (__v16sf)_mm512_setzero_ps());
2351 }
2352 
2353 #define _mm512_div_round_pd(A, B, R) \
2354   ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
2355                                     (__v8df)(__m512d)(B), (int)(R)))
2356 
2357 #define _mm512_mask_div_round_pd(W, U, A, B, R) \
2358   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2359                                    (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2360                                    (__v8df)(__m512d)(W)))
2361 
2362 #define _mm512_maskz_div_round_pd(U, A, B, R) \
2363   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2364                                    (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2365                                    (__v8df)_mm512_setzero_pd()))
2366 
2367 #define _mm512_div_round_ps(A, B, R) \
2368   ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
2369                                    (__v16sf)(__m512)(B), (int)(R)))
2370 
2371 #define _mm512_mask_div_round_ps(W, U, A, B, R) \
2372   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2373                                   (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2374                                   (__v16sf)(__m512)(W)))
2375 
2376 #define _mm512_maskz_div_round_ps(U, A, B, R) \
2377   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2378                                   (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2379                                   (__v16sf)_mm512_setzero_ps()))
2380 
2381 #define _mm512_roundscale_ps(A, B) \
2382   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
2383                                           (__v16sf)_mm512_undefined_ps(), \
2384                                           (__mmask16)-1, \
2385                                           _MM_FROUND_CUR_DIRECTION))
2386 
2387 #define _mm512_mask_roundscale_ps(A, B, C, imm) \
2388   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2389                                          (__v16sf)(__m512)(A), (__mmask16)(B), \
2390                                          _MM_FROUND_CUR_DIRECTION))
2391 
2392 #define _mm512_maskz_roundscale_ps(A, B, imm) \
2393   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2394                                           (__v16sf)_mm512_setzero_ps(), \
2395                                           (__mmask16)(A), \
2396                                           _MM_FROUND_CUR_DIRECTION))
2397 
2398 #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
2399   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2400                                          (__v16sf)(__m512)(A), (__mmask16)(B), \
2401                                          (int)(R)))
2402 
2403 #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
2404   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2405                                           (__v16sf)_mm512_setzero_ps(), \
2406                                           (__mmask16)(A), (int)(R)))
2407 
2408 #define _mm512_roundscale_round_ps(A, imm, R) \
2409   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
2410                                           (__v16sf)_mm512_undefined_ps(), \
2411                                           (__mmask16)-1, (int)(R)))
2412 
2413 #define _mm512_roundscale_pd(A, B) \
2414   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
2415                                            (__v8df)_mm512_undefined_pd(), \
2416                                            (__mmask8)-1, \
2417                                            _MM_FROUND_CUR_DIRECTION))
2418 
2419 #define _mm512_mask_roundscale_pd(A, B, C, imm) \
2420   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2421                                           (__v8df)(__m512d)(A), (__mmask8)(B), \
2422                                           _MM_FROUND_CUR_DIRECTION))
2423 
2424 #define _mm512_maskz_roundscale_pd(A, B, imm) \
2425   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2426                                            (__v8df)_mm512_setzero_pd(), \
2427                                            (__mmask8)(A), \
2428                                            _MM_FROUND_CUR_DIRECTION))
2429 
2430 #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
2431   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2432                                           (__v8df)(__m512d)(A), (__mmask8)(B), \
2433                                           (int)(R)))
2434 
2435 #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
2436   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2437                                            (__v8df)_mm512_setzero_pd(), \
2438                                            (__mmask8)(A), (int)(R)))
2439 
2440 #define _mm512_roundscale_round_pd(A, imm, R) \
2441   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
2442                                            (__v8df)_mm512_undefined_pd(), \
2443                                            (__mmask8)-1, (int)(R)))
2444 
2445 #define _mm512_fmadd_round_pd(A, B, C, R) \
2446   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2447                                             (__v8df)(__m512d)(B), \
2448                                             (__v8df)(__m512d)(C), \
2449                                             (__mmask8)-1, (int)(R)))
2450 
2451 
2452 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
2453   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2454                                             (__v8df)(__m512d)(B), \
2455                                             (__v8df)(__m512d)(C), \
2456                                             (__mmask8)(U), (int)(R)))
2457 
2458 
2459 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
2460   ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
2461                                              (__v8df)(__m512d)(B), \
2462                                              (__v8df)(__m512d)(C), \
2463                                              (__mmask8)(U), (int)(R)))
2464 
2465 
2466 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
2467   ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2468                                              (__v8df)(__m512d)(B), \
2469                                              (__v8df)(__m512d)(C), \
2470                                              (__mmask8)(U), (int)(R)))
2471 
2472 
2473 #define _mm512_fmsub_round_pd(A, B, C, R) \
2474   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2475                                             (__v8df)(__m512d)(B), \
2476                                             -(__v8df)(__m512d)(C), \
2477                                             (__mmask8)-1, (int)(R)))
2478 
2479 
2480 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
2481   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2482                                             (__v8df)(__m512d)(B), \
2483                                             -(__v8df)(__m512d)(C), \
2484                                             (__mmask8)(U), (int)(R)))
2485 
2486 
2487 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
2488   ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2489                                              (__v8df)(__m512d)(B), \
2490                                              -(__v8df)(__m512d)(C), \
2491                                              (__mmask8)(U), (int)(R)))
2492 
2493 
2494 #define _mm512_fnmadd_round_pd(A, B, C, R) \
2495   ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2496                                             (__v8df)(__m512d)(B), \
2497                                             (__v8df)(__m512d)(C), \
2498                                             (__mmask8)-1, (int)(R)))
2499 
2500 
2501 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
2502   ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
2503                                              (__v8df)(__m512d)(B), \
2504                                              (__v8df)(__m512d)(C), \
2505                                              (__mmask8)(U), (int)(R)))
2506 
2507 
2508 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
2509   ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2510                                              (__v8df)(__m512d)(B), \
2511                                              (__v8df)(__m512d)(C), \
2512                                              (__mmask8)(U), (int)(R)))
2513 
2514 
2515 #define _mm512_fnmsub_round_pd(A, B, C, R) \
2516   ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2517                                             (__v8df)(__m512d)(B), \
2518                                             -(__v8df)(__m512d)(C), \
2519                                             (__mmask8)-1, (int)(R)))
2520 
2521 
2522 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
2523   ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2524                                              (__v8df)(__m512d)(B), \
2525                                              -(__v8df)(__m512d)(C), \
2526                                              (__mmask8)(U), (int)(R)))
2527 
2528 
2529 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2530 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
2531 {
2532   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2533                                                     (__v8df) __B,
2534                                                     (__v8df) __C,
2535                                                     (__mmask8) -1,
2536                                                     _MM_FROUND_CUR_DIRECTION);
2537 }
2538 
2539 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2540 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2541 {
2542   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2543                                                     (__v8df) __B,
2544                                                     (__v8df) __C,
2545                                                     (__mmask8) __U,
2546                                                     _MM_FROUND_CUR_DIRECTION);
2547 }
2548 
2549 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2550 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2551 {
2552   return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
2553                                                      (__v8df) __B,
2554                                                      (__v8df) __C,
2555                                                      (__mmask8) __U,
2556                                                      _MM_FROUND_CUR_DIRECTION);
2557 }
2558 
2559 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2560 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2561 {
2562   return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2563                                                      (__v8df) __B,
2564                                                      (__v8df) __C,
2565                                                      (__mmask8) __U,
2566                                                      _MM_FROUND_CUR_DIRECTION);
2567 }
2568 
2569 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2570 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
2571 {
2572   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2573                                                     (__v8df) __B,
2574                                                     -(__v8df) __C,
2575                                                     (__mmask8) -1,
2576                                                     _MM_FROUND_CUR_DIRECTION);
2577 }
2578 
2579 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2580 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2581 {
2582   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2583                                                     (__v8df) __B,
2584                                                     -(__v8df) __C,
2585                                                     (__mmask8) __U,
2586                                                     _MM_FROUND_CUR_DIRECTION);
2587 }
2588 
2589 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2590 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2591 {
2592   return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2593                                                      (__v8df) __B,
2594                                                      -(__v8df) __C,
2595                                                      (__mmask8) __U,
2596                                                      _MM_FROUND_CUR_DIRECTION);
2597 }
2598 
2599 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2600 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
2601 {
2602   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2603                                                     -(__v8df) __B,
2604                                                     (__v8df) __C,
2605                                                     (__mmask8) -1,
2606                                                     _MM_FROUND_CUR_DIRECTION);
2607 }
2608 
2609 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2610 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2611 {
2612   return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
2613                                                      (__v8df) __B,
2614                                                      (__v8df) __C,
2615                                                      (__mmask8) __U,
2616                                                      _MM_FROUND_CUR_DIRECTION);
2617 }
2618 
2619 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2620 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2621 {
2622   return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2623                                                      (__v8df) __B,
2624                                                      (__v8df) __C,
2625                                                      (__mmask8) __U,
2626                                                      _MM_FROUND_CUR_DIRECTION);
2627 }
2628 
2629 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2630 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
2631 {
2632   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2633                                                     -(__v8df) __B,
2634                                                     -(__v8df) __C,
2635                                                     (__mmask8) -1,
2636                                                     _MM_FROUND_CUR_DIRECTION);
2637 }
2638 
2639 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2640 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2641 {
2642   return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2643                                                      (__v8df) __B,
2644                                                      -(__v8df) __C,
2645                                                      (__mmask8) __U,
2646                                                      _MM_FROUND_CUR_DIRECTION);
2647 }
2648 
2649 #define _mm512_fmadd_round_ps(A, B, C, R) \
2650   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2651                                            (__v16sf)(__m512)(B), \
2652                                            (__v16sf)(__m512)(C), \
2653                                            (__mmask16)-1, (int)(R)))
2654 
2655 
2656 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
2657   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2658                                            (__v16sf)(__m512)(B), \
2659                                            (__v16sf)(__m512)(C), \
2660                                            (__mmask16)(U), (int)(R)))
2661 
2662 
2663 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
2664   ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
2665                                             (__v16sf)(__m512)(B), \
2666                                             (__v16sf)(__m512)(C), \
2667                                             (__mmask16)(U), (int)(R)))
2668 
2669 
2670 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
2671   ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2672                                             (__v16sf)(__m512)(B), \
2673                                             (__v16sf)(__m512)(C), \
2674                                             (__mmask16)(U), (int)(R)))
2675 
2676 
2677 #define _mm512_fmsub_round_ps(A, B, C, R) \
2678   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2679                                            (__v16sf)(__m512)(B), \
2680                                            -(__v16sf)(__m512)(C), \
2681                                            (__mmask16)-1, (int)(R)))
2682 
2683 
2684 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
2685   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2686                                            (__v16sf)(__m512)(B), \
2687                                            -(__v16sf)(__m512)(C), \
2688                                            (__mmask16)(U), (int)(R)))
2689 
2690 
2691 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
2692   ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2693                                             (__v16sf)(__m512)(B), \
2694                                             -(__v16sf)(__m512)(C), \
2695                                             (__mmask16)(U), (int)(R)))
2696 
2697 
2698 #define _mm512_fnmadd_round_ps(A, B, C, R) \
2699   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2700                                            -(__v16sf)(__m512)(B), \
2701                                            (__v16sf)(__m512)(C), \
2702                                            (__mmask16)-1, (int)(R)))
2703 
2704 
2705 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
2706   ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
2707                                             (__v16sf)(__m512)(B), \
2708                                             (__v16sf)(__m512)(C), \
2709                                             (__mmask16)(U), (int)(R)))
2710 
2711 
2712 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
2713   ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2714                                             (__v16sf)(__m512)(B), \
2715                                             (__v16sf)(__m512)(C), \
2716                                             (__mmask16)(U), (int)(R)))
2717 
2718 
2719 #define _mm512_fnmsub_round_ps(A, B, C, R) \
2720   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2721                                            -(__v16sf)(__m512)(B), \
2722                                            -(__v16sf)(__m512)(C), \
2723                                            (__mmask16)-1, (int)(R)))
2724 
2725 
2726 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
2727   ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2728                                             (__v16sf)(__m512)(B), \
2729                                             -(__v16sf)(__m512)(C), \
2730                                             (__mmask16)(U), (int)(R)))
2731 
2732 
2733 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2734 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
2735 {
2736   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2737                                                    (__v16sf) __B,
2738                                                    (__v16sf) __C,
2739                                                    (__mmask16) -1,
2740                                                    _MM_FROUND_CUR_DIRECTION);
2741 }
2742 
2743 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2744 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
2745 {
2746   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2747                                                    (__v16sf) __B,
2748                                                    (__v16sf) __C,
2749                                                    (__mmask16) __U,
2750                                                    _MM_FROUND_CUR_DIRECTION);
2751 }
2752 
2753 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2754 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
2755 {
2756   return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
2757                                                     (__v16sf) __B,
2758                                                     (__v16sf) __C,
2759                                                     (__mmask16) __U,
2760                                                     _MM_FROUND_CUR_DIRECTION);
2761 }
2762 
2763 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2764 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2765 {
2766   return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2767                                                     (__v16sf) __B,
2768                                                     (__v16sf) __C,
2769                                                     (__mmask16) __U,
2770                                                     _MM_FROUND_CUR_DIRECTION);
2771 }
2772 
2773 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2774 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
2775 {
2776   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2777                                                    (__v16sf) __B,
2778                                                    -(__v16sf) __C,
2779                                                    (__mmask16) -1,
2780                                                    _MM_FROUND_CUR_DIRECTION);
2781 }
2782 
2783 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2784 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
2785 {
2786   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2787                                                    (__v16sf) __B,
2788                                                    -(__v16sf) __C,
2789                                                    (__mmask16) __U,
2790                                                    _MM_FROUND_CUR_DIRECTION);
2791 }
2792 
2793 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2794 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2795 {
2796   return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2797                                                     (__v16sf) __B,
2798                                                     -(__v16sf) __C,
2799                                                     (__mmask16) __U,
2800                                                     _MM_FROUND_CUR_DIRECTION);
2801 }
2802 
2803 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2804 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
2805 {
2806   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2807                                                    -(__v16sf) __B,
2808                                                    (__v16sf) __C,
2809                                                    (__mmask16) -1,
2810                                                    _MM_FROUND_CUR_DIRECTION);
2811 }
2812 
2813 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2814 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
2815 {
2816   return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
2817                                                     (__v16sf) __B,
2818                                                     (__v16sf) __C,
2819                                                     (__mmask16) __U,
2820                                                     _MM_FROUND_CUR_DIRECTION);
2821 }
2822 
2823 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2824 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2825 {
2826   return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2827                                                     (__v16sf) __B,
2828                                                     (__v16sf) __C,
2829                                                     (__mmask16) __U,
2830                                                     _MM_FROUND_CUR_DIRECTION);
2831 }
2832 
2833 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2834 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
2835 {
2836   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2837                                                    -(__v16sf) __B,
2838                                                    -(__v16sf) __C,
2839                                                    (__mmask16) -1,
2840                                                    _MM_FROUND_CUR_DIRECTION);
2841 }
2842 
2843 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2844 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2845 {
2846   return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2847                                                     (__v16sf) __B,
2848                                                     -(__v16sf) __C,
2849                                                     (__mmask16) __U,
2850                                                     _MM_FROUND_CUR_DIRECTION);
2851 }
2852 
2853 #define _mm512_fmaddsub_round_pd(A, B, C, R) \
2854   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2855                                                (__v8df)(__m512d)(B), \
2856                                                (__v8df)(__m512d)(C), \
2857                                                (__mmask8)-1, (int)(R)))
2858 
2859 
2860 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
2861   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2862                                                (__v8df)(__m512d)(B), \
2863                                                (__v8df)(__m512d)(C), \
2864                                                (__mmask8)(U), (int)(R)))
2865 
2866 
2867 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
2868   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
2869                                                 (__v8df)(__m512d)(B), \
2870                                                 (__v8df)(__m512d)(C), \
2871                                                 (__mmask8)(U), (int)(R)))
2872 
2873 
2874 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
2875   ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2876                                                 (__v8df)(__m512d)(B), \
2877                                                 (__v8df)(__m512d)(C), \
2878                                                 (__mmask8)(U), (int)(R)))
2879 
2880 
2881 #define _mm512_fmsubadd_round_pd(A, B, C, R) \
2882   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2883                                                (__v8df)(__m512d)(B), \
2884                                                -(__v8df)(__m512d)(C), \
2885                                                (__mmask8)-1, (int)(R)))
2886 
2887 
2888 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
2889   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2890                                                (__v8df)(__m512d)(B), \
2891                                                -(__v8df)(__m512d)(C), \
2892                                                (__mmask8)(U), (int)(R)))
2893 
2894 
2895 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
2896   ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2897                                                 (__v8df)(__m512d)(B), \
2898                                                 -(__v8df)(__m512d)(C), \
2899                                                 (__mmask8)(U), (int)(R)))
2900 
2901 
2902 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2903 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
2904 {
2905   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2906                                                       (__v8df) __B,
2907                                                       (__v8df) __C,
2908                                                       (__mmask8) -1,
2909                                                       _MM_FROUND_CUR_DIRECTION);
2910 }
2911 
2912 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2913 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2914 {
2915   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2916                                                       (__v8df) __B,
2917                                                       (__v8df) __C,
2918                                                       (__mmask8) __U,
2919                                                       _MM_FROUND_CUR_DIRECTION);
2920 }
2921 
2922 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2923 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2924 {
2925   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
2926                                                        (__v8df) __B,
2927                                                        (__v8df) __C,
2928                                                        (__mmask8) __U,
2929                                                        _MM_FROUND_CUR_DIRECTION);
2930 }
2931 
2932 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2933 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2934 {
2935   return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2936                                                        (__v8df) __B,
2937                                                        (__v8df) __C,
2938                                                        (__mmask8) __U,
2939                                                        _MM_FROUND_CUR_DIRECTION);
2940 }
2941 
2942 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2943 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
2944 {
2945   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2946                                                        (__v8df) __B,
2947                                                        -(__v8df) __C,
2948                                                        (__mmask8) -1,
2949                                                        _MM_FROUND_CUR_DIRECTION);
2950 }
2951 
2952 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2953 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2954 {
2955   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2956                                                        (__v8df) __B,
2957                                                        -(__v8df) __C,
2958                                                        (__mmask8) __U,
2959                                                        _MM_FROUND_CUR_DIRECTION);
2960 }
2961 
2962 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2963 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2964 {
2965   return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2966                                                         (__v8df) __B,
2967                                                         -(__v8df) __C,
2968                                                         (__mmask8) __U,
2969                                                         _MM_FROUND_CUR_DIRECTION);
2970 }
2971 
2972 #define _mm512_fmaddsub_round_ps(A, B, C, R) \
2973   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2974                                               (__v16sf)(__m512)(B), \
2975                                               (__v16sf)(__m512)(C), \
2976                                               (__mmask16)-1, (int)(R)))
2977 
2978 
2979 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
2980   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2981                                               (__v16sf)(__m512)(B), \
2982                                               (__v16sf)(__m512)(C), \
2983                                               (__mmask16)(U), (int)(R)))
2984 
2985 
2986 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
2987   ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
2988                                                (__v16sf)(__m512)(B), \
2989                                                (__v16sf)(__m512)(C), \
2990                                                (__mmask16)(U), (int)(R)))
2991 
2992 
2993 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
2994   ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
2995                                                (__v16sf)(__m512)(B), \
2996                                                (__v16sf)(__m512)(C), \
2997                                                (__mmask16)(U), (int)(R)))
2998 
2999 
3000 #define _mm512_fmsubadd_round_ps(A, B, C, R) \
3001   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
3002                                               (__v16sf)(__m512)(B), \
3003                                               -(__v16sf)(__m512)(C), \
3004                                               (__mmask16)-1, (int)(R)))
3005 
3006 
3007 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
3008   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
3009                                               (__v16sf)(__m512)(B), \
3010                                               -(__v16sf)(__m512)(C), \
3011                                               (__mmask16)(U), (int)(R)))
3012 
3013 
3014 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
3015   ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
3016                                                (__v16sf)(__m512)(B), \
3017                                                -(__v16sf)(__m512)(C), \
3018                                                (__mmask16)(U), (int)(R)))
3019 
3020 
3021 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3022 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
3023 {
3024   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3025                                                       (__v16sf) __B,
3026                                                       (__v16sf) __C,
3027                                                       (__mmask16) -1,
3028                                                       _MM_FROUND_CUR_DIRECTION);
3029 }
3030 
3031 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3032 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3033 {
3034   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3035                                                       (__v16sf) __B,
3036                                                       (__v16sf) __C,
3037                                                       (__mmask16) __U,
3038                                                       _MM_FROUND_CUR_DIRECTION);
3039 }
3040 
3041 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3042 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3043 {
3044   return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
3045                                                        (__v16sf) __B,
3046                                                        (__v16sf) __C,
3047                                                        (__mmask16) __U,
3048                                                        _MM_FROUND_CUR_DIRECTION);
3049 }
3050 
3051 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3052 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
3053 {
3054   return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3055                                                        (__v16sf) __B,
3056                                                        (__v16sf) __C,
3057                                                        (__mmask16) __U,
3058                                                        _MM_FROUND_CUR_DIRECTION);
3059 }
3060 
3061 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3062 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
3063 {
3064   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3065                                                       (__v16sf) __B,
3066                                                       -(__v16sf) __C,
3067                                                       (__mmask16) -1,
3068                                                       _MM_FROUND_CUR_DIRECTION);
3069 }
3070 
3071 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3072 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3073 {
3074   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3075                                                       (__v16sf) __B,
3076                                                       -(__v16sf) __C,
3077                                                       (__mmask16) __U,
3078                                                       _MM_FROUND_CUR_DIRECTION);
3079 }
3080 
3081 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3082 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
3083 {
3084   return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3085                                                        (__v16sf) __B,
3086                                                        -(__v16sf) __C,
3087                                                        (__mmask16) __U,
3088                                                        _MM_FROUND_CUR_DIRECTION);
3089 }
3090 
3091 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
3092   ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
3093                                              (__v8df)(__m512d)(B), \
3094                                              (__v8df)(__m512d)(C), \
3095                                              (__mmask8)(U), (int)(R)))
3096 
3097 
3098 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3099 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3100 {
3101   return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
3102                                                     (__v8df) __B,
3103                                                     (__v8df) __C,
3104                                                     (__mmask8) __U,
3105                                                     _MM_FROUND_CUR_DIRECTION);
3106 }
3107 
3108 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
3109   ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
3110                                             (__v16sf)(__m512)(B), \
3111                                             (__v16sf)(__m512)(C), \
3112                                             (__mmask16)(U), (int)(R)))
3113 
3114 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3115 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3116 {
3117   return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
3118                                                    (__v16sf) __B,
3119                                                    (__v16sf) __C,
3120                                                    (__mmask16) __U,
3121                                                    _MM_FROUND_CUR_DIRECTION);
3122 }
3123 
3124 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
3125   ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
3126                                                 (__v8df)(__m512d)(B), \
3127                                                 (__v8df)(__m512d)(C), \
3128                                                 (__mmask8)(U), (int)(R)))
3129 
3130 
3131 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3132 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3133 {
3134   return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
3135                                                        (__v8df) __B,
3136                                                        (__v8df) __C,
3137                                                        (__mmask8) __U,
3138                                                        _MM_FROUND_CUR_DIRECTION);
3139 }
3140 
3141 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
3142   ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
3143                                                (__v16sf)(__m512)(B), \
3144                                                (__v16sf)(__m512)(C), \
3145                                                (__mmask16)(U), (int)(R)))
3146 
3147 
3148 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3149 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3150 {
3151   return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
3152                                                       (__v16sf) __B,
3153                                                       (__v16sf) __C,
3154                                                       (__mmask16) __U,
3155                                                       _MM_FROUND_CUR_DIRECTION);
3156 }
3157 
3158 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
3159   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3160                                             -(__v8df)(__m512d)(B), \
3161                                             (__v8df)(__m512d)(C), \
3162                                             (__mmask8)(U), (int)(R)))
3163 
3164 
3165 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3166 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
3167 {
3168   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3169                                                     -(__v8df) __B,
3170                                                     (__v8df) __C,
3171                                                     (__mmask8) __U,
3172                                                     _MM_FROUND_CUR_DIRECTION);
3173 }
3174 
3175 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
3176   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3177                                            -(__v16sf)(__m512)(B), \
3178                                            (__v16sf)(__m512)(C), \
3179                                            (__mmask16)(U), (int)(R)))
3180 
3181 
3182 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3183 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3184 {
3185   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3186                                                    -(__v16sf) __B,
3187                                                    (__v16sf) __C,
3188                                                    (__mmask16) __U,
3189                                                    _MM_FROUND_CUR_DIRECTION);
3190 }
3191 
3192 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
3193   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3194                                             -(__v8df)(__m512d)(B), \
3195                                             -(__v8df)(__m512d)(C), \
3196                                             (__mmask8)(U), (int)(R)))
3197 
3198 
3199 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
3200   ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
3201                                              (__v8df)(__m512d)(B), \
3202                                              (__v8df)(__m512d)(C), \
3203                                              (__mmask8)(U), (int)(R)))
3204 
3205 
3206 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3207 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
3208 {
3209   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3210                                                     -(__v8df) __B,
3211                                                     -(__v8df) __C,
3212                                                     (__mmask8) __U,
3213                                                     _MM_FROUND_CUR_DIRECTION);
3214 }
3215 
3216 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3217 _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3218 {
3219   return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
3220                                                      (__v8df) __B,
3221                                                      (__v8df) __C,
3222                                                      (__mmask8) __U,
3223                                                      _MM_FROUND_CUR_DIRECTION);
3224 }
3225 
3226 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
3227   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3228                                            -(__v16sf)(__m512)(B), \
3229                                            -(__v16sf)(__m512)(C), \
3230                                            (__mmask16)(U), (int)(R)))
3231 
3232 
3233 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
3234   ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
3235                                             (__v16sf)(__m512)(B), \
3236                                             (__v16sf)(__m512)(C), \
3237                                             (__mmask16)(U), (int)(R)))
3238 
3239 
3240 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3241 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3242 {
3243   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3244                                                    -(__v16sf) __B,
3245                                                    -(__v16sf) __C,
3246                                                    (__mmask16) __U,
3247                                                    _MM_FROUND_CUR_DIRECTION);
3248 }
3249 
3250 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3251 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3252 {
3253   return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
3254                                                     (__v16sf) __B,
3255                                                     (__v16sf) __C,
3256                                                     (__mmask16) __U,
3257                                                     _MM_FROUND_CUR_DIRECTION);
3258 }
3259 
3260 
3261 
3262 /* Vector permutations */
3263 
3264 static __inline __m512i __DEFAULT_FN_ATTRS512
3265 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
3266 {
3267   return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
3268                                                 (__v16si) __B);
3269 }
3270 
3271 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3272 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
3273                                __m512i __B)
3274 {
3275   return (__m512i)__builtin_ia32_selectd_512(__U,
3276                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3277                               (__v16si)__A);
3278 }
3279 
3280 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3281 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
3282                                 __m512i __B)
3283 {
3284   return (__m512i)__builtin_ia32_selectd_512(__U,
3285                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3286                               (__v16si)__I);
3287 }
3288 
3289 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3290 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
3291                                 __m512i __B)
3292 {
3293   return (__m512i)__builtin_ia32_selectd_512(__U,
3294                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3295                               (__v16si)_mm512_setzero_si512());
3296 }
3297 
3298 static __inline __m512i __DEFAULT_FN_ATTRS512
3299 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
3300 {
3301   return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
3302                                                 (__v8di) __B);
3303 }
3304 
3305 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3306 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
3307                                __m512i __B)
3308 {
3309   return (__m512i)__builtin_ia32_selectq_512(__U,
3310                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3311                                (__v8di)__A);
3312 }
3313 
3314 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3315 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
3316                                 __m512i __B)
3317 {
3318   return (__m512i)__builtin_ia32_selectq_512(__U,
3319                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3320                                (__v8di)__I);
3321 }
3322 
3323 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3324 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
3325                                 __m512i __B)
3326 {
3327   return (__m512i)__builtin_ia32_selectq_512(__U,
3328                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3329                                (__v8di)_mm512_setzero_si512());
3330 }
3331 
3332 #define _mm512_alignr_epi64(A, B, I) \
3333   ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
3334                                      (__v8di)(__m512i)(B), (int)(I)))
3335 
3336 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
3337   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3338                                   (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3339                                   (__v8di)(__m512i)(W)))
3340 
3341 #define _mm512_maskz_alignr_epi64(U, A, B, imm) \
3342   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3343                                   (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3344                                   (__v8di)_mm512_setzero_si512()))
3345 
3346 #define _mm512_alignr_epi32(A, B, I) \
3347   ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
3348                                      (__v16si)(__m512i)(B), (int)(I)))
3349 
3350 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
3351   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3352                                  (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3353                                  (__v16si)(__m512i)(W)))
3354 
3355 #define _mm512_maskz_alignr_epi32(U, A, B, imm) \
3356   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3357                                  (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3358                                  (__v16si)_mm512_setzero_si512()))
3359 /* Vector Extract */
3360 
3361 #define _mm512_extractf64x4_pd(A, I) \
3362   ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
3363                                              (__v4df)_mm256_undefined_pd(), \
3364                                              (__mmask8)-1))
3365 
3366 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
3367   ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3368                                              (__v4df)(__m256d)(W), \
3369                                              (__mmask8)(U)))
3370 
3371 #define _mm512_maskz_extractf64x4_pd(U, A, imm) \
3372   ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3373                                              (__v4df)_mm256_setzero_pd(), \
3374                                              (__mmask8)(U)))
3375 
3376 #define _mm512_extractf32x4_ps(A, I) \
3377   ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
3378                                             (__v4sf)_mm_undefined_ps(), \
3379                                             (__mmask8)-1))
3380 
3381 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
3382   ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3383                                             (__v4sf)(__m128)(W), \
3384                                             (__mmask8)(U)))
3385 
3386 #define _mm512_maskz_extractf32x4_ps(U, A, imm) \
3387   ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3388                                             (__v4sf)_mm_setzero_ps(), \
3389                                             (__mmask8)(U)))
3390 
3391 /* Vector Blend */
3392 
3393 static __inline __m512d __DEFAULT_FN_ATTRS512
3394 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
3395 {
3396   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
3397                  (__v8df) __W,
3398                  (__v8df) __A);
3399 }
3400 
3401 static __inline __m512 __DEFAULT_FN_ATTRS512
3402 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
3403 {
3404   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
3405                 (__v16sf) __W,
3406                 (__v16sf) __A);
3407 }
3408 
3409 static __inline __m512i __DEFAULT_FN_ATTRS512
3410 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
3411 {
3412   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
3413                 (__v8di) __W,
3414                 (__v8di) __A);
3415 }
3416 
3417 static __inline __m512i __DEFAULT_FN_ATTRS512
3418 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
3419 {
3420   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
3421                 (__v16si) __W,
3422                 (__v16si) __A);
3423 }
3424 
3425 /* Compare */
3426 
3427 #define _mm512_cmp_round_ps_mask(A, B, P, R) \
3428   ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3429                                            (__v16sf)(__m512)(B), (int)(P), \
3430                                            (__mmask16)-1, (int)(R)))
3431 
3432 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
3433   ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3434                                            (__v16sf)(__m512)(B), (int)(P), \
3435                                            (__mmask16)(U), (int)(R)))
3436 
3437 #define _mm512_cmp_ps_mask(A, B, P) \
3438   _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3439 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \
3440   _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3441 
3442 #define _mm512_cmpeq_ps_mask(A, B) \
3443     _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
3444 #define _mm512_mask_cmpeq_ps_mask(k, A, B) \
3445     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
3446 
3447 #define _mm512_cmplt_ps_mask(A, B) \
3448     _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
3449 #define _mm512_mask_cmplt_ps_mask(k, A, B) \
3450     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
3451 
3452 #define _mm512_cmple_ps_mask(A, B) \
3453     _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
3454 #define _mm512_mask_cmple_ps_mask(k, A, B) \
3455     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
3456 
3457 #define _mm512_cmpunord_ps_mask(A, B) \
3458     _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
3459 #define _mm512_mask_cmpunord_ps_mask(k, A, B) \
3460     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
3461 
3462 #define _mm512_cmpneq_ps_mask(A, B) \
3463     _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
3464 #define _mm512_mask_cmpneq_ps_mask(k, A, B) \
3465     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
3466 
3467 #define _mm512_cmpnlt_ps_mask(A, B) \
3468     _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
3469 #define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
3470     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
3471 
3472 #define _mm512_cmpnle_ps_mask(A, B) \
3473     _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
3474 #define _mm512_mask_cmpnle_ps_mask(k, A, B) \
3475     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
3476 
3477 #define _mm512_cmpord_ps_mask(A, B) \
3478     _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
3479 #define _mm512_mask_cmpord_ps_mask(k, A, B) \
3480     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
3481 
3482 #define _mm512_cmp_round_pd_mask(A, B, P, R) \
3483   ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3484                                           (__v8df)(__m512d)(B), (int)(P), \
3485                                           (__mmask8)-1, (int)(R)))
3486 
3487 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
3488   ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3489                                           (__v8df)(__m512d)(B), (int)(P), \
3490                                           (__mmask8)(U), (int)(R)))
3491 
3492 #define _mm512_cmp_pd_mask(A, B, P) \
3493   _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3494 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \
3495   _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3496 
3497 #define _mm512_cmpeq_pd_mask(A, B) \
3498     _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
3499 #define _mm512_mask_cmpeq_pd_mask(k, A, B) \
3500     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
3501 
3502 #define _mm512_cmplt_pd_mask(A, B) \
3503     _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
3504 #define _mm512_mask_cmplt_pd_mask(k, A, B) \
3505     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
3506 
3507 #define _mm512_cmple_pd_mask(A, B) \
3508     _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
3509 #define _mm512_mask_cmple_pd_mask(k, A, B) \
3510     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
3511 
3512 #define _mm512_cmpunord_pd_mask(A, B) \
3513     _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
3514 #define _mm512_mask_cmpunord_pd_mask(k, A, B) \
3515     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
3516 
3517 #define _mm512_cmpneq_pd_mask(A, B) \
3518     _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
3519 #define _mm512_mask_cmpneq_pd_mask(k, A, B) \
3520     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
3521 
3522 #define _mm512_cmpnlt_pd_mask(A, B) \
3523     _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
3524 #define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
3525     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
3526 
3527 #define _mm512_cmpnle_pd_mask(A, B) \
3528     _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
3529 #define _mm512_mask_cmpnle_pd_mask(k, A, B) \
3530     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
3531 
3532 #define _mm512_cmpord_pd_mask(A, B) \
3533     _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
3534 #define _mm512_mask_cmpord_pd_mask(k, A, B) \
3535     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
3536 
3537 /* Conversion */
3538 
3539 #define _mm512_cvtt_roundps_epu32(A, R) \
3540   ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3541                                               (__v16si)_mm512_undefined_epi32(), \
3542                                               (__mmask16)-1, (int)(R)))
3543 
3544 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
3545   ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3546                                               (__v16si)(__m512i)(W), \
3547                                               (__mmask16)(U), (int)(R)))
3548 
3549 #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
3550   ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3551                                               (__v16si)_mm512_setzero_si512(), \
3552                                               (__mmask16)(U), (int)(R)))
3553 
3554 
3555 static __inline __m512i __DEFAULT_FN_ATTRS512
3556 _mm512_cvttps_epu32(__m512 __A)
3557 {
3558   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3559                   (__v16si)
3560                   _mm512_setzero_si512 (),
3561                   (__mmask16) -1,
3562                   _MM_FROUND_CUR_DIRECTION);
3563 }
3564 
3565 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3566 _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
3567 {
3568   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3569                    (__v16si) __W,
3570                    (__mmask16) __U,
3571                    _MM_FROUND_CUR_DIRECTION);
3572 }
3573 
3574 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3575 _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
3576 {
3577   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3578                    (__v16si) _mm512_setzero_si512 (),
3579                    (__mmask16) __U,
3580                    _MM_FROUND_CUR_DIRECTION);
3581 }
3582 
3583 #define _mm512_cvt_roundepi32_ps(A, R) \
3584   ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3585                                            (__v16sf)_mm512_setzero_ps(), \
3586                                            (__mmask16)-1, (int)(R)))
3587 
3588 #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
3589   ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3590                                            (__v16sf)(__m512)(W), \
3591                                            (__mmask16)(U), (int)(R)))
3592 
3593 #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
3594   ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3595                                            (__v16sf)_mm512_setzero_ps(), \
3596                                            (__mmask16)(U), (int)(R)))
3597 
3598 #define _mm512_cvt_roundepu32_ps(A, R) \
3599   ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3600                                             (__v16sf)_mm512_setzero_ps(), \
3601                                             (__mmask16)-1, (int)(R)))
3602 
3603 #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
3604   ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3605                                             (__v16sf)(__m512)(W), \
3606                                             (__mmask16)(U), (int)(R)))
3607 
3608 #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
3609   ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3610                                             (__v16sf)_mm512_setzero_ps(), \
3611                                             (__mmask16)(U), (int)(R)))
3612 
3613 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3614 _mm512_cvtepu32_ps (__m512i __A)
3615 {
3616   return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
3617 }
3618 
3619 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3620 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
3621 {
3622   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3623                                              (__v16sf)_mm512_cvtepu32_ps(__A),
3624                                              (__v16sf)__W);
3625 }
3626 
3627 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3628 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
3629 {
3630   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3631                                              (__v16sf)_mm512_cvtepu32_ps(__A),
3632                                              (__v16sf)_mm512_setzero_ps());
3633 }
3634 
3635 static __inline __m512d __DEFAULT_FN_ATTRS512
3636 _mm512_cvtepi32_pd(__m256i __A)
3637 {
3638   return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
3639 }
3640 
3641 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3642 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
3643 {
3644   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3645                                               (__v8df)_mm512_cvtepi32_pd(__A),
3646                                               (__v8df)__W);
3647 }
3648 
3649 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3650 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
3651 {
3652   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3653                                               (__v8df)_mm512_cvtepi32_pd(__A),
3654                                               (__v8df)_mm512_setzero_pd());
3655 }
3656 
3657 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3658 _mm512_cvtepi32lo_pd(__m512i __A)
3659 {
3660   return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
3661 }
3662 
3663 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3664 _mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
3665 {
3666   return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
3667 }
3668 
3669 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3670 _mm512_cvtepi32_ps (__m512i __A)
3671 {
3672   return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
3673 }
3674 
3675 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3676 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
3677 {
3678   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3679                                              (__v16sf)_mm512_cvtepi32_ps(__A),
3680                                              (__v16sf)__W);
3681 }
3682 
3683 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3684 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
3685 {
3686   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3687                                              (__v16sf)_mm512_cvtepi32_ps(__A),
3688                                              (__v16sf)_mm512_setzero_ps());
3689 }
3690 
3691 static __inline __m512d __DEFAULT_FN_ATTRS512
3692 _mm512_cvtepu32_pd(__m256i __A)
3693 {
3694   return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
3695 }
3696 
3697 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3698 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
3699 {
3700   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3701                                               (__v8df)_mm512_cvtepu32_pd(__A),
3702                                               (__v8df)__W);
3703 }
3704 
3705 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3706 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
3707 {
3708   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3709                                               (__v8df)_mm512_cvtepu32_pd(__A),
3710                                               (__v8df)_mm512_setzero_pd());
3711 }
3712 
3713 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3714 _mm512_cvtepu32lo_pd(__m512i __A)
3715 {
3716   return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
3717 }
3718 
3719 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3720 _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
3721 {
3722   return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
3723 }
3724 
3725 #define _mm512_cvt_roundpd_ps(A, R) \
3726   ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3727                                            (__v8sf)_mm256_setzero_ps(), \
3728                                            (__mmask8)-1, (int)(R)))
3729 
3730 #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
3731   ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3732                                            (__v8sf)(__m256)(W), (__mmask8)(U), \
3733                                            (int)(R)))
3734 
3735 #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
3736   ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3737                                            (__v8sf)_mm256_setzero_ps(), \
3738                                            (__mmask8)(U), (int)(R)))
3739 
3740 static __inline__ __m256 __DEFAULT_FN_ATTRS512
3741 _mm512_cvtpd_ps (__m512d __A)
3742 {
3743   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3744                 (__v8sf) _mm256_undefined_ps (),
3745                 (__mmask8) -1,
3746                 _MM_FROUND_CUR_DIRECTION);
3747 }
3748 
3749 static __inline__ __m256 __DEFAULT_FN_ATTRS512
3750 _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
3751 {
3752   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3753                 (__v8sf) __W,
3754                 (__mmask8) __U,
3755                 _MM_FROUND_CUR_DIRECTION);
3756 }
3757 
3758 static __inline__ __m256 __DEFAULT_FN_ATTRS512
3759 _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
3760 {
3761   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3762                 (__v8sf) _mm256_setzero_ps (),
3763                 (__mmask8) __U,
3764                 _MM_FROUND_CUR_DIRECTION);
3765 }
3766 
3767 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3768 _mm512_cvtpd_pslo (__m512d __A)
3769 {
3770   return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
3771                 (__v8sf) _mm256_setzero_ps (),
3772                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3773 }
3774 
3775 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3776 _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
3777 {
3778   return (__m512) __builtin_shufflevector (
3779                 (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
3780                                                __U, __A),
3781                 (__v8sf) _mm256_setzero_ps (),
3782                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3783 }
3784 
3785 #define _mm512_cvt_roundps_ph(A, I) \
3786   ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3787                                              (__v16hi)_mm256_undefined_si256(), \
3788                                              (__mmask16)-1))
3789 
3790 #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
3791   ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3792                                              (__v16hi)(__m256i)(U), \
3793                                              (__mmask16)(W)))
3794 
3795 #define _mm512_maskz_cvt_roundps_ph(W, A, I) \
3796   ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3797                                              (__v16hi)_mm256_setzero_si256(), \
3798                                              (__mmask16)(W)))
3799 
3800 #define _mm512_cvtps_ph       _mm512_cvt_roundps_ph
3801 #define _mm512_mask_cvtps_ph  _mm512_mask_cvt_roundps_ph
3802 #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
3803 
3804 #define _mm512_cvt_roundph_ps(A, R) \
3805   ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3806                                             (__v16sf)_mm512_undefined_ps(), \
3807                                             (__mmask16)-1, (int)(R)))
3808 
3809 #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
3810   ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3811                                             (__v16sf)(__m512)(W), \
3812                                             (__mmask16)(U), (int)(R)))
3813 
3814 #define _mm512_maskz_cvt_roundph_ps(U, A, R) \
3815   ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3816                                             (__v16sf)_mm512_setzero_ps(), \
3817                                             (__mmask16)(U), (int)(R)))
3818 
3819 
3820 static  __inline __m512 __DEFAULT_FN_ATTRS512
3821 _mm512_cvtph_ps(__m256i __A)
3822 {
3823   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3824                 (__v16sf)
3825                 _mm512_setzero_ps (),
3826                 (__mmask16) -1,
3827                 _MM_FROUND_CUR_DIRECTION);
3828 }
3829 
3830 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3831 _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
3832 {
3833   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3834                  (__v16sf) __W,
3835                  (__mmask16) __U,
3836                  _MM_FROUND_CUR_DIRECTION);
3837 }
3838 
3839 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3840 _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
3841 {
3842   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3843                  (__v16sf) _mm512_setzero_ps (),
3844                  (__mmask16) __U,
3845                  _MM_FROUND_CUR_DIRECTION);
3846 }
3847 
3848 #define _mm512_cvtt_roundpd_epi32(A, R) \
3849   ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3850                                              (__v8si)_mm256_setzero_si256(), \
3851                                              (__mmask8)-1, (int)(R)))
3852 
3853 #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
3854   ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3855                                              (__v8si)(__m256i)(W), \
3856                                              (__mmask8)(U), (int)(R)))
3857 
3858 #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
3859   ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3860                                              (__v8si)_mm256_setzero_si256(), \
3861                                              (__mmask8)(U), (int)(R)))
3862 
3863 static __inline __m256i __DEFAULT_FN_ATTRS512
3864 _mm512_cvttpd_epi32(__m512d __a)
3865 {
3866   return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
3867                                                    (__v8si)_mm256_setzero_si256(),
3868                                                    (__mmask8) -1,
3869                                                     _MM_FROUND_CUR_DIRECTION);
3870 }
3871 
3872 static __inline__ __m256i __DEFAULT_FN_ATTRS512
3873 _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
3874 {
3875   return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3876                   (__v8si) __W,
3877                   (__mmask8) __U,
3878                   _MM_FROUND_CUR_DIRECTION);
3879 }
3880 
3881 static __inline__ __m256i __DEFAULT_FN_ATTRS512
3882 _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
3883 {
3884   return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3885                   (__v8si) _mm256_setzero_si256 (),
3886                   (__mmask8) __U,
3887                   _MM_FROUND_CUR_DIRECTION);
3888 }
3889 
3890 #define _mm512_cvtt_roundps_epi32(A, R) \
3891   ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3892                                              (__v16si)_mm512_setzero_si512(), \
3893                                              (__mmask16)-1, (int)(R)))
3894 
3895 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
3896   ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3897                                              (__v16si)(__m512i)(W), \
3898                                              (__mmask16)(U), (int)(R)))
3899 
3900 #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
3901   ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3902                                              (__v16si)_mm512_setzero_si512(), \
3903                                              (__mmask16)(U), (int)(R)))
3904 
3905 static __inline __m512i __DEFAULT_FN_ATTRS512
3906 _mm512_cvttps_epi32(__m512 __a)
3907 {
3908   return (__m512i)
3909     __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
3910                                      (__v16si) _mm512_setzero_si512 (),
3911                                      (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
3912 }
3913 
3914 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3915 _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
3916 {
3917   return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3918                   (__v16si) __W,
3919                   (__mmask16) __U,
3920                   _MM_FROUND_CUR_DIRECTION);
3921 }
3922 
3923 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3924 _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
3925 {
3926   return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3927                   (__v16si) _mm512_setzero_si512 (),
3928                   (__mmask16) __U,
3929                   _MM_FROUND_CUR_DIRECTION);
3930 }
3931 
3932 #define _mm512_cvt_roundps_epi32(A, R) \
3933   ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3934                                             (__v16si)_mm512_setzero_si512(), \
3935                                             (__mmask16)-1, (int)(R)))
3936 
3937 #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
3938   ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3939                                             (__v16si)(__m512i)(W), \
3940                                             (__mmask16)(U), (int)(R)))
3941 
3942 #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
3943   ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3944                                             (__v16si)_mm512_setzero_si512(), \
3945                                             (__mmask16)(U), (int)(R)))
3946 
3947 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3948 _mm512_cvtps_epi32 (__m512 __A)
3949 {
3950   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3951                  (__v16si) _mm512_undefined_epi32 (),
3952                  (__mmask16) -1,
3953                  _MM_FROUND_CUR_DIRECTION);
3954 }
3955 
3956 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3957 _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
3958 {
3959   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3960                  (__v16si) __W,
3961                  (__mmask16) __U,
3962                  _MM_FROUND_CUR_DIRECTION);
3963 }
3964 
3965 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3966 _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
3967 {
3968   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3969                  (__v16si)
3970                  _mm512_setzero_si512 (),
3971                  (__mmask16) __U,
3972                  _MM_FROUND_CUR_DIRECTION);
3973 }
3974 
3975 #define _mm512_cvt_roundpd_epi32(A, R) \
3976   ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3977                                             (__v8si)_mm256_setzero_si256(), \
3978                                             (__mmask8)-1, (int)(R)))
3979 
3980 #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
3981   ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3982                                             (__v8si)(__m256i)(W), \
3983                                             (__mmask8)(U), (int)(R)))
3984 
3985 #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
3986   ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3987                                             (__v8si)_mm256_setzero_si256(), \
3988                                             (__mmask8)(U), (int)(R)))
3989 
3990 static __inline__ __m256i __DEFAULT_FN_ATTRS512
3991 _mm512_cvtpd_epi32 (__m512d __A)
3992 {
3993   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3994                  (__v8si)
3995                  _mm256_undefined_si256 (),
3996                  (__mmask8) -1,
3997                  _MM_FROUND_CUR_DIRECTION);
3998 }
3999 
4000 static __inline__ __m256i __DEFAULT_FN_ATTRS512
4001 _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
4002 {
4003   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
4004                  (__v8si) __W,
4005                  (__mmask8) __U,
4006                  _MM_FROUND_CUR_DIRECTION);
4007 }
4008 
4009 static __inline__ __m256i __DEFAULT_FN_ATTRS512
4010 _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
4011 {
4012   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
4013                  (__v8si)
4014                  _mm256_setzero_si256 (),
4015                  (__mmask8) __U,
4016                  _MM_FROUND_CUR_DIRECTION);
4017 }
4018 
4019 #define _mm512_cvt_roundps_epu32(A, R) \
4020   ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4021                                              (__v16si)_mm512_setzero_si512(), \
4022                                              (__mmask16)-1, (int)(R)))
4023 
4024 #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
4025   ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4026                                              (__v16si)(__m512i)(W), \
4027                                              (__mmask16)(U), (int)(R)))
4028 
4029 #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
4030   ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4031                                              (__v16si)_mm512_setzero_si512(), \
4032                                              (__mmask16)(U), (int)(R)))
4033 
4034 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4035 _mm512_cvtps_epu32 ( __m512 __A)
4036 {
4037   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
4038                   (__v16si)\
4039                   _mm512_undefined_epi32 (),
4040                   (__mmask16) -1,\
4041                   _MM_FROUND_CUR_DIRECTION);
4042 }
4043 
4044 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4045 _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
4046 {
4047   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4048                   (__v16si) __W,
4049                   (__mmask16) __U,
4050                   _MM_FROUND_CUR_DIRECTION);
4051 }
4052 
4053 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4054 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
4055 {
4056   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4057                   (__v16si)
4058                   _mm512_setzero_si512 (),
4059                   (__mmask16) __U ,
4060                   _MM_FROUND_CUR_DIRECTION);
4061 }
4062 
4063 #define _mm512_cvt_roundpd_epu32(A, R) \
4064   ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4065                                              (__v8si)_mm256_setzero_si256(), \
4066                                              (__mmask8)-1, (int)(R)))
4067 
4068 #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
4069   ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4070                                              (__v8si)(__m256i)(W), \
4071                                              (__mmask8)(U), (int)(R)))
4072 
4073 #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
4074   ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4075                                              (__v8si)_mm256_setzero_si256(), \
4076                                              (__mmask8)(U), (int)(R)))
4077 
4078 static __inline__ __m256i __DEFAULT_FN_ATTRS512
4079 _mm512_cvtpd_epu32 (__m512d __A)
4080 {
4081   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4082                   (__v8si)
4083                   _mm256_undefined_si256 (),
4084                   (__mmask8) -1,
4085                   _MM_FROUND_CUR_DIRECTION);
4086 }
4087 
4088 static __inline__ __m256i __DEFAULT_FN_ATTRS512
4089 _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
4090 {
4091   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4092                   (__v8si) __W,
4093                   (__mmask8) __U,
4094                   _MM_FROUND_CUR_DIRECTION);
4095 }
4096 
4097 static __inline__ __m256i __DEFAULT_FN_ATTRS512
4098 _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
4099 {
4100   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4101                   (__v8si)
4102                   _mm256_setzero_si256 (),
4103                   (__mmask8) __U,
4104                   _MM_FROUND_CUR_DIRECTION);
4105 }
4106 
4107 static __inline__ double __DEFAULT_FN_ATTRS512
4108 _mm512_cvtsd_f64(__m512d __a)
4109 {
4110   return __a[0];
4111 }
4112 
4113 static __inline__ float __DEFAULT_FN_ATTRS512
4114 _mm512_cvtss_f32(__m512 __a)
4115 {
4116   return __a[0];
4117 }
4118 
4119 /* Unpack and Interleave */
4120 
4121 static __inline __m512d __DEFAULT_FN_ATTRS512
4122 _mm512_unpackhi_pd(__m512d __a, __m512d __b)
4123 {
4124   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
4125                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4126 }
4127 
4128 static __inline__ __m512d __DEFAULT_FN_ATTRS512
4129 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
4130 {
4131   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4132                                            (__v8df)_mm512_unpackhi_pd(__A, __B),
4133                                            (__v8df)__W);
4134 }
4135 
4136 static __inline__ __m512d __DEFAULT_FN_ATTRS512
4137 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
4138 {
4139   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4140                                            (__v8df)_mm512_unpackhi_pd(__A, __B),
4141                                            (__v8df)_mm512_setzero_pd());
4142 }
4143 
4144 static __inline __m512d __DEFAULT_FN_ATTRS512
4145 _mm512_unpacklo_pd(__m512d __a, __m512d __b)
4146 {
4147   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
4148                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4149 }
4150 
4151 static __inline__ __m512d __DEFAULT_FN_ATTRS512
4152 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
4153 {
4154   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4155                                            (__v8df)_mm512_unpacklo_pd(__A, __B),
4156                                            (__v8df)__W);
4157 }
4158 
4159 static __inline__ __m512d __DEFAULT_FN_ATTRS512
4160 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
4161 {
4162   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4163                                            (__v8df)_mm512_unpacklo_pd(__A, __B),
4164                                            (__v8df)_mm512_setzero_pd());
4165 }
4166 
4167 static __inline __m512 __DEFAULT_FN_ATTRS512
4168 _mm512_unpackhi_ps(__m512 __a, __m512 __b)
4169 {
4170   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
4171                                          2,    18,    3,    19,
4172                                          2+4,  18+4,  3+4,  19+4,
4173                                          2+8,  18+8,  3+8,  19+8,
4174                                          2+12, 18+12, 3+12, 19+12);
4175 }
4176 
4177 static __inline__ __m512 __DEFAULT_FN_ATTRS512
4178 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
4179 {
4180   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4181                                           (__v16sf)_mm512_unpackhi_ps(__A, __B),
4182                                           (__v16sf)__W);
4183 }
4184 
4185 static __inline__ __m512 __DEFAULT_FN_ATTRS512
4186 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
4187 {
4188   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4189                                           (__v16sf)_mm512_unpackhi_ps(__A, __B),
4190                                           (__v16sf)_mm512_setzero_ps());
4191 }
4192 
4193 static __inline __m512 __DEFAULT_FN_ATTRS512
4194 _mm512_unpacklo_ps(__m512 __a, __m512 __b)
4195 {
4196   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
4197                                          0,    16,    1,    17,
4198                                          0+4,  16+4,  1+4,  17+4,
4199                                          0+8,  16+8,  1+8,  17+8,
4200                                          0+12, 16+12, 1+12, 17+12);
4201 }
4202 
4203 static __inline__ __m512 __DEFAULT_FN_ATTRS512
4204 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
4205 {
4206   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4207                                           (__v16sf)_mm512_unpacklo_ps(__A, __B),
4208                                           (__v16sf)__W);
4209 }
4210 
4211 static __inline__ __m512 __DEFAULT_FN_ATTRS512
4212 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
4213 {
4214   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4215                                           (__v16sf)_mm512_unpacklo_ps(__A, __B),
4216                                           (__v16sf)_mm512_setzero_ps());
4217 }
4218 
4219 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4220 _mm512_unpackhi_epi32(__m512i __A, __m512i __B)
4221 {
4222   return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4223                                           2,    18,    3,    19,
4224                                           2+4,  18+4,  3+4,  19+4,
4225                                           2+8,  18+8,  3+8,  19+8,
4226                                           2+12, 18+12, 3+12, 19+12);
4227 }
4228 
4229 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4230 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4231 {
4232   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4233                                        (__v16si)_mm512_unpackhi_epi32(__A, __B),
4234                                        (__v16si)__W);
4235 }
4236 
4237 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4238 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
4239 {
4240   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4241                                        (__v16si)_mm512_unpackhi_epi32(__A, __B),
4242                                        (__v16si)_mm512_setzero_si512());
4243 }
4244 
4245 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4246 _mm512_unpacklo_epi32(__m512i __A, __m512i __B)
4247 {
4248   return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4249                                           0,    16,    1,    17,
4250                                           0+4,  16+4,  1+4,  17+4,
4251                                           0+8,  16+8,  1+8,  17+8,
4252                                           0+12, 16+12, 1+12, 17+12);
4253 }
4254 
4255 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4256 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4257 {
4258   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4259                                        (__v16si)_mm512_unpacklo_epi32(__A, __B),
4260                                        (__v16si)__W);
4261 }
4262 
4263 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4264 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
4265 {
4266   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4267                                        (__v16si)_mm512_unpacklo_epi32(__A, __B),
4268                                        (__v16si)_mm512_setzero_si512());
4269 }
4270 
4271 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4272 _mm512_unpackhi_epi64(__m512i __A, __m512i __B)
4273 {
4274   return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4275                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4276 }
4277 
4278 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4279 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4280 {
4281   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4282                                         (__v8di)_mm512_unpackhi_epi64(__A, __B),
4283                                         (__v8di)__W);
4284 }
4285 
4286 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4287 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
4288 {
4289   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4290                                         (__v8di)_mm512_unpackhi_epi64(__A, __B),
4291                                         (__v8di)_mm512_setzero_si512());
4292 }
4293 
4294 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4295 _mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
4296 {
4297   return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4298                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4299 }
4300 
4301 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4302 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4303 {
4304   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4305                                         (__v8di)_mm512_unpacklo_epi64(__A, __B),
4306                                         (__v8di)__W);
4307 }
4308 
4309 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4310 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
4311 {
4312   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4313                                         (__v8di)_mm512_unpacklo_epi64(__A, __B),
4314                                         (__v8di)_mm512_setzero_si512());
4315 }
4316 
4317 
4318 /* SIMD load ops */
4319 
4320 static __inline __m512i __DEFAULT_FN_ATTRS512
4321 _mm512_loadu_si512 (void const *__P)
4322 {
4323   struct __loadu_si512 {
4324     __m512i_u __v;
4325   } __attribute__((__packed__, __may_alias__));
4326   return ((const struct __loadu_si512*)__P)->__v;
4327 }
4328 
4329 static __inline __m512i __DEFAULT_FN_ATTRS512
4330 _mm512_loadu_epi32 (void const *__P)
4331 {
4332   struct __loadu_epi32 {
4333     __m512i_u __v;
4334   } __attribute__((__packed__, __may_alias__));
4335   return ((const struct __loadu_epi32*)__P)->__v;
4336 }
4337 
4338 static __inline __m512i __DEFAULT_FN_ATTRS512
4339 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
4340 {
4341   return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
4342                   (__v16si) __W,
4343                   (__mmask16) __U);
4344 }
4345 
4346 
4347 static __inline __m512i __DEFAULT_FN_ATTRS512
4348 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
4349 {
4350   return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
4351                                                      (__v16si)
4352                                                      _mm512_setzero_si512 (),
4353                                                      (__mmask16) __U);
4354 }
4355 
4356 static __inline __m512i __DEFAULT_FN_ATTRS512
4357 _mm512_loadu_epi64 (void const *__P)
4358 {
4359   struct __loadu_epi64 {
4360     __m512i_u __v;
4361   } __attribute__((__packed__, __may_alias__));
4362   return ((const struct __loadu_epi64*)__P)->__v;
4363 }
4364 
4365 static __inline __m512i __DEFAULT_FN_ATTRS512
4366 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
4367 {
4368   return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
4369                   (__v8di) __W,
4370                   (__mmask8) __U);
4371 }
4372 
4373 static __inline __m512i __DEFAULT_FN_ATTRS512
4374 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
4375 {
4376   return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
4377                                                      (__v8di)
4378                                                      _mm512_setzero_si512 (),
4379                                                      (__mmask8) __U);
4380 }
4381 
4382 static __inline __m512 __DEFAULT_FN_ATTRS512
4383 _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
4384 {
4385   return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
4386                    (__v16sf) __W,
4387                    (__mmask16) __U);
4388 }
4389 
4390 static __inline __m512 __DEFAULT_FN_ATTRS512
4391 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
4392 {
4393   return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
4394                                                   (__v16sf)
4395                                                   _mm512_setzero_ps (),
4396                                                   (__mmask16) __U);
4397 }
4398 
4399 static __inline __m512d __DEFAULT_FN_ATTRS512
4400 _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
4401 {
4402   return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
4403                 (__v8df) __W,
4404                 (__mmask8) __U);
4405 }
4406 
4407 static __inline __m512d __DEFAULT_FN_ATTRS512
4408 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
4409 {
4410   return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
4411                                                    (__v8df)
4412                                                    _mm512_setzero_pd (),
4413                                                    (__mmask8) __U);
4414 }
4415 
4416 static __inline __m512d __DEFAULT_FN_ATTRS512
4417 _mm512_loadu_pd(void const *__p)
4418 {
4419   struct __loadu_pd {
4420     __m512d_u __v;
4421   } __attribute__((__packed__, __may_alias__));
4422   return ((const struct __loadu_pd*)__p)->__v;
4423 }
4424 
4425 static __inline __m512 __DEFAULT_FN_ATTRS512
4426 _mm512_loadu_ps(void const *__p)
4427 {
4428   struct __loadu_ps {
4429     __m512_u __v;
4430   } __attribute__((__packed__, __may_alias__));
4431   return ((const struct __loadu_ps*)__p)->__v;
4432 }
4433 
4434 static __inline __m512 __DEFAULT_FN_ATTRS512
4435 _mm512_load_ps(void const *__p)
4436 {
4437   return *(const __m512*)__p;
4438 }
4439 
4440 static __inline __m512 __DEFAULT_FN_ATTRS512
4441 _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
4442 {
4443   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
4444                    (__v16sf) __W,
4445                    (__mmask16) __U);
4446 }
4447 
4448 static __inline __m512 __DEFAULT_FN_ATTRS512
4449 _mm512_maskz_load_ps(__mmask16 __U, void const *__P)
4450 {
4451   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
4452                                                   (__v16sf)
4453                                                   _mm512_setzero_ps (),
4454                                                   (__mmask16) __U);
4455 }
4456 
4457 static __inline __m512d __DEFAULT_FN_ATTRS512
4458 _mm512_load_pd(void const *__p)
4459 {
4460   return *(const __m512d*)__p;
4461 }
4462 
4463 static __inline __m512d __DEFAULT_FN_ATTRS512
4464 _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
4465 {
4466   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
4467                           (__v8df) __W,
4468                           (__mmask8) __U);
4469 }
4470 
4471 static __inline __m512d __DEFAULT_FN_ATTRS512
4472 _mm512_maskz_load_pd(__mmask8 __U, void const *__P)
4473 {
4474   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
4475                                                    (__v8df)
4476                                                    _mm512_setzero_pd (),
4477                                                    (__mmask8) __U);
4478 }
4479 
4480 static __inline __m512i __DEFAULT_FN_ATTRS512
4481 _mm512_load_si512 (void const *__P)
4482 {
4483   return *(const __m512i *) __P;
4484 }
4485 
4486 static __inline __m512i __DEFAULT_FN_ATTRS512
4487 _mm512_load_epi32 (void const *__P)
4488 {
4489   return *(const __m512i *) __P;
4490 }
4491 
4492 static __inline __m512i __DEFAULT_FN_ATTRS512
4493 _mm512_load_epi64 (void const *__P)
4494 {
4495   return *(const __m512i *) __P;
4496 }
4497 
4498 /* SIMD store ops */
4499 
4500 static __inline void __DEFAULT_FN_ATTRS512
4501 _mm512_storeu_epi64 (void *__P, __m512i __A)
4502 {
4503   struct __storeu_epi64 {
4504     __m512i_u __v;
4505   } __attribute__((__packed__, __may_alias__));
4506   ((struct __storeu_epi64*)__P)->__v = __A;
4507 }
4508 
4509 static __inline void __DEFAULT_FN_ATTRS512
4510 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
4511 {
4512   __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
4513                                      (__mmask8) __U);
4514 }
4515 
4516 static __inline void __DEFAULT_FN_ATTRS512
4517 _mm512_storeu_si512 (void *__P, __m512i __A)
4518 {
4519   struct __storeu_si512 {
4520     __m512i_u __v;
4521   } __attribute__((__packed__, __may_alias__));
4522   ((struct __storeu_si512*)__P)->__v = __A;
4523 }
4524 
4525 static __inline void __DEFAULT_FN_ATTRS512
4526 _mm512_storeu_epi32 (void *__P, __m512i __A)
4527 {
4528   struct __storeu_epi32 {
4529     __m512i_u __v;
4530   } __attribute__((__packed__, __may_alias__));
4531   ((struct __storeu_epi32*)__P)->__v = __A;
4532 }
4533 
4534 static __inline void __DEFAULT_FN_ATTRS512
4535 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
4536 {
4537   __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
4538                                      (__mmask16) __U);
4539 }
4540 
4541 static __inline void __DEFAULT_FN_ATTRS512
4542 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
4543 {
4544   __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
4545 }
4546 
4547 static __inline void __DEFAULT_FN_ATTRS512
4548 _mm512_storeu_pd(void *__P, __m512d __A)
4549 {
4550   struct __storeu_pd {
4551     __m512d_u __v;
4552   } __attribute__((__packed__, __may_alias__));
4553   ((struct __storeu_pd*)__P)->__v = __A;
4554 }
4555 
4556 static __inline void __DEFAULT_FN_ATTRS512
4557 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
4558 {
4559   __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
4560                                    (__mmask16) __U);
4561 }
4562 
4563 static __inline void __DEFAULT_FN_ATTRS512
4564 _mm512_storeu_ps(void *__P, __m512 __A)
4565 {
4566   struct __storeu_ps {
4567     __m512_u __v;
4568   } __attribute__((__packed__, __may_alias__));
4569   ((struct __storeu_ps*)__P)->__v = __A;
4570 }
4571 
4572 static __inline void __DEFAULT_FN_ATTRS512
4573 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
4574 {
4575   __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
4576 }
4577 
4578 static __inline void __DEFAULT_FN_ATTRS512
4579 _mm512_store_pd(void *__P, __m512d __A)
4580 {
4581   *(__m512d*)__P = __A;
4582 }
4583 
4584 static __inline void __DEFAULT_FN_ATTRS512
4585 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
4586 {
4587   __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
4588                                    (__mmask16) __U);
4589 }
4590 
4591 static __inline void __DEFAULT_FN_ATTRS512
4592 _mm512_store_ps(void *__P, __m512 __A)
4593 {
4594   *(__m512*)__P = __A;
4595 }
4596 
4597 static __inline void __DEFAULT_FN_ATTRS512
4598 _mm512_store_si512 (void *__P, __m512i __A)
4599 {
4600   *(__m512i *) __P = __A;
4601 }
4602 
4603 static __inline void __DEFAULT_FN_ATTRS512
4604 _mm512_store_epi32 (void *__P, __m512i __A)
4605 {
4606   *(__m512i *) __P = __A;
4607 }
4608 
4609 static __inline void __DEFAULT_FN_ATTRS512
4610 _mm512_store_epi64 (void *__P, __m512i __A)
4611 {
4612   *(__m512i *) __P = __A;
4613 }
4614 
4615 /* Mask ops */
4616 
4617 static __inline __mmask16 __DEFAULT_FN_ATTRS
4618 _mm512_knot(__mmask16 __M)
4619 {
4620   return __builtin_ia32_knothi(__M);
4621 }
4622 
4623 /* Integer compare */
4624 
4625 #define _mm512_cmpeq_epi32_mask(A, B) \
4626     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
4627 #define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
4628     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
4629 #define _mm512_cmpge_epi32_mask(A, B) \
4630     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
4631 #define _mm512_mask_cmpge_epi32_mask(k, A, B) \
4632     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
4633 #define _mm512_cmpgt_epi32_mask(A, B) \
4634     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
4635 #define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
4636     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
4637 #define _mm512_cmple_epi32_mask(A, B) \
4638     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
4639 #define _mm512_mask_cmple_epi32_mask(k, A, B) \
4640     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
4641 #define _mm512_cmplt_epi32_mask(A, B) \
4642     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
4643 #define _mm512_mask_cmplt_epi32_mask(k, A, B) \
4644     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
4645 #define _mm512_cmpneq_epi32_mask(A, B) \
4646     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
4647 #define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
4648     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
4649 
4650 #define _mm512_cmpeq_epu32_mask(A, B) \
4651     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
4652 #define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
4653     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
4654 #define _mm512_cmpge_epu32_mask(A, B) \
4655     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
4656 #define _mm512_mask_cmpge_epu32_mask(k, A, B) \
4657     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
4658 #define _mm512_cmpgt_epu32_mask(A, B) \
4659     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
4660 #define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
4661     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
4662 #define _mm512_cmple_epu32_mask(A, B) \
4663     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
4664 #define _mm512_mask_cmple_epu32_mask(k, A, B) \
4665     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
4666 #define _mm512_cmplt_epu32_mask(A, B) \
4667     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
4668 #define _mm512_mask_cmplt_epu32_mask(k, A, B) \
4669     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
4670 #define _mm512_cmpneq_epu32_mask(A, B) \
4671     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
4672 #define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
4673     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
4674 
4675 #define _mm512_cmpeq_epi64_mask(A, B) \
4676     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
4677 #define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
4678     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
4679 #define _mm512_cmpge_epi64_mask(A, B) \
4680     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
4681 #define _mm512_mask_cmpge_epi64_mask(k, A, B) \
4682     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
4683 #define _mm512_cmpgt_epi64_mask(A, B) \
4684     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
4685 #define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
4686     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
4687 #define _mm512_cmple_epi64_mask(A, B) \
4688     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
4689 #define _mm512_mask_cmple_epi64_mask(k, A, B) \
4690     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
4691 #define _mm512_cmplt_epi64_mask(A, B) \
4692     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
4693 #define _mm512_mask_cmplt_epi64_mask(k, A, B) \
4694     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
4695 #define _mm512_cmpneq_epi64_mask(A, B) \
4696     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
4697 #define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
4698     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
4699 
4700 #define _mm512_cmpeq_epu64_mask(A, B) \
4701     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
4702 #define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
4703     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
4704 #define _mm512_cmpge_epu64_mask(A, B) \
4705     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
4706 #define _mm512_mask_cmpge_epu64_mask(k, A, B) \
4707     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
4708 #define _mm512_cmpgt_epu64_mask(A, B) \
4709     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
4710 #define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
4711     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
4712 #define _mm512_cmple_epu64_mask(A, B) \
4713     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
4714 #define _mm512_mask_cmple_epu64_mask(k, A, B) \
4715     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
4716 #define _mm512_cmplt_epu64_mask(A, B) \
4717     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
4718 #define _mm512_mask_cmplt_epu64_mask(k, A, B) \
4719     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
4720 #define _mm512_cmpneq_epu64_mask(A, B) \
4721     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
4722 #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
4723     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
4724 
4725 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4726 _mm512_cvtepi8_epi32(__m128i __A)
4727 {
4728   /* This function always performs a signed extension, but __v16qi is a char
4729      which may be signed or unsigned, so use __v16qs. */
4730   return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
4731 }
4732 
4733 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4734 _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
4735 {
4736   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4737                                              (__v16si)_mm512_cvtepi8_epi32(__A),
4738                                              (__v16si)__W);
4739 }
4740 
4741 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4742 _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
4743 {
4744   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4745                                              (__v16si)_mm512_cvtepi8_epi32(__A),
4746                                              (__v16si)_mm512_setzero_si512());
4747 }
4748 
4749 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4750 _mm512_cvtepi8_epi64(__m128i __A)
4751 {
4752   /* This function always performs a signed extension, but __v16qi is a char
4753      which may be signed or unsigned, so use __v16qs. */
4754   return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4755 }
4756 
4757 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4758 _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4759 {
4760   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4761                                              (__v8di)_mm512_cvtepi8_epi64(__A),
4762                                              (__v8di)__W);
4763 }
4764 
4765 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4766 _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
4767 {
4768   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4769                                              (__v8di)_mm512_cvtepi8_epi64(__A),
4770                                              (__v8di)_mm512_setzero_si512 ());
4771 }
4772 
4773 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4774 _mm512_cvtepi32_epi64(__m256i __X)
4775 {
4776   return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
4777 }
4778 
4779 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4780 _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
4781 {
4782   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4783                                              (__v8di)_mm512_cvtepi32_epi64(__X),
4784                                              (__v8di)__W);
4785 }
4786 
4787 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4788 _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
4789 {
4790   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4791                                              (__v8di)_mm512_cvtepi32_epi64(__X),
4792                                              (__v8di)_mm512_setzero_si512());
4793 }
4794 
4795 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4796 _mm512_cvtepi16_epi32(__m256i __A)
4797 {
4798   return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
4799 }
4800 
4801 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4802 _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
4803 {
4804   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4805                                             (__v16si)_mm512_cvtepi16_epi32(__A),
4806                                             (__v16si)__W);
4807 }
4808 
4809 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4810 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
4811 {
4812   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4813                                             (__v16si)_mm512_cvtepi16_epi32(__A),
4814                                             (__v16si)_mm512_setzero_si512 ());
4815 }
4816 
4817 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4818 _mm512_cvtepi16_epi64(__m128i __A)
4819 {
4820   return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
4821 }
4822 
4823 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4824 _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4825 {
4826   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4827                                              (__v8di)_mm512_cvtepi16_epi64(__A),
4828                                              (__v8di)__W);
4829 }
4830 
4831 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4832 _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
4833 {
4834   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4835                                              (__v8di)_mm512_cvtepi16_epi64(__A),
4836                                              (__v8di)_mm512_setzero_si512());
4837 }
4838 
4839 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4840 _mm512_cvtepu8_epi32(__m128i __A)
4841 {
4842   return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
4843 }
4844 
4845 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4846 _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
4847 {
4848   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4849                                              (__v16si)_mm512_cvtepu8_epi32(__A),
4850                                              (__v16si)__W);
4851 }
4852 
4853 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4854 _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
4855 {
4856   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4857                                              (__v16si)_mm512_cvtepu8_epi32(__A),
4858                                              (__v16si)_mm512_setzero_si512());
4859 }
4860 
4861 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4862 _mm512_cvtepu8_epi64(__m128i __A)
4863 {
4864   return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4865 }
4866 
4867 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4868 _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4869 {
4870   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4871                                              (__v8di)_mm512_cvtepu8_epi64(__A),
4872                                              (__v8di)__W);
4873 }
4874 
4875 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4876 _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
4877 {
4878   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4879                                              (__v8di)_mm512_cvtepu8_epi64(__A),
4880                                              (__v8di)_mm512_setzero_si512());
4881 }
4882 
4883 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4884 _mm512_cvtepu32_epi64(__m256i __X)
4885 {
4886   return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
4887 }
4888 
4889 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4890 _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
4891 {
4892   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4893                                              (__v8di)_mm512_cvtepu32_epi64(__X),
4894                                              (__v8di)__W);
4895 }
4896 
4897 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4898 _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
4899 {
4900   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4901                                              (__v8di)_mm512_cvtepu32_epi64(__X),
4902                                              (__v8di)_mm512_setzero_si512());
4903 }
4904 
4905 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4906 _mm512_cvtepu16_epi32(__m256i __A)
4907 {
4908   return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
4909 }
4910 
4911 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4912 _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
4913 {
4914   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4915                                             (__v16si)_mm512_cvtepu16_epi32(__A),
4916                                             (__v16si)__W);
4917 }
4918 
4919 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4920 _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
4921 {
4922   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4923                                             (__v16si)_mm512_cvtepu16_epi32(__A),
4924                                             (__v16si)_mm512_setzero_si512());
4925 }
4926 
4927 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4928 _mm512_cvtepu16_epi64(__m128i __A)
4929 {
4930   return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
4931 }
4932 
4933 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4934 _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4935 {
4936   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4937                                              (__v8di)_mm512_cvtepu16_epi64(__A),
4938                                              (__v8di)__W);
4939 }
4940 
4941 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4942 _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
4943 {
4944   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4945                                              (__v8di)_mm512_cvtepu16_epi64(__A),
4946                                              (__v8di)_mm512_setzero_si512());
4947 }
4948 
4949 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4950 _mm512_rorv_epi32 (__m512i __A, __m512i __B)
4951 {
4952   return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
4953 }
4954 
4955 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4956 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4957 {
4958   return (__m512i)__builtin_ia32_selectd_512(__U,
4959                                            (__v16si)_mm512_rorv_epi32(__A, __B),
4960                                            (__v16si)__W);
4961 }
4962 
4963 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4964 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
4965 {
4966   return (__m512i)__builtin_ia32_selectd_512(__U,
4967                                            (__v16si)_mm512_rorv_epi32(__A, __B),
4968                                            (__v16si)_mm512_setzero_si512());
4969 }
4970 
4971 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4972 _mm512_rorv_epi64 (__m512i __A, __m512i __B)
4973 {
4974   return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
4975 }
4976 
4977 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4978 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4979 {
4980   return (__m512i)__builtin_ia32_selectq_512(__U,
4981                                             (__v8di)_mm512_rorv_epi64(__A, __B),
4982                                             (__v8di)__W);
4983 }
4984 
4985 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4986 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
4987 {
4988   return (__m512i)__builtin_ia32_selectq_512(__U,
4989                                             (__v8di)_mm512_rorv_epi64(__A, __B),
4990                                             (__v8di)_mm512_setzero_si512());
4991 }
4992 
4993 
4994 
4995 #define _mm512_cmp_epi32_mask(a, b, p) \
4996   ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
4997                                           (__v16si)(__m512i)(b), (int)(p), \
4998                                           (__mmask16)-1))
4999 
5000 #define _mm512_cmp_epu32_mask(a, b, p) \
5001   ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
5002                                            (__v16si)(__m512i)(b), (int)(p), \
5003                                            (__mmask16)-1))
5004 
5005 #define _mm512_cmp_epi64_mask(a, b, p) \
5006   ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
5007                                          (__v8di)(__m512i)(b), (int)(p), \
5008                                          (__mmask8)-1))
5009 
5010 #define _mm512_cmp_epu64_mask(a, b, p) \
5011   ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
5012                                           (__v8di)(__m512i)(b), (int)(p), \
5013                                           (__mmask8)-1))
5014 
5015 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
5016   ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
5017                                           (__v16si)(__m512i)(b), (int)(p), \
5018                                           (__mmask16)(m)))
5019 
5020 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
5021   ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
5022                                            (__v16si)(__m512i)(b), (int)(p), \
5023                                            (__mmask16)(m)))
5024 
5025 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
5026   ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
5027                                          (__v8di)(__m512i)(b), (int)(p), \
5028                                          (__mmask8)(m)))
5029 
5030 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
5031   ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
5032                                           (__v8di)(__m512i)(b), (int)(p), \
5033                                           (__mmask8)(m)))
5034 
5035 #define _mm512_rol_epi32(a, b) \
5036   ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
5037 
5038 #define _mm512_mask_rol_epi32(W, U, a, b) \
5039   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5040                                        (__v16si)_mm512_rol_epi32((a), (b)), \
5041                                        (__v16si)(__m512i)(W)))
5042 
5043 #define _mm512_maskz_rol_epi32(U, a, b) \
5044   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5045                                        (__v16si)_mm512_rol_epi32((a), (b)), \
5046                                        (__v16si)_mm512_setzero_si512()))
5047 
5048 #define _mm512_rol_epi64(a, b) \
5049   ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
5050 
5051 #define _mm512_mask_rol_epi64(W, U, a, b) \
5052   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5053                                        (__v8di)_mm512_rol_epi64((a), (b)), \
5054                                        (__v8di)(__m512i)(W)))
5055 
5056 #define _mm512_maskz_rol_epi64(U, a, b) \
5057   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5058                                        (__v8di)_mm512_rol_epi64((a), (b)), \
5059                                        (__v8di)_mm512_setzero_si512()))
5060 
5061 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5062 _mm512_rolv_epi32 (__m512i __A, __m512i __B)
5063 {
5064   return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
5065 }
5066 
5067 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5068 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
5069 {
5070   return (__m512i)__builtin_ia32_selectd_512(__U,
5071                                            (__v16si)_mm512_rolv_epi32(__A, __B),
5072                                            (__v16si)__W);
5073 }
5074 
5075 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5076 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
5077 {
5078   return (__m512i)__builtin_ia32_selectd_512(__U,
5079                                            (__v16si)_mm512_rolv_epi32(__A, __B),
5080                                            (__v16si)_mm512_setzero_si512());
5081 }
5082 
5083 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5084 _mm512_rolv_epi64 (__m512i __A, __m512i __B)
5085 {
5086   return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
5087 }
5088 
5089 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5090 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
5091 {
5092   return (__m512i)__builtin_ia32_selectq_512(__U,
5093                                             (__v8di)_mm512_rolv_epi64(__A, __B),
5094                                             (__v8di)__W);
5095 }
5096 
5097 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5098 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
5099 {
5100   return (__m512i)__builtin_ia32_selectq_512(__U,
5101                                             (__v8di)_mm512_rolv_epi64(__A, __B),
5102                                             (__v8di)_mm512_setzero_si512());
5103 }
5104 
5105 #define _mm512_ror_epi32(A, B) \
5106   ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)))
5107 
5108 #define _mm512_mask_ror_epi32(W, U, A, B) \
5109   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5110                                        (__v16si)_mm512_ror_epi32((A), (B)), \
5111                                        (__v16si)(__m512i)(W)))
5112 
5113 #define _mm512_maskz_ror_epi32(U, A, B) \
5114   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5115                                        (__v16si)_mm512_ror_epi32((A), (B)), \
5116                                        (__v16si)_mm512_setzero_si512()))
5117 
5118 #define _mm512_ror_epi64(A, B) \
5119   ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)))
5120 
5121 #define _mm512_mask_ror_epi64(W, U, A, B) \
5122   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5123                                        (__v8di)_mm512_ror_epi64((A), (B)), \
5124                                        (__v8di)(__m512i)(W)))
5125 
5126 #define _mm512_maskz_ror_epi64(U, A, B) \
5127   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5128                                        (__v8di)_mm512_ror_epi64((A), (B)), \
5129                                        (__v8di)_mm512_setzero_si512()))
5130 
5131 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5132 _mm512_slli_epi32(__m512i __A, unsigned int __B)
5133 {
5134   return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B);
5135 }
5136 
5137 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5138 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
5139                        unsigned int __B)
5140 {
5141   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5142                                          (__v16si)_mm512_slli_epi32(__A, __B),
5143                                          (__v16si)__W);
5144 }
5145 
5146 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5147 _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
5148   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5149                                          (__v16si)_mm512_slli_epi32(__A, __B),
5150                                          (__v16si)_mm512_setzero_si512());
5151 }
5152 
5153 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5154 _mm512_slli_epi64(__m512i __A, unsigned int __B)
5155 {
5156   return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B);
5157 }
5158 
5159 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5160 _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
5161 {
5162   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5163                                           (__v8di)_mm512_slli_epi64(__A, __B),
5164                                           (__v8di)__W);
5165 }
5166 
5167 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5168 _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
5169 {
5170   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5171                                           (__v8di)_mm512_slli_epi64(__A, __B),
5172                                           (__v8di)_mm512_setzero_si512());
5173 }
5174 
5175 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5176 _mm512_srli_epi32(__m512i __A, unsigned int __B)
5177 {
5178   return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B);
5179 }
5180 
5181 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5182 _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
5183                        unsigned int __B)
5184 {
5185   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5186                                          (__v16si)_mm512_srli_epi32(__A, __B),
5187                                          (__v16si)__W);
5188 }
5189 
5190 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5191 _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
5192   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5193                                          (__v16si)_mm512_srli_epi32(__A, __B),
5194                                          (__v16si)_mm512_setzero_si512());
5195 }
5196 
5197 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5198 _mm512_srli_epi64(__m512i __A, unsigned int __B)
5199 {
5200   return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B);
5201 }
5202 
5203 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5204 _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A,
5205                        unsigned int __B)
5206 {
5207   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5208                                           (__v8di)_mm512_srli_epi64(__A, __B),
5209                                           (__v8di)__W);
5210 }
5211 
5212 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5213 _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A,
5214                         unsigned int __B)
5215 {
5216   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5217                                           (__v8di)_mm512_srli_epi64(__A, __B),
5218                                           (__v8di)_mm512_setzero_si512());
5219 }
5220 
5221 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5222 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
5223 {
5224   return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
5225               (__v16si) __W,
5226               (__mmask16) __U);
5227 }
5228 
5229 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5230 _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
5231 {
5232   return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
5233               (__v16si)
5234               _mm512_setzero_si512 (),
5235               (__mmask16) __U);
5236 }
5237 
5238 static __inline__ void __DEFAULT_FN_ATTRS512
5239 _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
5240 {
5241   __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
5242           (__mmask16) __U);
5243 }
5244 
5245 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5246 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
5247 {
5248   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
5249                  (__v16si) __A,
5250                  (__v16si) __W);
5251 }
5252 
5253 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5254 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
5255 {
5256   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
5257                  (__v16si) __A,
5258                  (__v16si) _mm512_setzero_si512 ());
5259 }
5260 
5261 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5262 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
5263 {
5264   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
5265                  (__v8di) __A,
5266                  (__v8di) __W);
5267 }
5268 
5269 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5270 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
5271 {
5272   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
5273                  (__v8di) __A,
5274                  (__v8di) _mm512_setzero_si512 ());
5275 }
5276 
5277 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5278 _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
5279 {
5280   return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
5281               (__v8di) __W,
5282               (__mmask8) __U);
5283 }
5284 
5285 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5286 _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
5287 {
5288   return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
5289               (__v8di)
5290               _mm512_setzero_si512 (),
5291               (__mmask8) __U);
5292 }
5293 
5294 static __inline__ void __DEFAULT_FN_ATTRS512
5295 _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
5296 {
5297   __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
5298           (__mmask8) __U);
5299 }
5300 
5301 static __inline__ __m512d __DEFAULT_FN_ATTRS512
5302 _mm512_movedup_pd (__m512d __A)
5303 {
5304   return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
5305                                           0, 0, 2, 2, 4, 4, 6, 6);
5306 }
5307 
5308 static __inline__ __m512d __DEFAULT_FN_ATTRS512
5309 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
5310 {
5311   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
5312                                               (__v8df)_mm512_movedup_pd(__A),
5313                                               (__v8df)__W);
5314 }
5315 
5316 static __inline__ __m512d __DEFAULT_FN_ATTRS512
5317 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
5318 {
5319   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
5320                                               (__v8df)_mm512_movedup_pd(__A),
5321                                               (__v8df)_mm512_setzero_pd());
5322 }
5323 
5324 #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
5325   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5326                                               (__v8df)(__m512d)(B), \
5327                                               (__v8di)(__m512i)(C), (int)(imm), \
5328                                               (__mmask8)-1, (int)(R)))
5329 
5330 #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
5331   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5332                                               (__v8df)(__m512d)(B), \
5333                                               (__v8di)(__m512i)(C), (int)(imm), \
5334                                               (__mmask8)(U), (int)(R)))
5335 
5336 #define _mm512_fixupimm_pd(A, B, C, imm) \
5337   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5338                                               (__v8df)(__m512d)(B), \
5339                                               (__v8di)(__m512i)(C), (int)(imm), \
5340                                               (__mmask8)-1, \
5341                                               _MM_FROUND_CUR_DIRECTION))
5342 
5343 #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
5344   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5345                                               (__v8df)(__m512d)(B), \
5346                                               (__v8di)(__m512i)(C), (int)(imm), \
5347                                               (__mmask8)(U), \
5348                                               _MM_FROUND_CUR_DIRECTION))
5349 
5350 #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
5351   ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
5352                                                (__v8df)(__m512d)(B), \
5353                                                (__v8di)(__m512i)(C), \
5354                                                (int)(imm), (__mmask8)(U), \
5355                                                (int)(R)))
5356 
5357 #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
5358   ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
5359                                                (__v8df)(__m512d)(B), \
5360                                                (__v8di)(__m512i)(C), \
5361                                                (int)(imm), (__mmask8)(U), \
5362                                                _MM_FROUND_CUR_DIRECTION))
5363 
5364 #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
5365   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5366                                              (__v16sf)(__m512)(B), \
5367                                              (__v16si)(__m512i)(C), (int)(imm), \
5368                                              (__mmask16)-1, (int)(R)))
5369 
5370 #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
5371   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5372                                              (__v16sf)(__m512)(B), \
5373                                              (__v16si)(__m512i)(C), (int)(imm), \
5374                                              (__mmask16)(U), (int)(R)))
5375 
5376 #define _mm512_fixupimm_ps(A, B, C, imm) \
5377   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5378                                              (__v16sf)(__m512)(B), \
5379                                              (__v16si)(__m512i)(C), (int)(imm), \
5380                                              (__mmask16)-1, \
5381                                              _MM_FROUND_CUR_DIRECTION))
5382 
5383 #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
5384   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5385                                              (__v16sf)(__m512)(B), \
5386                                              (__v16si)(__m512i)(C), (int)(imm), \
5387                                              (__mmask16)(U), \
5388                                              _MM_FROUND_CUR_DIRECTION))
5389 
5390 #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
5391   ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
5392                                               (__v16sf)(__m512)(B), \
5393                                               (__v16si)(__m512i)(C), \
5394                                               (int)(imm), (__mmask16)(U), \
5395                                               (int)(R)))
5396 
5397 #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
5398   ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
5399                                               (__v16sf)(__m512)(B), \
5400                                               (__v16si)(__m512i)(C), \
5401                                               (int)(imm), (__mmask16)(U), \
5402                                               _MM_FROUND_CUR_DIRECTION))
5403 
5404 #define _mm_fixupimm_round_sd(A, B, C, imm, R) \
5405   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5406                                            (__v2df)(__m128d)(B), \
5407                                            (__v2di)(__m128i)(C), (int)(imm), \
5408                                            (__mmask8)-1, (int)(R)))
5409 
5410 #define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \
5411   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5412                                            (__v2df)(__m128d)(B), \
5413                                            (__v2di)(__m128i)(C), (int)(imm), \
5414                                            (__mmask8)(U), (int)(R)))
5415 
5416 #define _mm_fixupimm_sd(A, B, C, imm) \
5417   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5418                                            (__v2df)(__m128d)(B), \
5419                                            (__v2di)(__m128i)(C), (int)(imm), \
5420                                            (__mmask8)-1, \
5421                                            _MM_FROUND_CUR_DIRECTION))
5422 
5423 #define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
5424   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5425                                            (__v2df)(__m128d)(B), \
5426                                            (__v2di)(__m128i)(C), (int)(imm), \
5427                                            (__mmask8)(U), \
5428                                            _MM_FROUND_CUR_DIRECTION))
5429 
5430 #define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
5431   ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
5432                                             (__v2df)(__m128d)(B), \
5433                                             (__v2di)(__m128i)(C), (int)(imm), \
5434                                             (__mmask8)(U), (int)(R)))
5435 
5436 #define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
5437   ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
5438                                             (__v2df)(__m128d)(B), \
5439                                             (__v2di)(__m128i)(C), (int)(imm), \
5440                                             (__mmask8)(U), \
5441                                             _MM_FROUND_CUR_DIRECTION))
5442 
5443 #define _mm_fixupimm_round_ss(A, B, C, imm, R) \
5444   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5445                                           (__v4sf)(__m128)(B), \
5446                                           (__v4si)(__m128i)(C), (int)(imm), \
5447                                           (__mmask8)-1, (int)(R)))
5448 
5449 #define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \
5450   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5451                                           (__v4sf)(__m128)(B), \
5452                                           (__v4si)(__m128i)(C), (int)(imm), \
5453                                           (__mmask8)(U), (int)(R)))
5454 
5455 #define _mm_fixupimm_ss(A, B, C, imm) \
5456   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5457                                           (__v4sf)(__m128)(B), \
5458                                           (__v4si)(__m128i)(C), (int)(imm), \
5459                                           (__mmask8)-1, \
5460                                           _MM_FROUND_CUR_DIRECTION))
5461 
5462 #define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
5463   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5464                                           (__v4sf)(__m128)(B), \
5465                                           (__v4si)(__m128i)(C), (int)(imm), \
5466                                           (__mmask8)(U), \
5467                                           _MM_FROUND_CUR_DIRECTION))
5468 
5469 #define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
5470   ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
5471                                            (__v4sf)(__m128)(B), \
5472                                            (__v4si)(__m128i)(C), (int)(imm), \
5473                                            (__mmask8)(U), (int)(R)))
5474 
5475 #define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
5476   ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
5477                                            (__v4sf)(__m128)(B), \
5478                                            (__v4si)(__m128i)(C), (int)(imm), \
5479                                            (__mmask8)(U), \
5480                                            _MM_FROUND_CUR_DIRECTION))
5481 
5482 #define _mm_getexp_round_sd(A, B, R) \
5483   ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5484                                                   (__v2df)(__m128d)(B), \
5485                                                   (__v2df)_mm_setzero_pd(), \
5486                                                   (__mmask8)-1, (int)(R)))
5487 
5488 
5489 static __inline__ __m128d __DEFAULT_FN_ATTRS128
5490 _mm_getexp_sd (__m128d __A, __m128d __B)
5491 {
5492   return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
5493                  (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
5494 }
5495 
5496 static __inline__ __m128d __DEFAULT_FN_ATTRS128
5497 _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
5498 {
5499  return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
5500           (__v2df) __B,
5501           (__v2df) __W,
5502           (__mmask8) __U,
5503           _MM_FROUND_CUR_DIRECTION);
5504 }
5505 
5506 #define _mm_mask_getexp_round_sd(W, U, A, B, R) \
5507   ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5508                                                   (__v2df)(__m128d)(B), \
5509                                                   (__v2df)(__m128d)(W), \
5510                                                   (__mmask8)(U), (int)(R)))
5511 
5512 static __inline__ __m128d __DEFAULT_FN_ATTRS128
5513 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
5514 {
5515  return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
5516           (__v2df) __B,
5517           (__v2df) _mm_setzero_pd (),
5518           (__mmask8) __U,
5519           _MM_FROUND_CUR_DIRECTION);
5520 }
5521 
5522 #define _mm_maskz_getexp_round_sd(U, A, B, R) \
5523   ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5524                                                   (__v2df)(__m128d)(B), \
5525                                                   (__v2df)_mm_setzero_pd(), \
5526                                                   (__mmask8)(U), (int)(R)))
5527 
5528 #define _mm_getexp_round_ss(A, B, R) \
5529   ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5530                                                  (__v4sf)(__m128)(B), \
5531                                                  (__v4sf)_mm_setzero_ps(), \
5532                                                  (__mmask8)-1, (int)(R)))
5533 
5534 static __inline__ __m128 __DEFAULT_FN_ATTRS128
5535 _mm_getexp_ss (__m128 __A, __m128 __B)
5536 {
5537   return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5538                 (__v4sf) __B, (__v4sf)  _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
5539 }
5540 
5541 static __inline__ __m128 __DEFAULT_FN_ATTRS128
5542 _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
5543 {
5544  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5545           (__v4sf) __B,
5546           (__v4sf) __W,
5547           (__mmask8) __U,
5548           _MM_FROUND_CUR_DIRECTION);
5549 }
5550 
5551 #define _mm_mask_getexp_round_ss(W, U, A, B, R) \
5552   ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5553                                                  (__v4sf)(__m128)(B), \
5554                                                  (__v4sf)(__m128)(W), \
5555                                                  (__mmask8)(U), (int)(R)))
5556 
5557 static __inline__ __m128 __DEFAULT_FN_ATTRS128
5558 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
5559 {
5560  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5561           (__v4sf) __B,
5562           (__v4sf) _mm_setzero_ps (),
5563           (__mmask8) __U,
5564           _MM_FROUND_CUR_DIRECTION);
5565 }
5566 
5567 #define _mm_maskz_getexp_round_ss(U, A, B, R) \
5568   ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5569                                                  (__v4sf)(__m128)(B), \
5570                                                  (__v4sf)_mm_setzero_ps(), \
5571                                                  (__mmask8)(U), (int)(R)))
5572 
5573 #define _mm_getmant_round_sd(A, B, C, D, R) \
5574   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5575                                                 (__v2df)(__m128d)(B), \
5576                                                 (int)(((D)<<2) | (C)), \
5577                                                 (__v2df)_mm_setzero_pd(), \
5578                                                 (__mmask8)-1, (int)(R)))
5579 
5580 #define _mm_getmant_sd(A, B, C, D)  \
5581   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5582                                                 (__v2df)(__m128d)(B), \
5583                                                 (int)(((D)<<2) | (C)), \
5584                                                 (__v2df)_mm_setzero_pd(), \
5585                                                 (__mmask8)-1, \
5586                                                 _MM_FROUND_CUR_DIRECTION))
5587 
5588 #define _mm_mask_getmant_sd(W, U, A, B, C, D) \
5589   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5590                                                 (__v2df)(__m128d)(B), \
5591                                                 (int)(((D)<<2) | (C)), \
5592                                                 (__v2df)(__m128d)(W), \
5593                                                 (__mmask8)(U), \
5594                                                 _MM_FROUND_CUR_DIRECTION))
5595 
5596 #define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \
5597   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5598                                                 (__v2df)(__m128d)(B), \
5599                                                 (int)(((D)<<2) | (C)), \
5600                                                 (__v2df)(__m128d)(W), \
5601                                                 (__mmask8)(U), (int)(R)))
5602 
5603 #define _mm_maskz_getmant_sd(U, A, B, C, D) \
5604   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5605                                                 (__v2df)(__m128d)(B), \
5606                                                 (int)(((D)<<2) | (C)), \
5607                                                 (__v2df)_mm_setzero_pd(), \
5608                                                 (__mmask8)(U), \
5609                                                 _MM_FROUND_CUR_DIRECTION))
5610 
5611 #define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \
5612   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5613                                                 (__v2df)(__m128d)(B), \
5614                                                 (int)(((D)<<2) | (C)), \
5615                                                 (__v2df)_mm_setzero_pd(), \
5616                                                 (__mmask8)(U), (int)(R)))
5617 
5618 #define _mm_getmant_round_ss(A, B, C, D, R) \
5619   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5620                                                (__v4sf)(__m128)(B), \
5621                                                (int)(((D)<<2) | (C)), \
5622                                                (__v4sf)_mm_setzero_ps(), \
5623                                                (__mmask8)-1, (int)(R)))
5624 
5625 #define _mm_getmant_ss(A, B, C, D) \
5626   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5627                                                (__v4sf)(__m128)(B), \
5628                                                (int)(((D)<<2) | (C)), \
5629                                                (__v4sf)_mm_setzero_ps(), \
5630                                                (__mmask8)-1, \
5631                                                _MM_FROUND_CUR_DIRECTION))
5632 
5633 #define _mm_mask_getmant_ss(W, U, A, B, C, D) \
5634   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5635                                                (__v4sf)(__m128)(B), \
5636                                                (int)(((D)<<2) | (C)), \
5637                                                (__v4sf)(__m128)(W), \
5638                                                (__mmask8)(U), \
5639                                                _MM_FROUND_CUR_DIRECTION))
5640 
5641 #define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \
5642   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5643                                                (__v4sf)(__m128)(B), \
5644                                                (int)(((D)<<2) | (C)), \
5645                                                (__v4sf)(__m128)(W), \
5646                                                (__mmask8)(U), (int)(R)))
5647 
5648 #define _mm_maskz_getmant_ss(U, A, B, C, D) \
5649   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5650                                                (__v4sf)(__m128)(B), \
5651                                                (int)(((D)<<2) | (C)), \
5652                                                (__v4sf)_mm_setzero_ps(), \
5653                                                (__mmask8)(U), \
5654                                                _MM_FROUND_CUR_DIRECTION))
5655 
5656 #define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \
5657   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5658                                                (__v4sf)(__m128)(B), \
5659                                                (int)(((D)<<2) | (C)), \
5660                                                (__v4sf)_mm_setzero_ps(), \
5661                                                (__mmask8)(U), (int)(R)))
5662 
5663 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
5664 _mm512_kmov (__mmask16 __A)
5665 {
5666   return  __A;
5667 }
5668 
5669 #define _mm_comi_round_sd(A, B, P, R) \
5670   ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
5671                                (int)(P), (int)(R)))
5672 
5673 #define _mm_comi_round_ss(A, B, P, R) \
5674   ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
5675                                (int)(P), (int)(R)))
5676 
5677 #ifdef __x86_64__
5678 #define _mm_cvt_roundsd_si64(A, R) \
5679   ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
5680 #endif
5681 
5682 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5683 _mm512_sll_epi32(__m512i __A, __m128i __B)
5684 {
5685   return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
5686 }
5687 
5688 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5689 _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5690 {
5691   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5692                                           (__v16si)_mm512_sll_epi32(__A, __B),
5693                                           (__v16si)__W);
5694 }
5695 
5696 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5697 _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5698 {
5699   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5700                                           (__v16si)_mm512_sll_epi32(__A, __B),
5701                                           (__v16si)_mm512_setzero_si512());
5702 }
5703 
5704 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5705 _mm512_sll_epi64(__m512i __A, __m128i __B)
5706 {
5707   return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
5708 }
5709 
5710 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5711 _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5712 {
5713   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5714                                              (__v8di)_mm512_sll_epi64(__A, __B),
5715                                              (__v8di)__W);
5716 }
5717 
5718 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5719 _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5720 {
5721   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5722                                            (__v8di)_mm512_sll_epi64(__A, __B),
5723                                            (__v8di)_mm512_setzero_si512());
5724 }
5725 
5726 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5727 _mm512_sllv_epi32(__m512i __X, __m512i __Y)
5728 {
5729   return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
5730 }
5731 
5732 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5733 _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5734 {
5735   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5736                                            (__v16si)_mm512_sllv_epi32(__X, __Y),
5737                                            (__v16si)__W);
5738 }
5739 
5740 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5741 _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5742 {
5743   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5744                                            (__v16si)_mm512_sllv_epi32(__X, __Y),
5745                                            (__v16si)_mm512_setzero_si512());
5746 }
5747 
5748 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5749 _mm512_sllv_epi64(__m512i __X, __m512i __Y)
5750 {
5751   return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
5752 }
5753 
5754 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5755 _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5756 {
5757   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5758                                             (__v8di)_mm512_sllv_epi64(__X, __Y),
5759                                             (__v8di)__W);
5760 }
5761 
5762 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5763 _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5764 {
5765   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5766                                             (__v8di)_mm512_sllv_epi64(__X, __Y),
5767                                             (__v8di)_mm512_setzero_si512());
5768 }
5769 
5770 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5771 _mm512_sra_epi32(__m512i __A, __m128i __B)
5772 {
5773   return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
5774 }
5775 
5776 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5777 _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5778 {
5779   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5780                                           (__v16si)_mm512_sra_epi32(__A, __B),
5781                                           (__v16si)__W);
5782 }
5783 
5784 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5785 _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5786 {
5787   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5788                                           (__v16si)_mm512_sra_epi32(__A, __B),
5789                                           (__v16si)_mm512_setzero_si512());
5790 }
5791 
5792 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5793 _mm512_sra_epi64(__m512i __A, __m128i __B)
5794 {
5795   return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
5796 }
5797 
5798 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5799 _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5800 {
5801   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5802                                            (__v8di)_mm512_sra_epi64(__A, __B),
5803                                            (__v8di)__W);
5804 }
5805 
5806 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5807 _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5808 {
5809   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5810                                            (__v8di)_mm512_sra_epi64(__A, __B),
5811                                            (__v8di)_mm512_setzero_si512());
5812 }
5813 
5814 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5815 _mm512_srav_epi32(__m512i __X, __m512i __Y)
5816 {
5817   return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
5818 }
5819 
5820 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5821 _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5822 {
5823   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5824                                            (__v16si)_mm512_srav_epi32(__X, __Y),
5825                                            (__v16si)__W);
5826 }
5827 
5828 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5829 _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5830 {
5831   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5832                                            (__v16si)_mm512_srav_epi32(__X, __Y),
5833                                            (__v16si)_mm512_setzero_si512());
5834 }
5835 
5836 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5837 _mm512_srav_epi64(__m512i __X, __m512i __Y)
5838 {
5839   return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
5840 }
5841 
5842 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5843 _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5844 {
5845   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5846                                             (__v8di)_mm512_srav_epi64(__X, __Y),
5847                                             (__v8di)__W);
5848 }
5849 
5850 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5851 _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5852 {
5853   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5854                                             (__v8di)_mm512_srav_epi64(__X, __Y),
5855                                             (__v8di)_mm512_setzero_si512());
5856 }
5857 
5858 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5859 _mm512_srl_epi32(__m512i __A, __m128i __B)
5860 {
5861   return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
5862 }
5863 
5864 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5865 _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5866 {
5867   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5868                                           (__v16si)_mm512_srl_epi32(__A, __B),
5869                                           (__v16si)__W);
5870 }
5871 
5872 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5873 _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5874 {
5875   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5876                                           (__v16si)_mm512_srl_epi32(__A, __B),
5877                                           (__v16si)_mm512_setzero_si512());
5878 }
5879 
5880 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5881 _mm512_srl_epi64(__m512i __A, __m128i __B)
5882 {
5883   return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
5884 }
5885 
5886 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5887 _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5888 {
5889   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5890                                            (__v8di)_mm512_srl_epi64(__A, __B),
5891                                            (__v8di)__W);
5892 }
5893 
5894 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5895 _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5896 {
5897   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5898                                            (__v8di)_mm512_srl_epi64(__A, __B),
5899                                            (__v8di)_mm512_setzero_si512());
5900 }
5901 
5902 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5903 _mm512_srlv_epi32(__m512i __X, __m512i __Y)
5904 {
5905   return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
5906 }
5907 
5908 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5909 _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5910 {
5911   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5912                                            (__v16si)_mm512_srlv_epi32(__X, __Y),
5913                                            (__v16si)__W);
5914 }
5915 
5916 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5917 _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5918 {
5919   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5920                                            (__v16si)_mm512_srlv_epi32(__X, __Y),
5921                                            (__v16si)_mm512_setzero_si512());
5922 }
5923 
5924 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5925 _mm512_srlv_epi64 (__m512i __X, __m512i __Y)
5926 {
5927   return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
5928 }
5929 
5930 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5931 _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5932 {
5933   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5934                                             (__v8di)_mm512_srlv_epi64(__X, __Y),
5935                                             (__v8di)__W);
5936 }
5937 
5938 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5939 _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5940 {
5941   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5942                                             (__v8di)_mm512_srlv_epi64(__X, __Y),
5943                                             (__v8di)_mm512_setzero_si512());
5944 }
5945 
5946 /// \enum _MM_TERNLOG_ENUM
5947 ///    A helper to represent the ternary logic operations among vector \a A,
5948 ///    \a B and \a C. The representation is passed to \a imm.
5949 typedef enum {
5950   _MM_TERNLOG_A = 0xF0,
5951   _MM_TERNLOG_B = 0xCC,
5952   _MM_TERNLOG_C = 0xAA
5953 } _MM_TERNLOG_ENUM;
5954 
5955 #define _mm512_ternarylogic_epi32(A, B, C, imm)                                \
5956   ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
5957       (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
5958       (unsigned char)(imm), (__mmask16)-1))
5959 
5960 #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm)                        \
5961   ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
5962       (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
5963       (unsigned char)(imm), (__mmask16)(U)))
5964 
5965 #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm)                       \
5966   ((__m512i)__builtin_ia32_pternlogd512_maskz(                                 \
5967       (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
5968       (unsigned char)(imm), (__mmask16)(U)))
5969 
5970 #define _mm512_ternarylogic_epi64(A, B, C, imm)                                \
5971   ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
5972       (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
5973       (unsigned char)(imm), (__mmask8)-1))
5974 
5975 #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm)                        \
5976   ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
5977       (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
5978       (unsigned char)(imm), (__mmask8)(U)))
5979 
5980 #define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm)                       \
5981   ((__m512i)__builtin_ia32_pternlogq512_maskz(                                 \
5982       (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
5983       (unsigned char)(imm), (__mmask8)(U)))
5984 
5985 #ifdef __x86_64__
5986 #define _mm_cvt_roundsd_i64(A, R) \
5987   ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
5988 #endif
5989 
5990 #define _mm_cvt_roundsd_si32(A, R) \
5991   ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
5992 
5993 #define _mm_cvt_roundsd_i32(A, R) \
5994   ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
5995 
5996 #define _mm_cvt_roundsd_u32(A, R) \
5997   ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)))
5998 
5999 static __inline__ unsigned __DEFAULT_FN_ATTRS128
6000 _mm_cvtsd_u32 (__m128d __A)
6001 {
6002   return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
6003              _MM_FROUND_CUR_DIRECTION);
6004 }
6005 
6006 #ifdef __x86_64__
6007 #define _mm_cvt_roundsd_u64(A, R) \
6008   ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
6009                                                    (int)(R)))
6010 
6011 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6012 _mm_cvtsd_u64 (__m128d __A)
6013 {
6014   return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
6015                  __A,
6016                  _MM_FROUND_CUR_DIRECTION);
6017 }
6018 #endif
6019 
6020 #define _mm_cvt_roundss_si32(A, R) \
6021   ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
6022 
6023 #define _mm_cvt_roundss_i32(A, R) \
6024   ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
6025 
6026 #ifdef __x86_64__
6027 #define _mm_cvt_roundss_si64(A, R) \
6028   ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
6029 
6030 #define _mm_cvt_roundss_i64(A, R) \
6031   ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
6032 #endif
6033 
6034 #define _mm_cvt_roundss_u32(A, R) \
6035   ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)))
6036 
6037 static __inline__ unsigned __DEFAULT_FN_ATTRS128
6038 _mm_cvtss_u32 (__m128 __A)
6039 {
6040   return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
6041              _MM_FROUND_CUR_DIRECTION);
6042 }
6043 
6044 #ifdef __x86_64__
6045 #define _mm_cvt_roundss_u64(A, R) \
6046   ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
6047                                                    (int)(R)))
6048 
6049 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6050 _mm_cvtss_u64 (__m128 __A)
6051 {
6052   return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
6053                  __A,
6054                  _MM_FROUND_CUR_DIRECTION);
6055 }
6056 #endif
6057 
6058 #define _mm_cvtt_roundsd_i32(A, R) \
6059   ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
6060 
6061 #define _mm_cvtt_roundsd_si32(A, R) \
6062   ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
6063 
6064 static __inline__ int __DEFAULT_FN_ATTRS128
6065 _mm_cvttsd_i32 (__m128d __A)
6066 {
6067   return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
6068               _MM_FROUND_CUR_DIRECTION);
6069 }
6070 
6071 #ifdef __x86_64__
6072 #define _mm_cvtt_roundsd_si64(A, R) \
6073   ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
6074 
6075 #define _mm_cvtt_roundsd_i64(A, R) \
6076   ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
6077 
6078 static __inline__ long long __DEFAULT_FN_ATTRS128
6079 _mm_cvttsd_i64 (__m128d __A)
6080 {
6081   return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
6082               _MM_FROUND_CUR_DIRECTION);
6083 }
6084 #endif
6085 
6086 #define _mm_cvtt_roundsd_u32(A, R) \
6087   ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)))
6088 
6089 static __inline__ unsigned __DEFAULT_FN_ATTRS128
6090 _mm_cvttsd_u32 (__m128d __A)
6091 {
6092   return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
6093               _MM_FROUND_CUR_DIRECTION);
6094 }
6095 
6096 #ifdef __x86_64__
6097 #define _mm_cvtt_roundsd_u64(A, R) \
6098   ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
6099                                                     (int)(R)))
6100 
6101 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6102 _mm_cvttsd_u64 (__m128d __A)
6103 {
6104   return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
6105                   __A,
6106                   _MM_FROUND_CUR_DIRECTION);
6107 }
6108 #endif
6109 
6110 #define _mm_cvtt_roundss_i32(A, R) \
6111   ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
6112 
6113 #define _mm_cvtt_roundss_si32(A, R) \
6114   ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
6115 
6116 static __inline__ int __DEFAULT_FN_ATTRS128
6117 _mm_cvttss_i32 (__m128 __A)
6118 {
6119   return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
6120               _MM_FROUND_CUR_DIRECTION);
6121 }
6122 
6123 #ifdef __x86_64__
6124 #define _mm_cvtt_roundss_i64(A, R) \
6125   ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
6126 
6127 #define _mm_cvtt_roundss_si64(A, R) \
6128   ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
6129 
6130 static __inline__ long long __DEFAULT_FN_ATTRS128
6131 _mm_cvttss_i64 (__m128 __A)
6132 {
6133   return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
6134               _MM_FROUND_CUR_DIRECTION);
6135 }
6136 #endif
6137 
6138 #define _mm_cvtt_roundss_u32(A, R) \
6139   ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)))
6140 
6141 static __inline__ unsigned __DEFAULT_FN_ATTRS128
6142 _mm_cvttss_u32 (__m128 __A)
6143 {
6144   return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
6145               _MM_FROUND_CUR_DIRECTION);
6146 }
6147 
6148 #ifdef __x86_64__
6149 #define _mm_cvtt_roundss_u64(A, R) \
6150   ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
6151                                                     (int)(R)))
6152 
6153 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6154 _mm_cvttss_u64 (__m128 __A)
6155 {
6156   return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
6157                   __A,
6158                   _MM_FROUND_CUR_DIRECTION);
6159 }
6160 #endif
6161 
6162 #define _mm512_permute_pd(X, C) \
6163   ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)))
6164 
6165 #define _mm512_mask_permute_pd(W, U, X, C) \
6166   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6167                                         (__v8df)_mm512_permute_pd((X), (C)), \
6168                                         (__v8df)(__m512d)(W)))
6169 
6170 #define _mm512_maskz_permute_pd(U, X, C) \
6171   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6172                                         (__v8df)_mm512_permute_pd((X), (C)), \
6173                                         (__v8df)_mm512_setzero_pd()))
6174 
6175 #define _mm512_permute_ps(X, C) \
6176   ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)))
6177 
6178 #define _mm512_mask_permute_ps(W, U, X, C) \
6179   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6180                                        (__v16sf)_mm512_permute_ps((X), (C)), \
6181                                        (__v16sf)(__m512)(W)))
6182 
6183 #define _mm512_maskz_permute_ps(U, X, C) \
6184   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6185                                        (__v16sf)_mm512_permute_ps((X), (C)), \
6186                                        (__v16sf)_mm512_setzero_ps()))
6187 
6188 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6189 _mm512_permutevar_pd(__m512d __A, __m512i __C)
6190 {
6191   return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
6192 }
6193 
6194 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6195 _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
6196 {
6197   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
6198                                          (__v8df)_mm512_permutevar_pd(__A, __C),
6199                                          (__v8df)__W);
6200 }
6201 
6202 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6203 _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
6204 {
6205   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
6206                                          (__v8df)_mm512_permutevar_pd(__A, __C),
6207                                          (__v8df)_mm512_setzero_pd());
6208 }
6209 
6210 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6211 _mm512_permutevar_ps(__m512 __A, __m512i __C)
6212 {
6213   return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
6214 }
6215 
6216 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6217 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
6218 {
6219   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
6220                                         (__v16sf)_mm512_permutevar_ps(__A, __C),
6221                                         (__v16sf)__W);
6222 }
6223 
6224 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6225 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
6226 {
6227   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
6228                                         (__v16sf)_mm512_permutevar_ps(__A, __C),
6229                                         (__v16sf)_mm512_setzero_ps());
6230 }
6231 
6232 static __inline __m512d __DEFAULT_FN_ATTRS512
6233 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
6234 {
6235   return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
6236                                                  (__v8df)__B);
6237 }
6238 
6239 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6240 _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
6241 {
6242   return (__m512d)__builtin_ia32_selectpd_512(__U,
6243                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6244                                   (__v8df)__A);
6245 }
6246 
6247 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6248 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
6249                              __m512d __B)
6250 {
6251   return (__m512d)__builtin_ia32_selectpd_512(__U,
6252                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6253                                   (__v8df)(__m512d)__I);
6254 }
6255 
6256 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6257 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
6258                              __m512d __B)
6259 {
6260   return (__m512d)__builtin_ia32_selectpd_512(__U,
6261                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6262                                   (__v8df)_mm512_setzero_pd());
6263 }
6264 
6265 static __inline __m512 __DEFAULT_FN_ATTRS512
6266 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
6267 {
6268   return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
6269                                                 (__v16sf) __B);
6270 }
6271 
6272 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6273 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
6274 {
6275   return (__m512)__builtin_ia32_selectps_512(__U,
6276                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6277                                  (__v16sf)__A);
6278 }
6279 
6280 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6281 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
6282 {
6283   return (__m512)__builtin_ia32_selectps_512(__U,
6284                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6285                                  (__v16sf)(__m512)__I);
6286 }
6287 
6288 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6289 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
6290 {
6291   return (__m512)__builtin_ia32_selectps_512(__U,
6292                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6293                                  (__v16sf)_mm512_setzero_ps());
6294 }
6295 
6296 
6297 #define _mm512_cvtt_roundpd_epu32(A, R) \
6298   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6299                                               (__v8si)_mm256_undefined_si256(), \
6300                                               (__mmask8)-1, (int)(R)))
6301 
6302 #define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \
6303   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6304                                               (__v8si)(__m256i)(W), \
6305                                               (__mmask8)(U), (int)(R)))
6306 
6307 #define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \
6308   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6309                                               (__v8si)_mm256_setzero_si256(), \
6310                                               (__mmask8)(U), (int)(R)))
6311 
6312 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6313 _mm512_cvttpd_epu32 (__m512d __A)
6314 {
6315   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6316                   (__v8si)
6317                   _mm256_undefined_si256 (),
6318                   (__mmask8) -1,
6319                   _MM_FROUND_CUR_DIRECTION);
6320 }
6321 
6322 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6323 _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
6324 {
6325   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6326                   (__v8si) __W,
6327                   (__mmask8) __U,
6328                   _MM_FROUND_CUR_DIRECTION);
6329 }
6330 
6331 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6332 _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
6333 {
6334   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6335                   (__v8si)
6336                   _mm256_setzero_si256 (),
6337                   (__mmask8) __U,
6338                   _MM_FROUND_CUR_DIRECTION);
6339 }
6340 
6341 #define _mm_roundscale_round_sd(A, B, imm, R) \
6342   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6343                                                  (__v2df)(__m128d)(B), \
6344                                                  (__v2df)_mm_setzero_pd(), \
6345                                                  (__mmask8)-1, (int)(imm), \
6346                                                  (int)(R)))
6347 
6348 #define _mm_roundscale_sd(A, B, imm) \
6349   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6350                                                  (__v2df)(__m128d)(B), \
6351                                                  (__v2df)_mm_setzero_pd(), \
6352                                                  (__mmask8)-1, (int)(imm), \
6353                                                  _MM_FROUND_CUR_DIRECTION))
6354 
6355 #define _mm_mask_roundscale_sd(W, U, A, B, imm) \
6356   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6357                                                  (__v2df)(__m128d)(B), \
6358                                                  (__v2df)(__m128d)(W), \
6359                                                  (__mmask8)(U), (int)(imm), \
6360                                                  _MM_FROUND_CUR_DIRECTION))
6361 
6362 #define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \
6363   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6364                                                  (__v2df)(__m128d)(B), \
6365                                                  (__v2df)(__m128d)(W), \
6366                                                  (__mmask8)(U), (int)(I), \
6367                                                  (int)(R)))
6368 
6369 #define _mm_maskz_roundscale_sd(U, A, B, I) \
6370   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6371                                                  (__v2df)(__m128d)(B), \
6372                                                  (__v2df)_mm_setzero_pd(), \
6373                                                  (__mmask8)(U), (int)(I), \
6374                                                  _MM_FROUND_CUR_DIRECTION))
6375 
6376 #define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
6377   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6378                                                  (__v2df)(__m128d)(B), \
6379                                                  (__v2df)_mm_setzero_pd(), \
6380                                                  (__mmask8)(U), (int)(I), \
6381                                                  (int)(R)))
6382 
6383 #define _mm_roundscale_round_ss(A, B, imm, R) \
6384   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6385                                                 (__v4sf)(__m128)(B), \
6386                                                 (__v4sf)_mm_setzero_ps(), \
6387                                                 (__mmask8)-1, (int)(imm), \
6388                                                 (int)(R)))
6389 
6390 #define _mm_roundscale_ss(A, B, imm) \
6391   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6392                                                 (__v4sf)(__m128)(B), \
6393                                                 (__v4sf)_mm_setzero_ps(), \
6394                                                 (__mmask8)-1, (int)(imm), \
6395                                                 _MM_FROUND_CUR_DIRECTION))
6396 
6397 #define _mm_mask_roundscale_ss(W, U, A, B, I) \
6398   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6399                                                 (__v4sf)(__m128)(B), \
6400                                                 (__v4sf)(__m128)(W), \
6401                                                 (__mmask8)(U), (int)(I), \
6402                                                 _MM_FROUND_CUR_DIRECTION))
6403 
6404 #define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \
6405   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6406                                                 (__v4sf)(__m128)(B), \
6407                                                 (__v4sf)(__m128)(W), \
6408                                                 (__mmask8)(U), (int)(I), \
6409                                                 (int)(R)))
6410 
6411 #define _mm_maskz_roundscale_ss(U, A, B, I) \
6412   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6413                                                 (__v4sf)(__m128)(B), \
6414                                                 (__v4sf)_mm_setzero_ps(), \
6415                                                 (__mmask8)(U), (int)(I), \
6416                                                 _MM_FROUND_CUR_DIRECTION))
6417 
6418 #define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
6419   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6420                                                 (__v4sf)(__m128)(B), \
6421                                                 (__v4sf)_mm_setzero_ps(), \
6422                                                 (__mmask8)(U), (int)(I), \
6423                                                 (int)(R)))
6424 
6425 #define _mm512_scalef_round_pd(A, B, R) \
6426   ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6427                                             (__v8df)(__m512d)(B), \
6428                                             (__v8df)_mm512_undefined_pd(), \
6429                                             (__mmask8)-1, (int)(R)))
6430 
6431 #define _mm512_mask_scalef_round_pd(W, U, A, B, R) \
6432   ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6433                                             (__v8df)(__m512d)(B), \
6434                                             (__v8df)(__m512d)(W), \
6435                                             (__mmask8)(U), (int)(R)))
6436 
6437 #define _mm512_maskz_scalef_round_pd(U, A, B, R) \
6438   ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6439                                             (__v8df)(__m512d)(B), \
6440                                             (__v8df)_mm512_setzero_pd(), \
6441                                             (__mmask8)(U), (int)(R)))
6442 
6443 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6444 _mm512_scalef_pd (__m512d __A, __m512d __B)
6445 {
6446   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6447                 (__v8df) __B,
6448                 (__v8df)
6449                 _mm512_undefined_pd (),
6450                 (__mmask8) -1,
6451                 _MM_FROUND_CUR_DIRECTION);
6452 }
6453 
6454 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6455 _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
6456 {
6457   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6458                 (__v8df) __B,
6459                 (__v8df) __W,
6460                 (__mmask8) __U,
6461                 _MM_FROUND_CUR_DIRECTION);
6462 }
6463 
6464 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6465 _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
6466 {
6467   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6468                 (__v8df) __B,
6469                 (__v8df)
6470                 _mm512_setzero_pd (),
6471                 (__mmask8) __U,
6472                 _MM_FROUND_CUR_DIRECTION);
6473 }
6474 
6475 #define _mm512_scalef_round_ps(A, B, R) \
6476   ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6477                                            (__v16sf)(__m512)(B), \
6478                                            (__v16sf)_mm512_undefined_ps(), \
6479                                            (__mmask16)-1, (int)(R)))
6480 
6481 #define _mm512_mask_scalef_round_ps(W, U, A, B, R) \
6482   ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6483                                            (__v16sf)(__m512)(B), \
6484                                            (__v16sf)(__m512)(W), \
6485                                            (__mmask16)(U), (int)(R)))
6486 
6487 #define _mm512_maskz_scalef_round_ps(U, A, B, R) \
6488   ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6489                                            (__v16sf)(__m512)(B), \
6490                                            (__v16sf)_mm512_setzero_ps(), \
6491                                            (__mmask16)(U), (int)(R)))
6492 
6493 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6494 _mm512_scalef_ps (__m512 __A, __m512 __B)
6495 {
6496   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6497                (__v16sf) __B,
6498                (__v16sf)
6499                _mm512_undefined_ps (),
6500                (__mmask16) -1,
6501                _MM_FROUND_CUR_DIRECTION);
6502 }
6503 
6504 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6505 _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
6506 {
6507   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6508                (__v16sf) __B,
6509                (__v16sf) __W,
6510                (__mmask16) __U,
6511                _MM_FROUND_CUR_DIRECTION);
6512 }
6513 
6514 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6515 _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
6516 {
6517   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6518                (__v16sf) __B,
6519                (__v16sf)
6520                _mm512_setzero_ps (),
6521                (__mmask16) __U,
6522                _MM_FROUND_CUR_DIRECTION);
6523 }
6524 
6525 #define _mm_scalef_round_sd(A, B, R) \
6526   ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6527                                                (__v2df)(__m128d)(B), \
6528                                                (__v2df)_mm_setzero_pd(), \
6529                                                (__mmask8)-1, (int)(R)))
6530 
6531 static __inline__ __m128d __DEFAULT_FN_ATTRS128
6532 _mm_scalef_sd (__m128d __A, __m128d __B)
6533 {
6534   return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
6535               (__v2df)( __B), (__v2df) _mm_setzero_pd(),
6536               (__mmask8) -1,
6537               _MM_FROUND_CUR_DIRECTION);
6538 }
6539 
6540 static __inline__ __m128d __DEFAULT_FN_ATTRS128
6541 _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
6542 {
6543  return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
6544                  (__v2df) __B,
6545                 (__v2df) __W,
6546                 (__mmask8) __U,
6547                 _MM_FROUND_CUR_DIRECTION);
6548 }
6549 
6550 #define _mm_mask_scalef_round_sd(W, U, A, B, R) \
6551   ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6552                                                (__v2df)(__m128d)(B), \
6553                                                (__v2df)(__m128d)(W), \
6554                                                (__mmask8)(U), (int)(R)))
6555 
6556 static __inline__ __m128d __DEFAULT_FN_ATTRS128
6557 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
6558 {
6559  return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
6560                  (__v2df) __B,
6561                 (__v2df) _mm_setzero_pd (),
6562                 (__mmask8) __U,
6563                 _MM_FROUND_CUR_DIRECTION);
6564 }
6565 
6566 #define _mm_maskz_scalef_round_sd(U, A, B, R) \
6567   ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6568                                                (__v2df)(__m128d)(B), \
6569                                                (__v2df)_mm_setzero_pd(), \
6570                                                (__mmask8)(U), (int)(R)))
6571 
6572 #define _mm_scalef_round_ss(A, B, R) \
6573   ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6574                                               (__v4sf)(__m128)(B), \
6575                                               (__v4sf)_mm_setzero_ps(), \
6576                                               (__mmask8)-1, (int)(R)))
6577 
6578 static __inline__ __m128 __DEFAULT_FN_ATTRS128
6579 _mm_scalef_ss (__m128 __A, __m128 __B)
6580 {
6581   return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
6582              (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
6583              (__mmask8) -1,
6584              _MM_FROUND_CUR_DIRECTION);
6585 }
6586 
6587 static __inline__ __m128 __DEFAULT_FN_ATTRS128
6588 _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
6589 {
6590  return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
6591                 (__v4sf) __B,
6592                 (__v4sf) __W,
6593                 (__mmask8) __U,
6594                 _MM_FROUND_CUR_DIRECTION);
6595 }
6596 
6597 #define _mm_mask_scalef_round_ss(W, U, A, B, R) \
6598   ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6599                                               (__v4sf)(__m128)(B), \
6600                                               (__v4sf)(__m128)(W), \
6601                                               (__mmask8)(U), (int)(R)))
6602 
6603 static __inline__ __m128 __DEFAULT_FN_ATTRS128
6604 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
6605 {
6606  return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
6607                  (__v4sf) __B,
6608                 (__v4sf) _mm_setzero_ps (),
6609                 (__mmask8) __U,
6610                 _MM_FROUND_CUR_DIRECTION);
6611 }
6612 
6613 #define _mm_maskz_scalef_round_ss(U, A, B, R) \
6614   ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6615                                               (__v4sf)(__m128)(B), \
6616                                               (__v4sf)_mm_setzero_ps(), \
6617                                               (__mmask8)(U), \
6618                                               (int)(R)))
6619 
6620 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6621 _mm512_srai_epi32(__m512i __A, unsigned int __B)
6622 {
6623   return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B);
6624 }
6625 
6626 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6627 _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A,
6628                        unsigned int __B)
6629 {
6630   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
6631                                          (__v16si)_mm512_srai_epi32(__A, __B),
6632                                          (__v16si)__W);
6633 }
6634 
6635 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6636 _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
6637                         unsigned int __B) {
6638   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
6639                                          (__v16si)_mm512_srai_epi32(__A, __B),
6640                                          (__v16si)_mm512_setzero_si512());
6641 }
6642 
6643 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6644 _mm512_srai_epi64(__m512i __A, unsigned int __B)
6645 {
6646   return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B);
6647 }
6648 
6649 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6650 _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
6651 {
6652   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
6653                                           (__v8di)_mm512_srai_epi64(__A, __B),
6654                                           (__v8di)__W);
6655 }
6656 
6657 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6658 _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
6659 {
6660   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
6661                                           (__v8di)_mm512_srai_epi64(__A, __B),
6662                                           (__v8di)_mm512_setzero_si512());
6663 }
6664 
6665 #define _mm512_shuffle_f32x4(A, B, imm) \
6666   ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
6667                                      (__v16sf)(__m512)(B), (int)(imm)))
6668 
6669 #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
6670   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6671                                        (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
6672                                        (__v16sf)(__m512)(W)))
6673 
6674 #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \
6675   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6676                                        (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
6677                                        (__v16sf)_mm512_setzero_ps()))
6678 
6679 #define _mm512_shuffle_f64x2(A, B, imm) \
6680   ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
6681                                       (__v8df)(__m512d)(B), (int)(imm)))
6682 
6683 #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
6684   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6685                                         (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
6686                                         (__v8df)(__m512d)(W)))
6687 
6688 #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \
6689   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6690                                         (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
6691                                         (__v8df)_mm512_setzero_pd()))
6692 
6693 #define _mm512_shuffle_i32x4(A, B, imm) \
6694   ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
6695                                       (__v16si)(__m512i)(B), (int)(imm)))
6696 
6697 #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
6698   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
6699                                        (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
6700                                        (__v16si)(__m512i)(W)))
6701 
6702 #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \
6703   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
6704                                        (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
6705                                        (__v16si)_mm512_setzero_si512()))
6706 
6707 #define _mm512_shuffle_i64x2(A, B, imm) \
6708   ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
6709                                       (__v8di)(__m512i)(B), (int)(imm)))
6710 
6711 #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
6712   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
6713                                        (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
6714                                        (__v8di)(__m512i)(W)))
6715 
6716 #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \
6717   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
6718                                        (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
6719                                        (__v8di)_mm512_setzero_si512()))
6720 
6721 #define _mm512_shuffle_pd(A, B, M) \
6722   ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
6723                                      (__v8df)(__m512d)(B), (int)(M)))
6724 
6725 #define _mm512_mask_shuffle_pd(W, U, A, B, M) \
6726   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6727                                         (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
6728                                         (__v8df)(__m512d)(W)))
6729 
6730 #define _mm512_maskz_shuffle_pd(U, A, B, M) \
6731   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6732                                         (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
6733                                         (__v8df)_mm512_setzero_pd()))
6734 
6735 #define _mm512_shuffle_ps(A, B, M) \
6736   ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
6737                                     (__v16sf)(__m512)(B), (int)(M)))
6738 
6739 #define _mm512_mask_shuffle_ps(W, U, A, B, M) \
6740   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6741                                        (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
6742                                        (__v16sf)(__m512)(W)))
6743 
6744 #define _mm512_maskz_shuffle_ps(U, A, B, M) \
6745   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6746                                        (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
6747                                        (__v16sf)_mm512_setzero_ps()))
6748 
6749 #define _mm_sqrt_round_sd(A, B, R) \
6750   ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6751                                              (__v2df)(__m128d)(B), \
6752                                              (__v2df)_mm_setzero_pd(), \
6753                                              (__mmask8)-1, (int)(R)))
6754 
6755 static __inline__ __m128d __DEFAULT_FN_ATTRS128
6756 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
6757 {
6758  return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
6759                  (__v2df) __B,
6760                 (__v2df) __W,
6761                 (__mmask8) __U,
6762                 _MM_FROUND_CUR_DIRECTION);
6763 }
6764 
6765 #define _mm_mask_sqrt_round_sd(W, U, A, B, R) \
6766   ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6767                                              (__v2df)(__m128d)(B), \
6768                                              (__v2df)(__m128d)(W), \
6769                                              (__mmask8)(U), (int)(R)))
6770 
6771 static __inline__ __m128d __DEFAULT_FN_ATTRS128
6772 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
6773 {
6774  return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
6775                  (__v2df) __B,
6776                 (__v2df) _mm_setzero_pd (),
6777                 (__mmask8) __U,
6778                 _MM_FROUND_CUR_DIRECTION);
6779 }
6780 
6781 #define _mm_maskz_sqrt_round_sd(U, A, B, R) \
6782   ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6783                                              (__v2df)(__m128d)(B), \
6784                                              (__v2df)_mm_setzero_pd(), \
6785                                              (__mmask8)(U), (int)(R)))
6786 
6787 #define _mm_sqrt_round_ss(A, B, R) \
6788   ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6789                                             (__v4sf)(__m128)(B), \
6790                                             (__v4sf)_mm_setzero_ps(), \
6791                                             (__mmask8)-1, (int)(R)))
6792 
6793 static __inline__ __m128 __DEFAULT_FN_ATTRS128
6794 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
6795 {
6796  return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
6797                  (__v4sf) __B,
6798                 (__v4sf) __W,
6799                 (__mmask8) __U,
6800                 _MM_FROUND_CUR_DIRECTION);
6801 }
6802 
6803 #define _mm_mask_sqrt_round_ss(W, U, A, B, R) \
6804   ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6805                                             (__v4sf)(__m128)(B), \
6806                                             (__v4sf)(__m128)(W), (__mmask8)(U), \
6807                                             (int)(R)))
6808 
6809 static __inline__ __m128 __DEFAULT_FN_ATTRS128
6810 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
6811 {
6812  return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
6813                  (__v4sf) __B,
6814                 (__v4sf) _mm_setzero_ps (),
6815                 (__mmask8) __U,
6816                 _MM_FROUND_CUR_DIRECTION);
6817 }
6818 
6819 #define _mm_maskz_sqrt_round_ss(U, A, B, R) \
6820   ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6821                                             (__v4sf)(__m128)(B), \
6822                                             (__v4sf)_mm_setzero_ps(), \
6823                                             (__mmask8)(U), (int)(R)))
6824 
6825 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6826 _mm512_broadcast_f32x4(__m128 __A)
6827 {
6828   return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
6829                                          0, 1, 2, 3, 0, 1, 2, 3,
6830                                          0, 1, 2, 3, 0, 1, 2, 3);
6831 }
6832 
6833 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6834 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
6835 {
6836   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
6837                                            (__v16sf)_mm512_broadcast_f32x4(__A),
6838                                            (__v16sf)__O);
6839 }
6840 
6841 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6842 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
6843 {
6844   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
6845                                            (__v16sf)_mm512_broadcast_f32x4(__A),
6846                                            (__v16sf)_mm512_setzero_ps());
6847 }
6848 
6849 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6850 _mm512_broadcast_f64x4(__m256d __A)
6851 {
6852   return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
6853                                           0, 1, 2, 3, 0, 1, 2, 3);
6854 }
6855 
6856 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6857 _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
6858 {
6859   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
6860                                             (__v8df)_mm512_broadcast_f64x4(__A),
6861                                             (__v8df)__O);
6862 }
6863 
6864 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6865 _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
6866 {
6867   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
6868                                             (__v8df)_mm512_broadcast_f64x4(__A),
6869                                             (__v8df)_mm512_setzero_pd());
6870 }
6871 
6872 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6873 _mm512_broadcast_i32x4(__m128i __A)
6874 {
6875   return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
6876                                           0, 1, 2, 3, 0, 1, 2, 3,
6877                                           0, 1, 2, 3, 0, 1, 2, 3);
6878 }
6879 
6880 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6881 _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
6882 {
6883   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
6884                                            (__v16si)_mm512_broadcast_i32x4(__A),
6885                                            (__v16si)__O);
6886 }
6887 
6888 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6889 _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
6890 {
6891   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
6892                                            (__v16si)_mm512_broadcast_i32x4(__A),
6893                                            (__v16si)_mm512_setzero_si512());
6894 }
6895 
6896 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6897 _mm512_broadcast_i64x4(__m256i __A)
6898 {
6899   return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
6900                                           0, 1, 2, 3, 0, 1, 2, 3);
6901 }
6902 
6903 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6904 _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
6905 {
6906   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
6907                                             (__v8di)_mm512_broadcast_i64x4(__A),
6908                                             (__v8di)__O);
6909 }
6910 
6911 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6912 _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
6913 {
6914   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
6915                                             (__v8di)_mm512_broadcast_i64x4(__A),
6916                                             (__v8di)_mm512_setzero_si512());
6917 }
6918 
6919 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6920 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
6921 {
6922   return (__m512d)__builtin_ia32_selectpd_512(__M,
6923                                               (__v8df) _mm512_broadcastsd_pd(__A),
6924                                               (__v8df) __O);
6925 }
6926 
6927 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6928 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
6929 {
6930   return (__m512d)__builtin_ia32_selectpd_512(__M,
6931                                               (__v8df) _mm512_broadcastsd_pd(__A),
6932                                               (__v8df) _mm512_setzero_pd());
6933 }
6934 
6935 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6936 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
6937 {
6938   return (__m512)__builtin_ia32_selectps_512(__M,
6939                                              (__v16sf) _mm512_broadcastss_ps(__A),
6940                                              (__v16sf) __O);
6941 }
6942 
6943 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6944 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
6945 {
6946   return (__m512)__builtin_ia32_selectps_512(__M,
6947                                              (__v16sf) _mm512_broadcastss_ps(__A),
6948                                              (__v16sf) _mm512_setzero_ps());
6949 }
6950 
6951 static __inline__ __m128i __DEFAULT_FN_ATTRS512
6952 _mm512_cvtsepi32_epi8 (__m512i __A)
6953 {
6954   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6955                (__v16qi) _mm_undefined_si128 (),
6956                (__mmask16) -1);
6957 }
6958 
6959 static __inline__ __m128i __DEFAULT_FN_ATTRS512
6960 _mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
6961 {
6962   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6963                (__v16qi) __O, __M);
6964 }
6965 
6966 static __inline__ __m128i __DEFAULT_FN_ATTRS512
6967 _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
6968 {
6969   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6970                (__v16qi) _mm_setzero_si128 (),
6971                __M);
6972 }
6973 
6974 static __inline__ void __DEFAULT_FN_ATTRS512
6975 _mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
6976 {
6977   __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
6978 }
6979 
6980 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6981 _mm512_cvtsepi32_epi16 (__m512i __A)
6982 {
6983   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6984                (__v16hi) _mm256_undefined_si256 (),
6985                (__mmask16) -1);
6986 }
6987 
6988 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6989 _mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
6990 {
6991   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6992                (__v16hi) __O, __M);
6993 }
6994 
6995 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6996 _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
6997 {
6998   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6999                (__v16hi) _mm256_setzero_si256 (),
7000                __M);
7001 }
7002 
7003 static __inline__ void __DEFAULT_FN_ATTRS512
7004 _mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
7005 {
7006   __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
7007 }
7008 
7009 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7010 _mm512_cvtsepi64_epi8 (__m512i __A)
7011 {
7012   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
7013                (__v16qi) _mm_undefined_si128 (),
7014                (__mmask8) -1);
7015 }
7016 
7017 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7018 _mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7019 {
7020   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
7021                (__v16qi) __O, __M);
7022 }
7023 
7024 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7025 _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
7026 {
7027   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
7028                (__v16qi) _mm_setzero_si128 (),
7029                __M);
7030 }
7031 
7032 static __inline__ void __DEFAULT_FN_ATTRS512
7033 _mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7034 {
7035   __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7036 }
7037 
7038 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7039 _mm512_cvtsepi64_epi32 (__m512i __A)
7040 {
7041   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7042                (__v8si) _mm256_undefined_si256 (),
7043                (__mmask8) -1);
7044 }
7045 
7046 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7047 _mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7048 {
7049   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7050                (__v8si) __O, __M);
7051 }
7052 
7053 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7054 _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
7055 {
7056   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7057                (__v8si) _mm256_setzero_si256 (),
7058                __M);
7059 }
7060 
7061 static __inline__ void __DEFAULT_FN_ATTRS512
7062 _mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
7063 {
7064   __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
7065 }
7066 
7067 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7068 _mm512_cvtsepi64_epi16 (__m512i __A)
7069 {
7070   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7071                (__v8hi) _mm_undefined_si128 (),
7072                (__mmask8) -1);
7073 }
7074 
7075 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7076 _mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7077 {
7078   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7079                (__v8hi) __O, __M);
7080 }
7081 
7082 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7083 _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
7084 {
7085   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7086                (__v8hi) _mm_setzero_si128 (),
7087                __M);
7088 }
7089 
7090 static __inline__ void __DEFAULT_FN_ATTRS512
7091 _mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
7092 {
7093   __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
7094 }
7095 
7096 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7097 _mm512_cvtusepi32_epi8 (__m512i __A)
7098 {
7099   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7100                 (__v16qi) _mm_undefined_si128 (),
7101                 (__mmask16) -1);
7102 }
7103 
7104 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7105 _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
7106 {
7107   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7108                 (__v16qi) __O,
7109                 __M);
7110 }
7111 
7112 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7113 _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
7114 {
7115   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7116                 (__v16qi) _mm_setzero_si128 (),
7117                 __M);
7118 }
7119 
7120 static __inline__ void __DEFAULT_FN_ATTRS512
7121 _mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
7122 {
7123   __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
7124 }
7125 
7126 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7127 _mm512_cvtusepi32_epi16 (__m512i __A)
7128 {
7129   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7130                 (__v16hi) _mm256_undefined_si256 (),
7131                 (__mmask16) -1);
7132 }
7133 
7134 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7135 _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
7136 {
7137   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7138                 (__v16hi) __O,
7139                 __M);
7140 }
7141 
7142 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7143 _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
7144 {
7145   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7146                 (__v16hi) _mm256_setzero_si256 (),
7147                 __M);
7148 }
7149 
7150 static __inline__ void __DEFAULT_FN_ATTRS512
7151 _mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
7152 {
7153   __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
7154 }
7155 
7156 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7157 _mm512_cvtusepi64_epi8 (__m512i __A)
7158 {
7159   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7160                 (__v16qi) _mm_undefined_si128 (),
7161                 (__mmask8) -1);
7162 }
7163 
7164 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7165 _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7166 {
7167   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7168                 (__v16qi) __O,
7169                 __M);
7170 }
7171 
7172 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7173 _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
7174 {
7175   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7176                 (__v16qi) _mm_setzero_si128 (),
7177                 __M);
7178 }
7179 
7180 static __inline__ void __DEFAULT_FN_ATTRS512
7181 _mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7182 {
7183   __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7184 }
7185 
7186 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7187 _mm512_cvtusepi64_epi32 (__m512i __A)
7188 {
7189   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7190                 (__v8si) _mm256_undefined_si256 (),
7191                 (__mmask8) -1);
7192 }
7193 
7194 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7195 _mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7196 {
7197   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7198                 (__v8si) __O, __M);
7199 }
7200 
7201 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7202 _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
7203 {
7204   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7205                 (__v8si) _mm256_setzero_si256 (),
7206                 __M);
7207 }
7208 
7209 static __inline__ void __DEFAULT_FN_ATTRS512
7210 _mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
7211 {
7212   __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
7213 }
7214 
7215 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7216 _mm512_cvtusepi64_epi16 (__m512i __A)
7217 {
7218   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7219                 (__v8hi) _mm_undefined_si128 (),
7220                 (__mmask8) -1);
7221 }
7222 
7223 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7224 _mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7225 {
7226   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7227                 (__v8hi) __O, __M);
7228 }
7229 
7230 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7231 _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
7232 {
7233   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7234                 (__v8hi) _mm_setzero_si128 (),
7235                 __M);
7236 }
7237 
7238 static __inline__ void __DEFAULT_FN_ATTRS512
7239 _mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
7240 {
7241   __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
7242 }
7243 
7244 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7245 _mm512_cvtepi32_epi8 (__m512i __A)
7246 {
7247   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7248               (__v16qi) _mm_undefined_si128 (),
7249               (__mmask16) -1);
7250 }
7251 
7252 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7253 _mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
7254 {
7255   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7256               (__v16qi) __O, __M);
7257 }
7258 
7259 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7260 _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
7261 {
7262   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7263               (__v16qi) _mm_setzero_si128 (),
7264               __M);
7265 }
7266 
7267 static __inline__ void __DEFAULT_FN_ATTRS512
7268 _mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
7269 {
7270   __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
7271 }
7272 
7273 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7274 _mm512_cvtepi32_epi16 (__m512i __A)
7275 {
7276   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7277               (__v16hi) _mm256_undefined_si256 (),
7278               (__mmask16) -1);
7279 }
7280 
7281 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7282 _mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
7283 {
7284   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7285               (__v16hi) __O, __M);
7286 }
7287 
7288 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7289 _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
7290 {
7291   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7292               (__v16hi) _mm256_setzero_si256 (),
7293               __M);
7294 }
7295 
7296 static __inline__ void __DEFAULT_FN_ATTRS512
7297 _mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
7298 {
7299   __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
7300 }
7301 
7302 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7303 _mm512_cvtepi64_epi8 (__m512i __A)
7304 {
7305   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7306               (__v16qi) _mm_undefined_si128 (),
7307               (__mmask8) -1);
7308 }
7309 
7310 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7311 _mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7312 {
7313   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7314               (__v16qi) __O, __M);
7315 }
7316 
7317 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7318 _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
7319 {
7320   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7321               (__v16qi) _mm_setzero_si128 (),
7322               __M);
7323 }
7324 
7325 static __inline__ void __DEFAULT_FN_ATTRS512
7326 _mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7327 {
7328   __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7329 }
7330 
7331 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7332 _mm512_cvtepi64_epi32 (__m512i __A)
7333 {
7334   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7335               (__v8si) _mm256_undefined_si256 (),
7336               (__mmask8) -1);
7337 }
7338 
7339 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7340 _mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7341 {
7342   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7343               (__v8si) __O, __M);
7344 }
7345 
7346 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7347 _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
7348 {
7349   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7350               (__v8si) _mm256_setzero_si256 (),
7351               __M);
7352 }
7353 
7354 static __inline__ void __DEFAULT_FN_ATTRS512
7355 _mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
7356 {
7357   __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
7358 }
7359 
7360 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7361 _mm512_cvtepi64_epi16 (__m512i __A)
7362 {
7363   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7364               (__v8hi) _mm_undefined_si128 (),
7365               (__mmask8) -1);
7366 }
7367 
7368 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7369 _mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7370 {
7371   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7372               (__v8hi) __O, __M);
7373 }
7374 
7375 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7376 _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
7377 {
7378   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7379               (__v8hi) _mm_setzero_si128 (),
7380               __M);
7381 }
7382 
7383 static __inline__ void __DEFAULT_FN_ATTRS512
7384 _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
7385 {
7386   __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
7387 }
7388 
7389 #define _mm512_extracti32x4_epi32(A, imm) \
7390   ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7391                                              (__v4si)_mm_undefined_si128(), \
7392                                              (__mmask8)-1))
7393 
7394 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
7395   ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7396                                              (__v4si)(__m128i)(W), \
7397                                              (__mmask8)(U)))
7398 
7399 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
7400   ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7401                                              (__v4si)_mm_setzero_si128(), \
7402                                              (__mmask8)(U)))
7403 
7404 #define _mm512_extracti64x4_epi64(A, imm) \
7405   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7406                                              (__v4di)_mm256_undefined_si256(), \
7407                                              (__mmask8)-1))
7408 
7409 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
7410   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7411                                              (__v4di)(__m256i)(W), \
7412                                              (__mmask8)(U)))
7413 
7414 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
7415   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7416                                              (__v4di)_mm256_setzero_si256(), \
7417                                              (__mmask8)(U)))
7418 
7419 #define _mm512_insertf64x4(A, B, imm) \
7420   ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
7421                                        (__v4df)(__m256d)(B), (int)(imm)))
7422 
7423 #define _mm512_mask_insertf64x4(W, U, A, B, imm) \
7424   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
7425                                    (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
7426                                    (__v8df)(__m512d)(W)))
7427 
7428 #define _mm512_maskz_insertf64x4(U, A, B, imm) \
7429   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
7430                                    (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
7431                                    (__v8df)_mm512_setzero_pd()))
7432 
7433 #define _mm512_inserti64x4(A, B, imm) \
7434   ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
7435                                        (__v4di)(__m256i)(B), (int)(imm)))
7436 
7437 #define _mm512_mask_inserti64x4(W, U, A, B, imm) \
7438   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
7439                                    (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
7440                                    (__v8di)(__m512i)(W)))
7441 
7442 #define _mm512_maskz_inserti64x4(U, A, B, imm) \
7443   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
7444                                    (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
7445                                    (__v8di)_mm512_setzero_si512()))
7446 
7447 #define _mm512_insertf32x4(A, B, imm) \
7448   ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
7449                                       (__v4sf)(__m128)(B), (int)(imm)))
7450 
7451 #define _mm512_mask_insertf32x4(W, U, A, B, imm) \
7452   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
7453                                   (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
7454                                   (__v16sf)(__m512)(W)))
7455 
7456 #define _mm512_maskz_insertf32x4(U, A, B, imm) \
7457   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
7458                                   (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
7459                                   (__v16sf)_mm512_setzero_ps()))
7460 
7461 #define _mm512_inserti32x4(A, B, imm) \
7462   ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
7463                                        (__v4si)(__m128i)(B), (int)(imm)))
7464 
7465 #define _mm512_mask_inserti32x4(W, U, A, B, imm) \
7466   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
7467                                   (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
7468                                   (__v16si)(__m512i)(W)))
7469 
7470 #define _mm512_maskz_inserti32x4(U, A, B, imm) \
7471   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
7472                                   (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
7473                                   (__v16si)_mm512_setzero_si512()))
7474 
7475 #define _mm512_getmant_round_pd(A, B, C, R) \
7476   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7477                                              (int)(((C)<<2) | (B)), \
7478                                              (__v8df)_mm512_undefined_pd(), \
7479                                              (__mmask8)-1, (int)(R)))
7480 
7481 #define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \
7482   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7483                                              (int)(((C)<<2) | (B)), \
7484                                              (__v8df)(__m512d)(W), \
7485                                              (__mmask8)(U), (int)(R)))
7486 
7487 #define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \
7488   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7489                                              (int)(((C)<<2) | (B)), \
7490                                              (__v8df)_mm512_setzero_pd(), \
7491                                              (__mmask8)(U), (int)(R)))
7492 
7493 #define _mm512_getmant_pd(A, B, C) \
7494   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7495                                              (int)(((C)<<2) | (B)), \
7496                                              (__v8df)_mm512_setzero_pd(), \
7497                                              (__mmask8)-1, \
7498                                              _MM_FROUND_CUR_DIRECTION))
7499 
7500 #define _mm512_mask_getmant_pd(W, U, A, B, C) \
7501   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7502                                              (int)(((C)<<2) | (B)), \
7503                                              (__v8df)(__m512d)(W), \
7504                                              (__mmask8)(U), \
7505                                              _MM_FROUND_CUR_DIRECTION))
7506 
7507 #define _mm512_maskz_getmant_pd(U, A, B, C) \
7508   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7509                                              (int)(((C)<<2) | (B)), \
7510                                              (__v8df)_mm512_setzero_pd(), \
7511                                              (__mmask8)(U), \
7512                                              _MM_FROUND_CUR_DIRECTION))
7513 
7514 #define _mm512_getmant_round_ps(A, B, C, R) \
7515   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7516                                             (int)(((C)<<2) | (B)), \
7517                                             (__v16sf)_mm512_undefined_ps(), \
7518                                             (__mmask16)-1, (int)(R)))
7519 
7520 #define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \
7521   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7522                                             (int)(((C)<<2) | (B)), \
7523                                             (__v16sf)(__m512)(W), \
7524                                             (__mmask16)(U), (int)(R)))
7525 
7526 #define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \
7527   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7528                                             (int)(((C)<<2) | (B)), \
7529                                             (__v16sf)_mm512_setzero_ps(), \
7530                                             (__mmask16)(U), (int)(R)))
7531 
7532 #define _mm512_getmant_ps(A, B, C) \
7533   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7534                                             (int)(((C)<<2)|(B)), \
7535                                             (__v16sf)_mm512_undefined_ps(), \
7536                                             (__mmask16)-1, \
7537                                             _MM_FROUND_CUR_DIRECTION))
7538 
7539 #define _mm512_mask_getmant_ps(W, U, A, B, C) \
7540   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7541                                             (int)(((C)<<2)|(B)), \
7542                                             (__v16sf)(__m512)(W), \
7543                                             (__mmask16)(U), \
7544                                             _MM_FROUND_CUR_DIRECTION))
7545 
7546 #define _mm512_maskz_getmant_ps(U, A, B, C) \
7547   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7548                                             (int)(((C)<<2)|(B)), \
7549                                             (__v16sf)_mm512_setzero_ps(), \
7550                                             (__mmask16)(U), \
7551                                             _MM_FROUND_CUR_DIRECTION))
7552 
7553 #define _mm512_getexp_round_pd(A, R) \
7554   ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7555                                             (__v8df)_mm512_undefined_pd(), \
7556                                             (__mmask8)-1, (int)(R)))
7557 
7558 #define _mm512_mask_getexp_round_pd(W, U, A, R) \
7559   ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7560                                             (__v8df)(__m512d)(W), \
7561                                             (__mmask8)(U), (int)(R)))
7562 
7563 #define _mm512_maskz_getexp_round_pd(U, A, R) \
7564   ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7565                                             (__v8df)_mm512_setzero_pd(), \
7566                                             (__mmask8)(U), (int)(R)))
7567 
7568 static __inline__ __m512d __DEFAULT_FN_ATTRS512
7569 _mm512_getexp_pd (__m512d __A)
7570 {
7571   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7572                 (__v8df) _mm512_undefined_pd (),
7573                 (__mmask8) -1,
7574                 _MM_FROUND_CUR_DIRECTION);
7575 }
7576 
7577 static __inline__ __m512d __DEFAULT_FN_ATTRS512
7578 _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
7579 {
7580   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7581                 (__v8df) __W,
7582                 (__mmask8) __U,
7583                 _MM_FROUND_CUR_DIRECTION);
7584 }
7585 
7586 static __inline__ __m512d __DEFAULT_FN_ATTRS512
7587 _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
7588 {
7589   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7590                 (__v8df) _mm512_setzero_pd (),
7591                 (__mmask8) __U,
7592                 _MM_FROUND_CUR_DIRECTION);
7593 }
7594 
7595 #define _mm512_getexp_round_ps(A, R) \
7596   ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7597                                            (__v16sf)_mm512_undefined_ps(), \
7598                                            (__mmask16)-1, (int)(R)))
7599 
7600 #define _mm512_mask_getexp_round_ps(W, U, A, R) \
7601   ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7602                                            (__v16sf)(__m512)(W), \
7603                                            (__mmask16)(U), (int)(R)))
7604 
7605 #define _mm512_maskz_getexp_round_ps(U, A, R) \
7606   ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7607                                            (__v16sf)_mm512_setzero_ps(), \
7608                                            (__mmask16)(U), (int)(R)))
7609 
7610 static __inline__ __m512 __DEFAULT_FN_ATTRS512
7611 _mm512_getexp_ps (__m512 __A)
7612 {
7613   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7614                (__v16sf) _mm512_undefined_ps (),
7615                (__mmask16) -1,
7616                _MM_FROUND_CUR_DIRECTION);
7617 }
7618 
7619 static __inline__ __m512 __DEFAULT_FN_ATTRS512
7620 _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
7621 {
7622   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7623                (__v16sf) __W,
7624                (__mmask16) __U,
7625                _MM_FROUND_CUR_DIRECTION);
7626 }
7627 
7628 static __inline__ __m512 __DEFAULT_FN_ATTRS512
7629 _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
7630 {
7631   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7632                (__v16sf) _mm512_setzero_ps (),
7633                (__mmask16) __U,
7634                _MM_FROUND_CUR_DIRECTION);
7635 }
7636 
7637 #define _mm512_i64gather_ps(index, addr, scale) \
7638   ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
7639                                         (void const *)(addr), \
7640                                         (__v8di)(__m512i)(index), (__mmask8)-1, \
7641                                         (int)(scale)))
7642 
7643 #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \
7644   ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
7645                                         (void const *)(addr), \
7646                                         (__v8di)(__m512i)(index), \
7647                                         (__mmask8)(mask), (int)(scale)))
7648 
7649 #define _mm512_i64gather_epi32(index, addr, scale) \
7650   ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
7651                                          (void const *)(addr), \
7652                                          (__v8di)(__m512i)(index), \
7653                                          (__mmask8)-1, (int)(scale)))
7654 
7655 #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \
7656   ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
7657                                          (void const *)(addr), \
7658                                          (__v8di)(__m512i)(index), \
7659                                          (__mmask8)(mask), (int)(scale)))
7660 
7661 #define _mm512_i64gather_pd(index, addr, scale) \
7662   ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
7663                                         (void const *)(addr), \
7664                                         (__v8di)(__m512i)(index), (__mmask8)-1, \
7665                                         (int)(scale)))
7666 
7667 #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \
7668   ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
7669                                         (void const *)(addr), \
7670                                         (__v8di)(__m512i)(index), \
7671                                         (__mmask8)(mask), (int)(scale)))
7672 
7673 #define _mm512_i64gather_epi64(index, addr, scale) \
7674   ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
7675                                         (void const *)(addr), \
7676                                         (__v8di)(__m512i)(index), (__mmask8)-1, \
7677                                         (int)(scale)))
7678 
7679 #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \
7680   ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
7681                                         (void const *)(addr), \
7682                                         (__v8di)(__m512i)(index), \
7683                                         (__mmask8)(mask), (int)(scale)))
7684 
7685 #define _mm512_i32gather_ps(index, addr, scale) \
7686   ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
7687                                         (void const *)(addr), \
7688                                         (__v16si)(__m512)(index), \
7689                                         (__mmask16)-1, (int)(scale)))
7690 
7691 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
7692   ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
7693                                         (void const *)(addr), \
7694                                         (__v16si)(__m512)(index), \
7695                                         (__mmask16)(mask), (int)(scale)))
7696 
7697 #define _mm512_i32gather_epi32(index, addr, scale) \
7698   ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
7699                                          (void const *)(addr), \
7700                                          (__v16si)(__m512i)(index), \
7701                                          (__mmask16)-1, (int)(scale)))
7702 
7703 #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \
7704   ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
7705                                          (void const *)(addr), \
7706                                          (__v16si)(__m512i)(index), \
7707                                          (__mmask16)(mask), (int)(scale)))
7708 
7709 #define _mm512_i32gather_pd(index, addr, scale) \
7710   ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
7711                                         (void const *)(addr), \
7712                                         (__v8si)(__m256i)(index), (__mmask8)-1, \
7713                                         (int)(scale)))
7714 
7715 #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \
7716   ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
7717                                         (void const *)(addr), \
7718                                         (__v8si)(__m256i)(index), \
7719                                         (__mmask8)(mask), (int)(scale)))
7720 
7721 #define _mm512_i32gather_epi64(index, addr, scale) \
7722   ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
7723                                         (void const *)(addr), \
7724                                         (__v8si)(__m256i)(index), (__mmask8)-1, \
7725                                         (int)(scale)))
7726 
7727 #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \
7728   ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
7729                                         (void const *)(addr), \
7730                                         (__v8si)(__m256i)(index), \
7731                                         (__mmask8)(mask), (int)(scale)))
7732 
7733 #define _mm512_i64scatter_ps(addr, index, v1, scale) \
7734   __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \
7735                                 (__v8di)(__m512i)(index), \
7736                                 (__v8sf)(__m256)(v1), (int)(scale))
7737 
7738 #define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \
7739   __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \
7740                                 (__v8di)(__m512i)(index), \
7741                                 (__v8sf)(__m256)(v1), (int)(scale))
7742 
7743 #define _mm512_i64scatter_epi32(addr, index, v1, scale) \
7744   __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \
7745                                 (__v8di)(__m512i)(index), \
7746                                 (__v8si)(__m256i)(v1), (int)(scale))
7747 
7748 #define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
7749   __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \
7750                                 (__v8di)(__m512i)(index), \
7751                                 (__v8si)(__m256i)(v1), (int)(scale))
7752 
7753 #define _mm512_i64scatter_pd(addr, index, v1, scale) \
7754   __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \
7755                                (__v8di)(__m512i)(index), \
7756                                (__v8df)(__m512d)(v1), (int)(scale))
7757 
7758 #define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \
7759   __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \
7760                                (__v8di)(__m512i)(index), \
7761                                (__v8df)(__m512d)(v1), (int)(scale))
7762 
7763 #define _mm512_i64scatter_epi64(addr, index, v1, scale) \
7764   __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \
7765                                (__v8di)(__m512i)(index), \
7766                                (__v8di)(__m512i)(v1), (int)(scale))
7767 
7768 #define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
7769   __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \
7770                                (__v8di)(__m512i)(index), \
7771                                (__v8di)(__m512i)(v1), (int)(scale))
7772 
7773 #define _mm512_i32scatter_ps(addr, index, v1, scale) \
7774   __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \
7775                                 (__v16si)(__m512i)(index), \
7776                                 (__v16sf)(__m512)(v1), (int)(scale))
7777 
7778 #define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \
7779   __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \
7780                                 (__v16si)(__m512i)(index), \
7781                                 (__v16sf)(__m512)(v1), (int)(scale))
7782 
7783 #define _mm512_i32scatter_epi32(addr, index, v1, scale) \
7784   __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \
7785                                 (__v16si)(__m512i)(index), \
7786                                 (__v16si)(__m512i)(v1), (int)(scale))
7787 
7788 #define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
7789   __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \
7790                                 (__v16si)(__m512i)(index), \
7791                                 (__v16si)(__m512i)(v1), (int)(scale))
7792 
7793 #define _mm512_i32scatter_pd(addr, index, v1, scale) \
7794   __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \
7795                                (__v8si)(__m256i)(index), \
7796                                (__v8df)(__m512d)(v1), (int)(scale))
7797 
7798 #define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \
7799   __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \
7800                                (__v8si)(__m256i)(index), \
7801                                (__v8df)(__m512d)(v1), (int)(scale))
7802 
7803 #define _mm512_i32scatter_epi64(addr, index, v1, scale) \
7804   __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \
7805                                (__v8si)(__m256i)(index), \
7806                                (__v8di)(__m512i)(v1), (int)(scale))
7807 
7808 #define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
7809   __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \
7810                                (__v8si)(__m256i)(index), \
7811                                (__v8di)(__m512i)(v1), (int)(scale))
7812 
7813 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7814 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7815 {
7816   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7817                                        (__v4sf)__A,
7818                                        (__v4sf)__B,
7819                                        (__mmask8)__U,
7820                                        _MM_FROUND_CUR_DIRECTION);
7821 }
7822 
7823 #define _mm_fmadd_round_ss(A, B, C, R) \
7824   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7825                                          (__v4sf)(__m128)(B), \
7826                                          (__v4sf)(__m128)(C), (__mmask8)-1, \
7827                                          (int)(R)))
7828 
7829 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) \
7830   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7831                                          (__v4sf)(__m128)(A), \
7832                                          (__v4sf)(__m128)(B), (__mmask8)(U), \
7833                                          (int)(R)))
7834 
7835 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7836 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7837 {
7838   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7839                                         (__v4sf)__B,
7840                                         (__v4sf)__C,
7841                                         (__mmask8)__U,
7842                                         _MM_FROUND_CUR_DIRECTION);
7843 }
7844 
7845 #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
7846   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7847                                           (__v4sf)(__m128)(B), \
7848                                           (__v4sf)(__m128)(C), (__mmask8)(U), \
7849                                           (int)(R)))
7850 
7851 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7852 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7853 {
7854   return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
7855                                         (__v4sf)__X,
7856                                         (__v4sf)__Y,
7857                                         (__mmask8)__U,
7858                                         _MM_FROUND_CUR_DIRECTION);
7859 }
7860 
7861 #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \
7862   ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
7863                                           (__v4sf)(__m128)(X), \
7864                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
7865                                           (int)(R)))
7866 
7867 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7868 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7869 {
7870   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7871                                        (__v4sf)__A,
7872                                        -(__v4sf)__B,
7873                                        (__mmask8)__U,
7874                                        _MM_FROUND_CUR_DIRECTION);
7875 }
7876 
7877 #define _mm_fmsub_round_ss(A, B, C, R) \
7878   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7879                                          (__v4sf)(__m128)(B), \
7880                                          -(__v4sf)(__m128)(C), (__mmask8)-1, \
7881                                          (int)(R)))
7882 
7883 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) \
7884   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7885                                          (__v4sf)(__m128)(A), \
7886                                          -(__v4sf)(__m128)(B), (__mmask8)(U), \
7887                                          (int)(R)))
7888 
7889 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7890 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7891 {
7892   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7893                                         (__v4sf)__B,
7894                                         -(__v4sf)__C,
7895                                         (__mmask8)__U,
7896                                         _MM_FROUND_CUR_DIRECTION);
7897 }
7898 
7899 #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
7900   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7901                                           (__v4sf)(__m128)(B), \
7902                                           -(__v4sf)(__m128)(C), (__mmask8)(U), \
7903                                           (int)(R)))
7904 
7905 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7906 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7907 {
7908   return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
7909                                         (__v4sf)__X,
7910                                         (__v4sf)__Y,
7911                                         (__mmask8)__U,
7912                                         _MM_FROUND_CUR_DIRECTION);
7913 }
7914 
7915 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \
7916   ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
7917                                           (__v4sf)(__m128)(X), \
7918                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
7919                                           (int)(R)))
7920 
7921 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7922 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7923 {
7924   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7925                                        -(__v4sf)__A,
7926                                        (__v4sf)__B,
7927                                        (__mmask8)__U,
7928                                        _MM_FROUND_CUR_DIRECTION);
7929 }
7930 
7931 #define _mm_fnmadd_round_ss(A, B, C, R) \
7932   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7933                                          -(__v4sf)(__m128)(B), \
7934                                          (__v4sf)(__m128)(C), (__mmask8)-1, \
7935                                          (int)(R)))
7936 
7937 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \
7938   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7939                                          -(__v4sf)(__m128)(A), \
7940                                          (__v4sf)(__m128)(B), (__mmask8)(U), \
7941                                          (int)(R)))
7942 
7943 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7944 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7945 {
7946   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7947                                         -(__v4sf)__B,
7948                                         (__v4sf)__C,
7949                                         (__mmask8)__U,
7950                                         _MM_FROUND_CUR_DIRECTION);
7951 }
7952 
7953 #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
7954   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7955                                           -(__v4sf)(__m128)(B), \
7956                                           (__v4sf)(__m128)(C), (__mmask8)(U), \
7957                                           (int)(R)))
7958 
7959 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7960 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7961 {
7962   return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
7963                                         -(__v4sf)__X,
7964                                         (__v4sf)__Y,
7965                                         (__mmask8)__U,
7966                                         _MM_FROUND_CUR_DIRECTION);
7967 }
7968 
7969 #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \
7970   ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
7971                                           -(__v4sf)(__m128)(X), \
7972                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
7973                                           (int)(R)))
7974 
7975 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7976 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7977 {
7978   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7979                                        -(__v4sf)__A,
7980                                        -(__v4sf)__B,
7981                                        (__mmask8)__U,
7982                                        _MM_FROUND_CUR_DIRECTION);
7983 }
7984 
7985 #define _mm_fnmsub_round_ss(A, B, C, R) \
7986   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7987                                          -(__v4sf)(__m128)(B), \
7988                                          -(__v4sf)(__m128)(C), (__mmask8)-1, \
7989                                          (int)(R)))
7990 
7991 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \
7992   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7993                                          -(__v4sf)(__m128)(A), \
7994                                          -(__v4sf)(__m128)(B), (__mmask8)(U), \
7995                                          (int)(R)))
7996 
7997 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7998 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7999 {
8000   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
8001                                         -(__v4sf)__B,
8002                                         -(__v4sf)__C,
8003                                         (__mmask8)__U,
8004                                         _MM_FROUND_CUR_DIRECTION);
8005 }
8006 
8007 #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
8008   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
8009                                           -(__v4sf)(__m128)(B), \
8010                                           -(__v4sf)(__m128)(C), (__mmask8)(U), \
8011                                           (int)(R)))
8012 
8013 static __inline__ __m128 __DEFAULT_FN_ATTRS128
8014 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
8015 {
8016   return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
8017                                         -(__v4sf)__X,
8018                                         (__v4sf)__Y,
8019                                         (__mmask8)__U,
8020                                         _MM_FROUND_CUR_DIRECTION);
8021 }
8022 
8023 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \
8024   ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
8025                                           -(__v4sf)(__m128)(X), \
8026                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
8027                                           (int)(R)))
8028 
8029 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8030 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8031 {
8032   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8033                                        (__v2df)__A,
8034                                        (__v2df)__B,
8035                                        (__mmask8)__U,
8036                                        _MM_FROUND_CUR_DIRECTION);
8037 }
8038 
8039 #define _mm_fmadd_round_sd(A, B, C, R) \
8040   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8041                                           (__v2df)(__m128d)(B), \
8042                                           (__v2df)(__m128d)(C), (__mmask8)-1, \
8043                                           (int)(R)))
8044 
8045 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) \
8046   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8047                                           (__v2df)(__m128d)(A), \
8048                                           (__v2df)(__m128d)(B), (__mmask8)(U), \
8049                                           (int)(R)))
8050 
8051 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8052 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8053 {
8054   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8055                                         (__v2df)__B,
8056                                         (__v2df)__C,
8057                                         (__mmask8)__U,
8058                                         _MM_FROUND_CUR_DIRECTION);
8059 }
8060 
8061 #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
8062   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8063                                            (__v2df)(__m128d)(B), \
8064                                            (__v2df)(__m128d)(C), (__mmask8)(U), \
8065                                            (int)(R)))
8066 
8067 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8068 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8069 {
8070   return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
8071                                         (__v2df)__X,
8072                                         (__v2df)__Y,
8073                                         (__mmask8)__U,
8074                                         _MM_FROUND_CUR_DIRECTION);
8075 }
8076 
8077 #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \
8078   ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
8079                                            (__v2df)(__m128d)(X), \
8080                                            (__v2df)(__m128d)(Y), (__mmask8)(U), \
8081                                            (int)(R)))
8082 
8083 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8084 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8085 {
8086   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8087                                        (__v2df)__A,
8088                                        -(__v2df)__B,
8089                                        (__mmask8)__U,
8090                                        _MM_FROUND_CUR_DIRECTION);
8091 }
8092 
8093 #define _mm_fmsub_round_sd(A, B, C, R) \
8094   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8095                                           (__v2df)(__m128d)(B), \
8096                                           -(__v2df)(__m128d)(C), (__mmask8)-1, \
8097                                           (int)(R)))
8098 
8099 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) \
8100   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8101                                           (__v2df)(__m128d)(A), \
8102                                           -(__v2df)(__m128d)(B), (__mmask8)(U), \
8103                                           (int)(R)))
8104 
8105 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8106 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8107 {
8108   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8109                                         (__v2df)__B,
8110                                         -(__v2df)__C,
8111                                         (__mmask8)__U,
8112                                         _MM_FROUND_CUR_DIRECTION);
8113 }
8114 
8115 #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
8116   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8117                                            (__v2df)(__m128d)(B), \
8118                                            -(__v2df)(__m128d)(C), \
8119                                            (__mmask8)(U), (int)(R)))
8120 
8121 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8122 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8123 {
8124   return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
8125                                         (__v2df)__X,
8126                                         (__v2df)__Y,
8127                                         (__mmask8)__U,
8128                                         _MM_FROUND_CUR_DIRECTION);
8129 }
8130 
8131 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \
8132   ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
8133                                            (__v2df)(__m128d)(X), \
8134                                            (__v2df)(__m128d)(Y), \
8135                                            (__mmask8)(U), (int)(R)))
8136 
8137 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8138 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8139 {
8140   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8141                                        -(__v2df)__A,
8142                                        (__v2df)__B,
8143                                        (__mmask8)__U,
8144                                        _MM_FROUND_CUR_DIRECTION);
8145 }
8146 
8147 #define _mm_fnmadd_round_sd(A, B, C, R) \
8148   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8149                                           -(__v2df)(__m128d)(B), \
8150                                           (__v2df)(__m128d)(C), (__mmask8)-1, \
8151                                           (int)(R)))
8152 
8153 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \
8154   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8155                                           -(__v2df)(__m128d)(A), \
8156                                           (__v2df)(__m128d)(B), (__mmask8)(U), \
8157                                           (int)(R)))
8158 
8159 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8160 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8161 {
8162   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8163                                         -(__v2df)__B,
8164                                         (__v2df)__C,
8165                                         (__mmask8)__U,
8166                                         _MM_FROUND_CUR_DIRECTION);
8167 }
8168 
8169 #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
8170   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8171                                            -(__v2df)(__m128d)(B), \
8172                                            (__v2df)(__m128d)(C), (__mmask8)(U), \
8173                                            (int)(R)))
8174 
8175 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8176 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8177 {
8178   return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
8179                                         -(__v2df)__X,
8180                                         (__v2df)__Y,
8181                                         (__mmask8)__U,
8182                                         _MM_FROUND_CUR_DIRECTION);
8183 }
8184 
8185 #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \
8186   ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
8187                                            -(__v2df)(__m128d)(X), \
8188                                            (__v2df)(__m128d)(Y), (__mmask8)(U), \
8189                                            (int)(R)))
8190 
8191 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8192 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8193 {
8194   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8195                                        -(__v2df)__A,
8196                                        -(__v2df)__B,
8197                                        (__mmask8)__U,
8198                                        _MM_FROUND_CUR_DIRECTION);
8199 }
8200 
8201 #define _mm_fnmsub_round_sd(A, B, C, R) \
8202   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8203                                           -(__v2df)(__m128d)(B), \
8204                                           -(__v2df)(__m128d)(C), (__mmask8)-1, \
8205                                           (int)(R)))
8206 
8207 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \
8208   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8209                                           -(__v2df)(__m128d)(A), \
8210                                           -(__v2df)(__m128d)(B), (__mmask8)(U), \
8211                                           (int)(R)))
8212 
8213 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8214 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8215 {
8216   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8217                                         -(__v2df)__B,
8218                                         -(__v2df)__C,
8219                                         (__mmask8)__U,
8220                                         _MM_FROUND_CUR_DIRECTION);
8221 }
8222 
8223 #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
8224   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8225                                            -(__v2df)(__m128d)(B), \
8226                                            -(__v2df)(__m128d)(C), \
8227                                            (__mmask8)(U), \
8228                                            (int)(R)))
8229 
8230 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8231 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8232 {
8233   return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
8234                                         -(__v2df)__X,
8235                                         (__v2df)__Y,
8236                                         (__mmask8)__U,
8237                                         _MM_FROUND_CUR_DIRECTION);
8238 }
8239 
8240 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \
8241   ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
8242                                            -(__v2df)(__m128d)(X), \
8243                                            (__v2df)(__m128d)(Y), \
8244                                            (__mmask8)(U), (int)(R)))
8245 
8246 #define _mm512_permutex_pd(X, C) \
8247   ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)))
8248 
8249 #define _mm512_mask_permutex_pd(W, U, X, C) \
8250   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
8251                                         (__v8df)_mm512_permutex_pd((X), (C)), \
8252                                         (__v8df)(__m512d)(W)))
8253 
8254 #define _mm512_maskz_permutex_pd(U, X, C) \
8255   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
8256                                         (__v8df)_mm512_permutex_pd((X), (C)), \
8257                                         (__v8df)_mm512_setzero_pd()))
8258 
8259 #define _mm512_permutex_epi64(X, C) \
8260   ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)))
8261 
8262 #define _mm512_mask_permutex_epi64(W, U, X, C) \
8263   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
8264                                        (__v8di)_mm512_permutex_epi64((X), (C)), \
8265                                        (__v8di)(__m512i)(W)))
8266 
8267 #define _mm512_maskz_permutex_epi64(U, X, C) \
8268   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
8269                                        (__v8di)_mm512_permutex_epi64((X), (C)), \
8270                                        (__v8di)_mm512_setzero_si512()))
8271 
8272 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8273 _mm512_permutexvar_pd (__m512i __X, __m512d __Y)
8274 {
8275   return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X);
8276 }
8277 
8278 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8279 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
8280 {
8281   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8282                                         (__v8df)_mm512_permutexvar_pd(__X, __Y),
8283                                         (__v8df)__W);
8284 }
8285 
8286 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8287 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
8288 {
8289   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8290                                         (__v8df)_mm512_permutexvar_pd(__X, __Y),
8291                                         (__v8df)_mm512_setzero_pd());
8292 }
8293 
8294 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8295 _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
8296 {
8297   return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X);
8298 }
8299 
8300 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8301 _mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
8302 {
8303   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
8304                                      (__v8di)_mm512_permutexvar_epi64(__X, __Y),
8305                                      (__v8di)_mm512_setzero_si512());
8306 }
8307 
8308 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8309 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
8310              __m512i __Y)
8311 {
8312   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
8313                                      (__v8di)_mm512_permutexvar_epi64(__X, __Y),
8314                                      (__v8di)__W);
8315 }
8316 
8317 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8318 _mm512_permutexvar_ps (__m512i __X, __m512 __Y)
8319 {
8320   return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X);
8321 }
8322 
8323 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8324 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
8325 {
8326   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8327                                        (__v16sf)_mm512_permutexvar_ps(__X, __Y),
8328                                        (__v16sf)__W);
8329 }
8330 
8331 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8332 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
8333 {
8334   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8335                                        (__v16sf)_mm512_permutexvar_ps(__X, __Y),
8336                                        (__v16sf)_mm512_setzero_ps());
8337 }
8338 
8339 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8340 _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
8341 {
8342   return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X);
8343 }
8344 
8345 #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
8346 
8347 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8348 _mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
8349 {
8350   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
8351                                     (__v16si)_mm512_permutexvar_epi32(__X, __Y),
8352                                     (__v16si)_mm512_setzero_si512());
8353 }
8354 
8355 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8356 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
8357              __m512i __Y)
8358 {
8359   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
8360                                     (__v16si)_mm512_permutexvar_epi32(__X, __Y),
8361                                     (__v16si)__W);
8362 }
8363 
8364 #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
8365 
8366 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8367 _mm512_kand (__mmask16 __A, __mmask16 __B)
8368 {
8369   return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
8370 }
8371 
8372 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8373 _mm512_kandn (__mmask16 __A, __mmask16 __B)
8374 {
8375   return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
8376 }
8377 
8378 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8379 _mm512_kor (__mmask16 __A, __mmask16 __B)
8380 {
8381   return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
8382 }
8383 
8384 static __inline__ int __DEFAULT_FN_ATTRS
8385 _mm512_kortestc (__mmask16 __A, __mmask16 __B)
8386 {
8387   return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
8388 }
8389 
8390 static __inline__ int __DEFAULT_FN_ATTRS
8391 _mm512_kortestz (__mmask16 __A, __mmask16 __B)
8392 {
8393   return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
8394 }
8395 
8396 static __inline__ unsigned char __DEFAULT_FN_ATTRS
8397 _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B)
8398 {
8399   return (unsigned char)__builtin_ia32_kortestchi(__A, __B);
8400 }
8401 
8402 static __inline__ unsigned char __DEFAULT_FN_ATTRS
8403 _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B)
8404 {
8405   return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
8406 }
8407 
8408 static __inline__ unsigned char __DEFAULT_FN_ATTRS
8409 _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
8410   *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B);
8411   return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
8412 }
8413 
8414 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8415 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
8416 {
8417   return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
8418 }
8419 
8420 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8421 _mm512_kxnor (__mmask16 __A, __mmask16 __B)
8422 {
8423   return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
8424 }
8425 
8426 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8427 _mm512_kxor (__mmask16 __A, __mmask16 __B)
8428 {
8429   return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
8430 }
8431 
8432 #define _kand_mask16 _mm512_kand
8433 #define _kandn_mask16 _mm512_kandn
8434 #define _knot_mask16 _mm512_knot
8435 #define _kor_mask16 _mm512_kor
8436 #define _kxnor_mask16 _mm512_kxnor
8437 #define _kxor_mask16 _mm512_kxor
8438 
8439 #define _kshiftli_mask16(A, I) \
8440   ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)))
8441 
8442 #define _kshiftri_mask16(A, I) \
8443   ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)))
8444 
8445 static __inline__ unsigned int __DEFAULT_FN_ATTRS
8446 _cvtmask16_u32(__mmask16 __A) {
8447   return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A);
8448 }
8449 
8450 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8451 _cvtu32_mask16(unsigned int __A) {
8452   return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A);
8453 }
8454 
8455 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8456 _load_mask16(__mmask16 *__A) {
8457   return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A);
8458 }
8459 
8460 static __inline__ void __DEFAULT_FN_ATTRS
8461 _store_mask16(__mmask16 *__A, __mmask16 __B) {
8462   *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B);
8463 }
8464 
8465 static __inline__ void __DEFAULT_FN_ATTRS512
8466 _mm512_stream_si512 (void * __P, __m512i __A)
8467 {
8468   typedef __v8di __v8di_aligned __attribute__((aligned(64)));
8469   __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
8470 }
8471 
8472 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8473 _mm512_stream_load_si512 (void const *__P)
8474 {
8475   typedef __v8di __v8di_aligned __attribute__((aligned(64)));
8476   return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
8477 }
8478 
8479 static __inline__ void __DEFAULT_FN_ATTRS512
8480 _mm512_stream_pd (void *__P, __m512d __A)
8481 {
8482   typedef __v8df __v8df_aligned __attribute__((aligned(64)));
8483   __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
8484 }
8485 
8486 static __inline__ void __DEFAULT_FN_ATTRS512
8487 _mm512_stream_ps (void *__P, __m512 __A)
8488 {
8489   typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
8490   __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
8491 }
8492 
8493 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8494 _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
8495 {
8496   return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
8497                   (__v8df) __W,
8498                   (__mmask8) __U);
8499 }
8500 
8501 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8502 _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
8503 {
8504   return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
8505                   (__v8df)
8506                   _mm512_setzero_pd (),
8507                   (__mmask8) __U);
8508 }
8509 
8510 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8511 _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
8512 {
8513   return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
8514                   (__v8di) __W,
8515                   (__mmask8) __U);
8516 }
8517 
8518 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8519 _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
8520 {
8521   return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
8522                   (__v8di)
8523                   _mm512_setzero_si512 (),
8524                   (__mmask8) __U);
8525 }
8526 
8527 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8528 _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
8529 {
8530   return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
8531                  (__v16sf) __W,
8532                  (__mmask16) __U);
8533 }
8534 
8535 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8536 _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
8537 {
8538   return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
8539                  (__v16sf)
8540                  _mm512_setzero_ps (),
8541                  (__mmask16) __U);
8542 }
8543 
8544 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8545 _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
8546 {
8547   return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
8548                   (__v16si) __W,
8549                   (__mmask16) __U);
8550 }
8551 
8552 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8553 _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
8554 {
8555   return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
8556                   (__v16si)
8557                   _mm512_setzero_si512 (),
8558                   (__mmask16) __U);
8559 }
8560 
8561 #define _mm_cmp_round_ss_mask(X, Y, P, R) \
8562   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8563                                        (__v4sf)(__m128)(Y), (int)(P), \
8564                                        (__mmask8)-1, (int)(R)))
8565 
8566 #define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
8567   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8568                                        (__v4sf)(__m128)(Y), (int)(P), \
8569                                        (__mmask8)(M), (int)(R)))
8570 
8571 #define _mm_cmp_ss_mask(X, Y, P) \
8572   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8573                                        (__v4sf)(__m128)(Y), (int)(P), \
8574                                        (__mmask8)-1, \
8575                                        _MM_FROUND_CUR_DIRECTION))
8576 
8577 #define _mm_mask_cmp_ss_mask(M, X, Y, P) \
8578   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8579                                        (__v4sf)(__m128)(Y), (int)(P), \
8580                                        (__mmask8)(M), \
8581                                        _MM_FROUND_CUR_DIRECTION))
8582 
8583 #define _mm_cmp_round_sd_mask(X, Y, P, R) \
8584   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8585                                        (__v2df)(__m128d)(Y), (int)(P), \
8586                                        (__mmask8)-1, (int)(R)))
8587 
8588 #define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
8589   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8590                                        (__v2df)(__m128d)(Y), (int)(P), \
8591                                        (__mmask8)(M), (int)(R)))
8592 
8593 #define _mm_cmp_sd_mask(X, Y, P) \
8594   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8595                                        (__v2df)(__m128d)(Y), (int)(P), \
8596                                        (__mmask8)-1, \
8597                                        _MM_FROUND_CUR_DIRECTION))
8598 
8599 #define _mm_mask_cmp_sd_mask(M, X, Y, P) \
8600   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8601                                        (__v2df)(__m128d)(Y), (int)(P), \
8602                                        (__mmask8)(M), \
8603                                        _MM_FROUND_CUR_DIRECTION))
8604 
8605 /* Bit Test */
8606 
8607 static __inline __mmask16 __DEFAULT_FN_ATTRS512
8608 _mm512_test_epi32_mask (__m512i __A, __m512i __B)
8609 {
8610   return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B),
8611                                    _mm512_setzero_si512());
8612 }
8613 
8614 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
8615 _mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
8616 {
8617   return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
8618                                         _mm512_setzero_si512());
8619 }
8620 
8621 static __inline __mmask8 __DEFAULT_FN_ATTRS512
8622 _mm512_test_epi64_mask (__m512i __A, __m512i __B)
8623 {
8624   return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B),
8625                                    _mm512_setzero_si512());
8626 }
8627 
8628 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
8629 _mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
8630 {
8631   return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
8632                                         _mm512_setzero_si512());
8633 }
8634 
8635 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
8636 _mm512_testn_epi32_mask (__m512i __A, __m512i __B)
8637 {
8638   return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B),
8639                                   _mm512_setzero_si512());
8640 }
8641 
8642 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
8643 _mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
8644 {
8645   return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
8646                                        _mm512_setzero_si512());
8647 }
8648 
8649 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
8650 _mm512_testn_epi64_mask (__m512i __A, __m512i __B)
8651 {
8652   return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B),
8653                                   _mm512_setzero_si512());
8654 }
8655 
8656 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
8657 _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
8658 {
8659   return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
8660                                        _mm512_setzero_si512());
8661 }
8662 
8663 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8664 _mm512_movehdup_ps (__m512 __A)
8665 {
8666   return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
8667                          1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
8668 }
8669 
8670 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8671 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
8672 {
8673   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8674                                              (__v16sf)_mm512_movehdup_ps(__A),
8675                                              (__v16sf)__W);
8676 }
8677 
8678 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8679 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
8680 {
8681   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8682                                              (__v16sf)_mm512_movehdup_ps(__A),
8683                                              (__v16sf)_mm512_setzero_ps());
8684 }
8685 
8686 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8687 _mm512_moveldup_ps (__m512 __A)
8688 {
8689   return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
8690                          0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
8691 }
8692 
8693 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8694 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
8695 {
8696   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8697                                              (__v16sf)_mm512_moveldup_ps(__A),
8698                                              (__v16sf)__W);
8699 }
8700 
8701 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8702 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
8703 {
8704   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8705                                              (__v16sf)_mm512_moveldup_ps(__A),
8706                                              (__v16sf)_mm512_setzero_ps());
8707 }
8708 
8709 static __inline__ __m128 __DEFAULT_FN_ATTRS128
8710 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
8711 {
8712   return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
8713 }
8714 
8715 static __inline__ __m128 __DEFAULT_FN_ATTRS128
8716 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
8717 {
8718   return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
8719                                      _mm_setzero_ps());
8720 }
8721 
8722 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8723 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8724 {
8725   return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
8726 }
8727 
8728 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8729 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
8730 {
8731   return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
8732                                      _mm_setzero_pd());
8733 }
8734 
8735 static __inline__ void __DEFAULT_FN_ATTRS128
8736 _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
8737 {
8738   __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1);
8739 }
8740 
8741 static __inline__ void __DEFAULT_FN_ATTRS128
8742 _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
8743 {
8744   __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1);
8745 }
8746 
8747 static __inline__ __m128 __DEFAULT_FN_ATTRS128
8748 _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
8749 {
8750   __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
8751                                                 (__v4sf)_mm_setzero_ps(),
8752                                                 0, 4, 4, 4);
8753 
8754   return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1);
8755 }
8756 
8757 static __inline__ __m128 __DEFAULT_FN_ATTRS128
8758 _mm_maskz_load_ss (__mmask8 __U, const float* __A)
8759 {
8760   return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A,
8761                                                 (__v4sf) _mm_setzero_ps(),
8762                                                 __U & 1);
8763 }
8764 
8765 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8766 _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
8767 {
8768   __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
8769                                                  (__v2df)_mm_setzero_pd(),
8770                                                  0, 2);
8771 
8772   return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1);
8773 }
8774 
8775 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8776 _mm_maskz_load_sd (__mmask8 __U, const double* __A)
8777 {
8778   return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A,
8779                                                   (__v2df) _mm_setzero_pd(),
8780                                                   __U & 1);
8781 }
8782 
8783 #define _mm512_shuffle_epi32(A, I) \
8784   ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)))
8785 
8786 #define _mm512_mask_shuffle_epi32(W, U, A, I) \
8787   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
8788                                        (__v16si)_mm512_shuffle_epi32((A), (I)), \
8789                                        (__v16si)(__m512i)(W)))
8790 
8791 #define _mm512_maskz_shuffle_epi32(U, A, I) \
8792   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
8793                                        (__v16si)_mm512_shuffle_epi32((A), (I)), \
8794                                        (__v16si)_mm512_setzero_si512()))
8795 
8796 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8797 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
8798 {
8799   return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
8800                 (__v8df) __W,
8801                 (__mmask8) __U);
8802 }
8803 
8804 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8805 _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
8806 {
8807   return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
8808                 (__v8df) _mm512_setzero_pd (),
8809                 (__mmask8) __U);
8810 }
8811 
8812 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8813 _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
8814 {
8815   return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
8816                 (__v8di) __W,
8817                 (__mmask8) __U);
8818 }
8819 
8820 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8821 _mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
8822 {
8823   return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
8824                 (__v8di) _mm512_setzero_si512 (),
8825                 (__mmask8) __U);
8826 }
8827 
8828 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8829 _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
8830 {
8831   return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
8832               (__v8df) __W,
8833               (__mmask8) __U);
8834 }
8835 
8836 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8837 _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
8838 {
8839   return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
8840               (__v8df) _mm512_setzero_pd(),
8841               (__mmask8) __U);
8842 }
8843 
8844 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8845 _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
8846 {
8847   return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
8848               (__v8di) __W,
8849               (__mmask8) __U);
8850 }
8851 
8852 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8853 _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
8854 {
8855   return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
8856               (__v8di) _mm512_setzero_si512(),
8857               (__mmask8) __U);
8858 }
8859 
8860 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8861 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
8862 {
8863   return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
8864                    (__v16sf) __W,
8865                    (__mmask16) __U);
8866 }
8867 
8868 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8869 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
8870 {
8871   return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
8872                    (__v16sf) _mm512_setzero_ps(),
8873                    (__mmask16) __U);
8874 }
8875 
8876 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8877 _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
8878 {
8879   return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
8880               (__v16si) __W,
8881               (__mmask16) __U);
8882 }
8883 
8884 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8885 _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
8886 {
8887   return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
8888               (__v16si) _mm512_setzero_si512(),
8889               (__mmask16) __U);
8890 }
8891 
8892 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8893 _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
8894 {
8895   return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
8896                (__v16sf) __W,
8897                (__mmask16) __U);
8898 }
8899 
8900 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8901 _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
8902 {
8903   return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
8904                (__v16sf) _mm512_setzero_ps(),
8905                (__mmask16) __U);
8906 }
8907 
8908 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8909 _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
8910 {
8911   return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
8912                 (__v16si) __W,
8913                 (__mmask16) __U);
8914 }
8915 
8916 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8917 _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
8918 {
8919   return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
8920                 (__v16si) _mm512_setzero_si512(),
8921                 (__mmask16) __U);
8922 }
8923 
8924 #define _mm512_cvt_roundps_pd(A, R) \
8925   ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8926                                             (__v8df)_mm512_undefined_pd(), \
8927                                             (__mmask8)-1, (int)(R)))
8928 
8929 #define _mm512_mask_cvt_roundps_pd(W, U, A, R) \
8930   ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8931                                             (__v8df)(__m512d)(W), \
8932                                             (__mmask8)(U), (int)(R)))
8933 
8934 #define _mm512_maskz_cvt_roundps_pd(U, A, R) \
8935   ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8936                                             (__v8df)_mm512_setzero_pd(), \
8937                                             (__mmask8)(U), (int)(R)))
8938 
8939 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8940 _mm512_cvtps_pd (__m256 __A)
8941 {
8942   return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df);
8943 }
8944 
8945 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8946 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
8947 {
8948   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8949                                               (__v8df)_mm512_cvtps_pd(__A),
8950                                               (__v8df)__W);
8951 }
8952 
8953 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8954 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
8955 {
8956   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8957                                               (__v8df)_mm512_cvtps_pd(__A),
8958                                               (__v8df)_mm512_setzero_pd());
8959 }
8960 
8961 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8962 _mm512_cvtpslo_pd (__m512 __A)
8963 {
8964   return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
8965 }
8966 
8967 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8968 _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
8969 {
8970   return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
8971 }
8972 
8973 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8974 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
8975 {
8976   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
8977               (__v8df) __A,
8978               (__v8df) __W);
8979 }
8980 
8981 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8982 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
8983 {
8984   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
8985               (__v8df) __A,
8986               (__v8df) _mm512_setzero_pd ());
8987 }
8988 
8989 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8990 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
8991 {
8992   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
8993              (__v16sf) __A,
8994              (__v16sf) __W);
8995 }
8996 
8997 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8998 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
8999 {
9000   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
9001              (__v16sf) __A,
9002              (__v16sf) _mm512_setzero_ps ());
9003 }
9004 
9005 static __inline__ void __DEFAULT_FN_ATTRS512
9006 _mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
9007 {
9008   __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
9009             (__mmask8) __U);
9010 }
9011 
9012 static __inline__ void __DEFAULT_FN_ATTRS512
9013 _mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
9014 {
9015   __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
9016             (__mmask8) __U);
9017 }
9018 
9019 static __inline__ void __DEFAULT_FN_ATTRS512
9020 _mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
9021 {
9022   __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
9023             (__mmask16) __U);
9024 }
9025 
9026 static __inline__ void __DEFAULT_FN_ATTRS512
9027 _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
9028 {
9029   __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
9030             (__mmask16) __U);
9031 }
9032 
9033 #define _mm_cvt_roundsd_ss(A, B, R) \
9034   ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9035                                               (__v2df)(__m128d)(B), \
9036                                               (__v4sf)_mm_undefined_ps(), \
9037                                               (__mmask8)-1, (int)(R)))
9038 
9039 #define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \
9040   ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9041                                               (__v2df)(__m128d)(B), \
9042                                               (__v4sf)(__m128)(W), \
9043                                               (__mmask8)(U), (int)(R)))
9044 
9045 #define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \
9046   ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9047                                               (__v2df)(__m128d)(B), \
9048                                               (__v4sf)_mm_setzero_ps(), \
9049                                               (__mmask8)(U), (int)(R)))
9050 
9051 static __inline__ __m128 __DEFAULT_FN_ATTRS128
9052 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
9053 {
9054   return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
9055                                              (__v2df)__B,
9056                                              (__v4sf)__W,
9057                                              (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9058 }
9059 
9060 static __inline__ __m128 __DEFAULT_FN_ATTRS128
9061 _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
9062 {
9063   return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
9064                                              (__v2df)__B,
9065                                              (__v4sf)_mm_setzero_ps(),
9066                                              (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9067 }
9068 
9069 #define _mm_cvtss_i32 _mm_cvtss_si32
9070 #define _mm_cvtsd_i32 _mm_cvtsd_si32
9071 #define _mm_cvti32_sd _mm_cvtsi32_sd
9072 #define _mm_cvti32_ss _mm_cvtsi32_ss
9073 #ifdef __x86_64__
9074 #define _mm_cvtss_i64 _mm_cvtss_si64
9075 #define _mm_cvtsd_i64 _mm_cvtsd_si64
9076 #define _mm_cvti64_sd _mm_cvtsi64_sd
9077 #define _mm_cvti64_ss _mm_cvtsi64_ss
9078 #endif
9079 
9080 #ifdef __x86_64__
9081 #define _mm_cvt_roundi64_sd(A, B, R) \
9082   ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
9083                                       (int)(R)))
9084 
9085 #define _mm_cvt_roundsi64_sd(A, B, R) \
9086   ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
9087                                       (int)(R)))
9088 #endif
9089 
9090 #define _mm_cvt_roundsi32_ss(A, B, R) \
9091   ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
9092 
9093 #define _mm_cvt_roundi32_ss(A, B, R) \
9094   ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
9095 
9096 #ifdef __x86_64__
9097 #define _mm_cvt_roundsi64_ss(A, B, R) \
9098   ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
9099                                      (int)(R)))
9100 
9101 #define _mm_cvt_roundi64_ss(A, B, R) \
9102   ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
9103                                      (int)(R)))
9104 #endif
9105 
9106 #define _mm_cvt_roundss_sd(A, B, R) \
9107   ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9108                                                (__v4sf)(__m128)(B), \
9109                                                (__v2df)_mm_undefined_pd(), \
9110                                                (__mmask8)-1, (int)(R)))
9111 
9112 #define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \
9113   ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9114                                                (__v4sf)(__m128)(B), \
9115                                                (__v2df)(__m128d)(W), \
9116                                                (__mmask8)(U), (int)(R)))
9117 
9118 #define _mm_maskz_cvt_roundss_sd(U, A, B, R) \
9119   ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9120                                                (__v4sf)(__m128)(B), \
9121                                                (__v2df)_mm_setzero_pd(), \
9122                                                (__mmask8)(U), (int)(R)))
9123 
9124 static __inline__ __m128d __DEFAULT_FN_ATTRS128
9125 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
9126 {
9127   return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
9128                                             (__v4sf)__B,
9129                                             (__v2df)__W,
9130                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9131 }
9132 
9133 static __inline__ __m128d __DEFAULT_FN_ATTRS128
9134 _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
9135 {
9136   return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
9137                                             (__v4sf)__B,
9138                                             (__v2df)_mm_setzero_pd(),
9139                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9140 }
9141 
9142 static __inline__ __m128d __DEFAULT_FN_ATTRS128
9143 _mm_cvtu32_sd (__m128d __A, unsigned __B)
9144 {
9145   __A[0] = __B;
9146   return __A;
9147 }
9148 
9149 #ifdef __x86_64__
9150 #define _mm_cvt_roundu64_sd(A, B, R) \
9151   ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
9152                                        (unsigned long long)(B), (int)(R)))
9153 
9154 static __inline__ __m128d __DEFAULT_FN_ATTRS128
9155 _mm_cvtu64_sd (__m128d __A, unsigned long long __B)
9156 {
9157   __A[0] = __B;
9158   return __A;
9159 }
9160 #endif
9161 
9162 #define _mm_cvt_roundu32_ss(A, B, R) \
9163   ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
9164                                       (int)(R)))
9165 
9166 static __inline__ __m128 __DEFAULT_FN_ATTRS128
9167 _mm_cvtu32_ss (__m128 __A, unsigned __B)
9168 {
9169   __A[0] = __B;
9170   return __A;
9171 }
9172 
9173 #ifdef __x86_64__
9174 #define _mm_cvt_roundu64_ss(A, B, R) \
9175   ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
9176                                       (unsigned long long)(B), (int)(R)))
9177 
9178 static __inline__ __m128 __DEFAULT_FN_ATTRS128
9179 _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
9180 {
9181   __A[0] = __B;
9182   return __A;
9183 }
9184 #endif
9185 
9186 static __inline__ __m512i __DEFAULT_FN_ATTRS512
9187 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
9188 {
9189   return (__m512i) __builtin_ia32_selectd_512(__M,
9190                                               (__v16si) _mm512_set1_epi32(__A),
9191                                               (__v16si) __O);
9192 }
9193 
9194 static __inline__ __m512i __DEFAULT_FN_ATTRS512
9195 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
9196 {
9197   return (__m512i) __builtin_ia32_selectq_512(__M,
9198                                               (__v8di) _mm512_set1_epi64(__A),
9199                                               (__v8di) __O);
9200 }
9201 
9202 static  __inline __m512i __DEFAULT_FN_ATTRS512
9203 _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
9204     char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
9205     char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
9206     char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
9207     char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
9208     char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
9209     char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
9210     char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
9211     char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
9212     char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
9213     char __e4, char __e3, char __e2, char __e1, char __e0) {
9214 
9215   return __extension__ (__m512i)(__v64qi)
9216     {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
9217      __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
9218      __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
9219      __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
9220      __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
9221      __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
9222      __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
9223      __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
9224 }
9225 
9226 static  __inline __m512i __DEFAULT_FN_ATTRS512
9227 _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
9228     short __e27, short __e26, short __e25, short __e24, short __e23,
9229     short __e22, short __e21, short __e20, short __e19, short __e18,
9230     short __e17, short __e16, short __e15, short __e14, short __e13,
9231     short __e12, short __e11, short __e10, short __e9, short __e8,
9232     short __e7, short __e6, short __e5, short __e4, short __e3,
9233     short __e2, short __e1, short __e0) {
9234   return __extension__ (__m512i)(__v32hi)
9235     {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
9236      __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
9237      __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
9238      __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
9239 }
9240 
9241 static __inline __m512i __DEFAULT_FN_ATTRS512
9242 _mm512_set_epi32 (int __A, int __B, int __C, int __D,
9243      int __E, int __F, int __G, int __H,
9244      int __I, int __J, int __K, int __L,
9245      int __M, int __N, int __O, int __P)
9246 {
9247   return __extension__ (__m512i)(__v16si)
9248   { __P, __O, __N, __M, __L, __K, __J, __I,
9249     __H, __G, __F, __E, __D, __C, __B, __A };
9250 }
9251 
9252 #define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,           \
9253        e8,e9,e10,e11,e12,e13,e14,e15)          \
9254   _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
9255                    (e5),(e4),(e3),(e2),(e1),(e0))
9256 
9257 static __inline__ __m512i __DEFAULT_FN_ATTRS512
9258 _mm512_set_epi64 (long long __A, long long __B, long long __C,
9259      long long __D, long long __E, long long __F,
9260      long long __G, long long __H)
9261 {
9262   return __extension__ (__m512i) (__v8di)
9263   { __H, __G, __F, __E, __D, __C, __B, __A };
9264 }
9265 
9266 #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)           \
9267   _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
9268 
9269 static __inline__ __m512d __DEFAULT_FN_ATTRS512
9270 _mm512_set_pd (double __A, double __B, double __C, double __D,
9271         double __E, double __F, double __G, double __H)
9272 {
9273   return __extension__ (__m512d)
9274   { __H, __G, __F, __E, __D, __C, __B, __A };
9275 }
9276 
9277 #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)              \
9278   _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
9279 
9280 static __inline__ __m512 __DEFAULT_FN_ATTRS512
9281 _mm512_set_ps (float __A, float __B, float __C, float __D,
9282         float __E, float __F, float __G, float __H,
9283         float __I, float __J, float __K, float __L,
9284         float __M, float __N, float __O, float __P)
9285 {
9286   return __extension__ (__m512)
9287   { __P, __O, __N, __M, __L, __K, __J, __I,
9288     __H, __G, __F, __E, __D, __C, __B, __A };
9289 }
9290 
9291 #define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
9292   _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
9293                 (e4),(e3),(e2),(e1),(e0))
9294 
9295 static __inline__ __m512 __DEFAULT_FN_ATTRS512
9296 _mm512_abs_ps(__m512 __A)
9297 {
9298   return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
9299 }
9300 
9301 static __inline__ __m512 __DEFAULT_FN_ATTRS512
9302 _mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
9303 {
9304   return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
9305 }
9306 
9307 static __inline__ __m512d __DEFAULT_FN_ATTRS512
9308 _mm512_abs_pd(__m512d __A)
9309 {
9310   return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
9311 }
9312 
9313 static __inline__ __m512d __DEFAULT_FN_ATTRS512
9314 _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
9315 {
9316   return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
9317 }
9318 
9319 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
9320  * outputs. This class of vector operation forms the basis of many scientific
9321  * computations. In vector-reduction arithmetic, the evaluation order is
9322  * independent of the order of the input elements of V.
9323 
9324  * For floating-point intrinsics:
9325  * 1. When using fadd/fmul intrinsics, the order of operations within the
9326  * vector is unspecified (associative math).
9327  * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
9328  * produce unspecified results.
9329 
9330  * Used bisection method. At each step, we partition the vector with previous
9331  * step in half, and the operation is performed on its two halves.
9332  * This takes log2(n) steps where n is the number of elements in the vector.
9333  */
9334 
9335 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
9336   return __builtin_reduce_add((__v8di)__W);
9337 }
9338 
9339 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
9340   return __builtin_reduce_mul((__v8di)__W);
9341 }
9342 
9343 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
9344   return __builtin_reduce_and((__v8di)__W);
9345 }
9346 
9347 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
9348   return __builtin_reduce_or((__v8di)__W);
9349 }
9350 
9351 static __inline__ long long __DEFAULT_FN_ATTRS512
9352 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
9353   __W = _mm512_maskz_mov_epi64(__M, __W);
9354   return __builtin_reduce_add((__v8di)__W);
9355 }
9356 
9357 static __inline__ long long __DEFAULT_FN_ATTRS512
9358 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
9359   __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
9360   return __builtin_reduce_mul((__v8di)__W);
9361 }
9362 
9363 static __inline__ long long __DEFAULT_FN_ATTRS512
9364 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
9365   __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W);
9366   return __builtin_reduce_and((__v8di)__W);
9367 }
9368 
9369 static __inline__ long long __DEFAULT_FN_ATTRS512
9370 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
9371   __W = _mm512_maskz_mov_epi64(__M, __W);
9372   return __builtin_reduce_or((__v8di)__W);
9373 }
9374 
9375 // -0.0 is used to ignore the start value since it is the neutral value of
9376 // floating point addition. For more information, please refer to
9377 // https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
9378 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
9379   return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
9380 }
9381 
9382 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
9383   return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
9384 }
9385 
9386 static __inline__ double __DEFAULT_FN_ATTRS512
9387 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
9388   __W = _mm512_maskz_mov_pd(__M, __W);
9389   return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
9390 }
9391 
9392 static __inline__ double __DEFAULT_FN_ATTRS512
9393 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
9394   __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
9395   return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
9396 }
9397 
9398 static __inline__ int __DEFAULT_FN_ATTRS512
9399 _mm512_reduce_add_epi32(__m512i __W) {
9400   return __builtin_reduce_add((__v16si)__W);
9401 }
9402 
9403 static __inline__ int __DEFAULT_FN_ATTRS512
9404 _mm512_reduce_mul_epi32(__m512i __W) {
9405   return __builtin_reduce_mul((__v16si)__W);
9406 }
9407 
9408 static __inline__ int __DEFAULT_FN_ATTRS512
9409 _mm512_reduce_and_epi32(__m512i __W) {
9410   return __builtin_reduce_and((__v16si)__W);
9411 }
9412 
9413 static __inline__ int __DEFAULT_FN_ATTRS512
9414 _mm512_reduce_or_epi32(__m512i __W) {
9415   return __builtin_reduce_or((__v16si)__W);
9416 }
9417 
9418 static __inline__ int __DEFAULT_FN_ATTRS512
9419 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
9420   __W = _mm512_maskz_mov_epi32(__M, __W);
9421   return __builtin_reduce_add((__v16si)__W);
9422 }
9423 
9424 static __inline__ int __DEFAULT_FN_ATTRS512
9425 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
9426   __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
9427   return __builtin_reduce_mul((__v16si)__W);
9428 }
9429 
9430 static __inline__ int __DEFAULT_FN_ATTRS512
9431 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
9432   __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W);
9433   return __builtin_reduce_and((__v16si)__W);
9434 }
9435 
9436 static __inline__ int __DEFAULT_FN_ATTRS512
9437 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
9438   __W = _mm512_maskz_mov_epi32(__M, __W);
9439   return __builtin_reduce_or((__v16si)__W);
9440 }
9441 
9442 static __inline__ float __DEFAULT_FN_ATTRS512
9443 _mm512_reduce_add_ps(__m512 __W) {
9444   return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
9445 }
9446 
9447 static __inline__ float __DEFAULT_FN_ATTRS512
9448 _mm512_reduce_mul_ps(__m512 __W) {
9449   return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
9450 }
9451 
9452 static __inline__ float __DEFAULT_FN_ATTRS512
9453 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
9454   __W = _mm512_maskz_mov_ps(__M, __W);
9455   return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
9456 }
9457 
9458 static __inline__ float __DEFAULT_FN_ATTRS512
9459 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
9460   __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
9461   return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
9462 }
9463 
9464 static __inline__ long long __DEFAULT_FN_ATTRS512
9465 _mm512_reduce_max_epi64(__m512i __V) {
9466   return __builtin_reduce_max((__v8di)__V);
9467 }
9468 
9469 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9470 _mm512_reduce_max_epu64(__m512i __V) {
9471   return __builtin_reduce_max((__v8du)__V);
9472 }
9473 
9474 static __inline__ long long __DEFAULT_FN_ATTRS512
9475 _mm512_reduce_min_epi64(__m512i __V) {
9476   return __builtin_reduce_min((__v8di)__V);
9477 }
9478 
9479 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9480 _mm512_reduce_min_epu64(__m512i __V) {
9481   return __builtin_reduce_min((__v8du)__V);
9482 }
9483 
9484 static __inline__ long long __DEFAULT_FN_ATTRS512
9485 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
9486   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
9487   return __builtin_reduce_max((__v8di)__V);
9488 }
9489 
9490 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9491 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
9492   __V = _mm512_maskz_mov_epi64(__M, __V);
9493   return __builtin_reduce_max((__v8du)__V);
9494 }
9495 
9496 static __inline__ long long __DEFAULT_FN_ATTRS512
9497 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
9498   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
9499   return __builtin_reduce_min((__v8di)__V);
9500 }
9501 
9502 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9503 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
9504   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
9505   return __builtin_reduce_min((__v8du)__V);
9506 }
9507 static __inline__ int __DEFAULT_FN_ATTRS512
9508 _mm512_reduce_max_epi32(__m512i __V) {
9509   return __builtin_reduce_max((__v16si)__V);
9510 }
9511 
9512 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9513 _mm512_reduce_max_epu32(__m512i __V) {
9514   return __builtin_reduce_max((__v16su)__V);
9515 }
9516 
9517 static __inline__ int __DEFAULT_FN_ATTRS512
9518 _mm512_reduce_min_epi32(__m512i __V) {
9519   return __builtin_reduce_min((__v16si)__V);
9520 }
9521 
9522 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9523 _mm512_reduce_min_epu32(__m512i __V) {
9524   return __builtin_reduce_min((__v16su)__V);
9525 }
9526 
9527 static __inline__ int __DEFAULT_FN_ATTRS512
9528 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
9529   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
9530   return __builtin_reduce_max((__v16si)__V);
9531 }
9532 
9533 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9534 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
9535   __V = _mm512_maskz_mov_epi32(__M, __V);
9536   return __builtin_reduce_max((__v16su)__V);
9537 }
9538 
9539 static __inline__ int __DEFAULT_FN_ATTRS512
9540 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
9541   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
9542   return __builtin_reduce_min((__v16si)__V);
9543 }
9544 
9545 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9546 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
9547   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V);
9548   return __builtin_reduce_min((__v16su)__V);
9549 }
9550 
9551 static __inline__ double __DEFAULT_FN_ATTRS512
9552 _mm512_reduce_max_pd(__m512d __V) {
9553   return __builtin_ia32_reduce_fmax_pd512(__V);
9554 }
9555 
9556 static __inline__ double __DEFAULT_FN_ATTRS512
9557 _mm512_reduce_min_pd(__m512d __V) {
9558   return __builtin_ia32_reduce_fmin_pd512(__V);
9559 }
9560 
9561 static __inline__ double __DEFAULT_FN_ATTRS512
9562 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
9563   __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
9564   return __builtin_ia32_reduce_fmax_pd512(__V);
9565 }
9566 
9567 static __inline__ double __DEFAULT_FN_ATTRS512
9568 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
9569   __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
9570   return __builtin_ia32_reduce_fmin_pd512(__V);
9571 }
9572 
9573 static __inline__ float __DEFAULT_FN_ATTRS512
9574 _mm512_reduce_max_ps(__m512 __V) {
9575   return __builtin_ia32_reduce_fmax_ps512(__V);
9576 }
9577 
9578 static __inline__ float __DEFAULT_FN_ATTRS512
9579 _mm512_reduce_min_ps(__m512 __V) {
9580   return __builtin_ia32_reduce_fmin_ps512(__V);
9581 }
9582 
9583 static __inline__ float __DEFAULT_FN_ATTRS512
9584 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
9585   __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
9586   return __builtin_ia32_reduce_fmax_ps512(__V);
9587 }
9588 
9589 static __inline__ float __DEFAULT_FN_ATTRS512
9590 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
9591   __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
9592   return __builtin_ia32_reduce_fmin_ps512(__V);
9593 }
9594 
9595 /// Moves the least significant 32 bits of a vector of [16 x i32] to a
9596 ///    32-bit signed integer value.
9597 ///
9598 /// \headerfile <x86intrin.h>
9599 ///
9600 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
9601 ///
9602 /// \param __A
9603 ///    A vector of [16 x i32]. The least significant 32 bits are moved to the
9604 ///    destination.
9605 /// \returns A 32-bit signed integer containing the moved value.
9606 static __inline__ int __DEFAULT_FN_ATTRS512
9607 _mm512_cvtsi512_si32(__m512i __A) {
9608   __v16si __b = (__v16si)__A;
9609   return __b[0];
9610 }
9611 
9612 /// Loads 8 double-precision (64-bit) floating-point elements stored at memory
9613 /// locations starting at location \a base_addr at packed 32-bit integer indices
9614 /// stored in the lower half of \a vindex scaled by \a scale them in dst.
9615 ///
9616 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
9617 ///
9618 /// \code{.operation}
9619 /// FOR j := 0 to 7
9620 ///   i := j*64
9621 ///   m := j*32
9622 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9623 ///   dst[i+63:i] := MEM[addr+63:addr]
9624 /// ENDFOR
9625 /// dst[MAX:512] := 0
9626 /// \endcode
9627 #define _mm512_i32logather_pd(vindex, base_addr, scale)                        \
9628   _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
9629 
9630 /// Loads 8 double-precision (64-bit) floating-point elements from memory
9631 /// starting at location \a base_addr at packed 32-bit integer indices stored in
9632 /// the lower half of \a vindex scaled by \a scale into dst using writemask
9633 /// \a mask (elements are copied from \a src when the corresponding mask bit is
9634 /// not set).
9635 ///
9636 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
9637 ///
9638 /// \code{.operation}
9639 /// FOR j := 0 to 7
9640 ///   i := j*64
9641 ///   m := j*32
9642 ///   IF mask[j]
9643 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9644 ///     dst[i+63:i] := MEM[addr+63:addr]
9645 ///   ELSE
9646 ///     dst[i+63:i] := src[i+63:i]
9647 ///   FI
9648 /// ENDFOR
9649 /// dst[MAX:512] := 0
9650 /// \endcode
9651 #define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale)        \
9652   _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex),      \
9653                            (base_addr), (scale))
9654 
9655 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr
9656 /// at packed 32-bit integer indices stored in the lower half of \a vindex
9657 /// scaled by \a scale and stores them in dst.
9658 ///
9659 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
9660 ///
9661 /// \code{.operation}
9662 /// FOR j := 0 to 7
9663 ///   i := j*64
9664 ///   m := j*32
9665 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9666 ///   dst[i+63:i] := MEM[addr+63:addr]
9667 /// ENDFOR
9668 /// dst[MAX:512] := 0
9669 /// \endcode
9670 #define _mm512_i32logather_epi64(vindex, base_addr, scale)                     \
9671   _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
9672 
9673 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr
9674 /// at packed 32-bit integer indices stored in the lower half of \a vindex
9675 /// scaled by \a scale and stores them in dst using writemask \a mask (elements
9676 /// are copied from \a src when the corresponding mask bit is not set).
9677 ///
9678 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
9679 ///
9680 /// \code{.operation}
9681 /// FOR j := 0 to 7
9682 ///   i := j*64
9683 ///   m := j*32
9684 ///   IF mask[j]
9685 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9686 ///     dst[i+63:i] := MEM[addr+63:addr]
9687 ///   ELSE
9688 ///     dst[i+63:i] := src[i+63:i]
9689 ///   FI
9690 /// ENDFOR
9691 /// dst[MAX:512] := 0
9692 /// \endcode
9693 #define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale)     \
9694   _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex),   \
9695                               (base_addr), (scale))
9696 
9697 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
9698 /// and to memory locations starting at location \a base_addr at packed 32-bit
9699 /// integer indices stored in \a vindex scaled by \a scale.
9700 ///
9701 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
9702 ///
9703 /// \code{.operation}
9704 /// FOR j := 0 to 7
9705 ///   i := j*64
9706 ///   m := j*32
9707 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9708 ///   MEM[addr+63:addr] := v1[i+63:i]
9709 /// ENDFOR
9710 /// \endcode
9711 #define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale)                   \
9712   _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
9713 
9714 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
9715 /// to memory locations starting at location \a base_addr at packed 32-bit
9716 /// integer indices stored in \a vindex scaled by \a scale. Only those elements
9717 /// whose corresponding mask bit is set in writemask \a mask are written to
9718 /// memory.
9719 ///
9720 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
9721 ///
9722 /// \code{.operation}
9723 /// FOR j := 0 to 7
9724 ///   i := j*64
9725 ///   m := j*32
9726 ///   IF mask[j]
9727 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9728 ///     MEM[addr+63:addr] := a[i+63:i]
9729 ///   FI
9730 /// ENDFOR
9731 /// \endcode
9732 #define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale)        \
9733   _mm512_mask_i32scatter_pd((base_addr), (mask),                               \
9734                             _mm512_castsi512_si256(vindex), (v1), (scale))
9735 
9736 /// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
9737 /// memory locations starting at location \a base_addr at packed 32-bit integer
9738 /// indices stored in \a vindex scaled by \a scale.
9739 ///
9740 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
9741 ///
9742 /// \code{.operation}
9743 /// FOR j := 0 to 7
9744 ///   i := j*64
9745 ///   m := j*32
9746 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9747 ///   MEM[addr+63:addr] := a[i+63:i]
9748 /// ENDFOR
9749 /// \endcode
9750 #define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale)                \
9751   _mm512_i32scatter_epi64((base_addr),                                         \
9752                           _mm512_castsi512_si256(vindex), (v1), (scale))
9753 
9754 /// Stores 8 packed 64-bit integer elements located in a and stores them in
9755 /// memory locations starting at location \a base_addr at packed 32-bit integer
9756 /// indices stored in \a vindex scaled by scale using writemask \a mask (elements
9757 /// whose corresponding mask bit is not set are not written to memory).
9758 ///
9759 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
9760 ///
9761 /// \code{.operation}
9762 /// FOR j := 0 to 7
9763 ///   i := j*64
9764 ///   m := j*32
9765 ///   IF mask[j]
9766 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9767 ///     MEM[addr+63:addr] := a[i+63:i]
9768 ///   FI
9769 /// ENDFOR
9770 /// \endcode
9771 #define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale)     \
9772   _mm512_mask_i32scatter_epi64((base_addr), (mask),                            \
9773                                _mm512_castsi512_si256(vindex), (v1), (scale))
9774 
9775 #undef __DEFAULT_FN_ATTRS512
9776 #undef __DEFAULT_FN_ATTRS128
9777 #undef __DEFAULT_FN_ATTRS
9778 
9779 #endif /* __AVX512FINTRIN_H */
9780