xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avx512fintrin.h (revision 0c428864495af9dc7d2af4d0a5ae21732af9c739)
1 /*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifndef __AVX512FINTRIN_H
14 #define __AVX512FINTRIN_H
15 
16 typedef char __v64qi __attribute__((__vector_size__(64)));
17 typedef short __v32hi __attribute__((__vector_size__(64)));
18 typedef double __v8df __attribute__((__vector_size__(64)));
19 typedef float __v16sf __attribute__((__vector_size__(64)));
20 typedef long long __v8di __attribute__((__vector_size__(64)));
21 typedef int __v16si __attribute__((__vector_size__(64)));
22 
23 /* Unsigned types */
24 typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
25 typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
26 typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
27 typedef unsigned int __v16su __attribute__((__vector_size__(64)));
28 
29 /* We need an explicitly signed variant for char. Note that this shouldn't
30  * appear in the interface though. */
31 typedef signed char __v64qs __attribute__((__vector_size__(64)));
32 
33 typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
34 typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64)));
35 typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
36 
37 typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1)));
38 typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1)));
39 typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)));
40 
41 typedef unsigned char __mmask8;
42 typedef unsigned short __mmask16;
43 
44 /* Rounding mode macros.  */
45 #define _MM_FROUND_TO_NEAREST_INT   0x00
46 #define _MM_FROUND_TO_NEG_INF       0x01
47 #define _MM_FROUND_TO_POS_INF       0x02
48 #define _MM_FROUND_TO_ZERO          0x03
49 #define _MM_FROUND_CUR_DIRECTION    0x04
50 
51 /* Constants for integer comparison predicates */
52 typedef enum {
53     _MM_CMPINT_EQ,      /* Equal */
54     _MM_CMPINT_LT,      /* Less than */
55     _MM_CMPINT_LE,      /* Less than or Equal */
56     _MM_CMPINT_UNUSED,
57     _MM_CMPINT_NE,      /* Not Equal */
58     _MM_CMPINT_NLT,     /* Not Less than */
59 #define _MM_CMPINT_GE   _MM_CMPINT_NLT  /* Greater than or Equal */
60     _MM_CMPINT_NLE      /* Not Less than or Equal */
61 #define _MM_CMPINT_GT   _MM_CMPINT_NLE  /* Greater than */
62 } _MM_CMPINT_ENUM;
63 
64 typedef enum
65 {
66   _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
67   _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
68   _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
69   _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
70   _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
71   _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
72   _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
73   _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
74   _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
75   _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
76   _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
77   _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
78   _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
79   _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
80   _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
81   _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
82   _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
83   _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
84   _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
85   _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
86   _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
87   _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
88   _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
89   _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
90   _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
91   _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
92   _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
93   _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
94   _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
95   _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
96   _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
97   _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
98   _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
99   _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
100   _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
101   _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
102   _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
103   _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
104   _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
105   _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
106   _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
107   _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
108   _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
109   _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
110   _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
111   _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
112   _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
113   _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
114   _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
115   _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
116   _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
117   _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
118   _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
119   _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
120   _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
121   _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
122   _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
123   _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
124   _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
125   _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
126   _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
127   _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
128   _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
129   _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
130   _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
131   _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
132   _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
133   _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
134   _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
135   _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
136   _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
137   _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
138   _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
139   _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
140   _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
141   _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
142   _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
143   _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
144   _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
145   _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
146   _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
147   _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
148   _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
149   _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
150   _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
151   _MM_PERM_DDDD = 0xFF
152 } _MM_PERM_ENUM;
153 
154 typedef enum
155 {
156   _MM_MANT_NORM_1_2,    /* interval [1, 2)      */
157   _MM_MANT_NORM_p5_2,   /* interval [0.5, 2)    */
158   _MM_MANT_NORM_p5_1,   /* interval [0.5, 1)    */
159   _MM_MANT_NORM_p75_1p5   /* interval [0.75, 1.5) */
160 } _MM_MANTISSA_NORM_ENUM;
161 
162 typedef enum
163 {
164   _MM_MANT_SIGN_src,    /* sign = sign(SRC)     */
165   _MM_MANT_SIGN_zero,   /* sign = 0             */
166   _MM_MANT_SIGN_nan   /* DEST = NaN if sign(SRC) = 1 */
167 } _MM_MANTISSA_SIGN_ENUM;
168 
169 /* Define the default attributes for the functions in this file. */
170 #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512)))
171 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128)))
172 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
173 
174 /* Create vectors with repeated elements */
175 
176 static  __inline __m512i __DEFAULT_FN_ATTRS512
177 _mm512_setzero_si512(void)
178 {
179   return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
180 }
181 
182 #define _mm512_setzero_epi32 _mm512_setzero_si512
183 
184 static __inline__ __m512d __DEFAULT_FN_ATTRS512
185 _mm512_undefined_pd(void)
186 {
187   return (__m512d)__builtin_ia32_undef512();
188 }
189 
190 static __inline__ __m512 __DEFAULT_FN_ATTRS512
191 _mm512_undefined(void)
192 {
193   return (__m512)__builtin_ia32_undef512();
194 }
195 
196 static __inline__ __m512 __DEFAULT_FN_ATTRS512
197 _mm512_undefined_ps(void)
198 {
199   return (__m512)__builtin_ia32_undef512();
200 }
201 
202 static __inline__ __m512i __DEFAULT_FN_ATTRS512
203 _mm512_undefined_epi32(void)
204 {
205   return (__m512i)__builtin_ia32_undef512();
206 }
207 
208 static __inline__ __m512i __DEFAULT_FN_ATTRS512
209 _mm512_broadcastd_epi32 (__m128i __A)
210 {
211   return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
212                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
213 }
214 
215 static __inline__ __m512i __DEFAULT_FN_ATTRS512
216 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
217 {
218   return (__m512i)__builtin_ia32_selectd_512(__M,
219                                              (__v16si) _mm512_broadcastd_epi32(__A),
220                                              (__v16si) __O);
221 }
222 
223 static __inline__ __m512i __DEFAULT_FN_ATTRS512
224 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
225 {
226   return (__m512i)__builtin_ia32_selectd_512(__M,
227                                              (__v16si) _mm512_broadcastd_epi32(__A),
228                                              (__v16si) _mm512_setzero_si512());
229 }
230 
231 static __inline__ __m512i __DEFAULT_FN_ATTRS512
232 _mm512_broadcastq_epi64 (__m128i __A)
233 {
234   return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
235                                           0, 0, 0, 0, 0, 0, 0, 0);
236 }
237 
238 static __inline__ __m512i __DEFAULT_FN_ATTRS512
239 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
240 {
241   return (__m512i)__builtin_ia32_selectq_512(__M,
242                                              (__v8di) _mm512_broadcastq_epi64(__A),
243                                              (__v8di) __O);
244 
245 }
246 
247 static __inline__ __m512i __DEFAULT_FN_ATTRS512
248 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
249 {
250   return (__m512i)__builtin_ia32_selectq_512(__M,
251                                              (__v8di) _mm512_broadcastq_epi64(__A),
252                                              (__v8di) _mm512_setzero_si512());
253 }
254 
255 
256 static __inline __m512 __DEFAULT_FN_ATTRS512
257 _mm512_setzero_ps(void)
258 {
259   return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
260                                  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
261 }
262 
263 #define _mm512_setzero _mm512_setzero_ps
264 
265 static  __inline __m512d __DEFAULT_FN_ATTRS512
266 _mm512_setzero_pd(void)
267 {
268   return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
269 }
270 
271 static __inline __m512 __DEFAULT_FN_ATTRS512
272 _mm512_set1_ps(float __w)
273 {
274   return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
275                                  __w, __w, __w, __w, __w, __w, __w, __w  };
276 }
277 
278 static __inline __m512d __DEFAULT_FN_ATTRS512
279 _mm512_set1_pd(double __w)
280 {
281   return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
282 }
283 
284 static __inline __m512i __DEFAULT_FN_ATTRS512
285 _mm512_set1_epi8(char __w)
286 {
287   return __extension__ (__m512i)(__v64qi){
288     __w, __w, __w, __w, __w, __w, __w, __w,
289     __w, __w, __w, __w, __w, __w, __w, __w,
290     __w, __w, __w, __w, __w, __w, __w, __w,
291     __w, __w, __w, __w, __w, __w, __w, __w,
292     __w, __w, __w, __w, __w, __w, __w, __w,
293     __w, __w, __w, __w, __w, __w, __w, __w,
294     __w, __w, __w, __w, __w, __w, __w, __w,
295     __w, __w, __w, __w, __w, __w, __w, __w  };
296 }
297 
298 static __inline __m512i __DEFAULT_FN_ATTRS512
299 _mm512_set1_epi16(short __w)
300 {
301   return __extension__ (__m512i)(__v32hi){
302     __w, __w, __w, __w, __w, __w, __w, __w,
303     __w, __w, __w, __w, __w, __w, __w, __w,
304     __w, __w, __w, __w, __w, __w, __w, __w,
305     __w, __w, __w, __w, __w, __w, __w, __w };
306 }
307 
308 static __inline __m512i __DEFAULT_FN_ATTRS512
309 _mm512_set1_epi32(int __s)
310 {
311   return __extension__ (__m512i)(__v16si){
312     __s, __s, __s, __s, __s, __s, __s, __s,
313     __s, __s, __s, __s, __s, __s, __s, __s };
314 }
315 
316 static __inline __m512i __DEFAULT_FN_ATTRS512
317 _mm512_maskz_set1_epi32(__mmask16 __M, int __A)
318 {
319   return (__m512i)__builtin_ia32_selectd_512(__M,
320                                              (__v16si)_mm512_set1_epi32(__A),
321                                              (__v16si)_mm512_setzero_si512());
322 }
323 
324 static __inline __m512i __DEFAULT_FN_ATTRS512
325 _mm512_set1_epi64(long long __d)
326 {
327   return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
328 }
329 
330 static __inline __m512i __DEFAULT_FN_ATTRS512
331 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
332 {
333   return (__m512i)__builtin_ia32_selectq_512(__M,
334                                              (__v8di)_mm512_set1_epi64(__A),
335                                              (__v8di)_mm512_setzero_si512());
336 }
337 
338 static __inline__ __m512 __DEFAULT_FN_ATTRS512
339 _mm512_broadcastss_ps(__m128 __A)
340 {
341   return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
342                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
343 }
344 
345 static __inline __m512i __DEFAULT_FN_ATTRS512
346 _mm512_set4_epi32 (int __A, int __B, int __C, int __D)
347 {
348   return __extension__ (__m512i)(__v16si)
349    { __D, __C, __B, __A, __D, __C, __B, __A,
350      __D, __C, __B, __A, __D, __C, __B, __A };
351 }
352 
353 static __inline __m512i __DEFAULT_FN_ATTRS512
354 _mm512_set4_epi64 (long long __A, long long __B, long long __C,
355        long long __D)
356 {
357   return __extension__ (__m512i) (__v8di)
358    { __D, __C, __B, __A, __D, __C, __B, __A };
359 }
360 
361 static __inline __m512d __DEFAULT_FN_ATTRS512
362 _mm512_set4_pd (double __A, double __B, double __C, double __D)
363 {
364   return __extension__ (__m512d)
365    { __D, __C, __B, __A, __D, __C, __B, __A };
366 }
367 
368 static __inline __m512 __DEFAULT_FN_ATTRS512
369 _mm512_set4_ps (float __A, float __B, float __C, float __D)
370 {
371   return __extension__ (__m512)
372    { __D, __C, __B, __A, __D, __C, __B, __A,
373      __D, __C, __B, __A, __D, __C, __B, __A };
374 }
375 
376 #define _mm512_setr4_epi32(e0,e1,e2,e3)               \
377   _mm512_set4_epi32((e3),(e2),(e1),(e0))
378 
379 #define _mm512_setr4_epi64(e0,e1,e2,e3)               \
380   _mm512_set4_epi64((e3),(e2),(e1),(e0))
381 
382 #define _mm512_setr4_pd(e0,e1,e2,e3)                \
383   _mm512_set4_pd((e3),(e2),(e1),(e0))
384 
385 #define _mm512_setr4_ps(e0,e1,e2,e3)                \
386   _mm512_set4_ps((e3),(e2),(e1),(e0))
387 
388 static __inline__ __m512d __DEFAULT_FN_ATTRS512
389 _mm512_broadcastsd_pd(__m128d __A)
390 {
391   return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
392                                           0, 0, 0, 0, 0, 0, 0, 0);
393 }
394 
395 /* Cast between vector types */
396 
397 static __inline __m512d __DEFAULT_FN_ATTRS512
398 _mm512_castpd256_pd512(__m256d __a)
399 {
400   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
401 }
402 
403 static __inline __m512 __DEFAULT_FN_ATTRS512
404 _mm512_castps256_ps512(__m256 __a)
405 {
406   return __builtin_shufflevector(__a, __a, 0,  1,  2,  3,  4,  5,  6,  7,
407                                           -1, -1, -1, -1, -1, -1, -1, -1);
408 }
409 
410 static __inline __m128d __DEFAULT_FN_ATTRS512
411 _mm512_castpd512_pd128(__m512d __a)
412 {
413   return __builtin_shufflevector(__a, __a, 0, 1);
414 }
415 
416 static __inline __m256d __DEFAULT_FN_ATTRS512
417 _mm512_castpd512_pd256 (__m512d __A)
418 {
419   return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
420 }
421 
422 static __inline __m128 __DEFAULT_FN_ATTRS512
423 _mm512_castps512_ps128(__m512 __a)
424 {
425   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
426 }
427 
428 static __inline __m256 __DEFAULT_FN_ATTRS512
429 _mm512_castps512_ps256 (__m512 __A)
430 {
431   return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
432 }
433 
434 static __inline __m512 __DEFAULT_FN_ATTRS512
435 _mm512_castpd_ps (__m512d __A)
436 {
437   return (__m512) (__A);
438 }
439 
440 static __inline __m512i __DEFAULT_FN_ATTRS512
441 _mm512_castpd_si512 (__m512d __A)
442 {
443   return (__m512i) (__A);
444 }
445 
446 static __inline__ __m512d __DEFAULT_FN_ATTRS512
447 _mm512_castpd128_pd512 (__m128d __A)
448 {
449   return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
450 }
451 
452 static __inline __m512d __DEFAULT_FN_ATTRS512
453 _mm512_castps_pd (__m512 __A)
454 {
455   return (__m512d) (__A);
456 }
457 
458 static __inline __m512i __DEFAULT_FN_ATTRS512
459 _mm512_castps_si512 (__m512 __A)
460 {
461   return (__m512i) (__A);
462 }
463 
464 static __inline__ __m512 __DEFAULT_FN_ATTRS512
465 _mm512_castps128_ps512 (__m128 __A)
466 {
467     return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
468 }
469 
470 static __inline__ __m512i __DEFAULT_FN_ATTRS512
471 _mm512_castsi128_si512 (__m128i __A)
472 {
473    return  __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
474 }
475 
476 static __inline__ __m512i __DEFAULT_FN_ATTRS512
477 _mm512_castsi256_si512 (__m256i __A)
478 {
479    return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
480 }
481 
482 static __inline __m512 __DEFAULT_FN_ATTRS512
483 _mm512_castsi512_ps (__m512i __A)
484 {
485   return (__m512) (__A);
486 }
487 
488 static __inline __m512d __DEFAULT_FN_ATTRS512
489 _mm512_castsi512_pd (__m512i __A)
490 {
491   return (__m512d) (__A);
492 }
493 
494 static __inline __m128i __DEFAULT_FN_ATTRS512
495 _mm512_castsi512_si128 (__m512i __A)
496 {
497   return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
498 }
499 
500 static __inline __m256i __DEFAULT_FN_ATTRS512
501 _mm512_castsi512_si256 (__m512i __A)
502 {
503   return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
504 }
505 
506 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
507 _mm512_int2mask(int __a)
508 {
509   return (__mmask16)__a;
510 }
511 
512 static __inline__ int __DEFAULT_FN_ATTRS
513 _mm512_mask2int(__mmask16 __a)
514 {
515   return (int)__a;
516 }
517 
518 /// Constructs a 512-bit floating-point vector of [8 x double] from a
519 ///    128-bit floating-point vector of [2 x double]. The lower 128 bits
520 ///    contain the value of the source vector. The upper 384 bits are set
521 ///    to zero.
522 ///
523 /// \headerfile <x86intrin.h>
524 ///
525 /// This intrinsic has no corresponding instruction.
526 ///
527 /// \param __a
528 ///    A 128-bit vector of [2 x double].
529 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
530 ///    contain the value of the parameter. The upper 384 bits are set to zero.
531 static __inline __m512d __DEFAULT_FN_ATTRS512
532 _mm512_zextpd128_pd512(__m128d __a)
533 {
534   return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
535 }
536 
537 /// Constructs a 512-bit floating-point vector of [8 x double] from a
538 ///    256-bit floating-point vector of [4 x double]. The lower 256 bits
539 ///    contain the value of the source vector. The upper 256 bits are set
540 ///    to zero.
541 ///
542 /// \headerfile <x86intrin.h>
543 ///
544 /// This intrinsic has no corresponding instruction.
545 ///
546 /// \param __a
547 ///    A 256-bit vector of [4 x double].
548 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
549 ///    contain the value of the parameter. The upper 256 bits are set to zero.
550 static __inline __m512d __DEFAULT_FN_ATTRS512
551 _mm512_zextpd256_pd512(__m256d __a)
552 {
553   return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
554 }
555 
556 /// Constructs a 512-bit floating-point vector of [16 x float] from a
557 ///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
558 ///    the value of the source vector. The upper 384 bits are set to zero.
559 ///
560 /// \headerfile <x86intrin.h>
561 ///
562 /// This intrinsic has no corresponding instruction.
563 ///
564 /// \param __a
565 ///    A 128-bit vector of [4 x float].
566 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
567 ///    contain the value of the parameter. The upper 384 bits are set to zero.
568 static __inline __m512 __DEFAULT_FN_ATTRS512
569 _mm512_zextps128_ps512(__m128 __a)
570 {
571   return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
572 }
573 
574 /// Constructs a 512-bit floating-point vector of [16 x float] from a
575 ///    256-bit floating-point vector of [8 x float]. The lower 256 bits contain
576 ///    the value of the source vector. The upper 256 bits are set to zero.
577 ///
578 /// \headerfile <x86intrin.h>
579 ///
580 /// This intrinsic has no corresponding instruction.
581 ///
582 /// \param __a
583 ///    A 256-bit vector of [8 x float].
584 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
585 ///    contain the value of the parameter. The upper 256 bits are set to zero.
586 static __inline __m512 __DEFAULT_FN_ATTRS512
587 _mm512_zextps256_ps512(__m256 __a)
588 {
589   return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
590 }
591 
592 /// Constructs a 512-bit integer vector from a 128-bit integer vector.
593 ///    The lower 128 bits contain the value of the source vector. The upper
594 ///    384 bits are set to zero.
595 ///
596 /// \headerfile <x86intrin.h>
597 ///
598 /// This intrinsic has no corresponding instruction.
599 ///
600 /// \param __a
601 ///    A 128-bit integer vector.
602 /// \returns A 512-bit integer vector. The lower 128 bits contain the value of
603 ///    the parameter. The upper 384 bits are set to zero.
604 static __inline __m512i __DEFAULT_FN_ATTRS512
605 _mm512_zextsi128_si512(__m128i __a)
606 {
607   return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
608 }
609 
610 /// Constructs a 512-bit integer vector from a 256-bit integer vector.
611 ///    The lower 256 bits contain the value of the source vector. The upper
612 ///    256 bits are set to zero.
613 ///
614 /// \headerfile <x86intrin.h>
615 ///
616 /// This intrinsic has no corresponding instruction.
617 ///
618 /// \param __a
619 ///    A 256-bit integer vector.
620 /// \returns A 512-bit integer vector. The lower 256 bits contain the value of
621 ///    the parameter. The upper 256 bits are set to zero.
622 static __inline __m512i __DEFAULT_FN_ATTRS512
623 _mm512_zextsi256_si512(__m256i __a)
624 {
625   return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
626 }
627 
628 /* Bitwise operators */
629 static __inline__ __m512i __DEFAULT_FN_ATTRS512
630 _mm512_and_epi32(__m512i __a, __m512i __b)
631 {
632   return (__m512i)((__v16su)__a & (__v16su)__b);
633 }
634 
635 static __inline__ __m512i __DEFAULT_FN_ATTRS512
636 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
637 {
638   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
639                 (__v16si) _mm512_and_epi32(__a, __b),
640                 (__v16si) __src);
641 }
642 
643 static __inline__ __m512i __DEFAULT_FN_ATTRS512
644 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
645 {
646   return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
647                                          __k, __a, __b);
648 }
649 
650 static __inline__ __m512i __DEFAULT_FN_ATTRS512
651 _mm512_and_epi64(__m512i __a, __m512i __b)
652 {
653   return (__m512i)((__v8du)__a & (__v8du)__b);
654 }
655 
656 static __inline__ __m512i __DEFAULT_FN_ATTRS512
657 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
658 {
659     return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
660                 (__v8di) _mm512_and_epi64(__a, __b),
661                 (__v8di) __src);
662 }
663 
664 static __inline__ __m512i __DEFAULT_FN_ATTRS512
665 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
666 {
667   return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
668                                          __k, __a, __b);
669 }
670 
671 static __inline__ __m512i __DEFAULT_FN_ATTRS512
672 _mm512_andnot_si512 (__m512i __A, __m512i __B)
673 {
674   return (__m512i)(~(__v8du)__A & (__v8du)__B);
675 }
676 
677 static __inline__ __m512i __DEFAULT_FN_ATTRS512
678 _mm512_andnot_epi32 (__m512i __A, __m512i __B)
679 {
680   return (__m512i)(~(__v16su)__A & (__v16su)__B);
681 }
682 
683 static __inline__ __m512i __DEFAULT_FN_ATTRS512
684 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
685 {
686   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
687                                          (__v16si)_mm512_andnot_epi32(__A, __B),
688                                          (__v16si)__W);
689 }
690 
691 static __inline__ __m512i __DEFAULT_FN_ATTRS512
692 _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
693 {
694   return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
695                                            __U, __A, __B);
696 }
697 
698 static __inline__ __m512i __DEFAULT_FN_ATTRS512
699 _mm512_andnot_epi64(__m512i __A, __m512i __B)
700 {
701   return (__m512i)(~(__v8du)__A & (__v8du)__B);
702 }
703 
704 static __inline__ __m512i __DEFAULT_FN_ATTRS512
705 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
706 {
707   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
708                                           (__v8di)_mm512_andnot_epi64(__A, __B),
709                                           (__v8di)__W);
710 }
711 
712 static __inline__ __m512i __DEFAULT_FN_ATTRS512
713 _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
714 {
715   return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
716                                            __U, __A, __B);
717 }
718 
719 static __inline__ __m512i __DEFAULT_FN_ATTRS512
720 _mm512_or_epi32(__m512i __a, __m512i __b)
721 {
722   return (__m512i)((__v16su)__a | (__v16su)__b);
723 }
724 
725 static __inline__ __m512i __DEFAULT_FN_ATTRS512
726 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
727 {
728   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
729                                              (__v16si)_mm512_or_epi32(__a, __b),
730                                              (__v16si)__src);
731 }
732 
733 static __inline__ __m512i __DEFAULT_FN_ATTRS512
734 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
735 {
736   return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
737 }
738 
739 static __inline__ __m512i __DEFAULT_FN_ATTRS512
740 _mm512_or_epi64(__m512i __a, __m512i __b)
741 {
742   return (__m512i)((__v8du)__a | (__v8du)__b);
743 }
744 
745 static __inline__ __m512i __DEFAULT_FN_ATTRS512
746 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
747 {
748   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
749                                              (__v8di)_mm512_or_epi64(__a, __b),
750                                              (__v8di)__src);
751 }
752 
753 static __inline__ __m512i __DEFAULT_FN_ATTRS512
754 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
755 {
756   return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
757 }
758 
759 static __inline__ __m512i __DEFAULT_FN_ATTRS512
760 _mm512_xor_epi32(__m512i __a, __m512i __b)
761 {
762   return (__m512i)((__v16su)__a ^ (__v16su)__b);
763 }
764 
765 static __inline__ __m512i __DEFAULT_FN_ATTRS512
766 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
767 {
768   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
769                                             (__v16si)_mm512_xor_epi32(__a, __b),
770                                             (__v16si)__src);
771 }
772 
773 static __inline__ __m512i __DEFAULT_FN_ATTRS512
774 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
775 {
776   return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
777 }
778 
779 static __inline__ __m512i __DEFAULT_FN_ATTRS512
780 _mm512_xor_epi64(__m512i __a, __m512i __b)
781 {
782   return (__m512i)((__v8du)__a ^ (__v8du)__b);
783 }
784 
785 static __inline__ __m512i __DEFAULT_FN_ATTRS512
786 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
787 {
788   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
789                                              (__v8di)_mm512_xor_epi64(__a, __b),
790                                              (__v8di)__src);
791 }
792 
793 static __inline__ __m512i __DEFAULT_FN_ATTRS512
794 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
795 {
796   return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
797 }
798 
799 static __inline__ __m512i __DEFAULT_FN_ATTRS512
800 _mm512_and_si512(__m512i __a, __m512i __b)
801 {
802   return (__m512i)((__v8du)__a & (__v8du)__b);
803 }
804 
805 static __inline__ __m512i __DEFAULT_FN_ATTRS512
806 _mm512_or_si512(__m512i __a, __m512i __b)
807 {
808   return (__m512i)((__v8du)__a | (__v8du)__b);
809 }
810 
811 static __inline__ __m512i __DEFAULT_FN_ATTRS512
812 _mm512_xor_si512(__m512i __a, __m512i __b)
813 {
814   return (__m512i)((__v8du)__a ^ (__v8du)__b);
815 }
816 
817 /* Arithmetic */
818 
819 static __inline __m512d __DEFAULT_FN_ATTRS512
820 _mm512_add_pd(__m512d __a, __m512d __b)
821 {
822   return (__m512d)((__v8df)__a + (__v8df)__b);
823 }
824 
825 static __inline __m512 __DEFAULT_FN_ATTRS512
826 _mm512_add_ps(__m512 __a, __m512 __b)
827 {
828   return (__m512)((__v16sf)__a + (__v16sf)__b);
829 }
830 
831 static __inline __m512d __DEFAULT_FN_ATTRS512
832 _mm512_mul_pd(__m512d __a, __m512d __b)
833 {
834   return (__m512d)((__v8df)__a * (__v8df)__b);
835 }
836 
837 static __inline __m512 __DEFAULT_FN_ATTRS512
838 _mm512_mul_ps(__m512 __a, __m512 __b)
839 {
840   return (__m512)((__v16sf)__a * (__v16sf)__b);
841 }
842 
843 static __inline __m512d __DEFAULT_FN_ATTRS512
844 _mm512_sub_pd(__m512d __a, __m512d __b)
845 {
846   return (__m512d)((__v8df)__a - (__v8df)__b);
847 }
848 
849 static __inline __m512 __DEFAULT_FN_ATTRS512
850 _mm512_sub_ps(__m512 __a, __m512 __b)
851 {
852   return (__m512)((__v16sf)__a - (__v16sf)__b);
853 }
854 
855 static __inline__ __m512i __DEFAULT_FN_ATTRS512
856 _mm512_add_epi64 (__m512i __A, __m512i __B)
857 {
858   return (__m512i) ((__v8du) __A + (__v8du) __B);
859 }
860 
861 static __inline__ __m512i __DEFAULT_FN_ATTRS512
862 _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
863 {
864   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
865                                              (__v8di)_mm512_add_epi64(__A, __B),
866                                              (__v8di)__W);
867 }
868 
869 static __inline__ __m512i __DEFAULT_FN_ATTRS512
870 _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
871 {
872   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
873                                              (__v8di)_mm512_add_epi64(__A, __B),
874                                              (__v8di)_mm512_setzero_si512());
875 }
876 
877 static __inline__ __m512i __DEFAULT_FN_ATTRS512
878 _mm512_sub_epi64 (__m512i __A, __m512i __B)
879 {
880   return (__m512i) ((__v8du) __A - (__v8du) __B);
881 }
882 
883 static __inline__ __m512i __DEFAULT_FN_ATTRS512
884 _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
885 {
886   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
887                                              (__v8di)_mm512_sub_epi64(__A, __B),
888                                              (__v8di)__W);
889 }
890 
891 static __inline__ __m512i __DEFAULT_FN_ATTRS512
892 _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
893 {
894   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
895                                              (__v8di)_mm512_sub_epi64(__A, __B),
896                                              (__v8di)_mm512_setzero_si512());
897 }
898 
899 static __inline__ __m512i __DEFAULT_FN_ATTRS512
900 _mm512_add_epi32 (__m512i __A, __m512i __B)
901 {
902   return (__m512i) ((__v16su) __A + (__v16su) __B);
903 }
904 
905 static __inline__ __m512i __DEFAULT_FN_ATTRS512
906 _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
907 {
908   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
909                                              (__v16si)_mm512_add_epi32(__A, __B),
910                                              (__v16si)__W);
911 }
912 
913 static __inline__ __m512i __DEFAULT_FN_ATTRS512
914 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
915 {
916   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
917                                              (__v16si)_mm512_add_epi32(__A, __B),
918                                              (__v16si)_mm512_setzero_si512());
919 }
920 
921 static __inline__ __m512i __DEFAULT_FN_ATTRS512
922 _mm512_sub_epi32 (__m512i __A, __m512i __B)
923 {
924   return (__m512i) ((__v16su) __A - (__v16su) __B);
925 }
926 
927 static __inline__ __m512i __DEFAULT_FN_ATTRS512
928 _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
929 {
930   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
931                                              (__v16si)_mm512_sub_epi32(__A, __B),
932                                              (__v16si)__W);
933 }
934 
935 static __inline__ __m512i __DEFAULT_FN_ATTRS512
936 _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
937 {
938   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
939                                              (__v16si)_mm512_sub_epi32(__A, __B),
940                                              (__v16si)_mm512_setzero_si512());
941 }
942 
943 #define _mm512_max_round_pd(A, B, R) \
944   ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
945                                     (__v8df)(__m512d)(B), (int)(R)))
946 
947 #define _mm512_mask_max_round_pd(W, U, A, B, R) \
948   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
949                                    (__v8df)_mm512_max_round_pd((A), (B), (R)), \
950                                    (__v8df)(W)))
951 
952 #define _mm512_maskz_max_round_pd(U, A, B, R) \
953   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
954                                    (__v8df)_mm512_max_round_pd((A), (B), (R)), \
955                                    (__v8df)_mm512_setzero_pd()))
956 
957 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
958 _mm512_max_pd(__m512d __A, __m512d __B)
959 {
960   return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
961                                            _MM_FROUND_CUR_DIRECTION);
962 }
963 
964 static __inline__ __m512d __DEFAULT_FN_ATTRS512
965 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
966 {
967   return (__m512d)__builtin_ia32_selectpd_512(__U,
968                                               (__v8df)_mm512_max_pd(__A, __B),
969                                               (__v8df)__W);
970 }
971 
972 static __inline__ __m512d __DEFAULT_FN_ATTRS512
973 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
974 {
975   return (__m512d)__builtin_ia32_selectpd_512(__U,
976                                               (__v8df)_mm512_max_pd(__A, __B),
977                                               (__v8df)_mm512_setzero_pd());
978 }
979 
980 #define _mm512_max_round_ps(A, B, R) \
981   ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
982                                    (__v16sf)(__m512)(B), (int)(R)))
983 
984 #define _mm512_mask_max_round_ps(W, U, A, B, R) \
985   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
986                                   (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
987                                   (__v16sf)(W)))
988 
989 #define _mm512_maskz_max_round_ps(U, A, B, R) \
990   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
991                                   (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
992                                   (__v16sf)_mm512_setzero_ps()))
993 
994 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
995 _mm512_max_ps(__m512 __A, __m512 __B)
996 {
997   return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
998                                           _MM_FROUND_CUR_DIRECTION);
999 }
1000 
1001 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1002 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
1003 {
1004   return (__m512)__builtin_ia32_selectps_512(__U,
1005                                              (__v16sf)_mm512_max_ps(__A, __B),
1006                                              (__v16sf)__W);
1007 }
1008 
1009 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1010 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
1011 {
1012   return (__m512)__builtin_ia32_selectps_512(__U,
1013                                              (__v16sf)_mm512_max_ps(__A, __B),
1014                                              (__v16sf)_mm512_setzero_ps());
1015 }
1016 
1017 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1018 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1019   return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1020                 (__v4sf) __B,
1021                 (__v4sf) __W,
1022                 (__mmask8) __U,
1023                 _MM_FROUND_CUR_DIRECTION);
1024 }
1025 
1026 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1027 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1028   return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1029                 (__v4sf) __B,
1030                 (__v4sf)  _mm_setzero_ps (),
1031                 (__mmask8) __U,
1032                 _MM_FROUND_CUR_DIRECTION);
1033 }
1034 
1035 #define _mm_max_round_ss(A, B, R) \
1036   ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1037                                            (__v4sf)(__m128)(B), \
1038                                            (__v4sf)_mm_setzero_ps(), \
1039                                            (__mmask8)-1, (int)(R)))
1040 
1041 #define _mm_mask_max_round_ss(W, U, A, B, R) \
1042   ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1043                                            (__v4sf)(__m128)(B), \
1044                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
1045                                            (int)(R)))
1046 
1047 #define _mm_maskz_max_round_ss(U, A, B, R) \
1048   ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1049                                            (__v4sf)(__m128)(B), \
1050                                            (__v4sf)_mm_setzero_ps(), \
1051                                            (__mmask8)(U), (int)(R)))
1052 
1053 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1054 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1055   return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1056                 (__v2df) __B,
1057                 (__v2df) __W,
1058                 (__mmask8) __U,
1059                 _MM_FROUND_CUR_DIRECTION);
1060 }
1061 
1062 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1063 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1064   return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1065                 (__v2df) __B,
1066                 (__v2df)  _mm_setzero_pd (),
1067                 (__mmask8) __U,
1068                 _MM_FROUND_CUR_DIRECTION);
1069 }
1070 
1071 #define _mm_max_round_sd(A, B, R) \
1072   ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1073                                             (__v2df)(__m128d)(B), \
1074                                             (__v2df)_mm_setzero_pd(), \
1075                                             (__mmask8)-1, (int)(R)))
1076 
1077 #define _mm_mask_max_round_sd(W, U, A, B, R) \
1078   ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1079                                             (__v2df)(__m128d)(B), \
1080                                             (__v2df)(__m128d)(W), \
1081                                             (__mmask8)(U), (int)(R)))
1082 
1083 #define _mm_maskz_max_round_sd(U, A, B, R) \
1084   ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1085                                             (__v2df)(__m128d)(B), \
1086                                             (__v2df)_mm_setzero_pd(), \
1087                                             (__mmask8)(U), (int)(R)))
1088 
1089 static __inline __m512i
1090 __DEFAULT_FN_ATTRS512
1091 _mm512_max_epi32(__m512i __A, __m512i __B)
1092 {
1093   return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B);
1094 }
1095 
1096 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1097 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1098 {
1099   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1100                                             (__v16si)_mm512_max_epi32(__A, __B),
1101                                             (__v16si)__W);
1102 }
1103 
1104 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1105 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
1106 {
1107   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1108                                             (__v16si)_mm512_max_epi32(__A, __B),
1109                                             (__v16si)_mm512_setzero_si512());
1110 }
1111 
1112 static __inline __m512i __DEFAULT_FN_ATTRS512
1113 _mm512_max_epu32(__m512i __A, __m512i __B)
1114 {
1115   return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B);
1116 }
1117 
1118 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1119 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1120 {
1121   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1122                                             (__v16si)_mm512_max_epu32(__A, __B),
1123                                             (__v16si)__W);
1124 }
1125 
1126 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1127 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
1128 {
1129   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1130                                             (__v16si)_mm512_max_epu32(__A, __B),
1131                                             (__v16si)_mm512_setzero_si512());
1132 }
1133 
1134 static __inline __m512i __DEFAULT_FN_ATTRS512
1135 _mm512_max_epi64(__m512i __A, __m512i __B)
1136 {
1137   return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B);
1138 }
1139 
1140 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1141 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1142 {
1143   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1144                                              (__v8di)_mm512_max_epi64(__A, __B),
1145                                              (__v8di)__W);
1146 }
1147 
1148 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1149 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
1150 {
1151   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1152                                              (__v8di)_mm512_max_epi64(__A, __B),
1153                                              (__v8di)_mm512_setzero_si512());
1154 }
1155 
1156 static __inline __m512i __DEFAULT_FN_ATTRS512
1157 _mm512_max_epu64(__m512i __A, __m512i __B)
1158 {
1159   return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B);
1160 }
1161 
1162 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1163 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1164 {
1165   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1166                                              (__v8di)_mm512_max_epu64(__A, __B),
1167                                              (__v8di)__W);
1168 }
1169 
1170 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1171 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
1172 {
1173   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1174                                              (__v8di)_mm512_max_epu64(__A, __B),
1175                                              (__v8di)_mm512_setzero_si512());
1176 }
1177 
1178 #define _mm512_min_round_pd(A, B, R) \
1179   ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
1180                                     (__v8df)(__m512d)(B), (int)(R)))
1181 
1182 #define _mm512_mask_min_round_pd(W, U, A, B, R) \
1183   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1184                                    (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1185                                    (__v8df)(W)))
1186 
1187 #define _mm512_maskz_min_round_pd(U, A, B, R) \
1188   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1189                                    (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1190                                    (__v8df)_mm512_setzero_pd()))
1191 
1192 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1193 _mm512_min_pd(__m512d __A, __m512d __B)
1194 {
1195   return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
1196                                            _MM_FROUND_CUR_DIRECTION);
1197 }
1198 
1199 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1200 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
1201 {
1202   return (__m512d)__builtin_ia32_selectpd_512(__U,
1203                                               (__v8df)_mm512_min_pd(__A, __B),
1204                                               (__v8df)__W);
1205 }
1206 
1207 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1208 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
1209 {
1210   return (__m512d)__builtin_ia32_selectpd_512(__U,
1211                                               (__v8df)_mm512_min_pd(__A, __B),
1212                                               (__v8df)_mm512_setzero_pd());
1213 }
1214 
1215 #define _mm512_min_round_ps(A, B, R) \
1216   ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
1217                                    (__v16sf)(__m512)(B), (int)(R)))
1218 
1219 #define _mm512_mask_min_round_ps(W, U, A, B, R) \
1220   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1221                                   (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1222                                   (__v16sf)(W)))
1223 
1224 #define _mm512_maskz_min_round_ps(U, A, B, R) \
1225   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1226                                   (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1227                                   (__v16sf)_mm512_setzero_ps()))
1228 
1229 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1230 _mm512_min_ps(__m512 __A, __m512 __B)
1231 {
1232   return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
1233                                           _MM_FROUND_CUR_DIRECTION);
1234 }
1235 
1236 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1237 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
1238 {
1239   return (__m512)__builtin_ia32_selectps_512(__U,
1240                                              (__v16sf)_mm512_min_ps(__A, __B),
1241                                              (__v16sf)__W);
1242 }
1243 
1244 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1245 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
1246 {
1247   return (__m512)__builtin_ia32_selectps_512(__U,
1248                                              (__v16sf)_mm512_min_ps(__A, __B),
1249                                              (__v16sf)_mm512_setzero_ps());
1250 }
1251 
1252 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1253 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1254   return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1255                 (__v4sf) __B,
1256                 (__v4sf) __W,
1257                 (__mmask8) __U,
1258                 _MM_FROUND_CUR_DIRECTION);
1259 }
1260 
1261 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1262 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1263   return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1264                 (__v4sf) __B,
1265                 (__v4sf)  _mm_setzero_ps (),
1266                 (__mmask8) __U,
1267                 _MM_FROUND_CUR_DIRECTION);
1268 }
1269 
1270 #define _mm_min_round_ss(A, B, R) \
1271   ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1272                                            (__v4sf)(__m128)(B), \
1273                                            (__v4sf)_mm_setzero_ps(), \
1274                                            (__mmask8)-1, (int)(R)))
1275 
1276 #define _mm_mask_min_round_ss(W, U, A, B, R) \
1277   ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1278                                            (__v4sf)(__m128)(B), \
1279                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
1280                                            (int)(R)))
1281 
1282 #define _mm_maskz_min_round_ss(U, A, B, R) \
1283   ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1284                                            (__v4sf)(__m128)(B), \
1285                                            (__v4sf)_mm_setzero_ps(), \
1286                                            (__mmask8)(U), (int)(R)))
1287 
1288 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1289 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1290   return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1291                 (__v2df) __B,
1292                 (__v2df) __W,
1293                 (__mmask8) __U,
1294                 _MM_FROUND_CUR_DIRECTION);
1295 }
1296 
1297 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1298 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1299   return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1300                 (__v2df) __B,
1301                 (__v2df)  _mm_setzero_pd (),
1302                 (__mmask8) __U,
1303                 _MM_FROUND_CUR_DIRECTION);
1304 }
1305 
1306 #define _mm_min_round_sd(A, B, R) \
1307   ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1308                                             (__v2df)(__m128d)(B), \
1309                                             (__v2df)_mm_setzero_pd(), \
1310                                             (__mmask8)-1, (int)(R)))
1311 
1312 #define _mm_mask_min_round_sd(W, U, A, B, R) \
1313   ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1314                                             (__v2df)(__m128d)(B), \
1315                                             (__v2df)(__m128d)(W), \
1316                                             (__mmask8)(U), (int)(R)))
1317 
1318 #define _mm_maskz_min_round_sd(U, A, B, R) \
1319   ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1320                                             (__v2df)(__m128d)(B), \
1321                                             (__v2df)_mm_setzero_pd(), \
1322                                             (__mmask8)(U), (int)(R)))
1323 
1324 static __inline __m512i
1325 __DEFAULT_FN_ATTRS512
1326 _mm512_min_epi32(__m512i __A, __m512i __B)
1327 {
1328   return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B);
1329 }
1330 
1331 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1332 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1333 {
1334   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1335                                             (__v16si)_mm512_min_epi32(__A, __B),
1336                                             (__v16si)__W);
1337 }
1338 
1339 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1340 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
1341 {
1342   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1343                                             (__v16si)_mm512_min_epi32(__A, __B),
1344                                             (__v16si)_mm512_setzero_si512());
1345 }
1346 
1347 static __inline __m512i __DEFAULT_FN_ATTRS512
1348 _mm512_min_epu32(__m512i __A, __m512i __B)
1349 {
1350   return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B);
1351 }
1352 
1353 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1354 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1355 {
1356   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1357                                             (__v16si)_mm512_min_epu32(__A, __B),
1358                                             (__v16si)__W);
1359 }
1360 
1361 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1362 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
1363 {
1364   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1365                                             (__v16si)_mm512_min_epu32(__A, __B),
1366                                             (__v16si)_mm512_setzero_si512());
1367 }
1368 
1369 static __inline __m512i __DEFAULT_FN_ATTRS512
1370 _mm512_min_epi64(__m512i __A, __m512i __B)
1371 {
1372   return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B);
1373 }
1374 
1375 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1376 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1377 {
1378   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1379                                              (__v8di)_mm512_min_epi64(__A, __B),
1380                                              (__v8di)__W);
1381 }
1382 
1383 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1384 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
1385 {
1386   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1387                                              (__v8di)_mm512_min_epi64(__A, __B),
1388                                              (__v8di)_mm512_setzero_si512());
1389 }
1390 
1391 static __inline __m512i __DEFAULT_FN_ATTRS512
1392 _mm512_min_epu64(__m512i __A, __m512i __B)
1393 {
1394   return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B);
1395 }
1396 
1397 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1398 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1399 {
1400   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1401                                              (__v8di)_mm512_min_epu64(__A, __B),
1402                                              (__v8di)__W);
1403 }
1404 
1405 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1406 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
1407 {
1408   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1409                                              (__v8di)_mm512_min_epu64(__A, __B),
1410                                              (__v8di)_mm512_setzero_si512());
1411 }
1412 
1413 static __inline __m512i __DEFAULT_FN_ATTRS512
1414 _mm512_mul_epi32(__m512i __X, __m512i __Y)
1415 {
1416   return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
1417 }
1418 
1419 static __inline __m512i __DEFAULT_FN_ATTRS512
1420 _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
1421 {
1422   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1423                                              (__v8di)_mm512_mul_epi32(__X, __Y),
1424                                              (__v8di)__W);
1425 }
1426 
1427 static __inline __m512i __DEFAULT_FN_ATTRS512
1428 _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
1429 {
1430   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1431                                              (__v8di)_mm512_mul_epi32(__X, __Y),
1432                                              (__v8di)_mm512_setzero_si512 ());
1433 }
1434 
1435 static __inline __m512i __DEFAULT_FN_ATTRS512
1436 _mm512_mul_epu32(__m512i __X, __m512i __Y)
1437 {
1438   return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
1439 }
1440 
1441 static __inline __m512i __DEFAULT_FN_ATTRS512
1442 _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
1443 {
1444   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1445                                              (__v8di)_mm512_mul_epu32(__X, __Y),
1446                                              (__v8di)__W);
1447 }
1448 
1449 static __inline __m512i __DEFAULT_FN_ATTRS512
1450 _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
1451 {
1452   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1453                                              (__v8di)_mm512_mul_epu32(__X, __Y),
1454                                              (__v8di)_mm512_setzero_si512 ());
1455 }
1456 
1457 static __inline __m512i __DEFAULT_FN_ATTRS512
1458 _mm512_mullo_epi32 (__m512i __A, __m512i __B)
1459 {
1460   return (__m512i) ((__v16su) __A * (__v16su) __B);
1461 }
1462 
1463 static __inline __m512i __DEFAULT_FN_ATTRS512
1464 _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
1465 {
1466   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1467                                              (__v16si)_mm512_mullo_epi32(__A, __B),
1468                                              (__v16si)_mm512_setzero_si512());
1469 }
1470 
1471 static __inline __m512i __DEFAULT_FN_ATTRS512
1472 _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1473 {
1474   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1475                                              (__v16si)_mm512_mullo_epi32(__A, __B),
1476                                              (__v16si)__W);
1477 }
1478 
1479 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1480 _mm512_mullox_epi64 (__m512i __A, __m512i __B) {
1481   return (__m512i) ((__v8du) __A * (__v8du) __B);
1482 }
1483 
1484 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1485 _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
1486   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1487                                              (__v8di)_mm512_mullox_epi64(__A, __B),
1488                                              (__v8di)__W);
1489 }
1490 
1491 #define _mm512_sqrt_round_pd(A, R) \
1492   ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
1493 
1494 #define _mm512_mask_sqrt_round_pd(W, U, A, R) \
1495   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1496                                        (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1497                                        (__v8df)(__m512d)(W)))
1498 
1499 #define _mm512_maskz_sqrt_round_pd(U, A, R) \
1500   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1501                                        (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1502                                        (__v8df)_mm512_setzero_pd()))
1503 
1504 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1505 _mm512_sqrt_pd(__m512d __A)
1506 {
1507   return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
1508                                            _MM_FROUND_CUR_DIRECTION);
1509 }
1510 
1511 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1512 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
1513 {
1514   return (__m512d)__builtin_ia32_selectpd_512(__U,
1515                                               (__v8df)_mm512_sqrt_pd(__A),
1516                                               (__v8df)__W);
1517 }
1518 
1519 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1520 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
1521 {
1522   return (__m512d)__builtin_ia32_selectpd_512(__U,
1523                                               (__v8df)_mm512_sqrt_pd(__A),
1524                                               (__v8df)_mm512_setzero_pd());
1525 }
1526 
1527 #define _mm512_sqrt_round_ps(A, R) \
1528   ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
1529 
1530 #define _mm512_mask_sqrt_round_ps(W, U, A, R) \
1531   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1532                                       (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1533                                       (__v16sf)(__m512)(W)))
1534 
1535 #define _mm512_maskz_sqrt_round_ps(U, A, R) \
1536   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1537                                       (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1538                                       (__v16sf)_mm512_setzero_ps()))
1539 
1540 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1541 _mm512_sqrt_ps(__m512 __A)
1542 {
1543   return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
1544                                           _MM_FROUND_CUR_DIRECTION);
1545 }
1546 
1547 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1548 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
1549 {
1550   return (__m512)__builtin_ia32_selectps_512(__U,
1551                                              (__v16sf)_mm512_sqrt_ps(__A),
1552                                              (__v16sf)__W);
1553 }
1554 
1555 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1556 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
1557 {
1558   return (__m512)__builtin_ia32_selectps_512(__U,
1559                                              (__v16sf)_mm512_sqrt_ps(__A),
1560                                              (__v16sf)_mm512_setzero_ps());
1561 }
1562 
1563 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1564 _mm512_rsqrt14_pd(__m512d __A)
1565 {
1566   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1567                  (__v8df)
1568                  _mm512_setzero_pd (),
1569                  (__mmask8) -1);}
1570 
1571 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1572 _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
1573 {
1574   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1575                   (__v8df) __W,
1576                   (__mmask8) __U);
1577 }
1578 
1579 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1580 _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
1581 {
1582   return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1583                   (__v8df)
1584                   _mm512_setzero_pd (),
1585                   (__mmask8) __U);
1586 }
1587 
1588 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1589 _mm512_rsqrt14_ps(__m512 __A)
1590 {
1591   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1592                 (__v16sf)
1593                 _mm512_setzero_ps (),
1594                 (__mmask16) -1);
1595 }
1596 
1597 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1598 _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
1599 {
1600   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1601                  (__v16sf) __W,
1602                  (__mmask16) __U);
1603 }
1604 
1605 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1606 _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
1607 {
1608   return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1609                  (__v16sf)
1610                  _mm512_setzero_ps (),
1611                  (__mmask16) __U);
1612 }
1613 
1614 static  __inline__ __m128 __DEFAULT_FN_ATTRS128
1615 _mm_rsqrt14_ss(__m128 __A, __m128 __B)
1616 {
1617   return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1618              (__v4sf) __B,
1619              (__v4sf)
1620              _mm_setzero_ps (),
1621              (__mmask8) -1);
1622 }
1623 
1624 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1625 _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
1626 {
1627  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1628           (__v4sf) __B,
1629           (__v4sf) __W,
1630           (__mmask8) __U);
1631 }
1632 
1633 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1634 _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
1635 {
1636  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1637           (__v4sf) __B,
1638           (__v4sf) _mm_setzero_ps (),
1639           (__mmask8) __U);
1640 }
1641 
1642 static  __inline__ __m128d __DEFAULT_FN_ATTRS128
1643 _mm_rsqrt14_sd(__m128d __A, __m128d __B)
1644 {
1645   return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
1646               (__v2df) __B,
1647               (__v2df)
1648               _mm_setzero_pd (),
1649               (__mmask8) -1);
1650 }
1651 
1652 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1653 _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
1654 {
1655  return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1656           (__v2df) __B,
1657           (__v2df) __W,
1658           (__mmask8) __U);
1659 }
1660 
1661 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1662 _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
1663 {
1664  return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1665           (__v2df) __B,
1666           (__v2df) _mm_setzero_pd (),
1667           (__mmask8) __U);
1668 }
1669 
1670 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1671 _mm512_rcp14_pd(__m512d __A)
1672 {
1673   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1674                (__v8df)
1675                _mm512_setzero_pd (),
1676                (__mmask8) -1);
1677 }
1678 
1679 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1680 _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
1681 {
1682   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1683                 (__v8df) __W,
1684                 (__mmask8) __U);
1685 }
1686 
1687 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1688 _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
1689 {
1690   return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1691                 (__v8df)
1692                 _mm512_setzero_pd (),
1693                 (__mmask8) __U);
1694 }
1695 
1696 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1697 _mm512_rcp14_ps(__m512 __A)
1698 {
1699   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1700               (__v16sf)
1701               _mm512_setzero_ps (),
1702               (__mmask16) -1);
1703 }
1704 
1705 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1706 _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
1707 {
1708   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1709                    (__v16sf) __W,
1710                    (__mmask16) __U);
1711 }
1712 
1713 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1714 _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
1715 {
1716   return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1717                    (__v16sf)
1718                    _mm512_setzero_ps (),
1719                    (__mmask16) __U);
1720 }
1721 
1722 static  __inline__ __m128 __DEFAULT_FN_ATTRS128
1723 _mm_rcp14_ss(__m128 __A, __m128 __B)
1724 {
1725   return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1726                  (__v4sf) __B,
1727                  (__v4sf)
1728                  _mm_setzero_ps (),
1729                  (__mmask8) -1);
1730 }
1731 
1732 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1733 _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
1734 {
1735  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1736           (__v4sf) __B,
1737           (__v4sf) __W,
1738           (__mmask8) __U);
1739 }
1740 
1741 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1742 _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
1743 {
1744  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1745           (__v4sf) __B,
1746           (__v4sf) _mm_setzero_ps (),
1747           (__mmask8) __U);
1748 }
1749 
1750 static  __inline__ __m128d __DEFAULT_FN_ATTRS128
1751 _mm_rcp14_sd(__m128d __A, __m128d __B)
1752 {
1753   return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
1754             (__v2df) __B,
1755             (__v2df)
1756             _mm_setzero_pd (),
1757             (__mmask8) -1);
1758 }
1759 
1760 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1761 _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
1762 {
1763  return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1764           (__v2df) __B,
1765           (__v2df) __W,
1766           (__mmask8) __U);
1767 }
1768 
1769 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1770 _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
1771 {
1772  return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1773           (__v2df) __B,
1774           (__v2df) _mm_setzero_pd (),
1775           (__mmask8) __U);
1776 }
1777 
1778 static __inline __m512 __DEFAULT_FN_ATTRS512
1779 _mm512_floor_ps(__m512 __A)
1780 {
1781   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1782                                                   _MM_FROUND_FLOOR,
1783                                                   (__v16sf) __A, (unsigned short)-1,
1784                                                   _MM_FROUND_CUR_DIRECTION);
1785 }
1786 
1787 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1788 _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
1789 {
1790   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1791                    _MM_FROUND_FLOOR,
1792                    (__v16sf) __W, __U,
1793                    _MM_FROUND_CUR_DIRECTION);
1794 }
1795 
1796 static __inline __m512d __DEFAULT_FN_ATTRS512
1797 _mm512_floor_pd(__m512d __A)
1798 {
1799   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1800                                                    _MM_FROUND_FLOOR,
1801                                                    (__v8df) __A, (unsigned char)-1,
1802                                                    _MM_FROUND_CUR_DIRECTION);
1803 }
1804 
1805 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1806 _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
1807 {
1808   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1809                 _MM_FROUND_FLOOR,
1810                 (__v8df) __W, __U,
1811                 _MM_FROUND_CUR_DIRECTION);
1812 }
1813 
1814 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1815 _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
1816 {
1817   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1818                    _MM_FROUND_CEIL,
1819                    (__v16sf) __W, __U,
1820                    _MM_FROUND_CUR_DIRECTION);
1821 }
1822 
1823 static __inline __m512 __DEFAULT_FN_ATTRS512
1824 _mm512_ceil_ps(__m512 __A)
1825 {
1826   return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1827                                                   _MM_FROUND_CEIL,
1828                                                   (__v16sf) __A, (unsigned short)-1,
1829                                                   _MM_FROUND_CUR_DIRECTION);
1830 }
1831 
1832 static __inline __m512d __DEFAULT_FN_ATTRS512
1833 _mm512_ceil_pd(__m512d __A)
1834 {
1835   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1836                                                    _MM_FROUND_CEIL,
1837                                                    (__v8df) __A, (unsigned char)-1,
1838                                                    _MM_FROUND_CUR_DIRECTION);
1839 }
1840 
1841 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1842 _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
1843 {
1844   return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1845                 _MM_FROUND_CEIL,
1846                 (__v8df) __W, __U,
1847                 _MM_FROUND_CUR_DIRECTION);
1848 }
1849 
1850 static __inline __m512i __DEFAULT_FN_ATTRS512
1851 _mm512_abs_epi64(__m512i __A)
1852 {
1853   return (__m512i)__builtin_elementwise_abs((__v8di)__A);
1854 }
1855 
1856 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1857 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
1858 {
1859   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1860                                              (__v8di)_mm512_abs_epi64(__A),
1861                                              (__v8di)__W);
1862 }
1863 
1864 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1865 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
1866 {
1867   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1868                                              (__v8di)_mm512_abs_epi64(__A),
1869                                              (__v8di)_mm512_setzero_si512());
1870 }
1871 
1872 static __inline __m512i __DEFAULT_FN_ATTRS512
1873 _mm512_abs_epi32(__m512i __A)
1874 {
1875   return (__m512i)__builtin_elementwise_abs((__v16si) __A);
1876 }
1877 
1878 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1879 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
1880 {
1881   return (__m512i)__builtin_ia32_selectd_512(__U,
1882                                              (__v16si)_mm512_abs_epi32(__A),
1883                                              (__v16si)__W);
1884 }
1885 
1886 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1887 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
1888 {
1889   return (__m512i)__builtin_ia32_selectd_512(__U,
1890                                              (__v16si)_mm512_abs_epi32(__A),
1891                                              (__v16si)_mm512_setzero_si512());
1892 }
1893 
1894 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1895 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1896   __A = _mm_add_ss(__A, __B);
1897   return __builtin_ia32_selectss_128(__U, __A, __W);
1898 }
1899 
1900 static __inline__ __m128 __DEFAULT_FN_ATTRS128
1901 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1902   __A = _mm_add_ss(__A, __B);
1903   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
1904 }
1905 
1906 #define _mm_add_round_ss(A, B, R) \
1907   ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1908                                            (__v4sf)(__m128)(B), \
1909                                            (__v4sf)_mm_setzero_ps(), \
1910                                            (__mmask8)-1, (int)(R)))
1911 
1912 #define _mm_mask_add_round_ss(W, U, A, B, R) \
1913   ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1914                                            (__v4sf)(__m128)(B), \
1915                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
1916                                            (int)(R)))
1917 
1918 #define _mm_maskz_add_round_ss(U, A, B, R) \
1919   ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1920                                            (__v4sf)(__m128)(B), \
1921                                            (__v4sf)_mm_setzero_ps(), \
1922                                            (__mmask8)(U), (int)(R)))
1923 
1924 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1925 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1926   __A = _mm_add_sd(__A, __B);
1927   return __builtin_ia32_selectsd_128(__U, __A, __W);
1928 }
1929 
1930 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1931 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1932   __A = _mm_add_sd(__A, __B);
1933   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
1934 }
1935 #define _mm_add_round_sd(A, B, R) \
1936   ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1937                                             (__v2df)(__m128d)(B), \
1938                                             (__v2df)_mm_setzero_pd(), \
1939                                             (__mmask8)-1, (int)(R)))
1940 
1941 #define _mm_mask_add_round_sd(W, U, A, B, R) \
1942   ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1943                                             (__v2df)(__m128d)(B), \
1944                                             (__v2df)(__m128d)(W), \
1945                                             (__mmask8)(U), (int)(R)))
1946 
1947 #define _mm_maskz_add_round_sd(U, A, B, R) \
1948   ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1949                                             (__v2df)(__m128d)(B), \
1950                                             (__v2df)_mm_setzero_pd(), \
1951                                             (__mmask8)(U), (int)(R)))
1952 
1953 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1954 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
1955   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
1956                                               (__v8df)_mm512_add_pd(__A, __B),
1957                                               (__v8df)__W);
1958 }
1959 
1960 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1961 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
1962   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
1963                                               (__v8df)_mm512_add_pd(__A, __B),
1964                                               (__v8df)_mm512_setzero_pd());
1965 }
1966 
1967 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1968 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
1969   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
1970                                              (__v16sf)_mm512_add_ps(__A, __B),
1971                                              (__v16sf)__W);
1972 }
1973 
1974 static __inline__ __m512 __DEFAULT_FN_ATTRS512
1975 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
1976   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
1977                                              (__v16sf)_mm512_add_ps(__A, __B),
1978                                              (__v16sf)_mm512_setzero_ps());
1979 }
1980 
1981 #define _mm512_add_round_pd(A, B, R) \
1982   ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
1983                                     (__v8df)(__m512d)(B), (int)(R)))
1984 
1985 #define _mm512_mask_add_round_pd(W, U, A, B, R) \
1986   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1987                                    (__v8df)_mm512_add_round_pd((A), (B), (R)), \
1988                                    (__v8df)(__m512d)(W)))
1989 
1990 #define _mm512_maskz_add_round_pd(U, A, B, R) \
1991   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1992                                    (__v8df)_mm512_add_round_pd((A), (B), (R)), \
1993                                    (__v8df)_mm512_setzero_pd()))
1994 
1995 #define _mm512_add_round_ps(A, B, R) \
1996   ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
1997                                    (__v16sf)(__m512)(B), (int)(R)))
1998 
1999 #define _mm512_mask_add_round_ps(W, U, A, B, R) \
2000   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2001                                   (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2002                                   (__v16sf)(__m512)(W)))
2003 
2004 #define _mm512_maskz_add_round_ps(U, A, B, R) \
2005   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2006                                   (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2007                                   (__v16sf)_mm512_setzero_ps()))
2008 
2009 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2010 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2011   __A = _mm_sub_ss(__A, __B);
2012   return __builtin_ia32_selectss_128(__U, __A, __W);
2013 }
2014 
2015 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2016 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2017   __A = _mm_sub_ss(__A, __B);
2018   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2019 }
2020 #define _mm_sub_round_ss(A, B, R) \
2021   ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2022                                            (__v4sf)(__m128)(B), \
2023                                            (__v4sf)_mm_setzero_ps(), \
2024                                            (__mmask8)-1, (int)(R)))
2025 
2026 #define _mm_mask_sub_round_ss(W, U, A, B, R) \
2027   ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2028                                            (__v4sf)(__m128)(B), \
2029                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
2030                                            (int)(R)))
2031 
2032 #define _mm_maskz_sub_round_ss(U, A, B, R) \
2033   ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2034                                            (__v4sf)(__m128)(B), \
2035                                            (__v4sf)_mm_setzero_ps(), \
2036                                            (__mmask8)(U), (int)(R)))
2037 
2038 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2039 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2040   __A = _mm_sub_sd(__A, __B);
2041   return __builtin_ia32_selectsd_128(__U, __A, __W);
2042 }
2043 
2044 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2045 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2046   __A = _mm_sub_sd(__A, __B);
2047   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2048 }
2049 
2050 #define _mm_sub_round_sd(A, B, R) \
2051   ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2052                                             (__v2df)(__m128d)(B), \
2053                                             (__v2df)_mm_setzero_pd(), \
2054                                             (__mmask8)-1, (int)(R)))
2055 
2056 #define _mm_mask_sub_round_sd(W, U, A, B, R) \
2057   ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2058                                             (__v2df)(__m128d)(B), \
2059                                             (__v2df)(__m128d)(W), \
2060                                             (__mmask8)(U), (int)(R)))
2061 
2062 #define _mm_maskz_sub_round_sd(U, A, B, R) \
2063   ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2064                                             (__v2df)(__m128d)(B), \
2065                                             (__v2df)_mm_setzero_pd(), \
2066                                             (__mmask8)(U), (int)(R)))
2067 
2068 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2069 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2070   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2071                                               (__v8df)_mm512_sub_pd(__A, __B),
2072                                               (__v8df)__W);
2073 }
2074 
2075 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2076 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2077   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2078                                               (__v8df)_mm512_sub_pd(__A, __B),
2079                                               (__v8df)_mm512_setzero_pd());
2080 }
2081 
2082 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2083 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2084   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2085                                              (__v16sf)_mm512_sub_ps(__A, __B),
2086                                              (__v16sf)__W);
2087 }
2088 
2089 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2090 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2091   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2092                                              (__v16sf)_mm512_sub_ps(__A, __B),
2093                                              (__v16sf)_mm512_setzero_ps());
2094 }
2095 
2096 #define _mm512_sub_round_pd(A, B, R) \
2097   ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
2098                                     (__v8df)(__m512d)(B), (int)(R)))
2099 
2100 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \
2101   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2102                                    (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2103                                    (__v8df)(__m512d)(W)))
2104 
2105 #define _mm512_maskz_sub_round_pd(U, A, B, R) \
2106   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2107                                    (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2108                                    (__v8df)_mm512_setzero_pd()))
2109 
2110 #define _mm512_sub_round_ps(A, B, R) \
2111   ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
2112                                    (__v16sf)(__m512)(B), (int)(R)))
2113 
2114 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \
2115   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2116                                   (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2117                                   (__v16sf)(__m512)(W)))
2118 
2119 #define _mm512_maskz_sub_round_ps(U, A, B, R) \
2120   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2121                                   (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2122                                   (__v16sf)_mm512_setzero_ps()))
2123 
2124 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2125 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2126   __A = _mm_mul_ss(__A, __B);
2127   return __builtin_ia32_selectss_128(__U, __A, __W);
2128 }
2129 
2130 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2131 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2132   __A = _mm_mul_ss(__A, __B);
2133   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2134 }
2135 #define _mm_mul_round_ss(A, B, R) \
2136   ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2137                                            (__v4sf)(__m128)(B), \
2138                                            (__v4sf)_mm_setzero_ps(), \
2139                                            (__mmask8)-1, (int)(R)))
2140 
2141 #define _mm_mask_mul_round_ss(W, U, A, B, R) \
2142   ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2143                                            (__v4sf)(__m128)(B), \
2144                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
2145                                            (int)(R)))
2146 
2147 #define _mm_maskz_mul_round_ss(U, A, B, R) \
2148   ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2149                                            (__v4sf)(__m128)(B), \
2150                                            (__v4sf)_mm_setzero_ps(), \
2151                                            (__mmask8)(U), (int)(R)))
2152 
2153 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2154 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2155   __A = _mm_mul_sd(__A, __B);
2156   return __builtin_ia32_selectsd_128(__U, __A, __W);
2157 }
2158 
2159 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2160 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2161   __A = _mm_mul_sd(__A, __B);
2162   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2163 }
2164 
2165 #define _mm_mul_round_sd(A, B, R) \
2166   ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2167                                             (__v2df)(__m128d)(B), \
2168                                             (__v2df)_mm_setzero_pd(), \
2169                                             (__mmask8)-1, (int)(R)))
2170 
2171 #define _mm_mask_mul_round_sd(W, U, A, B, R) \
2172   ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2173                                             (__v2df)(__m128d)(B), \
2174                                             (__v2df)(__m128d)(W), \
2175                                             (__mmask8)(U), (int)(R)))
2176 
2177 #define _mm_maskz_mul_round_sd(U, A, B, R) \
2178   ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2179                                             (__v2df)(__m128d)(B), \
2180                                             (__v2df)_mm_setzero_pd(), \
2181                                             (__mmask8)(U), (int)(R)))
2182 
2183 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2184 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2185   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2186                                               (__v8df)_mm512_mul_pd(__A, __B),
2187                                               (__v8df)__W);
2188 }
2189 
2190 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2191 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2192   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2193                                               (__v8df)_mm512_mul_pd(__A, __B),
2194                                               (__v8df)_mm512_setzero_pd());
2195 }
2196 
2197 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2198 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2199   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2200                                              (__v16sf)_mm512_mul_ps(__A, __B),
2201                                              (__v16sf)__W);
2202 }
2203 
2204 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2205 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2206   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2207                                              (__v16sf)_mm512_mul_ps(__A, __B),
2208                                              (__v16sf)_mm512_setzero_ps());
2209 }
2210 
2211 #define _mm512_mul_round_pd(A, B, R) \
2212   ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
2213                                     (__v8df)(__m512d)(B), (int)(R)))
2214 
2215 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \
2216   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2217                                    (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2218                                    (__v8df)(__m512d)(W)))
2219 
2220 #define _mm512_maskz_mul_round_pd(U, A, B, R) \
2221   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2222                                    (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2223                                    (__v8df)_mm512_setzero_pd()))
2224 
2225 #define _mm512_mul_round_ps(A, B, R) \
2226   ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
2227                                   (__v16sf)(__m512)(B), (int)(R)))
2228 
2229 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \
2230   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2231                                   (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2232                                   (__v16sf)(__m512)(W)))
2233 
2234 #define _mm512_maskz_mul_round_ps(U, A, B, R) \
2235   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2236                                   (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2237                                   (__v16sf)_mm512_setzero_ps()))
2238 
2239 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2240 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2241   __A = _mm_div_ss(__A, __B);
2242   return __builtin_ia32_selectss_128(__U, __A, __W);
2243 }
2244 
2245 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2246 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2247   __A = _mm_div_ss(__A, __B);
2248   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2249 }
2250 
2251 #define _mm_div_round_ss(A, B, R) \
2252   ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2253                                            (__v4sf)(__m128)(B), \
2254                                            (__v4sf)_mm_setzero_ps(), \
2255                                            (__mmask8)-1, (int)(R)))
2256 
2257 #define _mm_mask_div_round_ss(W, U, A, B, R) \
2258   ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2259                                            (__v4sf)(__m128)(B), \
2260                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
2261                                            (int)(R)))
2262 
2263 #define _mm_maskz_div_round_ss(U, A, B, R) \
2264   ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2265                                            (__v4sf)(__m128)(B), \
2266                                            (__v4sf)_mm_setzero_ps(), \
2267                                            (__mmask8)(U), (int)(R)))
2268 
2269 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2270 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2271   __A = _mm_div_sd(__A, __B);
2272   return __builtin_ia32_selectsd_128(__U, __A, __W);
2273 }
2274 
2275 static __inline__ __m128d __DEFAULT_FN_ATTRS128
2276 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2277   __A = _mm_div_sd(__A, __B);
2278   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2279 }
2280 
2281 #define _mm_div_round_sd(A, B, R) \
2282   ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2283                                             (__v2df)(__m128d)(B), \
2284                                             (__v2df)_mm_setzero_pd(), \
2285                                             (__mmask8)-1, (int)(R)))
2286 
2287 #define _mm_mask_div_round_sd(W, U, A, B, R) \
2288   ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2289                                             (__v2df)(__m128d)(B), \
2290                                             (__v2df)(__m128d)(W), \
2291                                             (__mmask8)(U), (int)(R)))
2292 
2293 #define _mm_maskz_div_round_sd(U, A, B, R) \
2294   ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2295                                             (__v2df)(__m128d)(B), \
2296                                             (__v2df)_mm_setzero_pd(), \
2297                                             (__mmask8)(U), (int)(R)))
2298 
2299 static __inline __m512d __DEFAULT_FN_ATTRS512
2300 _mm512_div_pd(__m512d __a, __m512d __b)
2301 {
2302   return (__m512d)((__v8df)__a/(__v8df)__b);
2303 }
2304 
2305 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2306 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2307   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2308                                               (__v8df)_mm512_div_pd(__A, __B),
2309                                               (__v8df)__W);
2310 }
2311 
2312 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2313 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2314   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2315                                               (__v8df)_mm512_div_pd(__A, __B),
2316                                               (__v8df)_mm512_setzero_pd());
2317 }
2318 
2319 static __inline __m512 __DEFAULT_FN_ATTRS512
2320 _mm512_div_ps(__m512 __a, __m512 __b)
2321 {
2322   return (__m512)((__v16sf)__a/(__v16sf)__b);
2323 }
2324 
2325 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2326 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2327   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2328                                              (__v16sf)_mm512_div_ps(__A, __B),
2329                                              (__v16sf)__W);
2330 }
2331 
2332 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2333 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2334   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2335                                              (__v16sf)_mm512_div_ps(__A, __B),
2336                                              (__v16sf)_mm512_setzero_ps());
2337 }
2338 
2339 #define _mm512_div_round_pd(A, B, R) \
2340   ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
2341                                     (__v8df)(__m512d)(B), (int)(R)))
2342 
2343 #define _mm512_mask_div_round_pd(W, U, A, B, R) \
2344   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2345                                    (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2346                                    (__v8df)(__m512d)(W)))
2347 
2348 #define _mm512_maskz_div_round_pd(U, A, B, R) \
2349   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2350                                    (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2351                                    (__v8df)_mm512_setzero_pd()))
2352 
2353 #define _mm512_div_round_ps(A, B, R) \
2354   ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
2355                                    (__v16sf)(__m512)(B), (int)(R)))
2356 
2357 #define _mm512_mask_div_round_ps(W, U, A, B, R) \
2358   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2359                                   (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2360                                   (__v16sf)(__m512)(W)))
2361 
2362 #define _mm512_maskz_div_round_ps(U, A, B, R) \
2363   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2364                                   (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2365                                   (__v16sf)_mm512_setzero_ps()))
2366 
2367 #define _mm512_roundscale_ps(A, B) \
2368   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
2369                                           (__v16sf)_mm512_undefined_ps(), \
2370                                           (__mmask16)-1, \
2371                                           _MM_FROUND_CUR_DIRECTION))
2372 
2373 #define _mm512_mask_roundscale_ps(A, B, C, imm) \
2374   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2375                                          (__v16sf)(__m512)(A), (__mmask16)(B), \
2376                                          _MM_FROUND_CUR_DIRECTION))
2377 
2378 #define _mm512_maskz_roundscale_ps(A, B, imm) \
2379   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2380                                           (__v16sf)_mm512_setzero_ps(), \
2381                                           (__mmask16)(A), \
2382                                           _MM_FROUND_CUR_DIRECTION))
2383 
2384 #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
2385   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2386                                          (__v16sf)(__m512)(A), (__mmask16)(B), \
2387                                          (int)(R)))
2388 
2389 #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
2390   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2391                                           (__v16sf)_mm512_setzero_ps(), \
2392                                           (__mmask16)(A), (int)(R)))
2393 
2394 #define _mm512_roundscale_round_ps(A, imm, R) \
2395   ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
2396                                           (__v16sf)_mm512_undefined_ps(), \
2397                                           (__mmask16)-1, (int)(R)))
2398 
2399 #define _mm512_roundscale_pd(A, B) \
2400   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
2401                                            (__v8df)_mm512_undefined_pd(), \
2402                                            (__mmask8)-1, \
2403                                            _MM_FROUND_CUR_DIRECTION))
2404 
2405 #define _mm512_mask_roundscale_pd(A, B, C, imm) \
2406   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2407                                           (__v8df)(__m512d)(A), (__mmask8)(B), \
2408                                           _MM_FROUND_CUR_DIRECTION))
2409 
2410 #define _mm512_maskz_roundscale_pd(A, B, imm) \
2411   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2412                                            (__v8df)_mm512_setzero_pd(), \
2413                                            (__mmask8)(A), \
2414                                            _MM_FROUND_CUR_DIRECTION))
2415 
2416 #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
2417   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2418                                           (__v8df)(__m512d)(A), (__mmask8)(B), \
2419                                           (int)(R)))
2420 
2421 #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
2422   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2423                                            (__v8df)_mm512_setzero_pd(), \
2424                                            (__mmask8)(A), (int)(R)))
2425 
2426 #define _mm512_roundscale_round_pd(A, imm, R) \
2427   ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
2428                                            (__v8df)_mm512_undefined_pd(), \
2429                                            (__mmask8)-1, (int)(R)))
2430 
2431 #define _mm512_fmadd_round_pd(A, B, C, R) \
2432   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2433                                             (__v8df)(__m512d)(B), \
2434                                             (__v8df)(__m512d)(C), \
2435                                             (__mmask8)-1, (int)(R)))
2436 
2437 
2438 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
2439   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2440                                             (__v8df)(__m512d)(B), \
2441                                             (__v8df)(__m512d)(C), \
2442                                             (__mmask8)(U), (int)(R)))
2443 
2444 
2445 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
2446   ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
2447                                              (__v8df)(__m512d)(B), \
2448                                              (__v8df)(__m512d)(C), \
2449                                              (__mmask8)(U), (int)(R)))
2450 
2451 
2452 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
2453   ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2454                                              (__v8df)(__m512d)(B), \
2455                                              (__v8df)(__m512d)(C), \
2456                                              (__mmask8)(U), (int)(R)))
2457 
2458 
2459 #define _mm512_fmsub_round_pd(A, B, C, R) \
2460   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2461                                             (__v8df)(__m512d)(B), \
2462                                             -(__v8df)(__m512d)(C), \
2463                                             (__mmask8)-1, (int)(R)))
2464 
2465 
2466 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
2467   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2468                                             (__v8df)(__m512d)(B), \
2469                                             -(__v8df)(__m512d)(C), \
2470                                             (__mmask8)(U), (int)(R)))
2471 
2472 
2473 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
2474   ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2475                                              (__v8df)(__m512d)(B), \
2476                                              -(__v8df)(__m512d)(C), \
2477                                              (__mmask8)(U), (int)(R)))
2478 
2479 
2480 #define _mm512_fnmadd_round_pd(A, B, C, R) \
2481   ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2482                                             (__v8df)(__m512d)(B), \
2483                                             (__v8df)(__m512d)(C), \
2484                                             (__mmask8)-1, (int)(R)))
2485 
2486 
2487 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
2488   ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
2489                                              (__v8df)(__m512d)(B), \
2490                                              (__v8df)(__m512d)(C), \
2491                                              (__mmask8)(U), (int)(R)))
2492 
2493 
2494 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
2495   ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2496                                              (__v8df)(__m512d)(B), \
2497                                              (__v8df)(__m512d)(C), \
2498                                              (__mmask8)(U), (int)(R)))
2499 
2500 
2501 #define _mm512_fnmsub_round_pd(A, B, C, R) \
2502   ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2503                                             (__v8df)(__m512d)(B), \
2504                                             -(__v8df)(__m512d)(C), \
2505                                             (__mmask8)-1, (int)(R)))
2506 
2507 
2508 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
2509   ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2510                                              (__v8df)(__m512d)(B), \
2511                                              -(__v8df)(__m512d)(C), \
2512                                              (__mmask8)(U), (int)(R)))
2513 
2514 
2515 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2516 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
2517 {
2518   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2519                                                     (__v8df) __B,
2520                                                     (__v8df) __C,
2521                                                     (__mmask8) -1,
2522                                                     _MM_FROUND_CUR_DIRECTION);
2523 }
2524 
2525 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2526 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2527 {
2528   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2529                                                     (__v8df) __B,
2530                                                     (__v8df) __C,
2531                                                     (__mmask8) __U,
2532                                                     _MM_FROUND_CUR_DIRECTION);
2533 }
2534 
2535 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2536 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2537 {
2538   return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
2539                                                      (__v8df) __B,
2540                                                      (__v8df) __C,
2541                                                      (__mmask8) __U,
2542                                                      _MM_FROUND_CUR_DIRECTION);
2543 }
2544 
2545 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2546 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2547 {
2548   return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2549                                                      (__v8df) __B,
2550                                                      (__v8df) __C,
2551                                                      (__mmask8) __U,
2552                                                      _MM_FROUND_CUR_DIRECTION);
2553 }
2554 
2555 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2556 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
2557 {
2558   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2559                                                     (__v8df) __B,
2560                                                     -(__v8df) __C,
2561                                                     (__mmask8) -1,
2562                                                     _MM_FROUND_CUR_DIRECTION);
2563 }
2564 
2565 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2566 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2567 {
2568   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2569                                                     (__v8df) __B,
2570                                                     -(__v8df) __C,
2571                                                     (__mmask8) __U,
2572                                                     _MM_FROUND_CUR_DIRECTION);
2573 }
2574 
2575 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2576 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2577 {
2578   return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2579                                                      (__v8df) __B,
2580                                                      -(__v8df) __C,
2581                                                      (__mmask8) __U,
2582                                                      _MM_FROUND_CUR_DIRECTION);
2583 }
2584 
2585 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2586 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
2587 {
2588   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2589                                                     -(__v8df) __B,
2590                                                     (__v8df) __C,
2591                                                     (__mmask8) -1,
2592                                                     _MM_FROUND_CUR_DIRECTION);
2593 }
2594 
2595 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2596 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2597 {
2598   return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
2599                                                      (__v8df) __B,
2600                                                      (__v8df) __C,
2601                                                      (__mmask8) __U,
2602                                                      _MM_FROUND_CUR_DIRECTION);
2603 }
2604 
2605 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2606 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2607 {
2608   return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2609                                                      (__v8df) __B,
2610                                                      (__v8df) __C,
2611                                                      (__mmask8) __U,
2612                                                      _MM_FROUND_CUR_DIRECTION);
2613 }
2614 
2615 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2616 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
2617 {
2618   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2619                                                     -(__v8df) __B,
2620                                                     -(__v8df) __C,
2621                                                     (__mmask8) -1,
2622                                                     _MM_FROUND_CUR_DIRECTION);
2623 }
2624 
2625 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2626 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2627 {
2628   return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2629                                                      (__v8df) __B,
2630                                                      -(__v8df) __C,
2631                                                      (__mmask8) __U,
2632                                                      _MM_FROUND_CUR_DIRECTION);
2633 }
2634 
2635 #define _mm512_fmadd_round_ps(A, B, C, R) \
2636   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2637                                            (__v16sf)(__m512)(B), \
2638                                            (__v16sf)(__m512)(C), \
2639                                            (__mmask16)-1, (int)(R)))
2640 
2641 
2642 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
2643   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2644                                            (__v16sf)(__m512)(B), \
2645                                            (__v16sf)(__m512)(C), \
2646                                            (__mmask16)(U), (int)(R)))
2647 
2648 
2649 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
2650   ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
2651                                             (__v16sf)(__m512)(B), \
2652                                             (__v16sf)(__m512)(C), \
2653                                             (__mmask16)(U), (int)(R)))
2654 
2655 
2656 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
2657   ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2658                                             (__v16sf)(__m512)(B), \
2659                                             (__v16sf)(__m512)(C), \
2660                                             (__mmask16)(U), (int)(R)))
2661 
2662 
2663 #define _mm512_fmsub_round_ps(A, B, C, R) \
2664   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2665                                            (__v16sf)(__m512)(B), \
2666                                            -(__v16sf)(__m512)(C), \
2667                                            (__mmask16)-1, (int)(R)))
2668 
2669 
2670 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
2671   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2672                                            (__v16sf)(__m512)(B), \
2673                                            -(__v16sf)(__m512)(C), \
2674                                            (__mmask16)(U), (int)(R)))
2675 
2676 
2677 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
2678   ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2679                                             (__v16sf)(__m512)(B), \
2680                                             -(__v16sf)(__m512)(C), \
2681                                             (__mmask16)(U), (int)(R)))
2682 
2683 
2684 #define _mm512_fnmadd_round_ps(A, B, C, R) \
2685   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2686                                            -(__v16sf)(__m512)(B), \
2687                                            (__v16sf)(__m512)(C), \
2688                                            (__mmask16)-1, (int)(R)))
2689 
2690 
2691 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
2692   ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
2693                                             (__v16sf)(__m512)(B), \
2694                                             (__v16sf)(__m512)(C), \
2695                                             (__mmask16)(U), (int)(R)))
2696 
2697 
2698 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
2699   ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2700                                             (__v16sf)(__m512)(B), \
2701                                             (__v16sf)(__m512)(C), \
2702                                             (__mmask16)(U), (int)(R)))
2703 
2704 
2705 #define _mm512_fnmsub_round_ps(A, B, C, R) \
2706   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2707                                            -(__v16sf)(__m512)(B), \
2708                                            -(__v16sf)(__m512)(C), \
2709                                            (__mmask16)-1, (int)(R)))
2710 
2711 
2712 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
2713   ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2714                                             (__v16sf)(__m512)(B), \
2715                                             -(__v16sf)(__m512)(C), \
2716                                             (__mmask16)(U), (int)(R)))
2717 
2718 
2719 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2720 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
2721 {
2722   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2723                                                    (__v16sf) __B,
2724                                                    (__v16sf) __C,
2725                                                    (__mmask16) -1,
2726                                                    _MM_FROUND_CUR_DIRECTION);
2727 }
2728 
2729 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2730 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
2731 {
2732   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2733                                                    (__v16sf) __B,
2734                                                    (__v16sf) __C,
2735                                                    (__mmask16) __U,
2736                                                    _MM_FROUND_CUR_DIRECTION);
2737 }
2738 
2739 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2740 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
2741 {
2742   return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
2743                                                     (__v16sf) __B,
2744                                                     (__v16sf) __C,
2745                                                     (__mmask16) __U,
2746                                                     _MM_FROUND_CUR_DIRECTION);
2747 }
2748 
2749 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2750 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2751 {
2752   return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2753                                                     (__v16sf) __B,
2754                                                     (__v16sf) __C,
2755                                                     (__mmask16) __U,
2756                                                     _MM_FROUND_CUR_DIRECTION);
2757 }
2758 
2759 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2760 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
2761 {
2762   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2763                                                    (__v16sf) __B,
2764                                                    -(__v16sf) __C,
2765                                                    (__mmask16) -1,
2766                                                    _MM_FROUND_CUR_DIRECTION);
2767 }
2768 
2769 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2770 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
2771 {
2772   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2773                                                    (__v16sf) __B,
2774                                                    -(__v16sf) __C,
2775                                                    (__mmask16) __U,
2776                                                    _MM_FROUND_CUR_DIRECTION);
2777 }
2778 
2779 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2780 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2781 {
2782   return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2783                                                     (__v16sf) __B,
2784                                                     -(__v16sf) __C,
2785                                                     (__mmask16) __U,
2786                                                     _MM_FROUND_CUR_DIRECTION);
2787 }
2788 
2789 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2790 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
2791 {
2792   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2793                                                    -(__v16sf) __B,
2794                                                    (__v16sf) __C,
2795                                                    (__mmask16) -1,
2796                                                    _MM_FROUND_CUR_DIRECTION);
2797 }
2798 
2799 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2800 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
2801 {
2802   return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
2803                                                     (__v16sf) __B,
2804                                                     (__v16sf) __C,
2805                                                     (__mmask16) __U,
2806                                                     _MM_FROUND_CUR_DIRECTION);
2807 }
2808 
2809 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2810 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2811 {
2812   return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2813                                                     (__v16sf) __B,
2814                                                     (__v16sf) __C,
2815                                                     (__mmask16) __U,
2816                                                     _MM_FROUND_CUR_DIRECTION);
2817 }
2818 
2819 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2820 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
2821 {
2822   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2823                                                    -(__v16sf) __B,
2824                                                    -(__v16sf) __C,
2825                                                    (__mmask16) -1,
2826                                                    _MM_FROUND_CUR_DIRECTION);
2827 }
2828 
2829 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2830 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2831 {
2832   return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2833                                                     (__v16sf) __B,
2834                                                     -(__v16sf) __C,
2835                                                     (__mmask16) __U,
2836                                                     _MM_FROUND_CUR_DIRECTION);
2837 }
2838 
2839 #define _mm512_fmaddsub_round_pd(A, B, C, R) \
2840   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2841                                                (__v8df)(__m512d)(B), \
2842                                                (__v8df)(__m512d)(C), \
2843                                                (__mmask8)-1, (int)(R)))
2844 
2845 
2846 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
2847   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2848                                                (__v8df)(__m512d)(B), \
2849                                                (__v8df)(__m512d)(C), \
2850                                                (__mmask8)(U), (int)(R)))
2851 
2852 
2853 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
2854   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
2855                                                 (__v8df)(__m512d)(B), \
2856                                                 (__v8df)(__m512d)(C), \
2857                                                 (__mmask8)(U), (int)(R)))
2858 
2859 
2860 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
2861   ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2862                                                 (__v8df)(__m512d)(B), \
2863                                                 (__v8df)(__m512d)(C), \
2864                                                 (__mmask8)(U), (int)(R)))
2865 
2866 
2867 #define _mm512_fmsubadd_round_pd(A, B, C, R) \
2868   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2869                                                (__v8df)(__m512d)(B), \
2870                                                -(__v8df)(__m512d)(C), \
2871                                                (__mmask8)-1, (int)(R)))
2872 
2873 
2874 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
2875   ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2876                                                (__v8df)(__m512d)(B), \
2877                                                -(__v8df)(__m512d)(C), \
2878                                                (__mmask8)(U), (int)(R)))
2879 
2880 
2881 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
2882   ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2883                                                 (__v8df)(__m512d)(B), \
2884                                                 -(__v8df)(__m512d)(C), \
2885                                                 (__mmask8)(U), (int)(R)))
2886 
2887 
2888 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2889 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
2890 {
2891   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2892                                                       (__v8df) __B,
2893                                                       (__v8df) __C,
2894                                                       (__mmask8) -1,
2895                                                       _MM_FROUND_CUR_DIRECTION);
2896 }
2897 
2898 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2899 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2900 {
2901   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2902                                                       (__v8df) __B,
2903                                                       (__v8df) __C,
2904                                                       (__mmask8) __U,
2905                                                       _MM_FROUND_CUR_DIRECTION);
2906 }
2907 
2908 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2909 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2910 {
2911   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
2912                                                        (__v8df) __B,
2913                                                        (__v8df) __C,
2914                                                        (__mmask8) __U,
2915                                                        _MM_FROUND_CUR_DIRECTION);
2916 }
2917 
2918 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2919 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2920 {
2921   return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2922                                                        (__v8df) __B,
2923                                                        (__v8df) __C,
2924                                                        (__mmask8) __U,
2925                                                        _MM_FROUND_CUR_DIRECTION);
2926 }
2927 
2928 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2929 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
2930 {
2931   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2932                                                        (__v8df) __B,
2933                                                        -(__v8df) __C,
2934                                                        (__mmask8) -1,
2935                                                        _MM_FROUND_CUR_DIRECTION);
2936 }
2937 
2938 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2939 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2940 {
2941   return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2942                                                        (__v8df) __B,
2943                                                        -(__v8df) __C,
2944                                                        (__mmask8) __U,
2945                                                        _MM_FROUND_CUR_DIRECTION);
2946 }
2947 
2948 static __inline__ __m512d __DEFAULT_FN_ATTRS512
2949 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2950 {
2951   return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2952                                                         (__v8df) __B,
2953                                                         -(__v8df) __C,
2954                                                         (__mmask8) __U,
2955                                                         _MM_FROUND_CUR_DIRECTION);
2956 }
2957 
2958 #define _mm512_fmaddsub_round_ps(A, B, C, R) \
2959   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2960                                               (__v16sf)(__m512)(B), \
2961                                               (__v16sf)(__m512)(C), \
2962                                               (__mmask16)-1, (int)(R)))
2963 
2964 
2965 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
2966   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2967                                               (__v16sf)(__m512)(B), \
2968                                               (__v16sf)(__m512)(C), \
2969                                               (__mmask16)(U), (int)(R)))
2970 
2971 
2972 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
2973   ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
2974                                                (__v16sf)(__m512)(B), \
2975                                                (__v16sf)(__m512)(C), \
2976                                                (__mmask16)(U), (int)(R)))
2977 
2978 
2979 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
2980   ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
2981                                                (__v16sf)(__m512)(B), \
2982                                                (__v16sf)(__m512)(C), \
2983                                                (__mmask16)(U), (int)(R)))
2984 
2985 
2986 #define _mm512_fmsubadd_round_ps(A, B, C, R) \
2987   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2988                                               (__v16sf)(__m512)(B), \
2989                                               -(__v16sf)(__m512)(C), \
2990                                               (__mmask16)-1, (int)(R)))
2991 
2992 
2993 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
2994   ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2995                                               (__v16sf)(__m512)(B), \
2996                                               -(__v16sf)(__m512)(C), \
2997                                               (__mmask16)(U), (int)(R)))
2998 
2999 
3000 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
3001   ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
3002                                                (__v16sf)(__m512)(B), \
3003                                                -(__v16sf)(__m512)(C), \
3004                                                (__mmask16)(U), (int)(R)))
3005 
3006 
3007 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3008 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
3009 {
3010   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3011                                                       (__v16sf) __B,
3012                                                       (__v16sf) __C,
3013                                                       (__mmask16) -1,
3014                                                       _MM_FROUND_CUR_DIRECTION);
3015 }
3016 
3017 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3018 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3019 {
3020   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3021                                                       (__v16sf) __B,
3022                                                       (__v16sf) __C,
3023                                                       (__mmask16) __U,
3024                                                       _MM_FROUND_CUR_DIRECTION);
3025 }
3026 
3027 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3028 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3029 {
3030   return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
3031                                                        (__v16sf) __B,
3032                                                        (__v16sf) __C,
3033                                                        (__mmask16) __U,
3034                                                        _MM_FROUND_CUR_DIRECTION);
3035 }
3036 
3037 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3038 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
3039 {
3040   return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3041                                                        (__v16sf) __B,
3042                                                        (__v16sf) __C,
3043                                                        (__mmask16) __U,
3044                                                        _MM_FROUND_CUR_DIRECTION);
3045 }
3046 
3047 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3048 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
3049 {
3050   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3051                                                       (__v16sf) __B,
3052                                                       -(__v16sf) __C,
3053                                                       (__mmask16) -1,
3054                                                       _MM_FROUND_CUR_DIRECTION);
3055 }
3056 
3057 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3058 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3059 {
3060   return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3061                                                       (__v16sf) __B,
3062                                                       -(__v16sf) __C,
3063                                                       (__mmask16) __U,
3064                                                       _MM_FROUND_CUR_DIRECTION);
3065 }
3066 
3067 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3068 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
3069 {
3070   return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3071                                                        (__v16sf) __B,
3072                                                        -(__v16sf) __C,
3073                                                        (__mmask16) __U,
3074                                                        _MM_FROUND_CUR_DIRECTION);
3075 }
3076 
3077 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
3078   ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
3079                                              (__v8df)(__m512d)(B), \
3080                                              (__v8df)(__m512d)(C), \
3081                                              (__mmask8)(U), (int)(R)))
3082 
3083 
3084 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3085 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3086 {
3087   return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
3088                                                     (__v8df) __B,
3089                                                     (__v8df) __C,
3090                                                     (__mmask8) __U,
3091                                                     _MM_FROUND_CUR_DIRECTION);
3092 }
3093 
3094 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
3095   ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
3096                                             (__v16sf)(__m512)(B), \
3097                                             (__v16sf)(__m512)(C), \
3098                                             (__mmask16)(U), (int)(R)))
3099 
3100 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3101 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3102 {
3103   return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
3104                                                    (__v16sf) __B,
3105                                                    (__v16sf) __C,
3106                                                    (__mmask16) __U,
3107                                                    _MM_FROUND_CUR_DIRECTION);
3108 }
3109 
3110 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
3111   ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
3112                                                 (__v8df)(__m512d)(B), \
3113                                                 (__v8df)(__m512d)(C), \
3114                                                 (__mmask8)(U), (int)(R)))
3115 
3116 
3117 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3118 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3119 {
3120   return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
3121                                                        (__v8df) __B,
3122                                                        (__v8df) __C,
3123                                                        (__mmask8) __U,
3124                                                        _MM_FROUND_CUR_DIRECTION);
3125 }
3126 
3127 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
3128   ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
3129                                                (__v16sf)(__m512)(B), \
3130                                                (__v16sf)(__m512)(C), \
3131                                                (__mmask16)(U), (int)(R)))
3132 
3133 
3134 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3135 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3136 {
3137   return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
3138                                                       (__v16sf) __B,
3139                                                       (__v16sf) __C,
3140                                                       (__mmask16) __U,
3141                                                       _MM_FROUND_CUR_DIRECTION);
3142 }
3143 
3144 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
3145   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3146                                             -(__v8df)(__m512d)(B), \
3147                                             (__v8df)(__m512d)(C), \
3148                                             (__mmask8)(U), (int)(R)))
3149 
3150 
3151 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3152 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
3153 {
3154   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3155                                                     -(__v8df) __B,
3156                                                     (__v8df) __C,
3157                                                     (__mmask8) __U,
3158                                                     _MM_FROUND_CUR_DIRECTION);
3159 }
3160 
3161 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
3162   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3163                                            -(__v16sf)(__m512)(B), \
3164                                            (__v16sf)(__m512)(C), \
3165                                            (__mmask16)(U), (int)(R)))
3166 
3167 
3168 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3169 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3170 {
3171   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3172                                                    -(__v16sf) __B,
3173                                                    (__v16sf) __C,
3174                                                    (__mmask16) __U,
3175                                                    _MM_FROUND_CUR_DIRECTION);
3176 }
3177 
3178 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
3179   ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3180                                             -(__v8df)(__m512d)(B), \
3181                                             -(__v8df)(__m512d)(C), \
3182                                             (__mmask8)(U), (int)(R)))
3183 
3184 
3185 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
3186   ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
3187                                              (__v8df)(__m512d)(B), \
3188                                              (__v8df)(__m512d)(C), \
3189                                              (__mmask8)(U), (int)(R)))
3190 
3191 
3192 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3193 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
3194 {
3195   return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3196                                                     -(__v8df) __B,
3197                                                     -(__v8df) __C,
3198                                                     (__mmask8) __U,
3199                                                     _MM_FROUND_CUR_DIRECTION);
3200 }
3201 
3202 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3203 _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3204 {
3205   return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
3206                                                      (__v8df) __B,
3207                                                      (__v8df) __C,
3208                                                      (__mmask8) __U,
3209                                                      _MM_FROUND_CUR_DIRECTION);
3210 }
3211 
3212 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
3213   ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3214                                            -(__v16sf)(__m512)(B), \
3215                                            -(__v16sf)(__m512)(C), \
3216                                            (__mmask16)(U), (int)(R)))
3217 
3218 
3219 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
3220   ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
3221                                             (__v16sf)(__m512)(B), \
3222                                             (__v16sf)(__m512)(C), \
3223                                             (__mmask16)(U), (int)(R)))
3224 
3225 
3226 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3227 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3228 {
3229   return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3230                                                    -(__v16sf) __B,
3231                                                    -(__v16sf) __C,
3232                                                    (__mmask16) __U,
3233                                                    _MM_FROUND_CUR_DIRECTION);
3234 }
3235 
3236 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3237 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3238 {
3239   return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
3240                                                     (__v16sf) __B,
3241                                                     (__v16sf) __C,
3242                                                     (__mmask16) __U,
3243                                                     _MM_FROUND_CUR_DIRECTION);
3244 }
3245 
3246 
3247 
3248 /* Vector permutations */
3249 
3250 static __inline __m512i __DEFAULT_FN_ATTRS512
3251 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
3252 {
3253   return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
3254                                                 (__v16si) __B);
3255 }
3256 
3257 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3258 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
3259                                __m512i __B)
3260 {
3261   return (__m512i)__builtin_ia32_selectd_512(__U,
3262                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3263                               (__v16si)__A);
3264 }
3265 
3266 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3267 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
3268                                 __m512i __B)
3269 {
3270   return (__m512i)__builtin_ia32_selectd_512(__U,
3271                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3272                               (__v16si)__I);
3273 }
3274 
3275 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3276 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
3277                                 __m512i __B)
3278 {
3279   return (__m512i)__builtin_ia32_selectd_512(__U,
3280                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3281                               (__v16si)_mm512_setzero_si512());
3282 }
3283 
3284 static __inline __m512i __DEFAULT_FN_ATTRS512
3285 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
3286 {
3287   return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
3288                                                 (__v8di) __B);
3289 }
3290 
3291 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3292 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
3293                                __m512i __B)
3294 {
3295   return (__m512i)__builtin_ia32_selectq_512(__U,
3296                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3297                                (__v8di)__A);
3298 }
3299 
3300 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3301 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
3302                                 __m512i __B)
3303 {
3304   return (__m512i)__builtin_ia32_selectq_512(__U,
3305                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3306                                (__v8di)__I);
3307 }
3308 
3309 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3310 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
3311                                 __m512i __B)
3312 {
3313   return (__m512i)__builtin_ia32_selectq_512(__U,
3314                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3315                                (__v8di)_mm512_setzero_si512());
3316 }
3317 
3318 #define _mm512_alignr_epi64(A, B, I) \
3319   ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
3320                                      (__v8di)(__m512i)(B), (int)(I)))
3321 
3322 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
3323   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3324                                   (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3325                                   (__v8di)(__m512i)(W)))
3326 
3327 #define _mm512_maskz_alignr_epi64(U, A, B, imm) \
3328   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3329                                   (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3330                                   (__v8di)_mm512_setzero_si512()))
3331 
3332 #define _mm512_alignr_epi32(A, B, I) \
3333   ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
3334                                      (__v16si)(__m512i)(B), (int)(I)))
3335 
3336 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
3337   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3338                                  (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3339                                  (__v16si)(__m512i)(W)))
3340 
3341 #define _mm512_maskz_alignr_epi32(U, A, B, imm) \
3342   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3343                                  (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3344                                  (__v16si)_mm512_setzero_si512()))
3345 /* Vector Extract */
3346 
3347 #define _mm512_extractf64x4_pd(A, I) \
3348   ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
3349                                              (__v4df)_mm256_undefined_pd(), \
3350                                              (__mmask8)-1))
3351 
3352 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
3353   ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3354                                              (__v4df)(__m256d)(W), \
3355                                              (__mmask8)(U)))
3356 
3357 #define _mm512_maskz_extractf64x4_pd(U, A, imm) \
3358   ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3359                                              (__v4df)_mm256_setzero_pd(), \
3360                                              (__mmask8)(U)))
3361 
3362 #define _mm512_extractf32x4_ps(A, I) \
3363   ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
3364                                             (__v4sf)_mm_undefined_ps(), \
3365                                             (__mmask8)-1))
3366 
3367 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
3368   ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3369                                             (__v4sf)(__m128)(W), \
3370                                             (__mmask8)(U)))
3371 
3372 #define _mm512_maskz_extractf32x4_ps(U, A, imm) \
3373   ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3374                                             (__v4sf)_mm_setzero_ps(), \
3375                                             (__mmask8)(U)))
3376 
3377 /* Vector Blend */
3378 
3379 static __inline __m512d __DEFAULT_FN_ATTRS512
3380 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
3381 {
3382   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
3383                  (__v8df) __W,
3384                  (__v8df) __A);
3385 }
3386 
3387 static __inline __m512 __DEFAULT_FN_ATTRS512
3388 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
3389 {
3390   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
3391                 (__v16sf) __W,
3392                 (__v16sf) __A);
3393 }
3394 
3395 static __inline __m512i __DEFAULT_FN_ATTRS512
3396 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
3397 {
3398   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
3399                 (__v8di) __W,
3400                 (__v8di) __A);
3401 }
3402 
3403 static __inline __m512i __DEFAULT_FN_ATTRS512
3404 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
3405 {
3406   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
3407                 (__v16si) __W,
3408                 (__v16si) __A);
3409 }
3410 
3411 /* Compare */
3412 
3413 #define _mm512_cmp_round_ps_mask(A, B, P, R) \
3414   ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3415                                            (__v16sf)(__m512)(B), (int)(P), \
3416                                            (__mmask16)-1, (int)(R)))
3417 
3418 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
3419   ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3420                                            (__v16sf)(__m512)(B), (int)(P), \
3421                                            (__mmask16)(U), (int)(R)))
3422 
3423 #define _mm512_cmp_ps_mask(A, B, P) \
3424   _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3425 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \
3426   _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3427 
3428 #define _mm512_cmpeq_ps_mask(A, B) \
3429     _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
3430 #define _mm512_mask_cmpeq_ps_mask(k, A, B) \
3431     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
3432 
3433 #define _mm512_cmplt_ps_mask(A, B) \
3434     _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
3435 #define _mm512_mask_cmplt_ps_mask(k, A, B) \
3436     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
3437 
3438 #define _mm512_cmple_ps_mask(A, B) \
3439     _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
3440 #define _mm512_mask_cmple_ps_mask(k, A, B) \
3441     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
3442 
3443 #define _mm512_cmpunord_ps_mask(A, B) \
3444     _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
3445 #define _mm512_mask_cmpunord_ps_mask(k, A, B) \
3446     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
3447 
3448 #define _mm512_cmpneq_ps_mask(A, B) \
3449     _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
3450 #define _mm512_mask_cmpneq_ps_mask(k, A, B) \
3451     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
3452 
3453 #define _mm512_cmpnlt_ps_mask(A, B) \
3454     _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
3455 #define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
3456     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
3457 
3458 #define _mm512_cmpnle_ps_mask(A, B) \
3459     _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
3460 #define _mm512_mask_cmpnle_ps_mask(k, A, B) \
3461     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
3462 
3463 #define _mm512_cmpord_ps_mask(A, B) \
3464     _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
3465 #define _mm512_mask_cmpord_ps_mask(k, A, B) \
3466     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
3467 
3468 #define _mm512_cmp_round_pd_mask(A, B, P, R) \
3469   ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3470                                           (__v8df)(__m512d)(B), (int)(P), \
3471                                           (__mmask8)-1, (int)(R)))
3472 
3473 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
3474   ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3475                                           (__v8df)(__m512d)(B), (int)(P), \
3476                                           (__mmask8)(U), (int)(R)))
3477 
3478 #define _mm512_cmp_pd_mask(A, B, P) \
3479   _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3480 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \
3481   _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3482 
3483 #define _mm512_cmpeq_pd_mask(A, B) \
3484     _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
3485 #define _mm512_mask_cmpeq_pd_mask(k, A, B) \
3486     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
3487 
3488 #define _mm512_cmplt_pd_mask(A, B) \
3489     _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
3490 #define _mm512_mask_cmplt_pd_mask(k, A, B) \
3491     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
3492 
3493 #define _mm512_cmple_pd_mask(A, B) \
3494     _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
3495 #define _mm512_mask_cmple_pd_mask(k, A, B) \
3496     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
3497 
3498 #define _mm512_cmpunord_pd_mask(A, B) \
3499     _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
3500 #define _mm512_mask_cmpunord_pd_mask(k, A, B) \
3501     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
3502 
3503 #define _mm512_cmpneq_pd_mask(A, B) \
3504     _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
3505 #define _mm512_mask_cmpneq_pd_mask(k, A, B) \
3506     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
3507 
3508 #define _mm512_cmpnlt_pd_mask(A, B) \
3509     _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
3510 #define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
3511     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
3512 
3513 #define _mm512_cmpnle_pd_mask(A, B) \
3514     _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
3515 #define _mm512_mask_cmpnle_pd_mask(k, A, B) \
3516     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
3517 
3518 #define _mm512_cmpord_pd_mask(A, B) \
3519     _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
3520 #define _mm512_mask_cmpord_pd_mask(k, A, B) \
3521     _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
3522 
3523 /* Conversion */
3524 
3525 #define _mm512_cvtt_roundps_epu32(A, R) \
3526   ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3527                                               (__v16si)_mm512_undefined_epi32(), \
3528                                               (__mmask16)-1, (int)(R)))
3529 
3530 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
3531   ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3532                                               (__v16si)(__m512i)(W), \
3533                                               (__mmask16)(U), (int)(R)))
3534 
3535 #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
3536   ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3537                                               (__v16si)_mm512_setzero_si512(), \
3538                                               (__mmask16)(U), (int)(R)))
3539 
3540 
3541 static __inline __m512i __DEFAULT_FN_ATTRS512
3542 _mm512_cvttps_epu32(__m512 __A)
3543 {
3544   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3545                   (__v16si)
3546                   _mm512_setzero_si512 (),
3547                   (__mmask16) -1,
3548                   _MM_FROUND_CUR_DIRECTION);
3549 }
3550 
3551 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3552 _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
3553 {
3554   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3555                    (__v16si) __W,
3556                    (__mmask16) __U,
3557                    _MM_FROUND_CUR_DIRECTION);
3558 }
3559 
3560 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3561 _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
3562 {
3563   return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3564                    (__v16si) _mm512_setzero_si512 (),
3565                    (__mmask16) __U,
3566                    _MM_FROUND_CUR_DIRECTION);
3567 }
3568 
3569 #define _mm512_cvt_roundepi32_ps(A, R) \
3570   ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3571                                            (__v16sf)_mm512_setzero_ps(), \
3572                                            (__mmask16)-1, (int)(R)))
3573 
3574 #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
3575   ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3576                                            (__v16sf)(__m512)(W), \
3577                                            (__mmask16)(U), (int)(R)))
3578 
3579 #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
3580   ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3581                                            (__v16sf)_mm512_setzero_ps(), \
3582                                            (__mmask16)(U), (int)(R)))
3583 
3584 #define _mm512_cvt_roundepu32_ps(A, R) \
3585   ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3586                                             (__v16sf)_mm512_setzero_ps(), \
3587                                             (__mmask16)-1, (int)(R)))
3588 
3589 #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
3590   ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3591                                             (__v16sf)(__m512)(W), \
3592                                             (__mmask16)(U), (int)(R)))
3593 
3594 #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
3595   ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3596                                             (__v16sf)_mm512_setzero_ps(), \
3597                                             (__mmask16)(U), (int)(R)))
3598 
3599 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3600 _mm512_cvtepu32_ps (__m512i __A)
3601 {
3602   return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
3603 }
3604 
3605 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3606 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
3607 {
3608   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3609                                              (__v16sf)_mm512_cvtepu32_ps(__A),
3610                                              (__v16sf)__W);
3611 }
3612 
3613 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3614 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
3615 {
3616   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3617                                              (__v16sf)_mm512_cvtepu32_ps(__A),
3618                                              (__v16sf)_mm512_setzero_ps());
3619 }
3620 
3621 static __inline __m512d __DEFAULT_FN_ATTRS512
3622 _mm512_cvtepi32_pd(__m256i __A)
3623 {
3624   return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
3625 }
3626 
3627 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3628 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
3629 {
3630   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3631                                               (__v8df)_mm512_cvtepi32_pd(__A),
3632                                               (__v8df)__W);
3633 }
3634 
3635 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3636 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
3637 {
3638   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3639                                               (__v8df)_mm512_cvtepi32_pd(__A),
3640                                               (__v8df)_mm512_setzero_pd());
3641 }
3642 
3643 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3644 _mm512_cvtepi32lo_pd(__m512i __A)
3645 {
3646   return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
3647 }
3648 
3649 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3650 _mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
3651 {
3652   return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
3653 }
3654 
3655 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3656 _mm512_cvtepi32_ps (__m512i __A)
3657 {
3658   return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
3659 }
3660 
3661 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3662 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
3663 {
3664   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3665                                              (__v16sf)_mm512_cvtepi32_ps(__A),
3666                                              (__v16sf)__W);
3667 }
3668 
3669 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3670 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
3671 {
3672   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3673                                              (__v16sf)_mm512_cvtepi32_ps(__A),
3674                                              (__v16sf)_mm512_setzero_ps());
3675 }
3676 
3677 static __inline __m512d __DEFAULT_FN_ATTRS512
3678 _mm512_cvtepu32_pd(__m256i __A)
3679 {
3680   return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
3681 }
3682 
3683 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3684 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
3685 {
3686   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3687                                               (__v8df)_mm512_cvtepu32_pd(__A),
3688                                               (__v8df)__W);
3689 }
3690 
3691 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3692 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
3693 {
3694   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3695                                               (__v8df)_mm512_cvtepu32_pd(__A),
3696                                               (__v8df)_mm512_setzero_pd());
3697 }
3698 
3699 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3700 _mm512_cvtepu32lo_pd(__m512i __A)
3701 {
3702   return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
3703 }
3704 
3705 static __inline__ __m512d __DEFAULT_FN_ATTRS512
3706 _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
3707 {
3708   return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
3709 }
3710 
3711 #define _mm512_cvt_roundpd_ps(A, R) \
3712   ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3713                                            (__v8sf)_mm256_setzero_ps(), \
3714                                            (__mmask8)-1, (int)(R)))
3715 
3716 #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
3717   ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3718                                            (__v8sf)(__m256)(W), (__mmask8)(U), \
3719                                            (int)(R)))
3720 
3721 #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
3722   ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3723                                            (__v8sf)_mm256_setzero_ps(), \
3724                                            (__mmask8)(U), (int)(R)))
3725 
3726 static __inline__ __m256 __DEFAULT_FN_ATTRS512
3727 _mm512_cvtpd_ps (__m512d __A)
3728 {
3729   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3730                 (__v8sf) _mm256_undefined_ps (),
3731                 (__mmask8) -1,
3732                 _MM_FROUND_CUR_DIRECTION);
3733 }
3734 
3735 static __inline__ __m256 __DEFAULT_FN_ATTRS512
3736 _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
3737 {
3738   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3739                 (__v8sf) __W,
3740                 (__mmask8) __U,
3741                 _MM_FROUND_CUR_DIRECTION);
3742 }
3743 
3744 static __inline__ __m256 __DEFAULT_FN_ATTRS512
3745 _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
3746 {
3747   return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3748                 (__v8sf) _mm256_setzero_ps (),
3749                 (__mmask8) __U,
3750                 _MM_FROUND_CUR_DIRECTION);
3751 }
3752 
3753 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3754 _mm512_cvtpd_pslo (__m512d __A)
3755 {
3756   return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
3757                 (__v8sf) _mm256_setzero_ps (),
3758                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3759 }
3760 
3761 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3762 _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
3763 {
3764   return (__m512) __builtin_shufflevector (
3765                 (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
3766                                                __U, __A),
3767                 (__v8sf) _mm256_setzero_ps (),
3768                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3769 }
3770 
3771 #define _mm512_cvt_roundps_ph(A, I) \
3772   ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3773                                              (__v16hi)_mm256_undefined_si256(), \
3774                                              (__mmask16)-1))
3775 
3776 #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
3777   ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3778                                              (__v16hi)(__m256i)(U), \
3779                                              (__mmask16)(W)))
3780 
3781 #define _mm512_maskz_cvt_roundps_ph(W, A, I) \
3782   ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3783                                              (__v16hi)_mm256_setzero_si256(), \
3784                                              (__mmask16)(W)))
3785 
3786 #define _mm512_cvtps_ph       _mm512_cvt_roundps_ph
3787 #define _mm512_mask_cvtps_ph  _mm512_mask_cvt_roundps_ph
3788 #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
3789 
3790 #define _mm512_cvt_roundph_ps(A, R) \
3791   ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3792                                             (__v16sf)_mm512_undefined_ps(), \
3793                                             (__mmask16)-1, (int)(R)))
3794 
3795 #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
3796   ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3797                                             (__v16sf)(__m512)(W), \
3798                                             (__mmask16)(U), (int)(R)))
3799 
3800 #define _mm512_maskz_cvt_roundph_ps(U, A, R) \
3801   ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3802                                             (__v16sf)_mm512_setzero_ps(), \
3803                                             (__mmask16)(U), (int)(R)))
3804 
3805 
3806 static  __inline __m512 __DEFAULT_FN_ATTRS512
3807 _mm512_cvtph_ps(__m256i __A)
3808 {
3809   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3810                 (__v16sf)
3811                 _mm512_setzero_ps (),
3812                 (__mmask16) -1,
3813                 _MM_FROUND_CUR_DIRECTION);
3814 }
3815 
3816 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3817 _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
3818 {
3819   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3820                  (__v16sf) __W,
3821                  (__mmask16) __U,
3822                  _MM_FROUND_CUR_DIRECTION);
3823 }
3824 
3825 static __inline__ __m512 __DEFAULT_FN_ATTRS512
3826 _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
3827 {
3828   return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3829                  (__v16sf) _mm512_setzero_ps (),
3830                  (__mmask16) __U,
3831                  _MM_FROUND_CUR_DIRECTION);
3832 }
3833 
3834 #define _mm512_cvtt_roundpd_epi32(A, R) \
3835   ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3836                                              (__v8si)_mm256_setzero_si256(), \
3837                                              (__mmask8)-1, (int)(R)))
3838 
3839 #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
3840   ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3841                                              (__v8si)(__m256i)(W), \
3842                                              (__mmask8)(U), (int)(R)))
3843 
3844 #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
3845   ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3846                                              (__v8si)_mm256_setzero_si256(), \
3847                                              (__mmask8)(U), (int)(R)))
3848 
3849 static __inline __m256i __DEFAULT_FN_ATTRS512
3850 _mm512_cvttpd_epi32(__m512d __a)
3851 {
3852   return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
3853                                                    (__v8si)_mm256_setzero_si256(),
3854                                                    (__mmask8) -1,
3855                                                     _MM_FROUND_CUR_DIRECTION);
3856 }
3857 
3858 static __inline__ __m256i __DEFAULT_FN_ATTRS512
3859 _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
3860 {
3861   return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3862                   (__v8si) __W,
3863                   (__mmask8) __U,
3864                   _MM_FROUND_CUR_DIRECTION);
3865 }
3866 
3867 static __inline__ __m256i __DEFAULT_FN_ATTRS512
3868 _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
3869 {
3870   return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3871                   (__v8si) _mm256_setzero_si256 (),
3872                   (__mmask8) __U,
3873                   _MM_FROUND_CUR_DIRECTION);
3874 }
3875 
3876 #define _mm512_cvtt_roundps_epi32(A, R) \
3877   ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3878                                              (__v16si)_mm512_setzero_si512(), \
3879                                              (__mmask16)-1, (int)(R)))
3880 
3881 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
3882   ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3883                                              (__v16si)(__m512i)(W), \
3884                                              (__mmask16)(U), (int)(R)))
3885 
3886 #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
3887   ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3888                                              (__v16si)_mm512_setzero_si512(), \
3889                                              (__mmask16)(U), (int)(R)))
3890 
3891 static __inline __m512i __DEFAULT_FN_ATTRS512
3892 _mm512_cvttps_epi32(__m512 __a)
3893 {
3894   return (__m512i)
3895     __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
3896                                      (__v16si) _mm512_setzero_si512 (),
3897                                      (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
3898 }
3899 
3900 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3901 _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
3902 {
3903   return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3904                   (__v16si) __W,
3905                   (__mmask16) __U,
3906                   _MM_FROUND_CUR_DIRECTION);
3907 }
3908 
3909 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3910 _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
3911 {
3912   return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3913                   (__v16si) _mm512_setzero_si512 (),
3914                   (__mmask16) __U,
3915                   _MM_FROUND_CUR_DIRECTION);
3916 }
3917 
3918 #define _mm512_cvt_roundps_epi32(A, R) \
3919   ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3920                                             (__v16si)_mm512_setzero_si512(), \
3921                                             (__mmask16)-1, (int)(R)))
3922 
3923 #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
3924   ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3925                                             (__v16si)(__m512i)(W), \
3926                                             (__mmask16)(U), (int)(R)))
3927 
3928 #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
3929   ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3930                                             (__v16si)_mm512_setzero_si512(), \
3931                                             (__mmask16)(U), (int)(R)))
3932 
3933 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3934 _mm512_cvtps_epi32 (__m512 __A)
3935 {
3936   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3937                  (__v16si) _mm512_undefined_epi32 (),
3938                  (__mmask16) -1,
3939                  _MM_FROUND_CUR_DIRECTION);
3940 }
3941 
3942 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3943 _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
3944 {
3945   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3946                  (__v16si) __W,
3947                  (__mmask16) __U,
3948                  _MM_FROUND_CUR_DIRECTION);
3949 }
3950 
3951 static __inline__ __m512i __DEFAULT_FN_ATTRS512
3952 _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
3953 {
3954   return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3955                  (__v16si)
3956                  _mm512_setzero_si512 (),
3957                  (__mmask16) __U,
3958                  _MM_FROUND_CUR_DIRECTION);
3959 }
3960 
3961 #define _mm512_cvt_roundpd_epi32(A, R) \
3962   ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3963                                             (__v8si)_mm256_setzero_si256(), \
3964                                             (__mmask8)-1, (int)(R)))
3965 
3966 #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
3967   ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3968                                             (__v8si)(__m256i)(W), \
3969                                             (__mmask8)(U), (int)(R)))
3970 
3971 #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
3972   ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3973                                             (__v8si)_mm256_setzero_si256(), \
3974                                             (__mmask8)(U), (int)(R)))
3975 
3976 static __inline__ __m256i __DEFAULT_FN_ATTRS512
3977 _mm512_cvtpd_epi32 (__m512d __A)
3978 {
3979   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3980                  (__v8si)
3981                  _mm256_undefined_si256 (),
3982                  (__mmask8) -1,
3983                  _MM_FROUND_CUR_DIRECTION);
3984 }
3985 
3986 static __inline__ __m256i __DEFAULT_FN_ATTRS512
3987 _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
3988 {
3989   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3990                  (__v8si) __W,
3991                  (__mmask8) __U,
3992                  _MM_FROUND_CUR_DIRECTION);
3993 }
3994 
3995 static __inline__ __m256i __DEFAULT_FN_ATTRS512
3996 _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
3997 {
3998   return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3999                  (__v8si)
4000                  _mm256_setzero_si256 (),
4001                  (__mmask8) __U,
4002                  _MM_FROUND_CUR_DIRECTION);
4003 }
4004 
4005 #define _mm512_cvt_roundps_epu32(A, R) \
4006   ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4007                                              (__v16si)_mm512_setzero_si512(), \
4008                                              (__mmask16)-1, (int)(R)))
4009 
4010 #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
4011   ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4012                                              (__v16si)(__m512i)(W), \
4013                                              (__mmask16)(U), (int)(R)))
4014 
4015 #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
4016   ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4017                                              (__v16si)_mm512_setzero_si512(), \
4018                                              (__mmask16)(U), (int)(R)))
4019 
4020 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4021 _mm512_cvtps_epu32 ( __m512 __A)
4022 {
4023   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
4024                   (__v16si)\
4025                   _mm512_undefined_epi32 (),
4026                   (__mmask16) -1,\
4027                   _MM_FROUND_CUR_DIRECTION);
4028 }
4029 
4030 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4031 _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
4032 {
4033   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4034                   (__v16si) __W,
4035                   (__mmask16) __U,
4036                   _MM_FROUND_CUR_DIRECTION);
4037 }
4038 
4039 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4040 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
4041 {
4042   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4043                   (__v16si)
4044                   _mm512_setzero_si512 (),
4045                   (__mmask16) __U ,
4046                   _MM_FROUND_CUR_DIRECTION);
4047 }
4048 
4049 #define _mm512_cvt_roundpd_epu32(A, R) \
4050   ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4051                                              (__v8si)_mm256_setzero_si256(), \
4052                                              (__mmask8)-1, (int)(R)))
4053 
4054 #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
4055   ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4056                                              (__v8si)(__m256i)(W), \
4057                                              (__mmask8)(U), (int)(R)))
4058 
4059 #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
4060   ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4061                                              (__v8si)_mm256_setzero_si256(), \
4062                                              (__mmask8)(U), (int)(R)))
4063 
4064 static __inline__ __m256i __DEFAULT_FN_ATTRS512
4065 _mm512_cvtpd_epu32 (__m512d __A)
4066 {
4067   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4068                   (__v8si)
4069                   _mm256_undefined_si256 (),
4070                   (__mmask8) -1,
4071                   _MM_FROUND_CUR_DIRECTION);
4072 }
4073 
4074 static __inline__ __m256i __DEFAULT_FN_ATTRS512
4075 _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
4076 {
4077   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4078                   (__v8si) __W,
4079                   (__mmask8) __U,
4080                   _MM_FROUND_CUR_DIRECTION);
4081 }
4082 
4083 static __inline__ __m256i __DEFAULT_FN_ATTRS512
4084 _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
4085 {
4086   return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4087                   (__v8si)
4088                   _mm256_setzero_si256 (),
4089                   (__mmask8) __U,
4090                   _MM_FROUND_CUR_DIRECTION);
4091 }
4092 
4093 static __inline__ double __DEFAULT_FN_ATTRS512
4094 _mm512_cvtsd_f64(__m512d __a)
4095 {
4096   return __a[0];
4097 }
4098 
4099 static __inline__ float __DEFAULT_FN_ATTRS512
4100 _mm512_cvtss_f32(__m512 __a)
4101 {
4102   return __a[0];
4103 }
4104 
4105 /* Unpack and Interleave */
4106 
4107 static __inline __m512d __DEFAULT_FN_ATTRS512
4108 _mm512_unpackhi_pd(__m512d __a, __m512d __b)
4109 {
4110   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
4111                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4112 }
4113 
4114 static __inline__ __m512d __DEFAULT_FN_ATTRS512
4115 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
4116 {
4117   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4118                                            (__v8df)_mm512_unpackhi_pd(__A, __B),
4119                                            (__v8df)__W);
4120 }
4121 
4122 static __inline__ __m512d __DEFAULT_FN_ATTRS512
4123 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
4124 {
4125   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4126                                            (__v8df)_mm512_unpackhi_pd(__A, __B),
4127                                            (__v8df)_mm512_setzero_pd());
4128 }
4129 
4130 static __inline __m512d __DEFAULT_FN_ATTRS512
4131 _mm512_unpacklo_pd(__m512d __a, __m512d __b)
4132 {
4133   return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
4134                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4135 }
4136 
4137 static __inline__ __m512d __DEFAULT_FN_ATTRS512
4138 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
4139 {
4140   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4141                                            (__v8df)_mm512_unpacklo_pd(__A, __B),
4142                                            (__v8df)__W);
4143 }
4144 
4145 static __inline__ __m512d __DEFAULT_FN_ATTRS512
4146 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
4147 {
4148   return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4149                                            (__v8df)_mm512_unpacklo_pd(__A, __B),
4150                                            (__v8df)_mm512_setzero_pd());
4151 }
4152 
4153 static __inline __m512 __DEFAULT_FN_ATTRS512
4154 _mm512_unpackhi_ps(__m512 __a, __m512 __b)
4155 {
4156   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
4157                                          2,    18,    3,    19,
4158                                          2+4,  18+4,  3+4,  19+4,
4159                                          2+8,  18+8,  3+8,  19+8,
4160                                          2+12, 18+12, 3+12, 19+12);
4161 }
4162 
4163 static __inline__ __m512 __DEFAULT_FN_ATTRS512
4164 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
4165 {
4166   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4167                                           (__v16sf)_mm512_unpackhi_ps(__A, __B),
4168                                           (__v16sf)__W);
4169 }
4170 
4171 static __inline__ __m512 __DEFAULT_FN_ATTRS512
4172 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
4173 {
4174   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4175                                           (__v16sf)_mm512_unpackhi_ps(__A, __B),
4176                                           (__v16sf)_mm512_setzero_ps());
4177 }
4178 
4179 static __inline __m512 __DEFAULT_FN_ATTRS512
4180 _mm512_unpacklo_ps(__m512 __a, __m512 __b)
4181 {
4182   return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
4183                                          0,    16,    1,    17,
4184                                          0+4,  16+4,  1+4,  17+4,
4185                                          0+8,  16+8,  1+8,  17+8,
4186                                          0+12, 16+12, 1+12, 17+12);
4187 }
4188 
4189 static __inline__ __m512 __DEFAULT_FN_ATTRS512
4190 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
4191 {
4192   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4193                                           (__v16sf)_mm512_unpacklo_ps(__A, __B),
4194                                           (__v16sf)__W);
4195 }
4196 
4197 static __inline__ __m512 __DEFAULT_FN_ATTRS512
4198 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
4199 {
4200   return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4201                                           (__v16sf)_mm512_unpacklo_ps(__A, __B),
4202                                           (__v16sf)_mm512_setzero_ps());
4203 }
4204 
4205 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4206 _mm512_unpackhi_epi32(__m512i __A, __m512i __B)
4207 {
4208   return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4209                                           2,    18,    3,    19,
4210                                           2+4,  18+4,  3+4,  19+4,
4211                                           2+8,  18+8,  3+8,  19+8,
4212                                           2+12, 18+12, 3+12, 19+12);
4213 }
4214 
4215 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4216 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4217 {
4218   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4219                                        (__v16si)_mm512_unpackhi_epi32(__A, __B),
4220                                        (__v16si)__W);
4221 }
4222 
4223 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4224 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
4225 {
4226   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4227                                        (__v16si)_mm512_unpackhi_epi32(__A, __B),
4228                                        (__v16si)_mm512_setzero_si512());
4229 }
4230 
4231 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4232 _mm512_unpacklo_epi32(__m512i __A, __m512i __B)
4233 {
4234   return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4235                                           0,    16,    1,    17,
4236                                           0+4,  16+4,  1+4,  17+4,
4237                                           0+8,  16+8,  1+8,  17+8,
4238                                           0+12, 16+12, 1+12, 17+12);
4239 }
4240 
4241 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4242 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4243 {
4244   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4245                                        (__v16si)_mm512_unpacklo_epi32(__A, __B),
4246                                        (__v16si)__W);
4247 }
4248 
4249 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4250 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
4251 {
4252   return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4253                                        (__v16si)_mm512_unpacklo_epi32(__A, __B),
4254                                        (__v16si)_mm512_setzero_si512());
4255 }
4256 
4257 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4258 _mm512_unpackhi_epi64(__m512i __A, __m512i __B)
4259 {
4260   return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4261                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4262 }
4263 
4264 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4265 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4266 {
4267   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4268                                         (__v8di)_mm512_unpackhi_epi64(__A, __B),
4269                                         (__v8di)__W);
4270 }
4271 
4272 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4273 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
4274 {
4275   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4276                                         (__v8di)_mm512_unpackhi_epi64(__A, __B),
4277                                         (__v8di)_mm512_setzero_si512());
4278 }
4279 
4280 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4281 _mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
4282 {
4283   return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4284                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4285 }
4286 
4287 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4288 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4289 {
4290   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4291                                         (__v8di)_mm512_unpacklo_epi64(__A, __B),
4292                                         (__v8di)__W);
4293 }
4294 
4295 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4296 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
4297 {
4298   return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4299                                         (__v8di)_mm512_unpacklo_epi64(__A, __B),
4300                                         (__v8di)_mm512_setzero_si512());
4301 }
4302 
4303 
4304 /* SIMD load ops */
4305 
4306 static __inline __m512i __DEFAULT_FN_ATTRS512
4307 _mm512_loadu_si512 (void const *__P)
4308 {
4309   struct __loadu_si512 {
4310     __m512i_u __v;
4311   } __attribute__((__packed__, __may_alias__));
4312   return ((const struct __loadu_si512*)__P)->__v;
4313 }
4314 
4315 static __inline __m512i __DEFAULT_FN_ATTRS512
4316 _mm512_loadu_epi32 (void const *__P)
4317 {
4318   struct __loadu_epi32 {
4319     __m512i_u __v;
4320   } __attribute__((__packed__, __may_alias__));
4321   return ((const struct __loadu_epi32*)__P)->__v;
4322 }
4323 
4324 static __inline __m512i __DEFAULT_FN_ATTRS512
4325 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
4326 {
4327   return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
4328                   (__v16si) __W,
4329                   (__mmask16) __U);
4330 }
4331 
4332 
4333 static __inline __m512i __DEFAULT_FN_ATTRS512
4334 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
4335 {
4336   return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
4337                                                      (__v16si)
4338                                                      _mm512_setzero_si512 (),
4339                                                      (__mmask16) __U);
4340 }
4341 
4342 static __inline __m512i __DEFAULT_FN_ATTRS512
4343 _mm512_loadu_epi64 (void const *__P)
4344 {
4345   struct __loadu_epi64 {
4346     __m512i_u __v;
4347   } __attribute__((__packed__, __may_alias__));
4348   return ((const struct __loadu_epi64*)__P)->__v;
4349 }
4350 
4351 static __inline __m512i __DEFAULT_FN_ATTRS512
4352 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
4353 {
4354   return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
4355                   (__v8di) __W,
4356                   (__mmask8) __U);
4357 }
4358 
4359 static __inline __m512i __DEFAULT_FN_ATTRS512
4360 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
4361 {
4362   return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
4363                                                      (__v8di)
4364                                                      _mm512_setzero_si512 (),
4365                                                      (__mmask8) __U);
4366 }
4367 
4368 static __inline __m512 __DEFAULT_FN_ATTRS512
4369 _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
4370 {
4371   return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
4372                    (__v16sf) __W,
4373                    (__mmask16) __U);
4374 }
4375 
4376 static __inline __m512 __DEFAULT_FN_ATTRS512
4377 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
4378 {
4379   return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
4380                                                   (__v16sf)
4381                                                   _mm512_setzero_ps (),
4382                                                   (__mmask16) __U);
4383 }
4384 
4385 static __inline __m512d __DEFAULT_FN_ATTRS512
4386 _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
4387 {
4388   return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
4389                 (__v8df) __W,
4390                 (__mmask8) __U);
4391 }
4392 
4393 static __inline __m512d __DEFAULT_FN_ATTRS512
4394 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
4395 {
4396   return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
4397                                                    (__v8df)
4398                                                    _mm512_setzero_pd (),
4399                                                    (__mmask8) __U);
4400 }
4401 
4402 static __inline __m512d __DEFAULT_FN_ATTRS512
4403 _mm512_loadu_pd(void const *__p)
4404 {
4405   struct __loadu_pd {
4406     __m512d_u __v;
4407   } __attribute__((__packed__, __may_alias__));
4408   return ((const struct __loadu_pd*)__p)->__v;
4409 }
4410 
4411 static __inline __m512 __DEFAULT_FN_ATTRS512
4412 _mm512_loadu_ps(void const *__p)
4413 {
4414   struct __loadu_ps {
4415     __m512_u __v;
4416   } __attribute__((__packed__, __may_alias__));
4417   return ((const struct __loadu_ps*)__p)->__v;
4418 }
4419 
4420 static __inline __m512 __DEFAULT_FN_ATTRS512
4421 _mm512_load_ps(void const *__p)
4422 {
4423   return *(const __m512*)__p;
4424 }
4425 
4426 static __inline __m512 __DEFAULT_FN_ATTRS512
4427 _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
4428 {
4429   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
4430                    (__v16sf) __W,
4431                    (__mmask16) __U);
4432 }
4433 
4434 static __inline __m512 __DEFAULT_FN_ATTRS512
4435 _mm512_maskz_load_ps(__mmask16 __U, void const *__P)
4436 {
4437   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
4438                                                   (__v16sf)
4439                                                   _mm512_setzero_ps (),
4440                                                   (__mmask16) __U);
4441 }
4442 
4443 static __inline __m512d __DEFAULT_FN_ATTRS512
4444 _mm512_load_pd(void const *__p)
4445 {
4446   return *(const __m512d*)__p;
4447 }
4448 
4449 static __inline __m512d __DEFAULT_FN_ATTRS512
4450 _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
4451 {
4452   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
4453                           (__v8df) __W,
4454                           (__mmask8) __U);
4455 }
4456 
4457 static __inline __m512d __DEFAULT_FN_ATTRS512
4458 _mm512_maskz_load_pd(__mmask8 __U, void const *__P)
4459 {
4460   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
4461                                                    (__v8df)
4462                                                    _mm512_setzero_pd (),
4463                                                    (__mmask8) __U);
4464 }
4465 
4466 static __inline __m512i __DEFAULT_FN_ATTRS512
4467 _mm512_load_si512 (void const *__P)
4468 {
4469   return *(const __m512i *) __P;
4470 }
4471 
4472 static __inline __m512i __DEFAULT_FN_ATTRS512
4473 _mm512_load_epi32 (void const *__P)
4474 {
4475   return *(const __m512i *) __P;
4476 }
4477 
4478 static __inline __m512i __DEFAULT_FN_ATTRS512
4479 _mm512_load_epi64 (void const *__P)
4480 {
4481   return *(const __m512i *) __P;
4482 }
4483 
4484 /* SIMD store ops */
4485 
4486 static __inline void __DEFAULT_FN_ATTRS512
4487 _mm512_storeu_epi64 (void *__P, __m512i __A)
4488 {
4489   struct __storeu_epi64 {
4490     __m512i_u __v;
4491   } __attribute__((__packed__, __may_alias__));
4492   ((struct __storeu_epi64*)__P)->__v = __A;
4493 }
4494 
4495 static __inline void __DEFAULT_FN_ATTRS512
4496 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
4497 {
4498   __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
4499                                      (__mmask8) __U);
4500 }
4501 
4502 static __inline void __DEFAULT_FN_ATTRS512
4503 _mm512_storeu_si512 (void *__P, __m512i __A)
4504 {
4505   struct __storeu_si512 {
4506     __m512i_u __v;
4507   } __attribute__((__packed__, __may_alias__));
4508   ((struct __storeu_si512*)__P)->__v = __A;
4509 }
4510 
4511 static __inline void __DEFAULT_FN_ATTRS512
4512 _mm512_storeu_epi32 (void *__P, __m512i __A)
4513 {
4514   struct __storeu_epi32 {
4515     __m512i_u __v;
4516   } __attribute__((__packed__, __may_alias__));
4517   ((struct __storeu_epi32*)__P)->__v = __A;
4518 }
4519 
4520 static __inline void __DEFAULT_FN_ATTRS512
4521 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
4522 {
4523   __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
4524                                      (__mmask16) __U);
4525 }
4526 
4527 static __inline void __DEFAULT_FN_ATTRS512
4528 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
4529 {
4530   __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
4531 }
4532 
4533 static __inline void __DEFAULT_FN_ATTRS512
4534 _mm512_storeu_pd(void *__P, __m512d __A)
4535 {
4536   struct __storeu_pd {
4537     __m512d_u __v;
4538   } __attribute__((__packed__, __may_alias__));
4539   ((struct __storeu_pd*)__P)->__v = __A;
4540 }
4541 
4542 static __inline void __DEFAULT_FN_ATTRS512
4543 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
4544 {
4545   __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
4546                                    (__mmask16) __U);
4547 }
4548 
4549 static __inline void __DEFAULT_FN_ATTRS512
4550 _mm512_storeu_ps(void *__P, __m512 __A)
4551 {
4552   struct __storeu_ps {
4553     __m512_u __v;
4554   } __attribute__((__packed__, __may_alias__));
4555   ((struct __storeu_ps*)__P)->__v = __A;
4556 }
4557 
4558 static __inline void __DEFAULT_FN_ATTRS512
4559 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
4560 {
4561   __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
4562 }
4563 
4564 static __inline void __DEFAULT_FN_ATTRS512
4565 _mm512_store_pd(void *__P, __m512d __A)
4566 {
4567   *(__m512d*)__P = __A;
4568 }
4569 
4570 static __inline void __DEFAULT_FN_ATTRS512
4571 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
4572 {
4573   __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
4574                                    (__mmask16) __U);
4575 }
4576 
4577 static __inline void __DEFAULT_FN_ATTRS512
4578 _mm512_store_ps(void *__P, __m512 __A)
4579 {
4580   *(__m512*)__P = __A;
4581 }
4582 
4583 static __inline void __DEFAULT_FN_ATTRS512
4584 _mm512_store_si512 (void *__P, __m512i __A)
4585 {
4586   *(__m512i *) __P = __A;
4587 }
4588 
4589 static __inline void __DEFAULT_FN_ATTRS512
4590 _mm512_store_epi32 (void *__P, __m512i __A)
4591 {
4592   *(__m512i *) __P = __A;
4593 }
4594 
4595 static __inline void __DEFAULT_FN_ATTRS512
4596 _mm512_store_epi64 (void *__P, __m512i __A)
4597 {
4598   *(__m512i *) __P = __A;
4599 }
4600 
4601 /* Mask ops */
4602 
4603 static __inline __mmask16 __DEFAULT_FN_ATTRS
4604 _mm512_knot(__mmask16 __M)
4605 {
4606   return __builtin_ia32_knothi(__M);
4607 }
4608 
4609 /* Integer compare */
4610 
4611 #define _mm512_cmpeq_epi32_mask(A, B) \
4612     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
4613 #define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
4614     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
4615 #define _mm512_cmpge_epi32_mask(A, B) \
4616     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
4617 #define _mm512_mask_cmpge_epi32_mask(k, A, B) \
4618     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
4619 #define _mm512_cmpgt_epi32_mask(A, B) \
4620     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
4621 #define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
4622     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
4623 #define _mm512_cmple_epi32_mask(A, B) \
4624     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
4625 #define _mm512_mask_cmple_epi32_mask(k, A, B) \
4626     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
4627 #define _mm512_cmplt_epi32_mask(A, B) \
4628     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
4629 #define _mm512_mask_cmplt_epi32_mask(k, A, B) \
4630     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
4631 #define _mm512_cmpneq_epi32_mask(A, B) \
4632     _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
4633 #define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
4634     _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
4635 
4636 #define _mm512_cmpeq_epu32_mask(A, B) \
4637     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
4638 #define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
4639     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
4640 #define _mm512_cmpge_epu32_mask(A, B) \
4641     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
4642 #define _mm512_mask_cmpge_epu32_mask(k, A, B) \
4643     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
4644 #define _mm512_cmpgt_epu32_mask(A, B) \
4645     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
4646 #define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
4647     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
4648 #define _mm512_cmple_epu32_mask(A, B) \
4649     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
4650 #define _mm512_mask_cmple_epu32_mask(k, A, B) \
4651     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
4652 #define _mm512_cmplt_epu32_mask(A, B) \
4653     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
4654 #define _mm512_mask_cmplt_epu32_mask(k, A, B) \
4655     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
4656 #define _mm512_cmpneq_epu32_mask(A, B) \
4657     _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
4658 #define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
4659     _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
4660 
4661 #define _mm512_cmpeq_epi64_mask(A, B) \
4662     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
4663 #define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
4664     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
4665 #define _mm512_cmpge_epi64_mask(A, B) \
4666     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
4667 #define _mm512_mask_cmpge_epi64_mask(k, A, B) \
4668     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
4669 #define _mm512_cmpgt_epi64_mask(A, B) \
4670     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
4671 #define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
4672     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
4673 #define _mm512_cmple_epi64_mask(A, B) \
4674     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
4675 #define _mm512_mask_cmple_epi64_mask(k, A, B) \
4676     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
4677 #define _mm512_cmplt_epi64_mask(A, B) \
4678     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
4679 #define _mm512_mask_cmplt_epi64_mask(k, A, B) \
4680     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
4681 #define _mm512_cmpneq_epi64_mask(A, B) \
4682     _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
4683 #define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
4684     _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
4685 
4686 #define _mm512_cmpeq_epu64_mask(A, B) \
4687     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
4688 #define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
4689     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
4690 #define _mm512_cmpge_epu64_mask(A, B) \
4691     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
4692 #define _mm512_mask_cmpge_epu64_mask(k, A, B) \
4693     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
4694 #define _mm512_cmpgt_epu64_mask(A, B) \
4695     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
4696 #define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
4697     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
4698 #define _mm512_cmple_epu64_mask(A, B) \
4699     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
4700 #define _mm512_mask_cmple_epu64_mask(k, A, B) \
4701     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
4702 #define _mm512_cmplt_epu64_mask(A, B) \
4703     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
4704 #define _mm512_mask_cmplt_epu64_mask(k, A, B) \
4705     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
4706 #define _mm512_cmpneq_epu64_mask(A, B) \
4707     _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
4708 #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
4709     _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
4710 
4711 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4712 _mm512_cvtepi8_epi32(__m128i __A)
4713 {
4714   /* This function always performs a signed extension, but __v16qi is a char
4715      which may be signed or unsigned, so use __v16qs. */
4716   return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
4717 }
4718 
4719 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4720 _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
4721 {
4722   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4723                                              (__v16si)_mm512_cvtepi8_epi32(__A),
4724                                              (__v16si)__W);
4725 }
4726 
4727 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4728 _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
4729 {
4730   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4731                                              (__v16si)_mm512_cvtepi8_epi32(__A),
4732                                              (__v16si)_mm512_setzero_si512());
4733 }
4734 
4735 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4736 _mm512_cvtepi8_epi64(__m128i __A)
4737 {
4738   /* This function always performs a signed extension, but __v16qi is a char
4739      which may be signed or unsigned, so use __v16qs. */
4740   return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4741 }
4742 
4743 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4744 _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4745 {
4746   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4747                                              (__v8di)_mm512_cvtepi8_epi64(__A),
4748                                              (__v8di)__W);
4749 }
4750 
4751 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4752 _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
4753 {
4754   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4755                                              (__v8di)_mm512_cvtepi8_epi64(__A),
4756                                              (__v8di)_mm512_setzero_si512 ());
4757 }
4758 
4759 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4760 _mm512_cvtepi32_epi64(__m256i __X)
4761 {
4762   return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
4763 }
4764 
4765 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4766 _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
4767 {
4768   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4769                                              (__v8di)_mm512_cvtepi32_epi64(__X),
4770                                              (__v8di)__W);
4771 }
4772 
4773 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4774 _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
4775 {
4776   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4777                                              (__v8di)_mm512_cvtepi32_epi64(__X),
4778                                              (__v8di)_mm512_setzero_si512());
4779 }
4780 
4781 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4782 _mm512_cvtepi16_epi32(__m256i __A)
4783 {
4784   return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
4785 }
4786 
4787 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4788 _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
4789 {
4790   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4791                                             (__v16si)_mm512_cvtepi16_epi32(__A),
4792                                             (__v16si)__W);
4793 }
4794 
4795 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4796 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
4797 {
4798   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4799                                             (__v16si)_mm512_cvtepi16_epi32(__A),
4800                                             (__v16si)_mm512_setzero_si512 ());
4801 }
4802 
4803 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4804 _mm512_cvtepi16_epi64(__m128i __A)
4805 {
4806   return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
4807 }
4808 
4809 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4810 _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4811 {
4812   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4813                                              (__v8di)_mm512_cvtepi16_epi64(__A),
4814                                              (__v8di)__W);
4815 }
4816 
4817 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4818 _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
4819 {
4820   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4821                                              (__v8di)_mm512_cvtepi16_epi64(__A),
4822                                              (__v8di)_mm512_setzero_si512());
4823 }
4824 
4825 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4826 _mm512_cvtepu8_epi32(__m128i __A)
4827 {
4828   return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
4829 }
4830 
4831 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4832 _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
4833 {
4834   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4835                                              (__v16si)_mm512_cvtepu8_epi32(__A),
4836                                              (__v16si)__W);
4837 }
4838 
4839 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4840 _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
4841 {
4842   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4843                                              (__v16si)_mm512_cvtepu8_epi32(__A),
4844                                              (__v16si)_mm512_setzero_si512());
4845 }
4846 
4847 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4848 _mm512_cvtepu8_epi64(__m128i __A)
4849 {
4850   return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4851 }
4852 
4853 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4854 _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4855 {
4856   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4857                                              (__v8di)_mm512_cvtepu8_epi64(__A),
4858                                              (__v8di)__W);
4859 }
4860 
4861 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4862 _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
4863 {
4864   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4865                                              (__v8di)_mm512_cvtepu8_epi64(__A),
4866                                              (__v8di)_mm512_setzero_si512());
4867 }
4868 
4869 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4870 _mm512_cvtepu32_epi64(__m256i __X)
4871 {
4872   return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
4873 }
4874 
4875 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4876 _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
4877 {
4878   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4879                                              (__v8di)_mm512_cvtepu32_epi64(__X),
4880                                              (__v8di)__W);
4881 }
4882 
4883 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4884 _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
4885 {
4886   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4887                                              (__v8di)_mm512_cvtepu32_epi64(__X),
4888                                              (__v8di)_mm512_setzero_si512());
4889 }
4890 
4891 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4892 _mm512_cvtepu16_epi32(__m256i __A)
4893 {
4894   return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
4895 }
4896 
4897 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4898 _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
4899 {
4900   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4901                                             (__v16si)_mm512_cvtepu16_epi32(__A),
4902                                             (__v16si)__W);
4903 }
4904 
4905 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4906 _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
4907 {
4908   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4909                                             (__v16si)_mm512_cvtepu16_epi32(__A),
4910                                             (__v16si)_mm512_setzero_si512());
4911 }
4912 
4913 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4914 _mm512_cvtepu16_epi64(__m128i __A)
4915 {
4916   return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
4917 }
4918 
4919 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4920 _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4921 {
4922   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4923                                              (__v8di)_mm512_cvtepu16_epi64(__A),
4924                                              (__v8di)__W);
4925 }
4926 
4927 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4928 _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
4929 {
4930   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4931                                              (__v8di)_mm512_cvtepu16_epi64(__A),
4932                                              (__v8di)_mm512_setzero_si512());
4933 }
4934 
4935 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4936 _mm512_rorv_epi32 (__m512i __A, __m512i __B)
4937 {
4938   return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
4939 }
4940 
4941 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4942 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4943 {
4944   return (__m512i)__builtin_ia32_selectd_512(__U,
4945                                            (__v16si)_mm512_rorv_epi32(__A, __B),
4946                                            (__v16si)__W);
4947 }
4948 
4949 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4950 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
4951 {
4952   return (__m512i)__builtin_ia32_selectd_512(__U,
4953                                            (__v16si)_mm512_rorv_epi32(__A, __B),
4954                                            (__v16si)_mm512_setzero_si512());
4955 }
4956 
4957 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4958 _mm512_rorv_epi64 (__m512i __A, __m512i __B)
4959 {
4960   return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
4961 }
4962 
4963 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4964 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4965 {
4966   return (__m512i)__builtin_ia32_selectq_512(__U,
4967                                             (__v8di)_mm512_rorv_epi64(__A, __B),
4968                                             (__v8di)__W);
4969 }
4970 
4971 static __inline__ __m512i __DEFAULT_FN_ATTRS512
4972 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
4973 {
4974   return (__m512i)__builtin_ia32_selectq_512(__U,
4975                                             (__v8di)_mm512_rorv_epi64(__A, __B),
4976                                             (__v8di)_mm512_setzero_si512());
4977 }
4978 
4979 
4980 
4981 #define _mm512_cmp_epi32_mask(a, b, p) \
4982   ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
4983                                           (__v16si)(__m512i)(b), (int)(p), \
4984                                           (__mmask16)-1))
4985 
4986 #define _mm512_cmp_epu32_mask(a, b, p) \
4987   ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
4988                                            (__v16si)(__m512i)(b), (int)(p), \
4989                                            (__mmask16)-1))
4990 
4991 #define _mm512_cmp_epi64_mask(a, b, p) \
4992   ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
4993                                          (__v8di)(__m512i)(b), (int)(p), \
4994                                          (__mmask8)-1))
4995 
4996 #define _mm512_cmp_epu64_mask(a, b, p) \
4997   ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
4998                                           (__v8di)(__m512i)(b), (int)(p), \
4999                                           (__mmask8)-1))
5000 
5001 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
5002   ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
5003                                           (__v16si)(__m512i)(b), (int)(p), \
5004                                           (__mmask16)(m)))
5005 
5006 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
5007   ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
5008                                            (__v16si)(__m512i)(b), (int)(p), \
5009                                            (__mmask16)(m)))
5010 
5011 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
5012   ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
5013                                          (__v8di)(__m512i)(b), (int)(p), \
5014                                          (__mmask8)(m)))
5015 
5016 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
5017   ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
5018                                           (__v8di)(__m512i)(b), (int)(p), \
5019                                           (__mmask8)(m)))
5020 
5021 #define _mm512_rol_epi32(a, b) \
5022   ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
5023 
5024 #define _mm512_mask_rol_epi32(W, U, a, b) \
5025   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5026                                        (__v16si)_mm512_rol_epi32((a), (b)), \
5027                                        (__v16si)(__m512i)(W)))
5028 
5029 #define _mm512_maskz_rol_epi32(U, a, b) \
5030   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5031                                        (__v16si)_mm512_rol_epi32((a), (b)), \
5032                                        (__v16si)_mm512_setzero_si512()))
5033 
5034 #define _mm512_rol_epi64(a, b) \
5035   ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
5036 
5037 #define _mm512_mask_rol_epi64(W, U, a, b) \
5038   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5039                                        (__v8di)_mm512_rol_epi64((a), (b)), \
5040                                        (__v8di)(__m512i)(W)))
5041 
5042 #define _mm512_maskz_rol_epi64(U, a, b) \
5043   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5044                                        (__v8di)_mm512_rol_epi64((a), (b)), \
5045                                        (__v8di)_mm512_setzero_si512()))
5046 
5047 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5048 _mm512_rolv_epi32 (__m512i __A, __m512i __B)
5049 {
5050   return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
5051 }
5052 
5053 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5054 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
5055 {
5056   return (__m512i)__builtin_ia32_selectd_512(__U,
5057                                            (__v16si)_mm512_rolv_epi32(__A, __B),
5058                                            (__v16si)__W);
5059 }
5060 
5061 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5062 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
5063 {
5064   return (__m512i)__builtin_ia32_selectd_512(__U,
5065                                            (__v16si)_mm512_rolv_epi32(__A, __B),
5066                                            (__v16si)_mm512_setzero_si512());
5067 }
5068 
5069 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5070 _mm512_rolv_epi64 (__m512i __A, __m512i __B)
5071 {
5072   return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
5073 }
5074 
5075 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5076 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
5077 {
5078   return (__m512i)__builtin_ia32_selectq_512(__U,
5079                                             (__v8di)_mm512_rolv_epi64(__A, __B),
5080                                             (__v8di)__W);
5081 }
5082 
5083 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5084 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
5085 {
5086   return (__m512i)__builtin_ia32_selectq_512(__U,
5087                                             (__v8di)_mm512_rolv_epi64(__A, __B),
5088                                             (__v8di)_mm512_setzero_si512());
5089 }
5090 
5091 #define _mm512_ror_epi32(A, B) \
5092   ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)))
5093 
5094 #define _mm512_mask_ror_epi32(W, U, A, B) \
5095   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5096                                        (__v16si)_mm512_ror_epi32((A), (B)), \
5097                                        (__v16si)(__m512i)(W)))
5098 
5099 #define _mm512_maskz_ror_epi32(U, A, B) \
5100   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5101                                        (__v16si)_mm512_ror_epi32((A), (B)), \
5102                                        (__v16si)_mm512_setzero_si512()))
5103 
5104 #define _mm512_ror_epi64(A, B) \
5105   ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)))
5106 
5107 #define _mm512_mask_ror_epi64(W, U, A, B) \
5108   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5109                                        (__v8di)_mm512_ror_epi64((A), (B)), \
5110                                        (__v8di)(__m512i)(W)))
5111 
5112 #define _mm512_maskz_ror_epi64(U, A, B) \
5113   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5114                                        (__v8di)_mm512_ror_epi64((A), (B)), \
5115                                        (__v8di)_mm512_setzero_si512()))
5116 
5117 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5118 _mm512_slli_epi32(__m512i __A, unsigned int __B)
5119 {
5120   return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B);
5121 }
5122 
5123 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5124 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
5125                        unsigned int __B)
5126 {
5127   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5128                                          (__v16si)_mm512_slli_epi32(__A, __B),
5129                                          (__v16si)__W);
5130 }
5131 
5132 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5133 _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
5134   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5135                                          (__v16si)_mm512_slli_epi32(__A, __B),
5136                                          (__v16si)_mm512_setzero_si512());
5137 }
5138 
5139 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5140 _mm512_slli_epi64(__m512i __A, unsigned int __B)
5141 {
5142   return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B);
5143 }
5144 
5145 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5146 _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
5147 {
5148   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5149                                           (__v8di)_mm512_slli_epi64(__A, __B),
5150                                           (__v8di)__W);
5151 }
5152 
5153 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5154 _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
5155 {
5156   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5157                                           (__v8di)_mm512_slli_epi64(__A, __B),
5158                                           (__v8di)_mm512_setzero_si512());
5159 }
5160 
5161 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5162 _mm512_srli_epi32(__m512i __A, unsigned int __B)
5163 {
5164   return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B);
5165 }
5166 
5167 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5168 _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
5169                        unsigned int __B)
5170 {
5171   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5172                                          (__v16si)_mm512_srli_epi32(__A, __B),
5173                                          (__v16si)__W);
5174 }
5175 
5176 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5177 _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
5178   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5179                                          (__v16si)_mm512_srli_epi32(__A, __B),
5180                                          (__v16si)_mm512_setzero_si512());
5181 }
5182 
5183 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5184 _mm512_srli_epi64(__m512i __A, unsigned int __B)
5185 {
5186   return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B);
5187 }
5188 
5189 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5190 _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A,
5191                        unsigned int __B)
5192 {
5193   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5194                                           (__v8di)_mm512_srli_epi64(__A, __B),
5195                                           (__v8di)__W);
5196 }
5197 
5198 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5199 _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A,
5200                         unsigned int __B)
5201 {
5202   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5203                                           (__v8di)_mm512_srli_epi64(__A, __B),
5204                                           (__v8di)_mm512_setzero_si512());
5205 }
5206 
5207 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5208 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
5209 {
5210   return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
5211               (__v16si) __W,
5212               (__mmask16) __U);
5213 }
5214 
5215 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5216 _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
5217 {
5218   return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
5219               (__v16si)
5220               _mm512_setzero_si512 (),
5221               (__mmask16) __U);
5222 }
5223 
5224 static __inline__ void __DEFAULT_FN_ATTRS512
5225 _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
5226 {
5227   __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
5228           (__mmask16) __U);
5229 }
5230 
5231 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5232 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
5233 {
5234   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
5235                  (__v16si) __A,
5236                  (__v16si) __W);
5237 }
5238 
5239 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5240 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
5241 {
5242   return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
5243                  (__v16si) __A,
5244                  (__v16si) _mm512_setzero_si512 ());
5245 }
5246 
5247 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5248 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
5249 {
5250   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
5251                  (__v8di) __A,
5252                  (__v8di) __W);
5253 }
5254 
5255 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5256 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
5257 {
5258   return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
5259                  (__v8di) __A,
5260                  (__v8di) _mm512_setzero_si512 ());
5261 }
5262 
5263 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5264 _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
5265 {
5266   return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
5267               (__v8di) __W,
5268               (__mmask8) __U);
5269 }
5270 
5271 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5272 _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
5273 {
5274   return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
5275               (__v8di)
5276               _mm512_setzero_si512 (),
5277               (__mmask8) __U);
5278 }
5279 
5280 static __inline__ void __DEFAULT_FN_ATTRS512
5281 _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
5282 {
5283   __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
5284           (__mmask8) __U);
5285 }
5286 
5287 static __inline__ __m512d __DEFAULT_FN_ATTRS512
5288 _mm512_movedup_pd (__m512d __A)
5289 {
5290   return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
5291                                           0, 0, 2, 2, 4, 4, 6, 6);
5292 }
5293 
5294 static __inline__ __m512d __DEFAULT_FN_ATTRS512
5295 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
5296 {
5297   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
5298                                               (__v8df)_mm512_movedup_pd(__A),
5299                                               (__v8df)__W);
5300 }
5301 
5302 static __inline__ __m512d __DEFAULT_FN_ATTRS512
5303 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
5304 {
5305   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
5306                                               (__v8df)_mm512_movedup_pd(__A),
5307                                               (__v8df)_mm512_setzero_pd());
5308 }
5309 
5310 #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
5311   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5312                                               (__v8df)(__m512d)(B), \
5313                                               (__v8di)(__m512i)(C), (int)(imm), \
5314                                               (__mmask8)-1, (int)(R)))
5315 
5316 #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
5317   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5318                                               (__v8df)(__m512d)(B), \
5319                                               (__v8di)(__m512i)(C), (int)(imm), \
5320                                               (__mmask8)(U), (int)(R)))
5321 
5322 #define _mm512_fixupimm_pd(A, B, C, imm) \
5323   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5324                                               (__v8df)(__m512d)(B), \
5325                                               (__v8di)(__m512i)(C), (int)(imm), \
5326                                               (__mmask8)-1, \
5327                                               _MM_FROUND_CUR_DIRECTION))
5328 
5329 #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
5330   ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5331                                               (__v8df)(__m512d)(B), \
5332                                               (__v8di)(__m512i)(C), (int)(imm), \
5333                                               (__mmask8)(U), \
5334                                               _MM_FROUND_CUR_DIRECTION))
5335 
5336 #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
5337   ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
5338                                                (__v8df)(__m512d)(B), \
5339                                                (__v8di)(__m512i)(C), \
5340                                                (int)(imm), (__mmask8)(U), \
5341                                                (int)(R)))
5342 
5343 #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
5344   ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
5345                                                (__v8df)(__m512d)(B), \
5346                                                (__v8di)(__m512i)(C), \
5347                                                (int)(imm), (__mmask8)(U), \
5348                                                _MM_FROUND_CUR_DIRECTION))
5349 
5350 #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
5351   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5352                                              (__v16sf)(__m512)(B), \
5353                                              (__v16si)(__m512i)(C), (int)(imm), \
5354                                              (__mmask16)-1, (int)(R)))
5355 
5356 #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
5357   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5358                                              (__v16sf)(__m512)(B), \
5359                                              (__v16si)(__m512i)(C), (int)(imm), \
5360                                              (__mmask16)(U), (int)(R)))
5361 
5362 #define _mm512_fixupimm_ps(A, B, C, imm) \
5363   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5364                                              (__v16sf)(__m512)(B), \
5365                                              (__v16si)(__m512i)(C), (int)(imm), \
5366                                              (__mmask16)-1, \
5367                                              _MM_FROUND_CUR_DIRECTION))
5368 
5369 #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
5370   ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5371                                              (__v16sf)(__m512)(B), \
5372                                              (__v16si)(__m512i)(C), (int)(imm), \
5373                                              (__mmask16)(U), \
5374                                              _MM_FROUND_CUR_DIRECTION))
5375 
5376 #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
5377   ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
5378                                               (__v16sf)(__m512)(B), \
5379                                               (__v16si)(__m512i)(C), \
5380                                               (int)(imm), (__mmask16)(U), \
5381                                               (int)(R)))
5382 
5383 #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
5384   ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
5385                                               (__v16sf)(__m512)(B), \
5386                                               (__v16si)(__m512i)(C), \
5387                                               (int)(imm), (__mmask16)(U), \
5388                                               _MM_FROUND_CUR_DIRECTION))
5389 
5390 #define _mm_fixupimm_round_sd(A, B, C, imm, R) \
5391   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5392                                            (__v2df)(__m128d)(B), \
5393                                            (__v2di)(__m128i)(C), (int)(imm), \
5394                                            (__mmask8)-1, (int)(R)))
5395 
5396 #define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \
5397   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5398                                            (__v2df)(__m128d)(B), \
5399                                            (__v2di)(__m128i)(C), (int)(imm), \
5400                                            (__mmask8)(U), (int)(R)))
5401 
5402 #define _mm_fixupimm_sd(A, B, C, imm) \
5403   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5404                                            (__v2df)(__m128d)(B), \
5405                                            (__v2di)(__m128i)(C), (int)(imm), \
5406                                            (__mmask8)-1, \
5407                                            _MM_FROUND_CUR_DIRECTION))
5408 
5409 #define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
5410   ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5411                                            (__v2df)(__m128d)(B), \
5412                                            (__v2di)(__m128i)(C), (int)(imm), \
5413                                            (__mmask8)(U), \
5414                                            _MM_FROUND_CUR_DIRECTION))
5415 
5416 #define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
5417   ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
5418                                             (__v2df)(__m128d)(B), \
5419                                             (__v2di)(__m128i)(C), (int)(imm), \
5420                                             (__mmask8)(U), (int)(R)))
5421 
5422 #define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
5423   ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
5424                                             (__v2df)(__m128d)(B), \
5425                                             (__v2di)(__m128i)(C), (int)(imm), \
5426                                             (__mmask8)(U), \
5427                                             _MM_FROUND_CUR_DIRECTION))
5428 
5429 #define _mm_fixupimm_round_ss(A, B, C, imm, R) \
5430   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5431                                           (__v4sf)(__m128)(B), \
5432                                           (__v4si)(__m128i)(C), (int)(imm), \
5433                                           (__mmask8)-1, (int)(R)))
5434 
5435 #define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \
5436   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5437                                           (__v4sf)(__m128)(B), \
5438                                           (__v4si)(__m128i)(C), (int)(imm), \
5439                                           (__mmask8)(U), (int)(R)))
5440 
5441 #define _mm_fixupimm_ss(A, B, C, imm) \
5442   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5443                                           (__v4sf)(__m128)(B), \
5444                                           (__v4si)(__m128i)(C), (int)(imm), \
5445                                           (__mmask8)-1, \
5446                                           _MM_FROUND_CUR_DIRECTION))
5447 
5448 #define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
5449   ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5450                                           (__v4sf)(__m128)(B), \
5451                                           (__v4si)(__m128i)(C), (int)(imm), \
5452                                           (__mmask8)(U), \
5453                                           _MM_FROUND_CUR_DIRECTION))
5454 
5455 #define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
5456   ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
5457                                            (__v4sf)(__m128)(B), \
5458                                            (__v4si)(__m128i)(C), (int)(imm), \
5459                                            (__mmask8)(U), (int)(R)))
5460 
5461 #define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
5462   ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
5463                                            (__v4sf)(__m128)(B), \
5464                                            (__v4si)(__m128i)(C), (int)(imm), \
5465                                            (__mmask8)(U), \
5466                                            _MM_FROUND_CUR_DIRECTION))
5467 
5468 #define _mm_getexp_round_sd(A, B, R) \
5469   ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5470                                                   (__v2df)(__m128d)(B), \
5471                                                   (__v2df)_mm_setzero_pd(), \
5472                                                   (__mmask8)-1, (int)(R)))
5473 
5474 
5475 static __inline__ __m128d __DEFAULT_FN_ATTRS128
5476 _mm_getexp_sd (__m128d __A, __m128d __B)
5477 {
5478   return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
5479                  (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
5480 }
5481 
5482 static __inline__ __m128d __DEFAULT_FN_ATTRS128
5483 _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
5484 {
5485  return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
5486           (__v2df) __B,
5487           (__v2df) __W,
5488           (__mmask8) __U,
5489           _MM_FROUND_CUR_DIRECTION);
5490 }
5491 
5492 #define _mm_mask_getexp_round_sd(W, U, A, B, R) \
5493   ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5494                                                   (__v2df)(__m128d)(B), \
5495                                                   (__v2df)(__m128d)(W), \
5496                                                   (__mmask8)(U), (int)(R)))
5497 
5498 static __inline__ __m128d __DEFAULT_FN_ATTRS128
5499 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
5500 {
5501  return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
5502           (__v2df) __B,
5503           (__v2df) _mm_setzero_pd (),
5504           (__mmask8) __U,
5505           _MM_FROUND_CUR_DIRECTION);
5506 }
5507 
5508 #define _mm_maskz_getexp_round_sd(U, A, B, R) \
5509   ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5510                                                   (__v2df)(__m128d)(B), \
5511                                                   (__v2df)_mm_setzero_pd(), \
5512                                                   (__mmask8)(U), (int)(R)))
5513 
5514 #define _mm_getexp_round_ss(A, B, R) \
5515   ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5516                                                  (__v4sf)(__m128)(B), \
5517                                                  (__v4sf)_mm_setzero_ps(), \
5518                                                  (__mmask8)-1, (int)(R)))
5519 
5520 static __inline__ __m128 __DEFAULT_FN_ATTRS128
5521 _mm_getexp_ss (__m128 __A, __m128 __B)
5522 {
5523   return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5524                 (__v4sf) __B, (__v4sf)  _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
5525 }
5526 
5527 static __inline__ __m128 __DEFAULT_FN_ATTRS128
5528 _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
5529 {
5530  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5531           (__v4sf) __B,
5532           (__v4sf) __W,
5533           (__mmask8) __U,
5534           _MM_FROUND_CUR_DIRECTION);
5535 }
5536 
5537 #define _mm_mask_getexp_round_ss(W, U, A, B, R) \
5538   ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5539                                                  (__v4sf)(__m128)(B), \
5540                                                  (__v4sf)(__m128)(W), \
5541                                                  (__mmask8)(U), (int)(R)))
5542 
5543 static __inline__ __m128 __DEFAULT_FN_ATTRS128
5544 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
5545 {
5546  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5547           (__v4sf) __B,
5548           (__v4sf) _mm_setzero_ps (),
5549           (__mmask8) __U,
5550           _MM_FROUND_CUR_DIRECTION);
5551 }
5552 
5553 #define _mm_maskz_getexp_round_ss(U, A, B, R) \
5554   ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5555                                                  (__v4sf)(__m128)(B), \
5556                                                  (__v4sf)_mm_setzero_ps(), \
5557                                                  (__mmask8)(U), (int)(R)))
5558 
5559 #define _mm_getmant_round_sd(A, B, C, D, R) \
5560   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5561                                                 (__v2df)(__m128d)(B), \
5562                                                 (int)(((D)<<2) | (C)), \
5563                                                 (__v2df)_mm_setzero_pd(), \
5564                                                 (__mmask8)-1, (int)(R)))
5565 
5566 #define _mm_getmant_sd(A, B, C, D)  \
5567   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5568                                                 (__v2df)(__m128d)(B), \
5569                                                 (int)(((D)<<2) | (C)), \
5570                                                 (__v2df)_mm_setzero_pd(), \
5571                                                 (__mmask8)-1, \
5572                                                 _MM_FROUND_CUR_DIRECTION))
5573 
5574 #define _mm_mask_getmant_sd(W, U, A, B, C, D) \
5575   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5576                                                 (__v2df)(__m128d)(B), \
5577                                                 (int)(((D)<<2) | (C)), \
5578                                                 (__v2df)(__m128d)(W), \
5579                                                 (__mmask8)(U), \
5580                                                 _MM_FROUND_CUR_DIRECTION))
5581 
5582 #define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \
5583   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5584                                                 (__v2df)(__m128d)(B), \
5585                                                 (int)(((D)<<2) | (C)), \
5586                                                 (__v2df)(__m128d)(W), \
5587                                                 (__mmask8)(U), (int)(R)))
5588 
5589 #define _mm_maskz_getmant_sd(U, A, B, C, D) \
5590   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5591                                                 (__v2df)(__m128d)(B), \
5592                                                 (int)(((D)<<2) | (C)), \
5593                                                 (__v2df)_mm_setzero_pd(), \
5594                                                 (__mmask8)(U), \
5595                                                 _MM_FROUND_CUR_DIRECTION))
5596 
5597 #define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \
5598   ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5599                                                 (__v2df)(__m128d)(B), \
5600                                                 (int)(((D)<<2) | (C)), \
5601                                                 (__v2df)_mm_setzero_pd(), \
5602                                                 (__mmask8)(U), (int)(R)))
5603 
5604 #define _mm_getmant_round_ss(A, B, C, D, R) \
5605   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5606                                                (__v4sf)(__m128)(B), \
5607                                                (int)(((D)<<2) | (C)), \
5608                                                (__v4sf)_mm_setzero_ps(), \
5609                                                (__mmask8)-1, (int)(R)))
5610 
5611 #define _mm_getmant_ss(A, B, C, D) \
5612   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5613                                                (__v4sf)(__m128)(B), \
5614                                                (int)(((D)<<2) | (C)), \
5615                                                (__v4sf)_mm_setzero_ps(), \
5616                                                (__mmask8)-1, \
5617                                                _MM_FROUND_CUR_DIRECTION))
5618 
5619 #define _mm_mask_getmant_ss(W, U, A, B, C, D) \
5620   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5621                                                (__v4sf)(__m128)(B), \
5622                                                (int)(((D)<<2) | (C)), \
5623                                                (__v4sf)(__m128)(W), \
5624                                                (__mmask8)(U), \
5625                                                _MM_FROUND_CUR_DIRECTION))
5626 
5627 #define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \
5628   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5629                                                (__v4sf)(__m128)(B), \
5630                                                (int)(((D)<<2) | (C)), \
5631                                                (__v4sf)(__m128)(W), \
5632                                                (__mmask8)(U), (int)(R)))
5633 
5634 #define _mm_maskz_getmant_ss(U, A, B, C, D) \
5635   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5636                                                (__v4sf)(__m128)(B), \
5637                                                (int)(((D)<<2) | (C)), \
5638                                                (__v4sf)_mm_setzero_ps(), \
5639                                                (__mmask8)(U), \
5640                                                _MM_FROUND_CUR_DIRECTION))
5641 
5642 #define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \
5643   ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5644                                                (__v4sf)(__m128)(B), \
5645                                                (int)(((D)<<2) | (C)), \
5646                                                (__v4sf)_mm_setzero_ps(), \
5647                                                (__mmask8)(U), (int)(R)))
5648 
5649 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
5650 _mm512_kmov (__mmask16 __A)
5651 {
5652   return  __A;
5653 }
5654 
5655 #define _mm_comi_round_sd(A, B, P, R) \
5656   ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
5657                                (int)(P), (int)(R)))
5658 
5659 #define _mm_comi_round_ss(A, B, P, R) \
5660   ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
5661                                (int)(P), (int)(R)))
5662 
5663 #ifdef __x86_64__
5664 #define _mm_cvt_roundsd_si64(A, R) \
5665   ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
5666 #endif
5667 
5668 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5669 _mm512_sll_epi32(__m512i __A, __m128i __B)
5670 {
5671   return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
5672 }
5673 
5674 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5675 _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5676 {
5677   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5678                                           (__v16si)_mm512_sll_epi32(__A, __B),
5679                                           (__v16si)__W);
5680 }
5681 
5682 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5683 _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5684 {
5685   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5686                                           (__v16si)_mm512_sll_epi32(__A, __B),
5687                                           (__v16si)_mm512_setzero_si512());
5688 }
5689 
5690 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5691 _mm512_sll_epi64(__m512i __A, __m128i __B)
5692 {
5693   return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
5694 }
5695 
5696 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5697 _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5698 {
5699   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5700                                              (__v8di)_mm512_sll_epi64(__A, __B),
5701                                              (__v8di)__W);
5702 }
5703 
5704 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5705 _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5706 {
5707   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5708                                            (__v8di)_mm512_sll_epi64(__A, __B),
5709                                            (__v8di)_mm512_setzero_si512());
5710 }
5711 
5712 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5713 _mm512_sllv_epi32(__m512i __X, __m512i __Y)
5714 {
5715   return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
5716 }
5717 
5718 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5719 _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5720 {
5721   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5722                                            (__v16si)_mm512_sllv_epi32(__X, __Y),
5723                                            (__v16si)__W);
5724 }
5725 
5726 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5727 _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5728 {
5729   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5730                                            (__v16si)_mm512_sllv_epi32(__X, __Y),
5731                                            (__v16si)_mm512_setzero_si512());
5732 }
5733 
5734 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5735 _mm512_sllv_epi64(__m512i __X, __m512i __Y)
5736 {
5737   return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
5738 }
5739 
5740 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5741 _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5742 {
5743   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5744                                             (__v8di)_mm512_sllv_epi64(__X, __Y),
5745                                             (__v8di)__W);
5746 }
5747 
5748 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5749 _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5750 {
5751   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5752                                             (__v8di)_mm512_sllv_epi64(__X, __Y),
5753                                             (__v8di)_mm512_setzero_si512());
5754 }
5755 
5756 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5757 _mm512_sra_epi32(__m512i __A, __m128i __B)
5758 {
5759   return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
5760 }
5761 
5762 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5763 _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5764 {
5765   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5766                                           (__v16si)_mm512_sra_epi32(__A, __B),
5767                                           (__v16si)__W);
5768 }
5769 
5770 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5771 _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5772 {
5773   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5774                                           (__v16si)_mm512_sra_epi32(__A, __B),
5775                                           (__v16si)_mm512_setzero_si512());
5776 }
5777 
5778 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5779 _mm512_sra_epi64(__m512i __A, __m128i __B)
5780 {
5781   return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
5782 }
5783 
5784 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5785 _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5786 {
5787   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5788                                            (__v8di)_mm512_sra_epi64(__A, __B),
5789                                            (__v8di)__W);
5790 }
5791 
5792 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5793 _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5794 {
5795   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5796                                            (__v8di)_mm512_sra_epi64(__A, __B),
5797                                            (__v8di)_mm512_setzero_si512());
5798 }
5799 
5800 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5801 _mm512_srav_epi32(__m512i __X, __m512i __Y)
5802 {
5803   return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
5804 }
5805 
5806 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5807 _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5808 {
5809   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5810                                            (__v16si)_mm512_srav_epi32(__X, __Y),
5811                                            (__v16si)__W);
5812 }
5813 
5814 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5815 _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5816 {
5817   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5818                                            (__v16si)_mm512_srav_epi32(__X, __Y),
5819                                            (__v16si)_mm512_setzero_si512());
5820 }
5821 
5822 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5823 _mm512_srav_epi64(__m512i __X, __m512i __Y)
5824 {
5825   return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
5826 }
5827 
5828 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5829 _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5830 {
5831   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5832                                             (__v8di)_mm512_srav_epi64(__X, __Y),
5833                                             (__v8di)__W);
5834 }
5835 
5836 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5837 _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5838 {
5839   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5840                                             (__v8di)_mm512_srav_epi64(__X, __Y),
5841                                             (__v8di)_mm512_setzero_si512());
5842 }
5843 
5844 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5845 _mm512_srl_epi32(__m512i __A, __m128i __B)
5846 {
5847   return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
5848 }
5849 
5850 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5851 _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5852 {
5853   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5854                                           (__v16si)_mm512_srl_epi32(__A, __B),
5855                                           (__v16si)__W);
5856 }
5857 
5858 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5859 _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5860 {
5861   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5862                                           (__v16si)_mm512_srl_epi32(__A, __B),
5863                                           (__v16si)_mm512_setzero_si512());
5864 }
5865 
5866 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5867 _mm512_srl_epi64(__m512i __A, __m128i __B)
5868 {
5869   return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
5870 }
5871 
5872 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5873 _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5874 {
5875   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5876                                            (__v8di)_mm512_srl_epi64(__A, __B),
5877                                            (__v8di)__W);
5878 }
5879 
5880 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5881 _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5882 {
5883   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5884                                            (__v8di)_mm512_srl_epi64(__A, __B),
5885                                            (__v8di)_mm512_setzero_si512());
5886 }
5887 
5888 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5889 _mm512_srlv_epi32(__m512i __X, __m512i __Y)
5890 {
5891   return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
5892 }
5893 
5894 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5895 _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5896 {
5897   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5898                                            (__v16si)_mm512_srlv_epi32(__X, __Y),
5899                                            (__v16si)__W);
5900 }
5901 
5902 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5903 _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5904 {
5905   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5906                                            (__v16si)_mm512_srlv_epi32(__X, __Y),
5907                                            (__v16si)_mm512_setzero_si512());
5908 }
5909 
5910 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5911 _mm512_srlv_epi64 (__m512i __X, __m512i __Y)
5912 {
5913   return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
5914 }
5915 
5916 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5917 _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5918 {
5919   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5920                                             (__v8di)_mm512_srlv_epi64(__X, __Y),
5921                                             (__v8di)__W);
5922 }
5923 
5924 static __inline__ __m512i __DEFAULT_FN_ATTRS512
5925 _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5926 {
5927   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5928                                             (__v8di)_mm512_srlv_epi64(__X, __Y),
5929                                             (__v8di)_mm512_setzero_si512());
5930 }
5931 
5932 /// \enum _MM_TERNLOG_ENUM
5933 ///    A helper to represent the ternary logic operations among vector \a A,
5934 ///    \a B and \a C. The representation is passed to \a imm.
5935 typedef enum {
5936   _MM_TERNLOG_A = 0xF0,
5937   _MM_TERNLOG_B = 0xCC,
5938   _MM_TERNLOG_C = 0xAA
5939 } _MM_TERNLOG_ENUM;
5940 
5941 #define _mm512_ternarylogic_epi32(A, B, C, imm)                                \
5942   ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
5943       (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
5944       (unsigned char)(imm), (__mmask16)-1))
5945 
5946 #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm)                        \
5947   ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
5948       (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
5949       (unsigned char)(imm), (__mmask16)(U)))
5950 
5951 #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm)                       \
5952   ((__m512i)__builtin_ia32_pternlogd512_maskz(                                 \
5953       (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
5954       (unsigned char)(imm), (__mmask16)(U)))
5955 
5956 #define _mm512_ternarylogic_epi64(A, B, C, imm)                                \
5957   ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
5958       (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
5959       (unsigned char)(imm), (__mmask8)-1))
5960 
5961 #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm)                        \
5962   ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
5963       (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
5964       (unsigned char)(imm), (__mmask8)(U)))
5965 
5966 #define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm)                       \
5967   ((__m512i)__builtin_ia32_pternlogq512_maskz(                                 \
5968       (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
5969       (unsigned char)(imm), (__mmask8)(U)))
5970 
5971 #ifdef __x86_64__
5972 #define _mm_cvt_roundsd_i64(A, R) \
5973   ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
5974 #endif
5975 
5976 #define _mm_cvt_roundsd_si32(A, R) \
5977   ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
5978 
5979 #define _mm_cvt_roundsd_i32(A, R) \
5980   ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
5981 
5982 #define _mm_cvt_roundsd_u32(A, R) \
5983   ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)))
5984 
5985 static __inline__ unsigned __DEFAULT_FN_ATTRS128
5986 _mm_cvtsd_u32 (__m128d __A)
5987 {
5988   return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
5989              _MM_FROUND_CUR_DIRECTION);
5990 }
5991 
5992 #ifdef __x86_64__
5993 #define _mm_cvt_roundsd_u64(A, R) \
5994   ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
5995                                                    (int)(R)))
5996 
5997 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
5998 _mm_cvtsd_u64 (__m128d __A)
5999 {
6000   return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
6001                  __A,
6002                  _MM_FROUND_CUR_DIRECTION);
6003 }
6004 #endif
6005 
6006 #define _mm_cvt_roundss_si32(A, R) \
6007   ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
6008 
6009 #define _mm_cvt_roundss_i32(A, R) \
6010   ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
6011 
6012 #ifdef __x86_64__
6013 #define _mm_cvt_roundss_si64(A, R) \
6014   ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
6015 
6016 #define _mm_cvt_roundss_i64(A, R) \
6017   ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
6018 #endif
6019 
6020 #define _mm_cvt_roundss_u32(A, R) \
6021   ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)))
6022 
6023 static __inline__ unsigned __DEFAULT_FN_ATTRS128
6024 _mm_cvtss_u32 (__m128 __A)
6025 {
6026   return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
6027              _MM_FROUND_CUR_DIRECTION);
6028 }
6029 
6030 #ifdef __x86_64__
6031 #define _mm_cvt_roundss_u64(A, R) \
6032   ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
6033                                                    (int)(R)))
6034 
6035 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6036 _mm_cvtss_u64 (__m128 __A)
6037 {
6038   return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
6039                  __A,
6040                  _MM_FROUND_CUR_DIRECTION);
6041 }
6042 #endif
6043 
6044 #define _mm_cvtt_roundsd_i32(A, R) \
6045   ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
6046 
6047 #define _mm_cvtt_roundsd_si32(A, R) \
6048   ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
6049 
6050 static __inline__ int __DEFAULT_FN_ATTRS128
6051 _mm_cvttsd_i32 (__m128d __A)
6052 {
6053   return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
6054               _MM_FROUND_CUR_DIRECTION);
6055 }
6056 
6057 #ifdef __x86_64__
6058 #define _mm_cvtt_roundsd_si64(A, R) \
6059   ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
6060 
6061 #define _mm_cvtt_roundsd_i64(A, R) \
6062   ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
6063 
6064 static __inline__ long long __DEFAULT_FN_ATTRS128
6065 _mm_cvttsd_i64 (__m128d __A)
6066 {
6067   return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
6068               _MM_FROUND_CUR_DIRECTION);
6069 }
6070 #endif
6071 
6072 #define _mm_cvtt_roundsd_u32(A, R) \
6073   ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)))
6074 
6075 static __inline__ unsigned __DEFAULT_FN_ATTRS128
6076 _mm_cvttsd_u32 (__m128d __A)
6077 {
6078   return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
6079               _MM_FROUND_CUR_DIRECTION);
6080 }
6081 
6082 #ifdef __x86_64__
6083 #define _mm_cvtt_roundsd_u64(A, R) \
6084   ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
6085                                                     (int)(R)))
6086 
6087 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6088 _mm_cvttsd_u64 (__m128d __A)
6089 {
6090   return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
6091                   __A,
6092                   _MM_FROUND_CUR_DIRECTION);
6093 }
6094 #endif
6095 
6096 #define _mm_cvtt_roundss_i32(A, R) \
6097   ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
6098 
6099 #define _mm_cvtt_roundss_si32(A, R) \
6100   ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
6101 
6102 static __inline__ int __DEFAULT_FN_ATTRS128
6103 _mm_cvttss_i32 (__m128 __A)
6104 {
6105   return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
6106               _MM_FROUND_CUR_DIRECTION);
6107 }
6108 
6109 #ifdef __x86_64__
6110 #define _mm_cvtt_roundss_i64(A, R) \
6111   ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
6112 
6113 #define _mm_cvtt_roundss_si64(A, R) \
6114   ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
6115 
6116 static __inline__ long long __DEFAULT_FN_ATTRS128
6117 _mm_cvttss_i64 (__m128 __A)
6118 {
6119   return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
6120               _MM_FROUND_CUR_DIRECTION);
6121 }
6122 #endif
6123 
6124 #define _mm_cvtt_roundss_u32(A, R) \
6125   ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)))
6126 
6127 static __inline__ unsigned __DEFAULT_FN_ATTRS128
6128 _mm_cvttss_u32 (__m128 __A)
6129 {
6130   return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
6131               _MM_FROUND_CUR_DIRECTION);
6132 }
6133 
6134 #ifdef __x86_64__
6135 #define _mm_cvtt_roundss_u64(A, R) \
6136   ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
6137                                                     (int)(R)))
6138 
6139 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6140 _mm_cvttss_u64 (__m128 __A)
6141 {
6142   return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
6143                   __A,
6144                   _MM_FROUND_CUR_DIRECTION);
6145 }
6146 #endif
6147 
6148 #define _mm512_permute_pd(X, C) \
6149   ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)))
6150 
6151 #define _mm512_mask_permute_pd(W, U, X, C) \
6152   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6153                                         (__v8df)_mm512_permute_pd((X), (C)), \
6154                                         (__v8df)(__m512d)(W)))
6155 
6156 #define _mm512_maskz_permute_pd(U, X, C) \
6157   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6158                                         (__v8df)_mm512_permute_pd((X), (C)), \
6159                                         (__v8df)_mm512_setzero_pd()))
6160 
6161 #define _mm512_permute_ps(X, C) \
6162   ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)))
6163 
6164 #define _mm512_mask_permute_ps(W, U, X, C) \
6165   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6166                                        (__v16sf)_mm512_permute_ps((X), (C)), \
6167                                        (__v16sf)(__m512)(W)))
6168 
6169 #define _mm512_maskz_permute_ps(U, X, C) \
6170   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6171                                        (__v16sf)_mm512_permute_ps((X), (C)), \
6172                                        (__v16sf)_mm512_setzero_ps()))
6173 
6174 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6175 _mm512_permutevar_pd(__m512d __A, __m512i __C)
6176 {
6177   return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
6178 }
6179 
6180 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6181 _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
6182 {
6183   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
6184                                          (__v8df)_mm512_permutevar_pd(__A, __C),
6185                                          (__v8df)__W);
6186 }
6187 
6188 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6189 _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
6190 {
6191   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
6192                                          (__v8df)_mm512_permutevar_pd(__A, __C),
6193                                          (__v8df)_mm512_setzero_pd());
6194 }
6195 
6196 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6197 _mm512_permutevar_ps(__m512 __A, __m512i __C)
6198 {
6199   return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
6200 }
6201 
6202 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6203 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
6204 {
6205   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
6206                                         (__v16sf)_mm512_permutevar_ps(__A, __C),
6207                                         (__v16sf)__W);
6208 }
6209 
6210 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6211 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
6212 {
6213   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
6214                                         (__v16sf)_mm512_permutevar_ps(__A, __C),
6215                                         (__v16sf)_mm512_setzero_ps());
6216 }
6217 
6218 static __inline __m512d __DEFAULT_FN_ATTRS512
6219 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
6220 {
6221   return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
6222                                                  (__v8df)__B);
6223 }
6224 
6225 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6226 _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
6227 {
6228   return (__m512d)__builtin_ia32_selectpd_512(__U,
6229                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6230                                   (__v8df)__A);
6231 }
6232 
6233 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6234 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
6235                              __m512d __B)
6236 {
6237   return (__m512d)__builtin_ia32_selectpd_512(__U,
6238                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6239                                   (__v8df)(__m512d)__I);
6240 }
6241 
6242 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6243 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
6244                              __m512d __B)
6245 {
6246   return (__m512d)__builtin_ia32_selectpd_512(__U,
6247                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6248                                   (__v8df)_mm512_setzero_pd());
6249 }
6250 
6251 static __inline __m512 __DEFAULT_FN_ATTRS512
6252 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
6253 {
6254   return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
6255                                                 (__v16sf) __B);
6256 }
6257 
6258 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6259 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
6260 {
6261   return (__m512)__builtin_ia32_selectps_512(__U,
6262                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6263                                  (__v16sf)__A);
6264 }
6265 
6266 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6267 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
6268 {
6269   return (__m512)__builtin_ia32_selectps_512(__U,
6270                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6271                                  (__v16sf)(__m512)__I);
6272 }
6273 
6274 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6275 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
6276 {
6277   return (__m512)__builtin_ia32_selectps_512(__U,
6278                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6279                                  (__v16sf)_mm512_setzero_ps());
6280 }
6281 
6282 
6283 #define _mm512_cvtt_roundpd_epu32(A, R) \
6284   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6285                                               (__v8si)_mm256_undefined_si256(), \
6286                                               (__mmask8)-1, (int)(R)))
6287 
6288 #define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \
6289   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6290                                               (__v8si)(__m256i)(W), \
6291                                               (__mmask8)(U), (int)(R)))
6292 
6293 #define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \
6294   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6295                                               (__v8si)_mm256_setzero_si256(), \
6296                                               (__mmask8)(U), (int)(R)))
6297 
6298 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6299 _mm512_cvttpd_epu32 (__m512d __A)
6300 {
6301   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6302                   (__v8si)
6303                   _mm256_undefined_si256 (),
6304                   (__mmask8) -1,
6305                   _MM_FROUND_CUR_DIRECTION);
6306 }
6307 
6308 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6309 _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
6310 {
6311   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6312                   (__v8si) __W,
6313                   (__mmask8) __U,
6314                   _MM_FROUND_CUR_DIRECTION);
6315 }
6316 
6317 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6318 _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
6319 {
6320   return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6321                   (__v8si)
6322                   _mm256_setzero_si256 (),
6323                   (__mmask8) __U,
6324                   _MM_FROUND_CUR_DIRECTION);
6325 }
6326 
6327 #define _mm_roundscale_round_sd(A, B, imm, R) \
6328   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6329                                                  (__v2df)(__m128d)(B), \
6330                                                  (__v2df)_mm_setzero_pd(), \
6331                                                  (__mmask8)-1, (int)(imm), \
6332                                                  (int)(R)))
6333 
6334 #define _mm_roundscale_sd(A, B, imm) \
6335   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6336                                                  (__v2df)(__m128d)(B), \
6337                                                  (__v2df)_mm_setzero_pd(), \
6338                                                  (__mmask8)-1, (int)(imm), \
6339                                                  _MM_FROUND_CUR_DIRECTION))
6340 
6341 #define _mm_mask_roundscale_sd(W, U, A, B, imm) \
6342   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6343                                                  (__v2df)(__m128d)(B), \
6344                                                  (__v2df)(__m128d)(W), \
6345                                                  (__mmask8)(U), (int)(imm), \
6346                                                  _MM_FROUND_CUR_DIRECTION))
6347 
6348 #define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \
6349   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6350                                                  (__v2df)(__m128d)(B), \
6351                                                  (__v2df)(__m128d)(W), \
6352                                                  (__mmask8)(U), (int)(I), \
6353                                                  (int)(R)))
6354 
6355 #define _mm_maskz_roundscale_sd(U, A, B, I) \
6356   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6357                                                  (__v2df)(__m128d)(B), \
6358                                                  (__v2df)_mm_setzero_pd(), \
6359                                                  (__mmask8)(U), (int)(I), \
6360                                                  _MM_FROUND_CUR_DIRECTION))
6361 
6362 #define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
6363   ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6364                                                  (__v2df)(__m128d)(B), \
6365                                                  (__v2df)_mm_setzero_pd(), \
6366                                                  (__mmask8)(U), (int)(I), \
6367                                                  (int)(R)))
6368 
6369 #define _mm_roundscale_round_ss(A, B, imm, R) \
6370   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6371                                                 (__v4sf)(__m128)(B), \
6372                                                 (__v4sf)_mm_setzero_ps(), \
6373                                                 (__mmask8)-1, (int)(imm), \
6374                                                 (int)(R)))
6375 
6376 #define _mm_roundscale_ss(A, B, imm) \
6377   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6378                                                 (__v4sf)(__m128)(B), \
6379                                                 (__v4sf)_mm_setzero_ps(), \
6380                                                 (__mmask8)-1, (int)(imm), \
6381                                                 _MM_FROUND_CUR_DIRECTION))
6382 
6383 #define _mm_mask_roundscale_ss(W, U, A, B, I) \
6384   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6385                                                 (__v4sf)(__m128)(B), \
6386                                                 (__v4sf)(__m128)(W), \
6387                                                 (__mmask8)(U), (int)(I), \
6388                                                 _MM_FROUND_CUR_DIRECTION))
6389 
6390 #define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \
6391   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6392                                                 (__v4sf)(__m128)(B), \
6393                                                 (__v4sf)(__m128)(W), \
6394                                                 (__mmask8)(U), (int)(I), \
6395                                                 (int)(R)))
6396 
6397 #define _mm_maskz_roundscale_ss(U, A, B, I) \
6398   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6399                                                 (__v4sf)(__m128)(B), \
6400                                                 (__v4sf)_mm_setzero_ps(), \
6401                                                 (__mmask8)(U), (int)(I), \
6402                                                 _MM_FROUND_CUR_DIRECTION))
6403 
6404 #define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
6405   ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6406                                                 (__v4sf)(__m128)(B), \
6407                                                 (__v4sf)_mm_setzero_ps(), \
6408                                                 (__mmask8)(U), (int)(I), \
6409                                                 (int)(R)))
6410 
6411 #define _mm512_scalef_round_pd(A, B, R) \
6412   ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6413                                             (__v8df)(__m512d)(B), \
6414                                             (__v8df)_mm512_undefined_pd(), \
6415                                             (__mmask8)-1, (int)(R)))
6416 
6417 #define _mm512_mask_scalef_round_pd(W, U, A, B, R) \
6418   ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6419                                             (__v8df)(__m512d)(B), \
6420                                             (__v8df)(__m512d)(W), \
6421                                             (__mmask8)(U), (int)(R)))
6422 
6423 #define _mm512_maskz_scalef_round_pd(U, A, B, R) \
6424   ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6425                                             (__v8df)(__m512d)(B), \
6426                                             (__v8df)_mm512_setzero_pd(), \
6427                                             (__mmask8)(U), (int)(R)))
6428 
6429 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6430 _mm512_scalef_pd (__m512d __A, __m512d __B)
6431 {
6432   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6433                 (__v8df) __B,
6434                 (__v8df)
6435                 _mm512_undefined_pd (),
6436                 (__mmask8) -1,
6437                 _MM_FROUND_CUR_DIRECTION);
6438 }
6439 
6440 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6441 _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
6442 {
6443   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6444                 (__v8df) __B,
6445                 (__v8df) __W,
6446                 (__mmask8) __U,
6447                 _MM_FROUND_CUR_DIRECTION);
6448 }
6449 
6450 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6451 _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
6452 {
6453   return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6454                 (__v8df) __B,
6455                 (__v8df)
6456                 _mm512_setzero_pd (),
6457                 (__mmask8) __U,
6458                 _MM_FROUND_CUR_DIRECTION);
6459 }
6460 
6461 #define _mm512_scalef_round_ps(A, B, R) \
6462   ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6463                                            (__v16sf)(__m512)(B), \
6464                                            (__v16sf)_mm512_undefined_ps(), \
6465                                            (__mmask16)-1, (int)(R)))
6466 
6467 #define _mm512_mask_scalef_round_ps(W, U, A, B, R) \
6468   ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6469                                            (__v16sf)(__m512)(B), \
6470                                            (__v16sf)(__m512)(W), \
6471                                            (__mmask16)(U), (int)(R)))
6472 
6473 #define _mm512_maskz_scalef_round_ps(U, A, B, R) \
6474   ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6475                                            (__v16sf)(__m512)(B), \
6476                                            (__v16sf)_mm512_setzero_ps(), \
6477                                            (__mmask16)(U), (int)(R)))
6478 
6479 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6480 _mm512_scalef_ps (__m512 __A, __m512 __B)
6481 {
6482   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6483                (__v16sf) __B,
6484                (__v16sf)
6485                _mm512_undefined_ps (),
6486                (__mmask16) -1,
6487                _MM_FROUND_CUR_DIRECTION);
6488 }
6489 
6490 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6491 _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
6492 {
6493   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6494                (__v16sf) __B,
6495                (__v16sf) __W,
6496                (__mmask16) __U,
6497                _MM_FROUND_CUR_DIRECTION);
6498 }
6499 
6500 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6501 _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
6502 {
6503   return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6504                (__v16sf) __B,
6505                (__v16sf)
6506                _mm512_setzero_ps (),
6507                (__mmask16) __U,
6508                _MM_FROUND_CUR_DIRECTION);
6509 }
6510 
6511 #define _mm_scalef_round_sd(A, B, R) \
6512   ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6513                                                (__v2df)(__m128d)(B), \
6514                                                (__v2df)_mm_setzero_pd(), \
6515                                                (__mmask8)-1, (int)(R)))
6516 
6517 static __inline__ __m128d __DEFAULT_FN_ATTRS128
6518 _mm_scalef_sd (__m128d __A, __m128d __B)
6519 {
6520   return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
6521               (__v2df)( __B), (__v2df) _mm_setzero_pd(),
6522               (__mmask8) -1,
6523               _MM_FROUND_CUR_DIRECTION);
6524 }
6525 
6526 static __inline__ __m128d __DEFAULT_FN_ATTRS128
6527 _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
6528 {
6529  return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
6530                  (__v2df) __B,
6531                 (__v2df) __W,
6532                 (__mmask8) __U,
6533                 _MM_FROUND_CUR_DIRECTION);
6534 }
6535 
6536 #define _mm_mask_scalef_round_sd(W, U, A, B, R) \
6537   ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6538                                                (__v2df)(__m128d)(B), \
6539                                                (__v2df)(__m128d)(W), \
6540                                                (__mmask8)(U), (int)(R)))
6541 
6542 static __inline__ __m128d __DEFAULT_FN_ATTRS128
6543 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
6544 {
6545  return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
6546                  (__v2df) __B,
6547                 (__v2df) _mm_setzero_pd (),
6548                 (__mmask8) __U,
6549                 _MM_FROUND_CUR_DIRECTION);
6550 }
6551 
6552 #define _mm_maskz_scalef_round_sd(U, A, B, R) \
6553   ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6554                                                (__v2df)(__m128d)(B), \
6555                                                (__v2df)_mm_setzero_pd(), \
6556                                                (__mmask8)(U), (int)(R)))
6557 
6558 #define _mm_scalef_round_ss(A, B, R) \
6559   ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6560                                               (__v4sf)(__m128)(B), \
6561                                               (__v4sf)_mm_setzero_ps(), \
6562                                               (__mmask8)-1, (int)(R)))
6563 
6564 static __inline__ __m128 __DEFAULT_FN_ATTRS128
6565 _mm_scalef_ss (__m128 __A, __m128 __B)
6566 {
6567   return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
6568              (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
6569              (__mmask8) -1,
6570              _MM_FROUND_CUR_DIRECTION);
6571 }
6572 
6573 static __inline__ __m128 __DEFAULT_FN_ATTRS128
6574 _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
6575 {
6576  return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
6577                 (__v4sf) __B,
6578                 (__v4sf) __W,
6579                 (__mmask8) __U,
6580                 _MM_FROUND_CUR_DIRECTION);
6581 }
6582 
6583 #define _mm_mask_scalef_round_ss(W, U, A, B, R) \
6584   ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6585                                               (__v4sf)(__m128)(B), \
6586                                               (__v4sf)(__m128)(W), \
6587                                               (__mmask8)(U), (int)(R)))
6588 
6589 static __inline__ __m128 __DEFAULT_FN_ATTRS128
6590 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
6591 {
6592  return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
6593                  (__v4sf) __B,
6594                 (__v4sf) _mm_setzero_ps (),
6595                 (__mmask8) __U,
6596                 _MM_FROUND_CUR_DIRECTION);
6597 }
6598 
6599 #define _mm_maskz_scalef_round_ss(U, A, B, R) \
6600   ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6601                                               (__v4sf)(__m128)(B), \
6602                                               (__v4sf)_mm_setzero_ps(), \
6603                                               (__mmask8)(U), \
6604                                               (int)(R)))
6605 
6606 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6607 _mm512_srai_epi32(__m512i __A, unsigned int __B)
6608 {
6609   return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B);
6610 }
6611 
6612 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6613 _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A,
6614                        unsigned int __B)
6615 {
6616   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
6617                                          (__v16si)_mm512_srai_epi32(__A, __B),
6618                                          (__v16si)__W);
6619 }
6620 
6621 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6622 _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
6623                         unsigned int __B) {
6624   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
6625                                          (__v16si)_mm512_srai_epi32(__A, __B),
6626                                          (__v16si)_mm512_setzero_si512());
6627 }
6628 
6629 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6630 _mm512_srai_epi64(__m512i __A, unsigned int __B)
6631 {
6632   return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B);
6633 }
6634 
6635 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6636 _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
6637 {
6638   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
6639                                           (__v8di)_mm512_srai_epi64(__A, __B),
6640                                           (__v8di)__W);
6641 }
6642 
6643 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6644 _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
6645 {
6646   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
6647                                           (__v8di)_mm512_srai_epi64(__A, __B),
6648                                           (__v8di)_mm512_setzero_si512());
6649 }
6650 
6651 #define _mm512_shuffle_f32x4(A, B, imm) \
6652   ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
6653                                      (__v16sf)(__m512)(B), (int)(imm)))
6654 
6655 #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
6656   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6657                                        (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
6658                                        (__v16sf)(__m512)(W)))
6659 
6660 #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \
6661   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6662                                        (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
6663                                        (__v16sf)_mm512_setzero_ps()))
6664 
6665 #define _mm512_shuffle_f64x2(A, B, imm) \
6666   ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
6667                                       (__v8df)(__m512d)(B), (int)(imm)))
6668 
6669 #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
6670   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6671                                         (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
6672                                         (__v8df)(__m512d)(W)))
6673 
6674 #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \
6675   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6676                                         (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
6677                                         (__v8df)_mm512_setzero_pd()))
6678 
6679 #define _mm512_shuffle_i32x4(A, B, imm) \
6680   ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
6681                                       (__v16si)(__m512i)(B), (int)(imm)))
6682 
6683 #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
6684   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
6685                                        (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
6686                                        (__v16si)(__m512i)(W)))
6687 
6688 #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \
6689   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
6690                                        (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
6691                                        (__v16si)_mm512_setzero_si512()))
6692 
6693 #define _mm512_shuffle_i64x2(A, B, imm) \
6694   ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
6695                                       (__v8di)(__m512i)(B), (int)(imm)))
6696 
6697 #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
6698   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
6699                                        (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
6700                                        (__v8di)(__m512i)(W)))
6701 
6702 #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \
6703   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
6704                                        (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
6705                                        (__v8di)_mm512_setzero_si512()))
6706 
6707 #define _mm512_shuffle_pd(A, B, M) \
6708   ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
6709                                      (__v8df)(__m512d)(B), (int)(M)))
6710 
6711 #define _mm512_mask_shuffle_pd(W, U, A, B, M) \
6712   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6713                                         (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
6714                                         (__v8df)(__m512d)(W)))
6715 
6716 #define _mm512_maskz_shuffle_pd(U, A, B, M) \
6717   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6718                                         (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
6719                                         (__v8df)_mm512_setzero_pd()))
6720 
6721 #define _mm512_shuffle_ps(A, B, M) \
6722   ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
6723                                     (__v16sf)(__m512)(B), (int)(M)))
6724 
6725 #define _mm512_mask_shuffle_ps(W, U, A, B, M) \
6726   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6727                                        (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
6728                                        (__v16sf)(__m512)(W)))
6729 
6730 #define _mm512_maskz_shuffle_ps(U, A, B, M) \
6731   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6732                                        (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
6733                                        (__v16sf)_mm512_setzero_ps()))
6734 
6735 #define _mm_sqrt_round_sd(A, B, R) \
6736   ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6737                                              (__v2df)(__m128d)(B), \
6738                                              (__v2df)_mm_setzero_pd(), \
6739                                              (__mmask8)-1, (int)(R)))
6740 
6741 static __inline__ __m128d __DEFAULT_FN_ATTRS128
6742 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
6743 {
6744  return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
6745                  (__v2df) __B,
6746                 (__v2df) __W,
6747                 (__mmask8) __U,
6748                 _MM_FROUND_CUR_DIRECTION);
6749 }
6750 
6751 #define _mm_mask_sqrt_round_sd(W, U, A, B, R) \
6752   ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6753                                              (__v2df)(__m128d)(B), \
6754                                              (__v2df)(__m128d)(W), \
6755                                              (__mmask8)(U), (int)(R)))
6756 
6757 static __inline__ __m128d __DEFAULT_FN_ATTRS128
6758 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
6759 {
6760  return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
6761                  (__v2df) __B,
6762                 (__v2df) _mm_setzero_pd (),
6763                 (__mmask8) __U,
6764                 _MM_FROUND_CUR_DIRECTION);
6765 }
6766 
6767 #define _mm_maskz_sqrt_round_sd(U, A, B, R) \
6768   ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6769                                              (__v2df)(__m128d)(B), \
6770                                              (__v2df)_mm_setzero_pd(), \
6771                                              (__mmask8)(U), (int)(R)))
6772 
6773 #define _mm_sqrt_round_ss(A, B, R) \
6774   ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6775                                             (__v4sf)(__m128)(B), \
6776                                             (__v4sf)_mm_setzero_ps(), \
6777                                             (__mmask8)-1, (int)(R)))
6778 
6779 static __inline__ __m128 __DEFAULT_FN_ATTRS128
6780 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
6781 {
6782  return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
6783                  (__v4sf) __B,
6784                 (__v4sf) __W,
6785                 (__mmask8) __U,
6786                 _MM_FROUND_CUR_DIRECTION);
6787 }
6788 
6789 #define _mm_mask_sqrt_round_ss(W, U, A, B, R) \
6790   ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6791                                             (__v4sf)(__m128)(B), \
6792                                             (__v4sf)(__m128)(W), (__mmask8)(U), \
6793                                             (int)(R)))
6794 
6795 static __inline__ __m128 __DEFAULT_FN_ATTRS128
6796 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
6797 {
6798  return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
6799                  (__v4sf) __B,
6800                 (__v4sf) _mm_setzero_ps (),
6801                 (__mmask8) __U,
6802                 _MM_FROUND_CUR_DIRECTION);
6803 }
6804 
6805 #define _mm_maskz_sqrt_round_ss(U, A, B, R) \
6806   ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6807                                             (__v4sf)(__m128)(B), \
6808                                             (__v4sf)_mm_setzero_ps(), \
6809                                             (__mmask8)(U), (int)(R)))
6810 
6811 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6812 _mm512_broadcast_f32x4(__m128 __A)
6813 {
6814   return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
6815                                          0, 1, 2, 3, 0, 1, 2, 3,
6816                                          0, 1, 2, 3, 0, 1, 2, 3);
6817 }
6818 
6819 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6820 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
6821 {
6822   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
6823                                            (__v16sf)_mm512_broadcast_f32x4(__A),
6824                                            (__v16sf)__O);
6825 }
6826 
6827 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6828 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
6829 {
6830   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
6831                                            (__v16sf)_mm512_broadcast_f32x4(__A),
6832                                            (__v16sf)_mm512_setzero_ps());
6833 }
6834 
6835 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6836 _mm512_broadcast_f64x4(__m256d __A)
6837 {
6838   return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
6839                                           0, 1, 2, 3, 0, 1, 2, 3);
6840 }
6841 
6842 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6843 _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
6844 {
6845   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
6846                                             (__v8df)_mm512_broadcast_f64x4(__A),
6847                                             (__v8df)__O);
6848 }
6849 
6850 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6851 _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
6852 {
6853   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
6854                                             (__v8df)_mm512_broadcast_f64x4(__A),
6855                                             (__v8df)_mm512_setzero_pd());
6856 }
6857 
6858 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6859 _mm512_broadcast_i32x4(__m128i __A)
6860 {
6861   return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
6862                                           0, 1, 2, 3, 0, 1, 2, 3,
6863                                           0, 1, 2, 3, 0, 1, 2, 3);
6864 }
6865 
6866 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6867 _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
6868 {
6869   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
6870                                            (__v16si)_mm512_broadcast_i32x4(__A),
6871                                            (__v16si)__O);
6872 }
6873 
6874 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6875 _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
6876 {
6877   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
6878                                            (__v16si)_mm512_broadcast_i32x4(__A),
6879                                            (__v16si)_mm512_setzero_si512());
6880 }
6881 
6882 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6883 _mm512_broadcast_i64x4(__m256i __A)
6884 {
6885   return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
6886                                           0, 1, 2, 3, 0, 1, 2, 3);
6887 }
6888 
6889 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6890 _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
6891 {
6892   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
6893                                             (__v8di)_mm512_broadcast_i64x4(__A),
6894                                             (__v8di)__O);
6895 }
6896 
6897 static __inline__ __m512i __DEFAULT_FN_ATTRS512
6898 _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
6899 {
6900   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
6901                                             (__v8di)_mm512_broadcast_i64x4(__A),
6902                                             (__v8di)_mm512_setzero_si512());
6903 }
6904 
6905 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6906 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
6907 {
6908   return (__m512d)__builtin_ia32_selectpd_512(__M,
6909                                               (__v8df) _mm512_broadcastsd_pd(__A),
6910                                               (__v8df) __O);
6911 }
6912 
6913 static __inline__ __m512d __DEFAULT_FN_ATTRS512
6914 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
6915 {
6916   return (__m512d)__builtin_ia32_selectpd_512(__M,
6917                                               (__v8df) _mm512_broadcastsd_pd(__A),
6918                                               (__v8df) _mm512_setzero_pd());
6919 }
6920 
6921 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6922 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
6923 {
6924   return (__m512)__builtin_ia32_selectps_512(__M,
6925                                              (__v16sf) _mm512_broadcastss_ps(__A),
6926                                              (__v16sf) __O);
6927 }
6928 
6929 static __inline__ __m512 __DEFAULT_FN_ATTRS512
6930 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
6931 {
6932   return (__m512)__builtin_ia32_selectps_512(__M,
6933                                              (__v16sf) _mm512_broadcastss_ps(__A),
6934                                              (__v16sf) _mm512_setzero_ps());
6935 }
6936 
6937 static __inline__ __m128i __DEFAULT_FN_ATTRS512
6938 _mm512_cvtsepi32_epi8 (__m512i __A)
6939 {
6940   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6941                (__v16qi) _mm_undefined_si128 (),
6942                (__mmask16) -1);
6943 }
6944 
6945 static __inline__ __m128i __DEFAULT_FN_ATTRS512
6946 _mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
6947 {
6948   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6949                (__v16qi) __O, __M);
6950 }
6951 
6952 static __inline__ __m128i __DEFAULT_FN_ATTRS512
6953 _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
6954 {
6955   return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6956                (__v16qi) _mm_setzero_si128 (),
6957                __M);
6958 }
6959 
6960 static __inline__ void __DEFAULT_FN_ATTRS512
6961 _mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
6962 {
6963   __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
6964 }
6965 
6966 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6967 _mm512_cvtsepi32_epi16 (__m512i __A)
6968 {
6969   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6970                (__v16hi) _mm256_undefined_si256 (),
6971                (__mmask16) -1);
6972 }
6973 
6974 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6975 _mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
6976 {
6977   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6978                (__v16hi) __O, __M);
6979 }
6980 
6981 static __inline__ __m256i __DEFAULT_FN_ATTRS512
6982 _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
6983 {
6984   return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6985                (__v16hi) _mm256_setzero_si256 (),
6986                __M);
6987 }
6988 
6989 static __inline__ void __DEFAULT_FN_ATTRS512
6990 _mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
6991 {
6992   __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
6993 }
6994 
6995 static __inline__ __m128i __DEFAULT_FN_ATTRS512
6996 _mm512_cvtsepi64_epi8 (__m512i __A)
6997 {
6998   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
6999                (__v16qi) _mm_undefined_si128 (),
7000                (__mmask8) -1);
7001 }
7002 
7003 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7004 _mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7005 {
7006   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
7007                (__v16qi) __O, __M);
7008 }
7009 
7010 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7011 _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
7012 {
7013   return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
7014                (__v16qi) _mm_setzero_si128 (),
7015                __M);
7016 }
7017 
7018 static __inline__ void __DEFAULT_FN_ATTRS512
7019 _mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7020 {
7021   __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7022 }
7023 
7024 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7025 _mm512_cvtsepi64_epi32 (__m512i __A)
7026 {
7027   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7028                (__v8si) _mm256_undefined_si256 (),
7029                (__mmask8) -1);
7030 }
7031 
7032 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7033 _mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7034 {
7035   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7036                (__v8si) __O, __M);
7037 }
7038 
7039 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7040 _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
7041 {
7042   return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7043                (__v8si) _mm256_setzero_si256 (),
7044                __M);
7045 }
7046 
7047 static __inline__ void __DEFAULT_FN_ATTRS512
7048 _mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
7049 {
7050   __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
7051 }
7052 
7053 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7054 _mm512_cvtsepi64_epi16 (__m512i __A)
7055 {
7056   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7057                (__v8hi) _mm_undefined_si128 (),
7058                (__mmask8) -1);
7059 }
7060 
7061 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7062 _mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7063 {
7064   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7065                (__v8hi) __O, __M);
7066 }
7067 
7068 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7069 _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
7070 {
7071   return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7072                (__v8hi) _mm_setzero_si128 (),
7073                __M);
7074 }
7075 
7076 static __inline__ void __DEFAULT_FN_ATTRS512
7077 _mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
7078 {
7079   __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
7080 }
7081 
7082 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7083 _mm512_cvtusepi32_epi8 (__m512i __A)
7084 {
7085   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7086                 (__v16qi) _mm_undefined_si128 (),
7087                 (__mmask16) -1);
7088 }
7089 
7090 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7091 _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
7092 {
7093   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7094                 (__v16qi) __O,
7095                 __M);
7096 }
7097 
7098 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7099 _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
7100 {
7101   return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7102                 (__v16qi) _mm_setzero_si128 (),
7103                 __M);
7104 }
7105 
7106 static __inline__ void __DEFAULT_FN_ATTRS512
7107 _mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
7108 {
7109   __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
7110 }
7111 
7112 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7113 _mm512_cvtusepi32_epi16 (__m512i __A)
7114 {
7115   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7116                 (__v16hi) _mm256_undefined_si256 (),
7117                 (__mmask16) -1);
7118 }
7119 
7120 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7121 _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
7122 {
7123   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7124                 (__v16hi) __O,
7125                 __M);
7126 }
7127 
7128 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7129 _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
7130 {
7131   return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7132                 (__v16hi) _mm256_setzero_si256 (),
7133                 __M);
7134 }
7135 
7136 static __inline__ void __DEFAULT_FN_ATTRS512
7137 _mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
7138 {
7139   __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
7140 }
7141 
7142 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7143 _mm512_cvtusepi64_epi8 (__m512i __A)
7144 {
7145   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7146                 (__v16qi) _mm_undefined_si128 (),
7147                 (__mmask8) -1);
7148 }
7149 
7150 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7151 _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7152 {
7153   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7154                 (__v16qi) __O,
7155                 __M);
7156 }
7157 
7158 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7159 _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
7160 {
7161   return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7162                 (__v16qi) _mm_setzero_si128 (),
7163                 __M);
7164 }
7165 
7166 static __inline__ void __DEFAULT_FN_ATTRS512
7167 _mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7168 {
7169   __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7170 }
7171 
7172 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7173 _mm512_cvtusepi64_epi32 (__m512i __A)
7174 {
7175   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7176                 (__v8si) _mm256_undefined_si256 (),
7177                 (__mmask8) -1);
7178 }
7179 
7180 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7181 _mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7182 {
7183   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7184                 (__v8si) __O, __M);
7185 }
7186 
7187 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7188 _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
7189 {
7190   return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7191                 (__v8si) _mm256_setzero_si256 (),
7192                 __M);
7193 }
7194 
7195 static __inline__ void __DEFAULT_FN_ATTRS512
7196 _mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
7197 {
7198   __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
7199 }
7200 
7201 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7202 _mm512_cvtusepi64_epi16 (__m512i __A)
7203 {
7204   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7205                 (__v8hi) _mm_undefined_si128 (),
7206                 (__mmask8) -1);
7207 }
7208 
7209 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7210 _mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7211 {
7212   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7213                 (__v8hi) __O, __M);
7214 }
7215 
7216 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7217 _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
7218 {
7219   return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7220                 (__v8hi) _mm_setzero_si128 (),
7221                 __M);
7222 }
7223 
7224 static __inline__ void __DEFAULT_FN_ATTRS512
7225 _mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
7226 {
7227   __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
7228 }
7229 
7230 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7231 _mm512_cvtepi32_epi8 (__m512i __A)
7232 {
7233   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7234               (__v16qi) _mm_undefined_si128 (),
7235               (__mmask16) -1);
7236 }
7237 
7238 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7239 _mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
7240 {
7241   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7242               (__v16qi) __O, __M);
7243 }
7244 
7245 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7246 _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
7247 {
7248   return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7249               (__v16qi) _mm_setzero_si128 (),
7250               __M);
7251 }
7252 
7253 static __inline__ void __DEFAULT_FN_ATTRS512
7254 _mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
7255 {
7256   __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
7257 }
7258 
7259 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7260 _mm512_cvtepi32_epi16 (__m512i __A)
7261 {
7262   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7263               (__v16hi) _mm256_undefined_si256 (),
7264               (__mmask16) -1);
7265 }
7266 
7267 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7268 _mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
7269 {
7270   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7271               (__v16hi) __O, __M);
7272 }
7273 
7274 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7275 _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
7276 {
7277   return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7278               (__v16hi) _mm256_setzero_si256 (),
7279               __M);
7280 }
7281 
7282 static __inline__ void __DEFAULT_FN_ATTRS512
7283 _mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
7284 {
7285   __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
7286 }
7287 
7288 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7289 _mm512_cvtepi64_epi8 (__m512i __A)
7290 {
7291   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7292               (__v16qi) _mm_undefined_si128 (),
7293               (__mmask8) -1);
7294 }
7295 
7296 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7297 _mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7298 {
7299   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7300               (__v16qi) __O, __M);
7301 }
7302 
7303 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7304 _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
7305 {
7306   return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7307               (__v16qi) _mm_setzero_si128 (),
7308               __M);
7309 }
7310 
7311 static __inline__ void __DEFAULT_FN_ATTRS512
7312 _mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7313 {
7314   __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7315 }
7316 
7317 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7318 _mm512_cvtepi64_epi32 (__m512i __A)
7319 {
7320   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7321               (__v8si) _mm256_undefined_si256 (),
7322               (__mmask8) -1);
7323 }
7324 
7325 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7326 _mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7327 {
7328   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7329               (__v8si) __O, __M);
7330 }
7331 
7332 static __inline__ __m256i __DEFAULT_FN_ATTRS512
7333 _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
7334 {
7335   return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7336               (__v8si) _mm256_setzero_si256 (),
7337               __M);
7338 }
7339 
7340 static __inline__ void __DEFAULT_FN_ATTRS512
7341 _mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
7342 {
7343   __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
7344 }
7345 
7346 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7347 _mm512_cvtepi64_epi16 (__m512i __A)
7348 {
7349   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7350               (__v8hi) _mm_undefined_si128 (),
7351               (__mmask8) -1);
7352 }
7353 
7354 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7355 _mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7356 {
7357   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7358               (__v8hi) __O, __M);
7359 }
7360 
7361 static __inline__ __m128i __DEFAULT_FN_ATTRS512
7362 _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
7363 {
7364   return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7365               (__v8hi) _mm_setzero_si128 (),
7366               __M);
7367 }
7368 
7369 static __inline__ void __DEFAULT_FN_ATTRS512
7370 _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
7371 {
7372   __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
7373 }
7374 
7375 #define _mm512_extracti32x4_epi32(A, imm) \
7376   ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7377                                              (__v4si)_mm_undefined_si128(), \
7378                                              (__mmask8)-1))
7379 
7380 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
7381   ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7382                                              (__v4si)(__m128i)(W), \
7383                                              (__mmask8)(U)))
7384 
7385 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
7386   ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7387                                              (__v4si)_mm_setzero_si128(), \
7388                                              (__mmask8)(U)))
7389 
7390 #define _mm512_extracti64x4_epi64(A, imm) \
7391   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7392                                              (__v4di)_mm256_undefined_si256(), \
7393                                              (__mmask8)-1))
7394 
7395 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
7396   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7397                                              (__v4di)(__m256i)(W), \
7398                                              (__mmask8)(U)))
7399 
7400 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
7401   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7402                                              (__v4di)_mm256_setzero_si256(), \
7403                                              (__mmask8)(U)))
7404 
7405 #define _mm512_insertf64x4(A, B, imm) \
7406   ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
7407                                        (__v4df)(__m256d)(B), (int)(imm)))
7408 
7409 #define _mm512_mask_insertf64x4(W, U, A, B, imm) \
7410   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
7411                                    (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
7412                                    (__v8df)(__m512d)(W)))
7413 
7414 #define _mm512_maskz_insertf64x4(U, A, B, imm) \
7415   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
7416                                    (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
7417                                    (__v8df)_mm512_setzero_pd()))
7418 
7419 #define _mm512_inserti64x4(A, B, imm) \
7420   ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
7421                                        (__v4di)(__m256i)(B), (int)(imm)))
7422 
7423 #define _mm512_mask_inserti64x4(W, U, A, B, imm) \
7424   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
7425                                    (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
7426                                    (__v8di)(__m512i)(W)))
7427 
7428 #define _mm512_maskz_inserti64x4(U, A, B, imm) \
7429   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
7430                                    (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
7431                                    (__v8di)_mm512_setzero_si512()))
7432 
7433 #define _mm512_insertf32x4(A, B, imm) \
7434   ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
7435                                       (__v4sf)(__m128)(B), (int)(imm)))
7436 
7437 #define _mm512_mask_insertf32x4(W, U, A, B, imm) \
7438   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
7439                                   (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
7440                                   (__v16sf)(__m512)(W)))
7441 
7442 #define _mm512_maskz_insertf32x4(U, A, B, imm) \
7443   ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
7444                                   (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
7445                                   (__v16sf)_mm512_setzero_ps()))
7446 
7447 #define _mm512_inserti32x4(A, B, imm) \
7448   ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
7449                                        (__v4si)(__m128i)(B), (int)(imm)))
7450 
7451 #define _mm512_mask_inserti32x4(W, U, A, B, imm) \
7452   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
7453                                   (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
7454                                   (__v16si)(__m512i)(W)))
7455 
7456 #define _mm512_maskz_inserti32x4(U, A, B, imm) \
7457   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
7458                                   (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
7459                                   (__v16si)_mm512_setzero_si512()))
7460 
7461 #define _mm512_getmant_round_pd(A, B, C, R) \
7462   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7463                                              (int)(((C)<<2) | (B)), \
7464                                              (__v8df)_mm512_undefined_pd(), \
7465                                              (__mmask8)-1, (int)(R)))
7466 
7467 #define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \
7468   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7469                                              (int)(((C)<<2) | (B)), \
7470                                              (__v8df)(__m512d)(W), \
7471                                              (__mmask8)(U), (int)(R)))
7472 
7473 #define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \
7474   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7475                                              (int)(((C)<<2) | (B)), \
7476                                              (__v8df)_mm512_setzero_pd(), \
7477                                              (__mmask8)(U), (int)(R)))
7478 
7479 #define _mm512_getmant_pd(A, B, C) \
7480   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7481                                              (int)(((C)<<2) | (B)), \
7482                                              (__v8df)_mm512_setzero_pd(), \
7483                                              (__mmask8)-1, \
7484                                              _MM_FROUND_CUR_DIRECTION))
7485 
7486 #define _mm512_mask_getmant_pd(W, U, A, B, C) \
7487   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7488                                              (int)(((C)<<2) | (B)), \
7489                                              (__v8df)(__m512d)(W), \
7490                                              (__mmask8)(U), \
7491                                              _MM_FROUND_CUR_DIRECTION))
7492 
7493 #define _mm512_maskz_getmant_pd(U, A, B, C) \
7494   ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7495                                              (int)(((C)<<2) | (B)), \
7496                                              (__v8df)_mm512_setzero_pd(), \
7497                                              (__mmask8)(U), \
7498                                              _MM_FROUND_CUR_DIRECTION))
7499 
7500 #define _mm512_getmant_round_ps(A, B, C, R) \
7501   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7502                                             (int)(((C)<<2) | (B)), \
7503                                             (__v16sf)_mm512_undefined_ps(), \
7504                                             (__mmask16)-1, (int)(R)))
7505 
7506 #define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \
7507   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7508                                             (int)(((C)<<2) | (B)), \
7509                                             (__v16sf)(__m512)(W), \
7510                                             (__mmask16)(U), (int)(R)))
7511 
7512 #define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \
7513   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7514                                             (int)(((C)<<2) | (B)), \
7515                                             (__v16sf)_mm512_setzero_ps(), \
7516                                             (__mmask16)(U), (int)(R)))
7517 
7518 #define _mm512_getmant_ps(A, B, C) \
7519   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7520                                             (int)(((C)<<2)|(B)), \
7521                                             (__v16sf)_mm512_undefined_ps(), \
7522                                             (__mmask16)-1, \
7523                                             _MM_FROUND_CUR_DIRECTION))
7524 
7525 #define _mm512_mask_getmant_ps(W, U, A, B, C) \
7526   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7527                                             (int)(((C)<<2)|(B)), \
7528                                             (__v16sf)(__m512)(W), \
7529                                             (__mmask16)(U), \
7530                                             _MM_FROUND_CUR_DIRECTION))
7531 
7532 #define _mm512_maskz_getmant_ps(U, A, B, C) \
7533   ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7534                                             (int)(((C)<<2)|(B)), \
7535                                             (__v16sf)_mm512_setzero_ps(), \
7536                                             (__mmask16)(U), \
7537                                             _MM_FROUND_CUR_DIRECTION))
7538 
7539 #define _mm512_getexp_round_pd(A, R) \
7540   ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7541                                             (__v8df)_mm512_undefined_pd(), \
7542                                             (__mmask8)-1, (int)(R)))
7543 
7544 #define _mm512_mask_getexp_round_pd(W, U, A, R) \
7545   ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7546                                             (__v8df)(__m512d)(W), \
7547                                             (__mmask8)(U), (int)(R)))
7548 
7549 #define _mm512_maskz_getexp_round_pd(U, A, R) \
7550   ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7551                                             (__v8df)_mm512_setzero_pd(), \
7552                                             (__mmask8)(U), (int)(R)))
7553 
7554 static __inline__ __m512d __DEFAULT_FN_ATTRS512
7555 _mm512_getexp_pd (__m512d __A)
7556 {
7557   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7558                 (__v8df) _mm512_undefined_pd (),
7559                 (__mmask8) -1,
7560                 _MM_FROUND_CUR_DIRECTION);
7561 }
7562 
7563 static __inline__ __m512d __DEFAULT_FN_ATTRS512
7564 _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
7565 {
7566   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7567                 (__v8df) __W,
7568                 (__mmask8) __U,
7569                 _MM_FROUND_CUR_DIRECTION);
7570 }
7571 
7572 static __inline__ __m512d __DEFAULT_FN_ATTRS512
7573 _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
7574 {
7575   return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7576                 (__v8df) _mm512_setzero_pd (),
7577                 (__mmask8) __U,
7578                 _MM_FROUND_CUR_DIRECTION);
7579 }
7580 
7581 #define _mm512_getexp_round_ps(A, R) \
7582   ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7583                                            (__v16sf)_mm512_undefined_ps(), \
7584                                            (__mmask16)-1, (int)(R)))
7585 
7586 #define _mm512_mask_getexp_round_ps(W, U, A, R) \
7587   ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7588                                            (__v16sf)(__m512)(W), \
7589                                            (__mmask16)(U), (int)(R)))
7590 
7591 #define _mm512_maskz_getexp_round_ps(U, A, R) \
7592   ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7593                                            (__v16sf)_mm512_setzero_ps(), \
7594                                            (__mmask16)(U), (int)(R)))
7595 
7596 static __inline__ __m512 __DEFAULT_FN_ATTRS512
7597 _mm512_getexp_ps (__m512 __A)
7598 {
7599   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7600                (__v16sf) _mm512_undefined_ps (),
7601                (__mmask16) -1,
7602                _MM_FROUND_CUR_DIRECTION);
7603 }
7604 
7605 static __inline__ __m512 __DEFAULT_FN_ATTRS512
7606 _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
7607 {
7608   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7609                (__v16sf) __W,
7610                (__mmask16) __U,
7611                _MM_FROUND_CUR_DIRECTION);
7612 }
7613 
7614 static __inline__ __m512 __DEFAULT_FN_ATTRS512
7615 _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
7616 {
7617   return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7618                (__v16sf) _mm512_setzero_ps (),
7619                (__mmask16) __U,
7620                _MM_FROUND_CUR_DIRECTION);
7621 }
7622 
7623 #define _mm512_i64gather_ps(index, addr, scale) \
7624   ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
7625                                         (void const *)(addr), \
7626                                         (__v8di)(__m512i)(index), (__mmask8)-1, \
7627                                         (int)(scale)))
7628 
7629 #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \
7630   ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
7631                                         (void const *)(addr), \
7632                                         (__v8di)(__m512i)(index), \
7633                                         (__mmask8)(mask), (int)(scale)))
7634 
7635 #define _mm512_i64gather_epi32(index, addr, scale) \
7636   ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
7637                                          (void const *)(addr), \
7638                                          (__v8di)(__m512i)(index), \
7639                                          (__mmask8)-1, (int)(scale)))
7640 
7641 #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \
7642   ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
7643                                          (void const *)(addr), \
7644                                          (__v8di)(__m512i)(index), \
7645                                          (__mmask8)(mask), (int)(scale)))
7646 
7647 #define _mm512_i64gather_pd(index, addr, scale) \
7648   ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
7649                                         (void const *)(addr), \
7650                                         (__v8di)(__m512i)(index), (__mmask8)-1, \
7651                                         (int)(scale)))
7652 
7653 #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \
7654   ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
7655                                         (void const *)(addr), \
7656                                         (__v8di)(__m512i)(index), \
7657                                         (__mmask8)(mask), (int)(scale)))
7658 
7659 #define _mm512_i64gather_epi64(index, addr, scale) \
7660   ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
7661                                         (void const *)(addr), \
7662                                         (__v8di)(__m512i)(index), (__mmask8)-1, \
7663                                         (int)(scale)))
7664 
7665 #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \
7666   ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
7667                                         (void const *)(addr), \
7668                                         (__v8di)(__m512i)(index), \
7669                                         (__mmask8)(mask), (int)(scale)))
7670 
7671 #define _mm512_i32gather_ps(index, addr, scale) \
7672   ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
7673                                         (void const *)(addr), \
7674                                         (__v16si)(__m512)(index), \
7675                                         (__mmask16)-1, (int)(scale)))
7676 
7677 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
7678   ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
7679                                         (void const *)(addr), \
7680                                         (__v16si)(__m512)(index), \
7681                                         (__mmask16)(mask), (int)(scale)))
7682 
7683 #define _mm512_i32gather_epi32(index, addr, scale) \
7684   ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
7685                                          (void const *)(addr), \
7686                                          (__v16si)(__m512i)(index), \
7687                                          (__mmask16)-1, (int)(scale)))
7688 
7689 #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \
7690   ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
7691                                          (void const *)(addr), \
7692                                          (__v16si)(__m512i)(index), \
7693                                          (__mmask16)(mask), (int)(scale)))
7694 
7695 #define _mm512_i32gather_pd(index, addr, scale) \
7696   ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
7697                                         (void const *)(addr), \
7698                                         (__v8si)(__m256i)(index), (__mmask8)-1, \
7699                                         (int)(scale)))
7700 
7701 #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \
7702   ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
7703                                         (void const *)(addr), \
7704                                         (__v8si)(__m256i)(index), \
7705                                         (__mmask8)(mask), (int)(scale)))
7706 
7707 #define _mm512_i32gather_epi64(index, addr, scale) \
7708   ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
7709                                         (void const *)(addr), \
7710                                         (__v8si)(__m256i)(index), (__mmask8)-1, \
7711                                         (int)(scale)))
7712 
7713 #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \
7714   ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
7715                                         (void const *)(addr), \
7716                                         (__v8si)(__m256i)(index), \
7717                                         (__mmask8)(mask), (int)(scale)))
7718 
7719 #define _mm512_i64scatter_ps(addr, index, v1, scale) \
7720   __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \
7721                                 (__v8di)(__m512i)(index), \
7722                                 (__v8sf)(__m256)(v1), (int)(scale))
7723 
7724 #define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \
7725   __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \
7726                                 (__v8di)(__m512i)(index), \
7727                                 (__v8sf)(__m256)(v1), (int)(scale))
7728 
7729 #define _mm512_i64scatter_epi32(addr, index, v1, scale) \
7730   __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \
7731                                 (__v8di)(__m512i)(index), \
7732                                 (__v8si)(__m256i)(v1), (int)(scale))
7733 
7734 #define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
7735   __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \
7736                                 (__v8di)(__m512i)(index), \
7737                                 (__v8si)(__m256i)(v1), (int)(scale))
7738 
7739 #define _mm512_i64scatter_pd(addr, index, v1, scale) \
7740   __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \
7741                                (__v8di)(__m512i)(index), \
7742                                (__v8df)(__m512d)(v1), (int)(scale))
7743 
7744 #define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \
7745   __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \
7746                                (__v8di)(__m512i)(index), \
7747                                (__v8df)(__m512d)(v1), (int)(scale))
7748 
7749 #define _mm512_i64scatter_epi64(addr, index, v1, scale) \
7750   __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \
7751                                (__v8di)(__m512i)(index), \
7752                                (__v8di)(__m512i)(v1), (int)(scale))
7753 
7754 #define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
7755   __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \
7756                                (__v8di)(__m512i)(index), \
7757                                (__v8di)(__m512i)(v1), (int)(scale))
7758 
7759 #define _mm512_i32scatter_ps(addr, index, v1, scale) \
7760   __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \
7761                                 (__v16si)(__m512i)(index), \
7762                                 (__v16sf)(__m512)(v1), (int)(scale))
7763 
7764 #define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \
7765   __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \
7766                                 (__v16si)(__m512i)(index), \
7767                                 (__v16sf)(__m512)(v1), (int)(scale))
7768 
7769 #define _mm512_i32scatter_epi32(addr, index, v1, scale) \
7770   __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \
7771                                 (__v16si)(__m512i)(index), \
7772                                 (__v16si)(__m512i)(v1), (int)(scale))
7773 
7774 #define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
7775   __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \
7776                                 (__v16si)(__m512i)(index), \
7777                                 (__v16si)(__m512i)(v1), (int)(scale))
7778 
7779 #define _mm512_i32scatter_pd(addr, index, v1, scale) \
7780   __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \
7781                                (__v8si)(__m256i)(index), \
7782                                (__v8df)(__m512d)(v1), (int)(scale))
7783 
7784 #define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \
7785   __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \
7786                                (__v8si)(__m256i)(index), \
7787                                (__v8df)(__m512d)(v1), (int)(scale))
7788 
7789 #define _mm512_i32scatter_epi64(addr, index, v1, scale) \
7790   __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \
7791                                (__v8si)(__m256i)(index), \
7792                                (__v8di)(__m512i)(v1), (int)(scale))
7793 
7794 #define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
7795   __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \
7796                                (__v8si)(__m256i)(index), \
7797                                (__v8di)(__m512i)(v1), (int)(scale))
7798 
7799 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7800 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7801 {
7802   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7803                                        (__v4sf)__A,
7804                                        (__v4sf)__B,
7805                                        (__mmask8)__U,
7806                                        _MM_FROUND_CUR_DIRECTION);
7807 }
7808 
7809 #define _mm_fmadd_round_ss(A, B, C, R) \
7810   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7811                                          (__v4sf)(__m128)(B), \
7812                                          (__v4sf)(__m128)(C), (__mmask8)-1, \
7813                                          (int)(R)))
7814 
7815 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) \
7816   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7817                                          (__v4sf)(__m128)(A), \
7818                                          (__v4sf)(__m128)(B), (__mmask8)(U), \
7819                                          (int)(R)))
7820 
7821 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7822 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7823 {
7824   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7825                                         (__v4sf)__B,
7826                                         (__v4sf)__C,
7827                                         (__mmask8)__U,
7828                                         _MM_FROUND_CUR_DIRECTION);
7829 }
7830 
7831 #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
7832   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7833                                           (__v4sf)(__m128)(B), \
7834                                           (__v4sf)(__m128)(C), (__mmask8)(U), \
7835                                           (int)(R)))
7836 
7837 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7838 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7839 {
7840   return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
7841                                         (__v4sf)__X,
7842                                         (__v4sf)__Y,
7843                                         (__mmask8)__U,
7844                                         _MM_FROUND_CUR_DIRECTION);
7845 }
7846 
7847 #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \
7848   ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
7849                                           (__v4sf)(__m128)(X), \
7850                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
7851                                           (int)(R)))
7852 
7853 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7854 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7855 {
7856   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7857                                        (__v4sf)__A,
7858                                        -(__v4sf)__B,
7859                                        (__mmask8)__U,
7860                                        _MM_FROUND_CUR_DIRECTION);
7861 }
7862 
7863 #define _mm_fmsub_round_ss(A, B, C, R) \
7864   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7865                                          (__v4sf)(__m128)(B), \
7866                                          -(__v4sf)(__m128)(C), (__mmask8)-1, \
7867                                          (int)(R)))
7868 
7869 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) \
7870   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7871                                          (__v4sf)(__m128)(A), \
7872                                          -(__v4sf)(__m128)(B), (__mmask8)(U), \
7873                                          (int)(R)))
7874 
7875 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7876 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7877 {
7878   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7879                                         (__v4sf)__B,
7880                                         -(__v4sf)__C,
7881                                         (__mmask8)__U,
7882                                         _MM_FROUND_CUR_DIRECTION);
7883 }
7884 
7885 #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
7886   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7887                                           (__v4sf)(__m128)(B), \
7888                                           -(__v4sf)(__m128)(C), (__mmask8)(U), \
7889                                           (int)(R)))
7890 
7891 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7892 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7893 {
7894   return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
7895                                         (__v4sf)__X,
7896                                         (__v4sf)__Y,
7897                                         (__mmask8)__U,
7898                                         _MM_FROUND_CUR_DIRECTION);
7899 }
7900 
7901 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \
7902   ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
7903                                           (__v4sf)(__m128)(X), \
7904                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
7905                                           (int)(R)))
7906 
7907 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7908 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7909 {
7910   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7911                                        -(__v4sf)__A,
7912                                        (__v4sf)__B,
7913                                        (__mmask8)__U,
7914                                        _MM_FROUND_CUR_DIRECTION);
7915 }
7916 
7917 #define _mm_fnmadd_round_ss(A, B, C, R) \
7918   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7919                                          -(__v4sf)(__m128)(B), \
7920                                          (__v4sf)(__m128)(C), (__mmask8)-1, \
7921                                          (int)(R)))
7922 
7923 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \
7924   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7925                                          -(__v4sf)(__m128)(A), \
7926                                          (__v4sf)(__m128)(B), (__mmask8)(U), \
7927                                          (int)(R)))
7928 
7929 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7930 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7931 {
7932   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7933                                         -(__v4sf)__B,
7934                                         (__v4sf)__C,
7935                                         (__mmask8)__U,
7936                                         _MM_FROUND_CUR_DIRECTION);
7937 }
7938 
7939 #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
7940   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7941                                           -(__v4sf)(__m128)(B), \
7942                                           (__v4sf)(__m128)(C), (__mmask8)(U), \
7943                                           (int)(R)))
7944 
7945 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7946 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7947 {
7948   return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
7949                                         -(__v4sf)__X,
7950                                         (__v4sf)__Y,
7951                                         (__mmask8)__U,
7952                                         _MM_FROUND_CUR_DIRECTION);
7953 }
7954 
7955 #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \
7956   ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
7957                                           -(__v4sf)(__m128)(X), \
7958                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
7959                                           (int)(R)))
7960 
7961 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7962 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7963 {
7964   return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7965                                        -(__v4sf)__A,
7966                                        -(__v4sf)__B,
7967                                        (__mmask8)__U,
7968                                        _MM_FROUND_CUR_DIRECTION);
7969 }
7970 
7971 #define _mm_fnmsub_round_ss(A, B, C, R) \
7972   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7973                                          -(__v4sf)(__m128)(B), \
7974                                          -(__v4sf)(__m128)(C), (__mmask8)-1, \
7975                                          (int)(R)))
7976 
7977 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \
7978   ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7979                                          -(__v4sf)(__m128)(A), \
7980                                          -(__v4sf)(__m128)(B), (__mmask8)(U), \
7981                                          (int)(R)))
7982 
7983 static __inline__ __m128 __DEFAULT_FN_ATTRS128
7984 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7985 {
7986   return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7987                                         -(__v4sf)__B,
7988                                         -(__v4sf)__C,
7989                                         (__mmask8)__U,
7990                                         _MM_FROUND_CUR_DIRECTION);
7991 }
7992 
7993 #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
7994   ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7995                                           -(__v4sf)(__m128)(B), \
7996                                           -(__v4sf)(__m128)(C), (__mmask8)(U), \
7997                                           (int)(R)))
7998 
7999 static __inline__ __m128 __DEFAULT_FN_ATTRS128
8000 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
8001 {
8002   return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
8003                                         -(__v4sf)__X,
8004                                         (__v4sf)__Y,
8005                                         (__mmask8)__U,
8006                                         _MM_FROUND_CUR_DIRECTION);
8007 }
8008 
8009 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \
8010   ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
8011                                           -(__v4sf)(__m128)(X), \
8012                                           (__v4sf)(__m128)(Y), (__mmask8)(U), \
8013                                           (int)(R)))
8014 
8015 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8016 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8017 {
8018   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8019                                        (__v2df)__A,
8020                                        (__v2df)__B,
8021                                        (__mmask8)__U,
8022                                        _MM_FROUND_CUR_DIRECTION);
8023 }
8024 
8025 #define _mm_fmadd_round_sd(A, B, C, R) \
8026   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8027                                           (__v2df)(__m128d)(B), \
8028                                           (__v2df)(__m128d)(C), (__mmask8)-1, \
8029                                           (int)(R)))
8030 
8031 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) \
8032   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8033                                           (__v2df)(__m128d)(A), \
8034                                           (__v2df)(__m128d)(B), (__mmask8)(U), \
8035                                           (int)(R)))
8036 
8037 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8038 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8039 {
8040   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8041                                         (__v2df)__B,
8042                                         (__v2df)__C,
8043                                         (__mmask8)__U,
8044                                         _MM_FROUND_CUR_DIRECTION);
8045 }
8046 
8047 #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
8048   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8049                                            (__v2df)(__m128d)(B), \
8050                                            (__v2df)(__m128d)(C), (__mmask8)(U), \
8051                                            (int)(R)))
8052 
8053 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8054 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8055 {
8056   return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
8057                                         (__v2df)__X,
8058                                         (__v2df)__Y,
8059                                         (__mmask8)__U,
8060                                         _MM_FROUND_CUR_DIRECTION);
8061 }
8062 
8063 #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \
8064   ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
8065                                            (__v2df)(__m128d)(X), \
8066                                            (__v2df)(__m128d)(Y), (__mmask8)(U), \
8067                                            (int)(R)))
8068 
8069 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8070 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8071 {
8072   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8073                                        (__v2df)__A,
8074                                        -(__v2df)__B,
8075                                        (__mmask8)__U,
8076                                        _MM_FROUND_CUR_DIRECTION);
8077 }
8078 
8079 #define _mm_fmsub_round_sd(A, B, C, R) \
8080   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8081                                           (__v2df)(__m128d)(B), \
8082                                           -(__v2df)(__m128d)(C), (__mmask8)-1, \
8083                                           (int)(R)))
8084 
8085 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) \
8086   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8087                                           (__v2df)(__m128d)(A), \
8088                                           -(__v2df)(__m128d)(B), (__mmask8)(U), \
8089                                           (int)(R)))
8090 
8091 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8092 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8093 {
8094   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8095                                         (__v2df)__B,
8096                                         -(__v2df)__C,
8097                                         (__mmask8)__U,
8098                                         _MM_FROUND_CUR_DIRECTION);
8099 }
8100 
8101 #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
8102   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8103                                            (__v2df)(__m128d)(B), \
8104                                            -(__v2df)(__m128d)(C), \
8105                                            (__mmask8)(U), (int)(R)))
8106 
8107 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8108 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8109 {
8110   return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
8111                                         (__v2df)__X,
8112                                         (__v2df)__Y,
8113                                         (__mmask8)__U,
8114                                         _MM_FROUND_CUR_DIRECTION);
8115 }
8116 
8117 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \
8118   ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
8119                                            (__v2df)(__m128d)(X), \
8120                                            (__v2df)(__m128d)(Y), \
8121                                            (__mmask8)(U), (int)(R)))
8122 
8123 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8124 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8125 {
8126   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8127                                        -(__v2df)__A,
8128                                        (__v2df)__B,
8129                                        (__mmask8)__U,
8130                                        _MM_FROUND_CUR_DIRECTION);
8131 }
8132 
8133 #define _mm_fnmadd_round_sd(A, B, C, R) \
8134   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8135                                           -(__v2df)(__m128d)(B), \
8136                                           (__v2df)(__m128d)(C), (__mmask8)-1, \
8137                                           (int)(R)))
8138 
8139 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \
8140   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8141                                           -(__v2df)(__m128d)(A), \
8142                                           (__v2df)(__m128d)(B), (__mmask8)(U), \
8143                                           (int)(R)))
8144 
8145 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8146 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8147 {
8148   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8149                                         -(__v2df)__B,
8150                                         (__v2df)__C,
8151                                         (__mmask8)__U,
8152                                         _MM_FROUND_CUR_DIRECTION);
8153 }
8154 
8155 #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
8156   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8157                                            -(__v2df)(__m128d)(B), \
8158                                            (__v2df)(__m128d)(C), (__mmask8)(U), \
8159                                            (int)(R)))
8160 
8161 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8162 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8163 {
8164   return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
8165                                         -(__v2df)__X,
8166                                         (__v2df)__Y,
8167                                         (__mmask8)__U,
8168                                         _MM_FROUND_CUR_DIRECTION);
8169 }
8170 
8171 #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \
8172   ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
8173                                            -(__v2df)(__m128d)(X), \
8174                                            (__v2df)(__m128d)(Y), (__mmask8)(U), \
8175                                            (int)(R)))
8176 
8177 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8178 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8179 {
8180   return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8181                                        -(__v2df)__A,
8182                                        -(__v2df)__B,
8183                                        (__mmask8)__U,
8184                                        _MM_FROUND_CUR_DIRECTION);
8185 }
8186 
8187 #define _mm_fnmsub_round_sd(A, B, C, R) \
8188   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8189                                           -(__v2df)(__m128d)(B), \
8190                                           -(__v2df)(__m128d)(C), (__mmask8)-1, \
8191                                           (int)(R)))
8192 
8193 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \
8194   ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8195                                           -(__v2df)(__m128d)(A), \
8196                                           -(__v2df)(__m128d)(B), (__mmask8)(U), \
8197                                           (int)(R)))
8198 
8199 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8200 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8201 {
8202   return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8203                                         -(__v2df)__B,
8204                                         -(__v2df)__C,
8205                                         (__mmask8)__U,
8206                                         _MM_FROUND_CUR_DIRECTION);
8207 }
8208 
8209 #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
8210   ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8211                                            -(__v2df)(__m128d)(B), \
8212                                            -(__v2df)(__m128d)(C), \
8213                                            (__mmask8)(U), \
8214                                            (int)(R)))
8215 
8216 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8217 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8218 {
8219   return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
8220                                         -(__v2df)__X,
8221                                         (__v2df)__Y,
8222                                         (__mmask8)__U,
8223                                         _MM_FROUND_CUR_DIRECTION);
8224 }
8225 
8226 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \
8227   ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
8228                                            -(__v2df)(__m128d)(X), \
8229                                            (__v2df)(__m128d)(Y), \
8230                                            (__mmask8)(U), (int)(R)))
8231 
8232 #define _mm512_permutex_pd(X, C) \
8233   ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)))
8234 
8235 #define _mm512_mask_permutex_pd(W, U, X, C) \
8236   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
8237                                         (__v8df)_mm512_permutex_pd((X), (C)), \
8238                                         (__v8df)(__m512d)(W)))
8239 
8240 #define _mm512_maskz_permutex_pd(U, X, C) \
8241   ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
8242                                         (__v8df)_mm512_permutex_pd((X), (C)), \
8243                                         (__v8df)_mm512_setzero_pd()))
8244 
8245 #define _mm512_permutex_epi64(X, C) \
8246   ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)))
8247 
8248 #define _mm512_mask_permutex_epi64(W, U, X, C) \
8249   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
8250                                        (__v8di)_mm512_permutex_epi64((X), (C)), \
8251                                        (__v8di)(__m512i)(W)))
8252 
8253 #define _mm512_maskz_permutex_epi64(U, X, C) \
8254   ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
8255                                        (__v8di)_mm512_permutex_epi64((X), (C)), \
8256                                        (__v8di)_mm512_setzero_si512()))
8257 
8258 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8259 _mm512_permutexvar_pd (__m512i __X, __m512d __Y)
8260 {
8261   return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X);
8262 }
8263 
8264 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8265 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
8266 {
8267   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8268                                         (__v8df)_mm512_permutexvar_pd(__X, __Y),
8269                                         (__v8df)__W);
8270 }
8271 
8272 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8273 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
8274 {
8275   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8276                                         (__v8df)_mm512_permutexvar_pd(__X, __Y),
8277                                         (__v8df)_mm512_setzero_pd());
8278 }
8279 
8280 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8281 _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
8282 {
8283   return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X);
8284 }
8285 
8286 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8287 _mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
8288 {
8289   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
8290                                      (__v8di)_mm512_permutexvar_epi64(__X, __Y),
8291                                      (__v8di)_mm512_setzero_si512());
8292 }
8293 
8294 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8295 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
8296              __m512i __Y)
8297 {
8298   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
8299                                      (__v8di)_mm512_permutexvar_epi64(__X, __Y),
8300                                      (__v8di)__W);
8301 }
8302 
8303 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8304 _mm512_permutexvar_ps (__m512i __X, __m512 __Y)
8305 {
8306   return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X);
8307 }
8308 
8309 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8310 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
8311 {
8312   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8313                                        (__v16sf)_mm512_permutexvar_ps(__X, __Y),
8314                                        (__v16sf)__W);
8315 }
8316 
8317 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8318 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
8319 {
8320   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8321                                        (__v16sf)_mm512_permutexvar_ps(__X, __Y),
8322                                        (__v16sf)_mm512_setzero_ps());
8323 }
8324 
8325 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8326 _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
8327 {
8328   return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X);
8329 }
8330 
8331 #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
8332 
8333 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8334 _mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
8335 {
8336   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
8337                                     (__v16si)_mm512_permutexvar_epi32(__X, __Y),
8338                                     (__v16si)_mm512_setzero_si512());
8339 }
8340 
8341 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8342 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
8343              __m512i __Y)
8344 {
8345   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
8346                                     (__v16si)_mm512_permutexvar_epi32(__X, __Y),
8347                                     (__v16si)__W);
8348 }
8349 
8350 #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
8351 
8352 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8353 _mm512_kand (__mmask16 __A, __mmask16 __B)
8354 {
8355   return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
8356 }
8357 
8358 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8359 _mm512_kandn (__mmask16 __A, __mmask16 __B)
8360 {
8361   return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
8362 }
8363 
8364 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8365 _mm512_kor (__mmask16 __A, __mmask16 __B)
8366 {
8367   return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
8368 }
8369 
8370 static __inline__ int __DEFAULT_FN_ATTRS
8371 _mm512_kortestc (__mmask16 __A, __mmask16 __B)
8372 {
8373   return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
8374 }
8375 
8376 static __inline__ int __DEFAULT_FN_ATTRS
8377 _mm512_kortestz (__mmask16 __A, __mmask16 __B)
8378 {
8379   return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
8380 }
8381 
8382 static __inline__ unsigned char __DEFAULT_FN_ATTRS
8383 _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B)
8384 {
8385   return (unsigned char)__builtin_ia32_kortestchi(__A, __B);
8386 }
8387 
8388 static __inline__ unsigned char __DEFAULT_FN_ATTRS
8389 _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B)
8390 {
8391   return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
8392 }
8393 
8394 static __inline__ unsigned char __DEFAULT_FN_ATTRS
8395 _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
8396   *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B);
8397   return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
8398 }
8399 
8400 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8401 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
8402 {
8403   return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
8404 }
8405 
8406 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8407 _mm512_kxnor (__mmask16 __A, __mmask16 __B)
8408 {
8409   return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
8410 }
8411 
8412 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8413 _mm512_kxor (__mmask16 __A, __mmask16 __B)
8414 {
8415   return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
8416 }
8417 
8418 #define _kand_mask16 _mm512_kand
8419 #define _kandn_mask16 _mm512_kandn
8420 #define _knot_mask16 _mm512_knot
8421 #define _kor_mask16 _mm512_kor
8422 #define _kxnor_mask16 _mm512_kxnor
8423 #define _kxor_mask16 _mm512_kxor
8424 
8425 #define _kshiftli_mask16(A, I) \
8426   ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)))
8427 
8428 #define _kshiftri_mask16(A, I) \
8429   ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)))
8430 
8431 static __inline__ unsigned int __DEFAULT_FN_ATTRS
8432 _cvtmask16_u32(__mmask16 __A) {
8433   return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A);
8434 }
8435 
8436 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8437 _cvtu32_mask16(unsigned int __A) {
8438   return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A);
8439 }
8440 
8441 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8442 _load_mask16(__mmask16 *__A) {
8443   return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A);
8444 }
8445 
8446 static __inline__ void __DEFAULT_FN_ATTRS
8447 _store_mask16(__mmask16 *__A, __mmask16 __B) {
8448   *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B);
8449 }
8450 
8451 static __inline__ void __DEFAULT_FN_ATTRS512
8452 _mm512_stream_si512 (void * __P, __m512i __A)
8453 {
8454   typedef __v8di __v8di_aligned __attribute__((aligned(64)));
8455   __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
8456 }
8457 
8458 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8459 _mm512_stream_load_si512 (void const *__P)
8460 {
8461   typedef __v8di __v8di_aligned __attribute__((aligned(64)));
8462   return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
8463 }
8464 
8465 static __inline__ void __DEFAULT_FN_ATTRS512
8466 _mm512_stream_pd (void *__P, __m512d __A)
8467 {
8468   typedef __v8df __v8df_aligned __attribute__((aligned(64)));
8469   __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
8470 }
8471 
8472 static __inline__ void __DEFAULT_FN_ATTRS512
8473 _mm512_stream_ps (void *__P, __m512 __A)
8474 {
8475   typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
8476   __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
8477 }
8478 
8479 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8480 _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
8481 {
8482   return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
8483                   (__v8df) __W,
8484                   (__mmask8) __U);
8485 }
8486 
8487 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8488 _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
8489 {
8490   return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
8491                   (__v8df)
8492                   _mm512_setzero_pd (),
8493                   (__mmask8) __U);
8494 }
8495 
8496 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8497 _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
8498 {
8499   return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
8500                   (__v8di) __W,
8501                   (__mmask8) __U);
8502 }
8503 
8504 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8505 _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
8506 {
8507   return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
8508                   (__v8di)
8509                   _mm512_setzero_si512 (),
8510                   (__mmask8) __U);
8511 }
8512 
8513 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8514 _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
8515 {
8516   return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
8517                  (__v16sf) __W,
8518                  (__mmask16) __U);
8519 }
8520 
8521 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8522 _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
8523 {
8524   return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
8525                  (__v16sf)
8526                  _mm512_setzero_ps (),
8527                  (__mmask16) __U);
8528 }
8529 
8530 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8531 _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
8532 {
8533   return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
8534                   (__v16si) __W,
8535                   (__mmask16) __U);
8536 }
8537 
8538 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8539 _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
8540 {
8541   return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
8542                   (__v16si)
8543                   _mm512_setzero_si512 (),
8544                   (__mmask16) __U);
8545 }
8546 
8547 #define _mm_cmp_round_ss_mask(X, Y, P, R) \
8548   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8549                                        (__v4sf)(__m128)(Y), (int)(P), \
8550                                        (__mmask8)-1, (int)(R)))
8551 
8552 #define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
8553   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8554                                        (__v4sf)(__m128)(Y), (int)(P), \
8555                                        (__mmask8)(M), (int)(R)))
8556 
8557 #define _mm_cmp_ss_mask(X, Y, P) \
8558   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8559                                        (__v4sf)(__m128)(Y), (int)(P), \
8560                                        (__mmask8)-1, \
8561                                        _MM_FROUND_CUR_DIRECTION))
8562 
8563 #define _mm_mask_cmp_ss_mask(M, X, Y, P) \
8564   ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8565                                        (__v4sf)(__m128)(Y), (int)(P), \
8566                                        (__mmask8)(M), \
8567                                        _MM_FROUND_CUR_DIRECTION))
8568 
8569 #define _mm_cmp_round_sd_mask(X, Y, P, R) \
8570   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8571                                        (__v2df)(__m128d)(Y), (int)(P), \
8572                                        (__mmask8)-1, (int)(R)))
8573 
8574 #define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
8575   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8576                                        (__v2df)(__m128d)(Y), (int)(P), \
8577                                        (__mmask8)(M), (int)(R)))
8578 
8579 #define _mm_cmp_sd_mask(X, Y, P) \
8580   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8581                                        (__v2df)(__m128d)(Y), (int)(P), \
8582                                        (__mmask8)-1, \
8583                                        _MM_FROUND_CUR_DIRECTION))
8584 
8585 #define _mm_mask_cmp_sd_mask(M, X, Y, P) \
8586   ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8587                                        (__v2df)(__m128d)(Y), (int)(P), \
8588                                        (__mmask8)(M), \
8589                                        _MM_FROUND_CUR_DIRECTION))
8590 
8591 /* Bit Test */
8592 
8593 static __inline __mmask16 __DEFAULT_FN_ATTRS512
8594 _mm512_test_epi32_mask (__m512i __A, __m512i __B)
8595 {
8596   return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B),
8597                                    _mm512_setzero_si512());
8598 }
8599 
8600 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
8601 _mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
8602 {
8603   return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
8604                                         _mm512_setzero_si512());
8605 }
8606 
8607 static __inline __mmask8 __DEFAULT_FN_ATTRS512
8608 _mm512_test_epi64_mask (__m512i __A, __m512i __B)
8609 {
8610   return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B),
8611                                    _mm512_setzero_si512());
8612 }
8613 
8614 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
8615 _mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
8616 {
8617   return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
8618                                         _mm512_setzero_si512());
8619 }
8620 
8621 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
8622 _mm512_testn_epi32_mask (__m512i __A, __m512i __B)
8623 {
8624   return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B),
8625                                   _mm512_setzero_si512());
8626 }
8627 
8628 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
8629 _mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
8630 {
8631   return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
8632                                        _mm512_setzero_si512());
8633 }
8634 
8635 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
8636 _mm512_testn_epi64_mask (__m512i __A, __m512i __B)
8637 {
8638   return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B),
8639                                   _mm512_setzero_si512());
8640 }
8641 
8642 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
8643 _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
8644 {
8645   return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
8646                                        _mm512_setzero_si512());
8647 }
8648 
8649 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8650 _mm512_movehdup_ps (__m512 __A)
8651 {
8652   return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
8653                          1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
8654 }
8655 
8656 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8657 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
8658 {
8659   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8660                                              (__v16sf)_mm512_movehdup_ps(__A),
8661                                              (__v16sf)__W);
8662 }
8663 
8664 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8665 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
8666 {
8667   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8668                                              (__v16sf)_mm512_movehdup_ps(__A),
8669                                              (__v16sf)_mm512_setzero_ps());
8670 }
8671 
8672 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8673 _mm512_moveldup_ps (__m512 __A)
8674 {
8675   return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
8676                          0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
8677 }
8678 
8679 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8680 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
8681 {
8682   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8683                                              (__v16sf)_mm512_moveldup_ps(__A),
8684                                              (__v16sf)__W);
8685 }
8686 
8687 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8688 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
8689 {
8690   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8691                                              (__v16sf)_mm512_moveldup_ps(__A),
8692                                              (__v16sf)_mm512_setzero_ps());
8693 }
8694 
8695 static __inline__ __m128 __DEFAULT_FN_ATTRS128
8696 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
8697 {
8698   return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
8699 }
8700 
8701 static __inline__ __m128 __DEFAULT_FN_ATTRS128
8702 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
8703 {
8704   return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
8705                                      _mm_setzero_ps());
8706 }
8707 
8708 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8709 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8710 {
8711   return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
8712 }
8713 
8714 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8715 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
8716 {
8717   return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
8718                                      _mm_setzero_pd());
8719 }
8720 
8721 static __inline__ void __DEFAULT_FN_ATTRS128
8722 _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
8723 {
8724   __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1);
8725 }
8726 
8727 static __inline__ void __DEFAULT_FN_ATTRS128
8728 _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
8729 {
8730   __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1);
8731 }
8732 
8733 static __inline__ __m128 __DEFAULT_FN_ATTRS128
8734 _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
8735 {
8736   __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
8737                                                 (__v4sf)_mm_setzero_ps(),
8738                                                 0, 4, 4, 4);
8739 
8740   return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1);
8741 }
8742 
8743 static __inline__ __m128 __DEFAULT_FN_ATTRS128
8744 _mm_maskz_load_ss (__mmask8 __U, const float* __A)
8745 {
8746   return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A,
8747                                                 (__v4sf) _mm_setzero_ps(),
8748                                                 __U & 1);
8749 }
8750 
8751 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8752 _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
8753 {
8754   __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
8755                                                  (__v2df)_mm_setzero_pd(),
8756                                                  0, 2);
8757 
8758   return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1);
8759 }
8760 
8761 static __inline__ __m128d __DEFAULT_FN_ATTRS128
8762 _mm_maskz_load_sd (__mmask8 __U, const double* __A)
8763 {
8764   return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A,
8765                                                   (__v2df) _mm_setzero_pd(),
8766                                                   __U & 1);
8767 }
8768 
8769 #define _mm512_shuffle_epi32(A, I) \
8770   ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)))
8771 
8772 #define _mm512_mask_shuffle_epi32(W, U, A, I) \
8773   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
8774                                        (__v16si)_mm512_shuffle_epi32((A), (I)), \
8775                                        (__v16si)(__m512i)(W)))
8776 
8777 #define _mm512_maskz_shuffle_epi32(U, A, I) \
8778   ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
8779                                        (__v16si)_mm512_shuffle_epi32((A), (I)), \
8780                                        (__v16si)_mm512_setzero_si512()))
8781 
8782 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8783 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
8784 {
8785   return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
8786                 (__v8df) __W,
8787                 (__mmask8) __U);
8788 }
8789 
8790 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8791 _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
8792 {
8793   return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
8794                 (__v8df) _mm512_setzero_pd (),
8795                 (__mmask8) __U);
8796 }
8797 
8798 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8799 _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
8800 {
8801   return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
8802                 (__v8di) __W,
8803                 (__mmask8) __U);
8804 }
8805 
8806 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8807 _mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
8808 {
8809   return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
8810                 (__v8di) _mm512_setzero_si512 (),
8811                 (__mmask8) __U);
8812 }
8813 
8814 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8815 _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
8816 {
8817   return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
8818               (__v8df) __W,
8819               (__mmask8) __U);
8820 }
8821 
8822 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8823 _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
8824 {
8825   return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
8826               (__v8df) _mm512_setzero_pd(),
8827               (__mmask8) __U);
8828 }
8829 
8830 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8831 _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
8832 {
8833   return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
8834               (__v8di) __W,
8835               (__mmask8) __U);
8836 }
8837 
8838 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8839 _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
8840 {
8841   return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
8842               (__v8di) _mm512_setzero_si512(),
8843               (__mmask8) __U);
8844 }
8845 
8846 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8847 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
8848 {
8849   return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
8850                    (__v16sf) __W,
8851                    (__mmask16) __U);
8852 }
8853 
8854 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8855 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
8856 {
8857   return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
8858                    (__v16sf) _mm512_setzero_ps(),
8859                    (__mmask16) __U);
8860 }
8861 
8862 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8863 _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
8864 {
8865   return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
8866               (__v16si) __W,
8867               (__mmask16) __U);
8868 }
8869 
8870 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8871 _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
8872 {
8873   return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
8874               (__v16si) _mm512_setzero_si512(),
8875               (__mmask16) __U);
8876 }
8877 
8878 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8879 _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
8880 {
8881   return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
8882                (__v16sf) __W,
8883                (__mmask16) __U);
8884 }
8885 
8886 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8887 _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
8888 {
8889   return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
8890                (__v16sf) _mm512_setzero_ps(),
8891                (__mmask16) __U);
8892 }
8893 
8894 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8895 _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
8896 {
8897   return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
8898                 (__v16si) __W,
8899                 (__mmask16) __U);
8900 }
8901 
8902 static __inline__ __m512i __DEFAULT_FN_ATTRS512
8903 _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
8904 {
8905   return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
8906                 (__v16si) _mm512_setzero_si512(),
8907                 (__mmask16) __U);
8908 }
8909 
8910 #define _mm512_cvt_roundps_pd(A, R) \
8911   ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8912                                             (__v8df)_mm512_undefined_pd(), \
8913                                             (__mmask8)-1, (int)(R)))
8914 
8915 #define _mm512_mask_cvt_roundps_pd(W, U, A, R) \
8916   ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8917                                             (__v8df)(__m512d)(W), \
8918                                             (__mmask8)(U), (int)(R)))
8919 
8920 #define _mm512_maskz_cvt_roundps_pd(U, A, R) \
8921   ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8922                                             (__v8df)_mm512_setzero_pd(), \
8923                                             (__mmask8)(U), (int)(R)))
8924 
8925 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8926 _mm512_cvtps_pd (__m256 __A)
8927 {
8928   return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df);
8929 }
8930 
8931 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8932 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
8933 {
8934   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8935                                               (__v8df)_mm512_cvtps_pd(__A),
8936                                               (__v8df)__W);
8937 }
8938 
8939 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8940 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
8941 {
8942   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8943                                               (__v8df)_mm512_cvtps_pd(__A),
8944                                               (__v8df)_mm512_setzero_pd());
8945 }
8946 
8947 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8948 _mm512_cvtpslo_pd (__m512 __A)
8949 {
8950   return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
8951 }
8952 
8953 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8954 _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
8955 {
8956   return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
8957 }
8958 
8959 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8960 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
8961 {
8962   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
8963               (__v8df) __A,
8964               (__v8df) __W);
8965 }
8966 
8967 static __inline__ __m512d __DEFAULT_FN_ATTRS512
8968 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
8969 {
8970   return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
8971               (__v8df) __A,
8972               (__v8df) _mm512_setzero_pd ());
8973 }
8974 
8975 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8976 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
8977 {
8978   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
8979              (__v16sf) __A,
8980              (__v16sf) __W);
8981 }
8982 
8983 static __inline__ __m512 __DEFAULT_FN_ATTRS512
8984 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
8985 {
8986   return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
8987              (__v16sf) __A,
8988              (__v16sf) _mm512_setzero_ps ());
8989 }
8990 
8991 static __inline__ void __DEFAULT_FN_ATTRS512
8992 _mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
8993 {
8994   __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
8995             (__mmask8) __U);
8996 }
8997 
8998 static __inline__ void __DEFAULT_FN_ATTRS512
8999 _mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
9000 {
9001   __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
9002             (__mmask8) __U);
9003 }
9004 
9005 static __inline__ void __DEFAULT_FN_ATTRS512
9006 _mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
9007 {
9008   __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
9009             (__mmask16) __U);
9010 }
9011 
9012 static __inline__ void __DEFAULT_FN_ATTRS512
9013 _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
9014 {
9015   __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
9016             (__mmask16) __U);
9017 }
9018 
9019 #define _mm_cvt_roundsd_ss(A, B, R) \
9020   ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9021                                               (__v2df)(__m128d)(B), \
9022                                               (__v4sf)_mm_undefined_ps(), \
9023                                               (__mmask8)-1, (int)(R)))
9024 
9025 #define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \
9026   ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9027                                               (__v2df)(__m128d)(B), \
9028                                               (__v4sf)(__m128)(W), \
9029                                               (__mmask8)(U), (int)(R)))
9030 
9031 #define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \
9032   ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9033                                               (__v2df)(__m128d)(B), \
9034                                               (__v4sf)_mm_setzero_ps(), \
9035                                               (__mmask8)(U), (int)(R)))
9036 
9037 static __inline__ __m128 __DEFAULT_FN_ATTRS128
9038 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
9039 {
9040   return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
9041                                              (__v2df)__B,
9042                                              (__v4sf)__W,
9043                                              (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9044 }
9045 
9046 static __inline__ __m128 __DEFAULT_FN_ATTRS128
9047 _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
9048 {
9049   return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
9050                                              (__v2df)__B,
9051                                              (__v4sf)_mm_setzero_ps(),
9052                                              (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9053 }
9054 
9055 #define _mm_cvtss_i32 _mm_cvtss_si32
9056 #define _mm_cvtsd_i32 _mm_cvtsd_si32
9057 #define _mm_cvti32_sd _mm_cvtsi32_sd
9058 #define _mm_cvti32_ss _mm_cvtsi32_ss
9059 #ifdef __x86_64__
9060 #define _mm_cvtss_i64 _mm_cvtss_si64
9061 #define _mm_cvtsd_i64 _mm_cvtsd_si64
9062 #define _mm_cvti64_sd _mm_cvtsi64_sd
9063 #define _mm_cvti64_ss _mm_cvtsi64_ss
9064 #endif
9065 
9066 #ifdef __x86_64__
9067 #define _mm_cvt_roundi64_sd(A, B, R) \
9068   ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
9069                                       (int)(R)))
9070 
9071 #define _mm_cvt_roundsi64_sd(A, B, R) \
9072   ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
9073                                       (int)(R)))
9074 #endif
9075 
9076 #define _mm_cvt_roundsi32_ss(A, B, R) \
9077   ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
9078 
9079 #define _mm_cvt_roundi32_ss(A, B, R) \
9080   ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
9081 
9082 #ifdef __x86_64__
9083 #define _mm_cvt_roundsi64_ss(A, B, R) \
9084   ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
9085                                      (int)(R)))
9086 
9087 #define _mm_cvt_roundi64_ss(A, B, R) \
9088   ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
9089                                      (int)(R)))
9090 #endif
9091 
9092 #define _mm_cvt_roundss_sd(A, B, R) \
9093   ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9094                                                (__v4sf)(__m128)(B), \
9095                                                (__v2df)_mm_undefined_pd(), \
9096                                                (__mmask8)-1, (int)(R)))
9097 
9098 #define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \
9099   ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9100                                                (__v4sf)(__m128)(B), \
9101                                                (__v2df)(__m128d)(W), \
9102                                                (__mmask8)(U), (int)(R)))
9103 
9104 #define _mm_maskz_cvt_roundss_sd(U, A, B, R) \
9105   ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9106                                                (__v4sf)(__m128)(B), \
9107                                                (__v2df)_mm_setzero_pd(), \
9108                                                (__mmask8)(U), (int)(R)))
9109 
9110 static __inline__ __m128d __DEFAULT_FN_ATTRS128
9111 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
9112 {
9113   return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
9114                                             (__v4sf)__B,
9115                                             (__v2df)__W,
9116                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9117 }
9118 
9119 static __inline__ __m128d __DEFAULT_FN_ATTRS128
9120 _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
9121 {
9122   return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
9123                                             (__v4sf)__B,
9124                                             (__v2df)_mm_setzero_pd(),
9125                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9126 }
9127 
9128 static __inline__ __m128d __DEFAULT_FN_ATTRS128
9129 _mm_cvtu32_sd (__m128d __A, unsigned __B)
9130 {
9131   __A[0] = __B;
9132   return __A;
9133 }
9134 
9135 #ifdef __x86_64__
9136 #define _mm_cvt_roundu64_sd(A, B, R) \
9137   ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
9138                                        (unsigned long long)(B), (int)(R)))
9139 
9140 static __inline__ __m128d __DEFAULT_FN_ATTRS128
9141 _mm_cvtu64_sd (__m128d __A, unsigned long long __B)
9142 {
9143   __A[0] = __B;
9144   return __A;
9145 }
9146 #endif
9147 
9148 #define _mm_cvt_roundu32_ss(A, B, R) \
9149   ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
9150                                       (int)(R)))
9151 
9152 static __inline__ __m128 __DEFAULT_FN_ATTRS128
9153 _mm_cvtu32_ss (__m128 __A, unsigned __B)
9154 {
9155   __A[0] = __B;
9156   return __A;
9157 }
9158 
9159 #ifdef __x86_64__
9160 #define _mm_cvt_roundu64_ss(A, B, R) \
9161   ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
9162                                       (unsigned long long)(B), (int)(R)))
9163 
9164 static __inline__ __m128 __DEFAULT_FN_ATTRS128
9165 _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
9166 {
9167   __A[0] = __B;
9168   return __A;
9169 }
9170 #endif
9171 
9172 static __inline__ __m512i __DEFAULT_FN_ATTRS512
9173 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
9174 {
9175   return (__m512i) __builtin_ia32_selectd_512(__M,
9176                                               (__v16si) _mm512_set1_epi32(__A),
9177                                               (__v16si) __O);
9178 }
9179 
9180 static __inline__ __m512i __DEFAULT_FN_ATTRS512
9181 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
9182 {
9183   return (__m512i) __builtin_ia32_selectq_512(__M,
9184                                               (__v8di) _mm512_set1_epi64(__A),
9185                                               (__v8di) __O);
9186 }
9187 
9188 static  __inline __m512i __DEFAULT_FN_ATTRS512
9189 _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
9190     char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
9191     char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
9192     char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
9193     char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
9194     char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
9195     char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
9196     char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
9197     char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
9198     char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
9199     char __e4, char __e3, char __e2, char __e1, char __e0) {
9200 
9201   return __extension__ (__m512i)(__v64qi)
9202     {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
9203      __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
9204      __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
9205      __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
9206      __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
9207      __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
9208      __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
9209      __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
9210 }
9211 
9212 static  __inline __m512i __DEFAULT_FN_ATTRS512
9213 _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
9214     short __e27, short __e26, short __e25, short __e24, short __e23,
9215     short __e22, short __e21, short __e20, short __e19, short __e18,
9216     short __e17, short __e16, short __e15, short __e14, short __e13,
9217     short __e12, short __e11, short __e10, short __e9, short __e8,
9218     short __e7, short __e6, short __e5, short __e4, short __e3,
9219     short __e2, short __e1, short __e0) {
9220   return __extension__ (__m512i)(__v32hi)
9221     {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
9222      __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
9223      __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
9224      __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
9225 }
9226 
9227 static __inline __m512i __DEFAULT_FN_ATTRS512
9228 _mm512_set_epi32 (int __A, int __B, int __C, int __D,
9229      int __E, int __F, int __G, int __H,
9230      int __I, int __J, int __K, int __L,
9231      int __M, int __N, int __O, int __P)
9232 {
9233   return __extension__ (__m512i)(__v16si)
9234   { __P, __O, __N, __M, __L, __K, __J, __I,
9235     __H, __G, __F, __E, __D, __C, __B, __A };
9236 }
9237 
9238 #define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,           \
9239        e8,e9,e10,e11,e12,e13,e14,e15)          \
9240   _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
9241                    (e5),(e4),(e3),(e2),(e1),(e0))
9242 
9243 static __inline__ __m512i __DEFAULT_FN_ATTRS512
9244 _mm512_set_epi64 (long long __A, long long __B, long long __C,
9245      long long __D, long long __E, long long __F,
9246      long long __G, long long __H)
9247 {
9248   return __extension__ (__m512i) (__v8di)
9249   { __H, __G, __F, __E, __D, __C, __B, __A };
9250 }
9251 
9252 #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)           \
9253   _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
9254 
9255 static __inline__ __m512d __DEFAULT_FN_ATTRS512
9256 _mm512_set_pd (double __A, double __B, double __C, double __D,
9257         double __E, double __F, double __G, double __H)
9258 {
9259   return __extension__ (__m512d)
9260   { __H, __G, __F, __E, __D, __C, __B, __A };
9261 }
9262 
9263 #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)              \
9264   _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
9265 
9266 static __inline__ __m512 __DEFAULT_FN_ATTRS512
9267 _mm512_set_ps (float __A, float __B, float __C, float __D,
9268         float __E, float __F, float __G, float __H,
9269         float __I, float __J, float __K, float __L,
9270         float __M, float __N, float __O, float __P)
9271 {
9272   return __extension__ (__m512)
9273   { __P, __O, __N, __M, __L, __K, __J, __I,
9274     __H, __G, __F, __E, __D, __C, __B, __A };
9275 }
9276 
9277 #define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
9278   _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
9279                 (e4),(e3),(e2),(e1),(e0))
9280 
9281 static __inline__ __m512 __DEFAULT_FN_ATTRS512
9282 _mm512_abs_ps(__m512 __A)
9283 {
9284   return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
9285 }
9286 
9287 static __inline__ __m512 __DEFAULT_FN_ATTRS512
9288 _mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
9289 {
9290   return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
9291 }
9292 
9293 static __inline__ __m512d __DEFAULT_FN_ATTRS512
9294 _mm512_abs_pd(__m512d __A)
9295 {
9296   return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
9297 }
9298 
9299 static __inline__ __m512d __DEFAULT_FN_ATTRS512
9300 _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
9301 {
9302   return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
9303 }
9304 
9305 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
9306  * outputs. This class of vector operation forms the basis of many scientific
9307  * computations. In vector-reduction arithmetic, the evaluation order is
9308  * independent of the order of the input elements of V.
9309 
9310  * For floating-point intrinsics:
9311  * 1. When using fadd/fmul intrinsics, the order of operations within the
9312  * vector is unspecified (associative math).
9313  * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
9314  * produce unspecified results.
9315 
9316  * Used bisection method. At each step, we partition the vector with previous
9317  * step in half, and the operation is performed on its two halves.
9318  * This takes log2(n) steps where n is the number of elements in the vector.
9319  */
9320 
9321 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
9322   return __builtin_reduce_add((__v8di)__W);
9323 }
9324 
9325 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
9326   return __builtin_reduce_mul((__v8di)__W);
9327 }
9328 
9329 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
9330   return __builtin_reduce_and((__v8di)__W);
9331 }
9332 
9333 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
9334   return __builtin_reduce_or((__v8di)__W);
9335 }
9336 
9337 static __inline__ long long __DEFAULT_FN_ATTRS512
9338 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
9339   __W = _mm512_maskz_mov_epi64(__M, __W);
9340   return __builtin_reduce_add((__v8di)__W);
9341 }
9342 
9343 static __inline__ long long __DEFAULT_FN_ATTRS512
9344 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
9345   __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
9346   return __builtin_reduce_mul((__v8di)__W);
9347 }
9348 
9349 static __inline__ long long __DEFAULT_FN_ATTRS512
9350 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
9351   __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W);
9352   return __builtin_reduce_and((__v8di)__W);
9353 }
9354 
9355 static __inline__ long long __DEFAULT_FN_ATTRS512
9356 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
9357   __W = _mm512_maskz_mov_epi64(__M, __W);
9358   return __builtin_reduce_or((__v8di)__W);
9359 }
9360 
9361 // -0.0 is used to ignore the start value since it is the neutral value of
9362 // floating point addition. For more information, please refer to
9363 // https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
9364 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
9365   return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
9366 }
9367 
9368 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
9369   return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
9370 }
9371 
9372 static __inline__ double __DEFAULT_FN_ATTRS512
9373 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
9374   __W = _mm512_maskz_mov_pd(__M, __W);
9375   return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
9376 }
9377 
9378 static __inline__ double __DEFAULT_FN_ATTRS512
9379 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
9380   __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
9381   return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
9382 }
9383 
9384 static __inline__ int __DEFAULT_FN_ATTRS512
9385 _mm512_reduce_add_epi32(__m512i __W) {
9386   return __builtin_reduce_add((__v16si)__W);
9387 }
9388 
9389 static __inline__ int __DEFAULT_FN_ATTRS512
9390 _mm512_reduce_mul_epi32(__m512i __W) {
9391   return __builtin_reduce_mul((__v16si)__W);
9392 }
9393 
9394 static __inline__ int __DEFAULT_FN_ATTRS512
9395 _mm512_reduce_and_epi32(__m512i __W) {
9396   return __builtin_reduce_and((__v16si)__W);
9397 }
9398 
9399 static __inline__ int __DEFAULT_FN_ATTRS512
9400 _mm512_reduce_or_epi32(__m512i __W) {
9401   return __builtin_reduce_or((__v16si)__W);
9402 }
9403 
9404 static __inline__ int __DEFAULT_FN_ATTRS512
9405 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
9406   __W = _mm512_maskz_mov_epi32(__M, __W);
9407   return __builtin_reduce_add((__v16si)__W);
9408 }
9409 
9410 static __inline__ int __DEFAULT_FN_ATTRS512
9411 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
9412   __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
9413   return __builtin_reduce_mul((__v16si)__W);
9414 }
9415 
9416 static __inline__ int __DEFAULT_FN_ATTRS512
9417 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
9418   __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W);
9419   return __builtin_reduce_and((__v16si)__W);
9420 }
9421 
9422 static __inline__ int __DEFAULT_FN_ATTRS512
9423 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
9424   __W = _mm512_maskz_mov_epi32(__M, __W);
9425   return __builtin_reduce_or((__v16si)__W);
9426 }
9427 
9428 static __inline__ float __DEFAULT_FN_ATTRS512
9429 _mm512_reduce_add_ps(__m512 __W) {
9430   return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
9431 }
9432 
9433 static __inline__ float __DEFAULT_FN_ATTRS512
9434 _mm512_reduce_mul_ps(__m512 __W) {
9435   return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
9436 }
9437 
9438 static __inline__ float __DEFAULT_FN_ATTRS512
9439 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
9440   __W = _mm512_maskz_mov_ps(__M, __W);
9441   return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
9442 }
9443 
9444 static __inline__ float __DEFAULT_FN_ATTRS512
9445 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
9446   __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
9447   return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
9448 }
9449 
9450 static __inline__ long long __DEFAULT_FN_ATTRS512
9451 _mm512_reduce_max_epi64(__m512i __V) {
9452   return __builtin_reduce_max((__v8di)__V);
9453 }
9454 
9455 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9456 _mm512_reduce_max_epu64(__m512i __V) {
9457   return __builtin_reduce_max((__v8du)__V);
9458 }
9459 
9460 static __inline__ long long __DEFAULT_FN_ATTRS512
9461 _mm512_reduce_min_epi64(__m512i __V) {
9462   return __builtin_reduce_min((__v8di)__V);
9463 }
9464 
9465 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9466 _mm512_reduce_min_epu64(__m512i __V) {
9467   return __builtin_reduce_min((__v8du)__V);
9468 }
9469 
9470 static __inline__ long long __DEFAULT_FN_ATTRS512
9471 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
9472   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
9473   return __builtin_reduce_max((__v8di)__V);
9474 }
9475 
9476 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9477 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
9478   __V = _mm512_maskz_mov_epi64(__M, __V);
9479   return __builtin_reduce_max((__v8du)__V);
9480 }
9481 
9482 static __inline__ long long __DEFAULT_FN_ATTRS512
9483 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
9484   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
9485   return __builtin_reduce_min((__v8di)__V);
9486 }
9487 
9488 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9489 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
9490   __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
9491   return __builtin_reduce_min((__v8du)__V);
9492 }
9493 static __inline__ int __DEFAULT_FN_ATTRS512
9494 _mm512_reduce_max_epi32(__m512i __V) {
9495   return __builtin_reduce_max((__v16si)__V);
9496 }
9497 
9498 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9499 _mm512_reduce_max_epu32(__m512i __V) {
9500   return __builtin_reduce_max((__v16su)__V);
9501 }
9502 
9503 static __inline__ int __DEFAULT_FN_ATTRS512
9504 _mm512_reduce_min_epi32(__m512i __V) {
9505   return __builtin_reduce_min((__v16si)__V);
9506 }
9507 
9508 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9509 _mm512_reduce_min_epu32(__m512i __V) {
9510   return __builtin_reduce_min((__v16su)__V);
9511 }
9512 
9513 static __inline__ int __DEFAULT_FN_ATTRS512
9514 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
9515   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
9516   return __builtin_reduce_max((__v16si)__V);
9517 }
9518 
9519 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9520 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
9521   __V = _mm512_maskz_mov_epi32(__M, __V);
9522   return __builtin_reduce_max((__v16su)__V);
9523 }
9524 
9525 static __inline__ int __DEFAULT_FN_ATTRS512
9526 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
9527   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
9528   return __builtin_reduce_min((__v16si)__V);
9529 }
9530 
9531 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9532 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
9533   __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V);
9534   return __builtin_reduce_min((__v16su)__V);
9535 }
9536 
9537 static __inline__ double __DEFAULT_FN_ATTRS512
9538 _mm512_reduce_max_pd(__m512d __V) {
9539   return __builtin_ia32_reduce_fmax_pd512(__V);
9540 }
9541 
9542 static __inline__ double __DEFAULT_FN_ATTRS512
9543 _mm512_reduce_min_pd(__m512d __V) {
9544   return __builtin_ia32_reduce_fmin_pd512(__V);
9545 }
9546 
9547 static __inline__ double __DEFAULT_FN_ATTRS512
9548 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
9549   __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
9550   return __builtin_ia32_reduce_fmax_pd512(__V);
9551 }
9552 
9553 static __inline__ double __DEFAULT_FN_ATTRS512
9554 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
9555   __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
9556   return __builtin_ia32_reduce_fmin_pd512(__V);
9557 }
9558 
9559 static __inline__ float __DEFAULT_FN_ATTRS512
9560 _mm512_reduce_max_ps(__m512 __V) {
9561   return __builtin_ia32_reduce_fmax_ps512(__V);
9562 }
9563 
9564 static __inline__ float __DEFAULT_FN_ATTRS512
9565 _mm512_reduce_min_ps(__m512 __V) {
9566   return __builtin_ia32_reduce_fmin_ps512(__V);
9567 }
9568 
9569 static __inline__ float __DEFAULT_FN_ATTRS512
9570 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
9571   __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
9572   return __builtin_ia32_reduce_fmax_ps512(__V);
9573 }
9574 
9575 static __inline__ float __DEFAULT_FN_ATTRS512
9576 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
9577   __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
9578   return __builtin_ia32_reduce_fmin_ps512(__V);
9579 }
9580 
9581 /// Moves the least significant 32 bits of a vector of [16 x i32] to a
9582 ///    32-bit signed integer value.
9583 ///
9584 /// \headerfile <x86intrin.h>
9585 ///
9586 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
9587 ///
9588 /// \param __A
9589 ///    A vector of [16 x i32]. The least significant 32 bits are moved to the
9590 ///    destination.
9591 /// \returns A 32-bit signed integer containing the moved value.
9592 static __inline__ int __DEFAULT_FN_ATTRS512
9593 _mm512_cvtsi512_si32(__m512i __A) {
9594   __v16si __b = (__v16si)__A;
9595   return __b[0];
9596 }
9597 
9598 /// Loads 8 double-precision (64-bit) floating-point elements stored at memory
9599 /// locations starting at location \a base_addr at packed 32-bit integer indices
9600 /// stored in the lower half of \a vindex scaled by \a scale them in dst.
9601 ///
9602 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
9603 ///
9604 /// \code{.operation}
9605 /// FOR j := 0 to 7
9606 ///   i := j*64
9607 ///   m := j*32
9608 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9609 ///   dst[i+63:i] := MEM[addr+63:addr]
9610 /// ENDFOR
9611 /// dst[MAX:512] := 0
9612 /// \endcode
9613 #define _mm512_i32logather_pd(vindex, base_addr, scale)                        \
9614   _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
9615 
9616 /// Loads 8 double-precision (64-bit) floating-point elements from memory
9617 /// starting at location \a base_addr at packed 32-bit integer indices stored in
9618 /// the lower half of \a vindex scaled by \a scale into dst using writemask
9619 /// \a mask (elements are copied from \a src when the corresponding mask bit is
9620 /// not set).
9621 ///
9622 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
9623 ///
9624 /// \code{.operation}
9625 /// FOR j := 0 to 7
9626 ///   i := j*64
9627 ///   m := j*32
9628 ///   IF mask[j]
9629 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9630 ///     dst[i+63:i] := MEM[addr+63:addr]
9631 ///   ELSE
9632 ///     dst[i+63:i] := src[i+63:i]
9633 ///   FI
9634 /// ENDFOR
9635 /// dst[MAX:512] := 0
9636 /// \endcode
9637 #define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale)        \
9638   _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex),      \
9639                            (base_addr), (scale))
9640 
9641 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr
9642 /// at packed 32-bit integer indices stored in the lower half of \a vindex
9643 /// scaled by \a scale and stores them in dst.
9644 ///
9645 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
9646 ///
9647 /// \code{.operation}
9648 /// FOR j := 0 to 7
9649 ///   i := j*64
9650 ///   m := j*32
9651 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9652 ///   dst[i+63:i] := MEM[addr+63:addr]
9653 /// ENDFOR
9654 /// dst[MAX:512] := 0
9655 /// \endcode
9656 #define _mm512_i32logather_epi64(vindex, base_addr, scale)                     \
9657   _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
9658 
9659 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr
9660 /// at packed 32-bit integer indices stored in the lower half of \a vindex
9661 /// scaled by \a scale and stores them in dst using writemask \a mask (elements
9662 /// are copied from \a src when the corresponding mask bit is not set).
9663 ///
9664 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
9665 ///
9666 /// \code{.operation}
9667 /// FOR j := 0 to 7
9668 ///   i := j*64
9669 ///   m := j*32
9670 ///   IF mask[j]
9671 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9672 ///     dst[i+63:i] := MEM[addr+63:addr]
9673 ///   ELSE
9674 ///     dst[i+63:i] := src[i+63:i]
9675 ///   FI
9676 /// ENDFOR
9677 /// dst[MAX:512] := 0
9678 /// \endcode
9679 #define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale)     \
9680   _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex),   \
9681                               (base_addr), (scale))
9682 
9683 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
9684 /// and to memory locations starting at location \a base_addr at packed 32-bit
9685 /// integer indices stored in \a vindex scaled by \a scale.
9686 ///
9687 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
9688 ///
9689 /// \code{.operation}
9690 /// FOR j := 0 to 7
9691 ///   i := j*64
9692 ///   m := j*32
9693 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9694 ///   MEM[addr+63:addr] := v1[i+63:i]
9695 /// ENDFOR
9696 /// \endcode
9697 #define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale)                   \
9698   _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
9699 
9700 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
9701 /// to memory locations starting at location \a base_addr at packed 32-bit
9702 /// integer indices stored in \a vindex scaled by \a scale. Only those elements
9703 /// whose corresponding mask bit is set in writemask \a mask are written to
9704 /// memory.
9705 ///
9706 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
9707 ///
9708 /// \code{.operation}
9709 /// FOR j := 0 to 7
9710 ///   i := j*64
9711 ///   m := j*32
9712 ///   IF mask[j]
9713 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9714 ///     MEM[addr+63:addr] := a[i+63:i]
9715 ///   FI
9716 /// ENDFOR
9717 /// \endcode
9718 #define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale)        \
9719   _mm512_mask_i32scatter_pd((base_addr), (mask),                               \
9720                             _mm512_castsi512_si256(vindex), (v1), (scale))
9721 
9722 /// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
9723 /// memory locations starting at location \a base_addr at packed 32-bit integer
9724 /// indices stored in \a vindex scaled by \a scale.
9725 ///
9726 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
9727 ///
9728 /// \code{.operation}
9729 /// FOR j := 0 to 7
9730 ///   i := j*64
9731 ///   m := j*32
9732 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9733 ///   MEM[addr+63:addr] := a[i+63:i]
9734 /// ENDFOR
9735 /// \endcode
9736 #define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale)                \
9737   _mm512_i32scatter_epi64((base_addr),                                         \
9738                           _mm512_castsi512_si256(vindex), (v1), (scale))
9739 
9740 /// Stores 8 packed 64-bit integer elements located in a and stores them in
9741 /// memory locations starting at location \a base_addr at packed 32-bit integer
9742 /// indices stored in \a vindex scaled by scale using writemask \a mask (elements
9743 /// whose corresponding mask bit is not set are not written to memory).
9744 ///
9745 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
9746 ///
9747 /// \code{.operation}
9748 /// FOR j := 0 to 7
9749 ///   i := j*64
9750 ///   m := j*32
9751 ///   IF mask[j]
9752 ///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9753 ///     MEM[addr+63:addr] := a[i+63:i]
9754 ///   FI
9755 /// ENDFOR
9756 /// \endcode
9757 #define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale)     \
9758   _mm512_mask_i32scatter_epi64((base_addr), (mask),                            \
9759                                _mm512_castsi512_si256(vindex), (v1), (scale))
9760 
9761 #undef __DEFAULT_FN_ATTRS512
9762 #undef __DEFAULT_FN_ATTRS128
9763 #undef __DEFAULT_FN_ATTRS
9764 
9765 #endif /* __AVX512FINTRIN_H */
9766