1 /*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead." 11 #endif 12 13 #ifndef __AVX512FINTRIN_H 14 #define __AVX512FINTRIN_H 15 16 typedef char __v64qi __attribute__((__vector_size__(64))); 17 typedef short __v32hi __attribute__((__vector_size__(64))); 18 typedef double __v8df __attribute__((__vector_size__(64))); 19 typedef float __v16sf __attribute__((__vector_size__(64))); 20 typedef long long __v8di __attribute__((__vector_size__(64))); 21 typedef int __v16si __attribute__((__vector_size__(64))); 22 23 /* Unsigned types */ 24 typedef unsigned char __v64qu __attribute__((__vector_size__(64))); 25 typedef unsigned short __v32hu __attribute__((__vector_size__(64))); 26 typedef unsigned long long __v8du __attribute__((__vector_size__(64))); 27 typedef unsigned int __v16su __attribute__((__vector_size__(64))); 28 29 /* We need an explicitly signed variant for char. Note that this shouldn't 30 * appear in the interface though. */ 31 typedef signed char __v64qs __attribute__((__vector_size__(64))); 32 33 typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64))); 34 typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64))); 35 typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64))); 36 37 typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1))); 38 typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1))); 39 typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1))); 40 41 typedef unsigned char __mmask8; 42 typedef unsigned short __mmask16; 43 44 /* Rounding mode macros. */ 45 #define _MM_FROUND_TO_NEAREST_INT 0x00 46 #define _MM_FROUND_TO_NEG_INF 0x01 47 #define _MM_FROUND_TO_POS_INF 0x02 48 #define _MM_FROUND_TO_ZERO 0x03 49 #define _MM_FROUND_CUR_DIRECTION 0x04 50 51 /* Constants for integer comparison predicates */ 52 typedef enum { 53 _MM_CMPINT_EQ, /* Equal */ 54 _MM_CMPINT_LT, /* Less than */ 55 _MM_CMPINT_LE, /* Less than or Equal */ 56 _MM_CMPINT_UNUSED, 57 _MM_CMPINT_NE, /* Not Equal */ 58 _MM_CMPINT_NLT, /* Not Less than */ 59 #define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */ 60 _MM_CMPINT_NLE /* Not Less than or Equal */ 61 #define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */ 62 } _MM_CMPINT_ENUM; 63 64 typedef enum 65 { 66 _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, 67 _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, 68 _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, 69 _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, 70 _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, 71 _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, 72 _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, 73 _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, 74 _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, 75 _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, 76 _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, 77 _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, 78 _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, 79 _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, 80 _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, 81 _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, 82 _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, 83 _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, 84 _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, 85 _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, 86 _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, 87 _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, 88 _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, 89 _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, 90 _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, 91 _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, 92 _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, 93 _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, 94 _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, 95 _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, 96 _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, 97 _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, 98 _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, 99 _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, 100 _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, 101 _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, 102 _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, 103 _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, 104 _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, 105 _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, 106 _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, 107 _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, 108 _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, 109 _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, 110 _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, 111 _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, 112 _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, 113 _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, 114 _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, 115 _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, 116 _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, 117 _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, 118 _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, 119 _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, 120 _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, 121 _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, 122 _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, 123 _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, 124 _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, 125 _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, 126 _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, 127 _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, 128 _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, 129 _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, 130 _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, 131 _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, 132 _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, 133 _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, 134 _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, 135 _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, 136 _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, 137 _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, 138 _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, 139 _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, 140 _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, 141 _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, 142 _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, 143 _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, 144 _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, 145 _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, 146 _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, 147 _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, 148 _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, 149 _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, 150 _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, 151 _MM_PERM_DDDD = 0xFF 152 } _MM_PERM_ENUM; 153 154 typedef enum 155 { 156 _MM_MANT_NORM_1_2, /* interval [1, 2) */ 157 _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */ 158 _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */ 159 _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */ 160 } _MM_MANTISSA_NORM_ENUM; 161 162 typedef enum 163 { 164 _MM_MANT_SIGN_src, /* sign = sign(SRC) */ 165 _MM_MANT_SIGN_zero, /* sign = 0 */ 166 _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */ 167 } _MM_MANTISSA_SIGN_ENUM; 168 169 /* Define the default attributes for the functions in this file. */ 170 #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512))) 171 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128))) 172 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f"))) 173 174 /* Create vectors with repeated elements */ 175 176 static __inline __m512i __DEFAULT_FN_ATTRS512 177 _mm512_setzero_si512(void) 178 { 179 return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; 180 } 181 182 #define _mm512_setzero_epi32 _mm512_setzero_si512 183 184 static __inline__ __m512d __DEFAULT_FN_ATTRS512 185 _mm512_undefined_pd(void) 186 { 187 return (__m512d)__builtin_ia32_undef512(); 188 } 189 190 static __inline__ __m512 __DEFAULT_FN_ATTRS512 191 _mm512_undefined(void) 192 { 193 return (__m512)__builtin_ia32_undef512(); 194 } 195 196 static __inline__ __m512 __DEFAULT_FN_ATTRS512 197 _mm512_undefined_ps(void) 198 { 199 return (__m512)__builtin_ia32_undef512(); 200 } 201 202 static __inline__ __m512i __DEFAULT_FN_ATTRS512 203 _mm512_undefined_epi32(void) 204 { 205 return (__m512i)__builtin_ia32_undef512(); 206 } 207 208 static __inline__ __m512i __DEFAULT_FN_ATTRS512 209 _mm512_broadcastd_epi32 (__m128i __A) 210 { 211 return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A, 212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 213 } 214 215 static __inline__ __m512i __DEFAULT_FN_ATTRS512 216 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) 217 { 218 return (__m512i)__builtin_ia32_selectd_512(__M, 219 (__v16si) _mm512_broadcastd_epi32(__A), 220 (__v16si) __O); 221 } 222 223 static __inline__ __m512i __DEFAULT_FN_ATTRS512 224 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) 225 { 226 return (__m512i)__builtin_ia32_selectd_512(__M, 227 (__v16si) _mm512_broadcastd_epi32(__A), 228 (__v16si) _mm512_setzero_si512()); 229 } 230 231 static __inline__ __m512i __DEFAULT_FN_ATTRS512 232 _mm512_broadcastq_epi64 (__m128i __A) 233 { 234 return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A, 235 0, 0, 0, 0, 0, 0, 0, 0); 236 } 237 238 static __inline__ __m512i __DEFAULT_FN_ATTRS512 239 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) 240 { 241 return (__m512i)__builtin_ia32_selectq_512(__M, 242 (__v8di) _mm512_broadcastq_epi64(__A), 243 (__v8di) __O); 244 245 } 246 247 static __inline__ __m512i __DEFAULT_FN_ATTRS512 248 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) 249 { 250 return (__m512i)__builtin_ia32_selectq_512(__M, 251 (__v8di) _mm512_broadcastq_epi64(__A), 252 (__v8di) _mm512_setzero_si512()); 253 } 254 255 256 static __inline __m512 __DEFAULT_FN_ATTRS512 257 _mm512_setzero_ps(void) 258 { 259 return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 260 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; 261 } 262 263 #define _mm512_setzero _mm512_setzero_ps 264 265 static __inline __m512d __DEFAULT_FN_ATTRS512 266 _mm512_setzero_pd(void) 267 { 268 return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; 269 } 270 271 static __inline __m512 __DEFAULT_FN_ATTRS512 272 _mm512_set1_ps(float __w) 273 { 274 return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w, 275 __w, __w, __w, __w, __w, __w, __w, __w }; 276 } 277 278 static __inline __m512d __DEFAULT_FN_ATTRS512 279 _mm512_set1_pd(double __w) 280 { 281 return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w }; 282 } 283 284 static __inline __m512i __DEFAULT_FN_ATTRS512 285 _mm512_set1_epi8(char __w) 286 { 287 return __extension__ (__m512i)(__v64qi){ 288 __w, __w, __w, __w, __w, __w, __w, __w, 289 __w, __w, __w, __w, __w, __w, __w, __w, 290 __w, __w, __w, __w, __w, __w, __w, __w, 291 __w, __w, __w, __w, __w, __w, __w, __w, 292 __w, __w, __w, __w, __w, __w, __w, __w, 293 __w, __w, __w, __w, __w, __w, __w, __w, 294 __w, __w, __w, __w, __w, __w, __w, __w, 295 __w, __w, __w, __w, __w, __w, __w, __w }; 296 } 297 298 static __inline __m512i __DEFAULT_FN_ATTRS512 299 _mm512_set1_epi16(short __w) 300 { 301 return __extension__ (__m512i)(__v32hi){ 302 __w, __w, __w, __w, __w, __w, __w, __w, 303 __w, __w, __w, __w, __w, __w, __w, __w, 304 __w, __w, __w, __w, __w, __w, __w, __w, 305 __w, __w, __w, __w, __w, __w, __w, __w }; 306 } 307 308 static __inline __m512i __DEFAULT_FN_ATTRS512 309 _mm512_set1_epi32(int __s) 310 { 311 return __extension__ (__m512i)(__v16si){ 312 __s, __s, __s, __s, __s, __s, __s, __s, 313 __s, __s, __s, __s, __s, __s, __s, __s }; 314 } 315 316 static __inline __m512i __DEFAULT_FN_ATTRS512 317 _mm512_maskz_set1_epi32(__mmask16 __M, int __A) 318 { 319 return (__m512i)__builtin_ia32_selectd_512(__M, 320 (__v16si)_mm512_set1_epi32(__A), 321 (__v16si)_mm512_setzero_si512()); 322 } 323 324 static __inline __m512i __DEFAULT_FN_ATTRS512 325 _mm512_set1_epi64(long long __d) 326 { 327 return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; 328 } 329 330 static __inline __m512i __DEFAULT_FN_ATTRS512 331 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) 332 { 333 return (__m512i)__builtin_ia32_selectq_512(__M, 334 (__v8di)_mm512_set1_epi64(__A), 335 (__v8di)_mm512_setzero_si512()); 336 } 337 338 static __inline__ __m512 __DEFAULT_FN_ATTRS512 339 _mm512_broadcastss_ps(__m128 __A) 340 { 341 return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A, 342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 343 } 344 345 static __inline __m512i __DEFAULT_FN_ATTRS512 346 _mm512_set4_epi32 (int __A, int __B, int __C, int __D) 347 { 348 return __extension__ (__m512i)(__v16si) 349 { __D, __C, __B, __A, __D, __C, __B, __A, 350 __D, __C, __B, __A, __D, __C, __B, __A }; 351 } 352 353 static __inline __m512i __DEFAULT_FN_ATTRS512 354 _mm512_set4_epi64 (long long __A, long long __B, long long __C, 355 long long __D) 356 { 357 return __extension__ (__m512i) (__v8di) 358 { __D, __C, __B, __A, __D, __C, __B, __A }; 359 } 360 361 static __inline __m512d __DEFAULT_FN_ATTRS512 362 _mm512_set4_pd (double __A, double __B, double __C, double __D) 363 { 364 return __extension__ (__m512d) 365 { __D, __C, __B, __A, __D, __C, __B, __A }; 366 } 367 368 static __inline __m512 __DEFAULT_FN_ATTRS512 369 _mm512_set4_ps (float __A, float __B, float __C, float __D) 370 { 371 return __extension__ (__m512) 372 { __D, __C, __B, __A, __D, __C, __B, __A, 373 __D, __C, __B, __A, __D, __C, __B, __A }; 374 } 375 376 #define _mm512_setr4_epi32(e0,e1,e2,e3) \ 377 _mm512_set4_epi32((e3),(e2),(e1),(e0)) 378 379 #define _mm512_setr4_epi64(e0,e1,e2,e3) \ 380 _mm512_set4_epi64((e3),(e2),(e1),(e0)) 381 382 #define _mm512_setr4_pd(e0,e1,e2,e3) \ 383 _mm512_set4_pd((e3),(e2),(e1),(e0)) 384 385 #define _mm512_setr4_ps(e0,e1,e2,e3) \ 386 _mm512_set4_ps((e3),(e2),(e1),(e0)) 387 388 static __inline__ __m512d __DEFAULT_FN_ATTRS512 389 _mm512_broadcastsd_pd(__m128d __A) 390 { 391 return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A, 392 0, 0, 0, 0, 0, 0, 0, 0); 393 } 394 395 /* Cast between vector types */ 396 397 static __inline __m512d __DEFAULT_FN_ATTRS512 398 _mm512_castpd256_pd512(__m256d __a) 399 { 400 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1); 401 } 402 403 static __inline __m512 __DEFAULT_FN_ATTRS512 404 _mm512_castps256_ps512(__m256 __a) 405 { 406 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 407 -1, -1, -1, -1, -1, -1, -1, -1); 408 } 409 410 static __inline __m128d __DEFAULT_FN_ATTRS512 411 _mm512_castpd512_pd128(__m512d __a) 412 { 413 return __builtin_shufflevector(__a, __a, 0, 1); 414 } 415 416 static __inline __m256d __DEFAULT_FN_ATTRS512 417 _mm512_castpd512_pd256 (__m512d __A) 418 { 419 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3); 420 } 421 422 static __inline __m128 __DEFAULT_FN_ATTRS512 423 _mm512_castps512_ps128(__m512 __a) 424 { 425 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); 426 } 427 428 static __inline __m256 __DEFAULT_FN_ATTRS512 429 _mm512_castps512_ps256 (__m512 __A) 430 { 431 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7); 432 } 433 434 static __inline __m512 __DEFAULT_FN_ATTRS512 435 _mm512_castpd_ps (__m512d __A) 436 { 437 return (__m512) (__A); 438 } 439 440 static __inline __m512i __DEFAULT_FN_ATTRS512 441 _mm512_castpd_si512 (__m512d __A) 442 { 443 return (__m512i) (__A); 444 } 445 446 static __inline__ __m512d __DEFAULT_FN_ATTRS512 447 _mm512_castpd128_pd512 (__m128d __A) 448 { 449 return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); 450 } 451 452 static __inline __m512d __DEFAULT_FN_ATTRS512 453 _mm512_castps_pd (__m512 __A) 454 { 455 return (__m512d) (__A); 456 } 457 458 static __inline __m512i __DEFAULT_FN_ATTRS512 459 _mm512_castps_si512 (__m512 __A) 460 { 461 return (__m512i) (__A); 462 } 463 464 static __inline__ __m512 __DEFAULT_FN_ATTRS512 465 _mm512_castps128_ps512 (__m128 __A) 466 { 467 return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 468 } 469 470 static __inline__ __m512i __DEFAULT_FN_ATTRS512 471 _mm512_castsi128_si512 (__m128i __A) 472 { 473 return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); 474 } 475 476 static __inline__ __m512i __DEFAULT_FN_ATTRS512 477 _mm512_castsi256_si512 (__m256i __A) 478 { 479 return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1); 480 } 481 482 static __inline __m512 __DEFAULT_FN_ATTRS512 483 _mm512_castsi512_ps (__m512i __A) 484 { 485 return (__m512) (__A); 486 } 487 488 static __inline __m512d __DEFAULT_FN_ATTRS512 489 _mm512_castsi512_pd (__m512i __A) 490 { 491 return (__m512d) (__A); 492 } 493 494 static __inline __m128i __DEFAULT_FN_ATTRS512 495 _mm512_castsi512_si128 (__m512i __A) 496 { 497 return (__m128i)__builtin_shufflevector(__A, __A , 0, 1); 498 } 499 500 static __inline __m256i __DEFAULT_FN_ATTRS512 501 _mm512_castsi512_si256 (__m512i __A) 502 { 503 return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); 504 } 505 506 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 507 _mm512_int2mask(int __a) 508 { 509 return (__mmask16)__a; 510 } 511 512 static __inline__ int __DEFAULT_FN_ATTRS 513 _mm512_mask2int(__mmask16 __a) 514 { 515 return (int)__a; 516 } 517 518 /// Constructs a 512-bit floating-point vector of [8 x double] from a 519 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits 520 /// contain the value of the source vector. The upper 384 bits are set 521 /// to zero. 522 /// 523 /// \headerfile <x86intrin.h> 524 /// 525 /// This intrinsic has no corresponding instruction. 526 /// 527 /// \param __a 528 /// A 128-bit vector of [2 x double]. 529 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits 530 /// contain the value of the parameter. The upper 384 bits are set to zero. 531 static __inline __m512d __DEFAULT_FN_ATTRS512 532 _mm512_zextpd128_pd512(__m128d __a) 533 { 534 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3); 535 } 536 537 /// Constructs a 512-bit floating-point vector of [8 x double] from a 538 /// 256-bit floating-point vector of [4 x double]. The lower 256 bits 539 /// contain the value of the source vector. The upper 256 bits are set 540 /// to zero. 541 /// 542 /// \headerfile <x86intrin.h> 543 /// 544 /// This intrinsic has no corresponding instruction. 545 /// 546 /// \param __a 547 /// A 256-bit vector of [4 x double]. 548 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits 549 /// contain the value of the parameter. The upper 256 bits are set to zero. 550 static __inline __m512d __DEFAULT_FN_ATTRS512 551 _mm512_zextpd256_pd512(__m256d __a) 552 { 553 return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7); 554 } 555 556 /// Constructs a 512-bit floating-point vector of [16 x float] from a 557 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 558 /// the value of the source vector. The upper 384 bits are set to zero. 559 /// 560 /// \headerfile <x86intrin.h> 561 /// 562 /// This intrinsic has no corresponding instruction. 563 /// 564 /// \param __a 565 /// A 128-bit vector of [4 x float]. 566 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits 567 /// contain the value of the parameter. The upper 384 bits are set to zero. 568 static __inline __m512 __DEFAULT_FN_ATTRS512 569 _mm512_zextps128_ps512(__m128 __a) 570 { 571 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7); 572 } 573 574 /// Constructs a 512-bit floating-point vector of [16 x float] from a 575 /// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain 576 /// the value of the source vector. The upper 256 bits are set to zero. 577 /// 578 /// \headerfile <x86intrin.h> 579 /// 580 /// This intrinsic has no corresponding instruction. 581 /// 582 /// \param __a 583 /// A 256-bit vector of [8 x float]. 584 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits 585 /// contain the value of the parameter. The upper 256 bits are set to zero. 586 static __inline __m512 __DEFAULT_FN_ATTRS512 587 _mm512_zextps256_ps512(__m256 __a) 588 { 589 return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 590 } 591 592 /// Constructs a 512-bit integer vector from a 128-bit integer vector. 593 /// The lower 128 bits contain the value of the source vector. The upper 594 /// 384 bits are set to zero. 595 /// 596 /// \headerfile <x86intrin.h> 597 /// 598 /// This intrinsic has no corresponding instruction. 599 /// 600 /// \param __a 601 /// A 128-bit integer vector. 602 /// \returns A 512-bit integer vector. The lower 128 bits contain the value of 603 /// the parameter. The upper 384 bits are set to zero. 604 static __inline __m512i __DEFAULT_FN_ATTRS512 605 _mm512_zextsi128_si512(__m128i __a) 606 { 607 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3); 608 } 609 610 /// Constructs a 512-bit integer vector from a 256-bit integer vector. 611 /// The lower 256 bits contain the value of the source vector. The upper 612 /// 256 bits are set to zero. 613 /// 614 /// \headerfile <x86intrin.h> 615 /// 616 /// This intrinsic has no corresponding instruction. 617 /// 618 /// \param __a 619 /// A 256-bit integer vector. 620 /// \returns A 512-bit integer vector. The lower 256 bits contain the value of 621 /// the parameter. The upper 256 bits are set to zero. 622 static __inline __m512i __DEFAULT_FN_ATTRS512 623 _mm512_zextsi256_si512(__m256i __a) 624 { 625 return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7); 626 } 627 628 /* Bitwise operators */ 629 static __inline__ __m512i __DEFAULT_FN_ATTRS512 630 _mm512_and_epi32(__m512i __a, __m512i __b) 631 { 632 return (__m512i)((__v16su)__a & (__v16su)__b); 633 } 634 635 static __inline__ __m512i __DEFAULT_FN_ATTRS512 636 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 637 { 638 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 639 (__v16si) _mm512_and_epi32(__a, __b), 640 (__v16si) __src); 641 } 642 643 static __inline__ __m512i __DEFAULT_FN_ATTRS512 644 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) 645 { 646 return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (), 647 __k, __a, __b); 648 } 649 650 static __inline__ __m512i __DEFAULT_FN_ATTRS512 651 _mm512_and_epi64(__m512i __a, __m512i __b) 652 { 653 return (__m512i)((__v8du)__a & (__v8du)__b); 654 } 655 656 static __inline__ __m512i __DEFAULT_FN_ATTRS512 657 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 658 { 659 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k, 660 (__v8di) _mm512_and_epi64(__a, __b), 661 (__v8di) __src); 662 } 663 664 static __inline__ __m512i __DEFAULT_FN_ATTRS512 665 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) 666 { 667 return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (), 668 __k, __a, __b); 669 } 670 671 static __inline__ __m512i __DEFAULT_FN_ATTRS512 672 _mm512_andnot_si512 (__m512i __A, __m512i __B) 673 { 674 return (__m512i)(~(__v8du)__A & (__v8du)__B); 675 } 676 677 static __inline__ __m512i __DEFAULT_FN_ATTRS512 678 _mm512_andnot_epi32 (__m512i __A, __m512i __B) 679 { 680 return (__m512i)(~(__v16su)__A & (__v16su)__B); 681 } 682 683 static __inline__ __m512i __DEFAULT_FN_ATTRS512 684 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 685 { 686 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 687 (__v16si)_mm512_andnot_epi32(__A, __B), 688 (__v16si)__W); 689 } 690 691 static __inline__ __m512i __DEFAULT_FN_ATTRS512 692 _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B) 693 { 694 return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(), 695 __U, __A, __B); 696 } 697 698 static __inline__ __m512i __DEFAULT_FN_ATTRS512 699 _mm512_andnot_epi64(__m512i __A, __m512i __B) 700 { 701 return (__m512i)(~(__v8du)__A & (__v8du)__B); 702 } 703 704 static __inline__ __m512i __DEFAULT_FN_ATTRS512 705 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 706 { 707 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 708 (__v8di)_mm512_andnot_epi64(__A, __B), 709 (__v8di)__W); 710 } 711 712 static __inline__ __m512i __DEFAULT_FN_ATTRS512 713 _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B) 714 { 715 return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(), 716 __U, __A, __B); 717 } 718 719 static __inline__ __m512i __DEFAULT_FN_ATTRS512 720 _mm512_or_epi32(__m512i __a, __m512i __b) 721 { 722 return (__m512i)((__v16su)__a | (__v16su)__b); 723 } 724 725 static __inline__ __m512i __DEFAULT_FN_ATTRS512 726 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 727 { 728 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 729 (__v16si)_mm512_or_epi32(__a, __b), 730 (__v16si)__src); 731 } 732 733 static __inline__ __m512i __DEFAULT_FN_ATTRS512 734 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) 735 { 736 return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b); 737 } 738 739 static __inline__ __m512i __DEFAULT_FN_ATTRS512 740 _mm512_or_epi64(__m512i __a, __m512i __b) 741 { 742 return (__m512i)((__v8du)__a | (__v8du)__b); 743 } 744 745 static __inline__ __m512i __DEFAULT_FN_ATTRS512 746 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 747 { 748 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, 749 (__v8di)_mm512_or_epi64(__a, __b), 750 (__v8di)__src); 751 } 752 753 static __inline__ __m512i __DEFAULT_FN_ATTRS512 754 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) 755 { 756 return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b); 757 } 758 759 static __inline__ __m512i __DEFAULT_FN_ATTRS512 760 _mm512_xor_epi32(__m512i __a, __m512i __b) 761 { 762 return (__m512i)((__v16su)__a ^ (__v16su)__b); 763 } 764 765 static __inline__ __m512i __DEFAULT_FN_ATTRS512 766 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 767 { 768 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 769 (__v16si)_mm512_xor_epi32(__a, __b), 770 (__v16si)__src); 771 } 772 773 static __inline__ __m512i __DEFAULT_FN_ATTRS512 774 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) 775 { 776 return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b); 777 } 778 779 static __inline__ __m512i __DEFAULT_FN_ATTRS512 780 _mm512_xor_epi64(__m512i __a, __m512i __b) 781 { 782 return (__m512i)((__v8du)__a ^ (__v8du)__b); 783 } 784 785 static __inline__ __m512i __DEFAULT_FN_ATTRS512 786 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 787 { 788 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, 789 (__v8di)_mm512_xor_epi64(__a, __b), 790 (__v8di)__src); 791 } 792 793 static __inline__ __m512i __DEFAULT_FN_ATTRS512 794 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) 795 { 796 return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b); 797 } 798 799 static __inline__ __m512i __DEFAULT_FN_ATTRS512 800 _mm512_and_si512(__m512i __a, __m512i __b) 801 { 802 return (__m512i)((__v8du)__a & (__v8du)__b); 803 } 804 805 static __inline__ __m512i __DEFAULT_FN_ATTRS512 806 _mm512_or_si512(__m512i __a, __m512i __b) 807 { 808 return (__m512i)((__v8du)__a | (__v8du)__b); 809 } 810 811 static __inline__ __m512i __DEFAULT_FN_ATTRS512 812 _mm512_xor_si512(__m512i __a, __m512i __b) 813 { 814 return (__m512i)((__v8du)__a ^ (__v8du)__b); 815 } 816 817 /* Arithmetic */ 818 819 static __inline __m512d __DEFAULT_FN_ATTRS512 820 _mm512_add_pd(__m512d __a, __m512d __b) 821 { 822 return (__m512d)((__v8df)__a + (__v8df)__b); 823 } 824 825 static __inline __m512 __DEFAULT_FN_ATTRS512 826 _mm512_add_ps(__m512 __a, __m512 __b) 827 { 828 return (__m512)((__v16sf)__a + (__v16sf)__b); 829 } 830 831 static __inline __m512d __DEFAULT_FN_ATTRS512 832 _mm512_mul_pd(__m512d __a, __m512d __b) 833 { 834 return (__m512d)((__v8df)__a * (__v8df)__b); 835 } 836 837 static __inline __m512 __DEFAULT_FN_ATTRS512 838 _mm512_mul_ps(__m512 __a, __m512 __b) 839 { 840 return (__m512)((__v16sf)__a * (__v16sf)__b); 841 } 842 843 static __inline __m512d __DEFAULT_FN_ATTRS512 844 _mm512_sub_pd(__m512d __a, __m512d __b) 845 { 846 return (__m512d)((__v8df)__a - (__v8df)__b); 847 } 848 849 static __inline __m512 __DEFAULT_FN_ATTRS512 850 _mm512_sub_ps(__m512 __a, __m512 __b) 851 { 852 return (__m512)((__v16sf)__a - (__v16sf)__b); 853 } 854 855 static __inline__ __m512i __DEFAULT_FN_ATTRS512 856 _mm512_add_epi64 (__m512i __A, __m512i __B) 857 { 858 return (__m512i) ((__v8du) __A + (__v8du) __B); 859 } 860 861 static __inline__ __m512i __DEFAULT_FN_ATTRS512 862 _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 863 { 864 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 865 (__v8di)_mm512_add_epi64(__A, __B), 866 (__v8di)__W); 867 } 868 869 static __inline__ __m512i __DEFAULT_FN_ATTRS512 870 _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) 871 { 872 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 873 (__v8di)_mm512_add_epi64(__A, __B), 874 (__v8di)_mm512_setzero_si512()); 875 } 876 877 static __inline__ __m512i __DEFAULT_FN_ATTRS512 878 _mm512_sub_epi64 (__m512i __A, __m512i __B) 879 { 880 return (__m512i) ((__v8du) __A - (__v8du) __B); 881 } 882 883 static __inline__ __m512i __DEFAULT_FN_ATTRS512 884 _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 885 { 886 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 887 (__v8di)_mm512_sub_epi64(__A, __B), 888 (__v8di)__W); 889 } 890 891 static __inline__ __m512i __DEFAULT_FN_ATTRS512 892 _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) 893 { 894 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 895 (__v8di)_mm512_sub_epi64(__A, __B), 896 (__v8di)_mm512_setzero_si512()); 897 } 898 899 static __inline__ __m512i __DEFAULT_FN_ATTRS512 900 _mm512_add_epi32 (__m512i __A, __m512i __B) 901 { 902 return (__m512i) ((__v16su) __A + (__v16su) __B); 903 } 904 905 static __inline__ __m512i __DEFAULT_FN_ATTRS512 906 _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 907 { 908 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 909 (__v16si)_mm512_add_epi32(__A, __B), 910 (__v16si)__W); 911 } 912 913 static __inline__ __m512i __DEFAULT_FN_ATTRS512 914 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 915 { 916 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 917 (__v16si)_mm512_add_epi32(__A, __B), 918 (__v16si)_mm512_setzero_si512()); 919 } 920 921 static __inline__ __m512i __DEFAULT_FN_ATTRS512 922 _mm512_sub_epi32 (__m512i __A, __m512i __B) 923 { 924 return (__m512i) ((__v16su) __A - (__v16su) __B); 925 } 926 927 static __inline__ __m512i __DEFAULT_FN_ATTRS512 928 _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 929 { 930 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 931 (__v16si)_mm512_sub_epi32(__A, __B), 932 (__v16si)__W); 933 } 934 935 static __inline__ __m512i __DEFAULT_FN_ATTRS512 936 _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) 937 { 938 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 939 (__v16si)_mm512_sub_epi32(__A, __B), 940 (__v16si)_mm512_setzero_si512()); 941 } 942 943 #define _mm512_max_round_pd(A, B, R) \ 944 ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \ 945 (__v8df)(__m512d)(B), (int)(R))) 946 947 #define _mm512_mask_max_round_pd(W, U, A, B, R) \ 948 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 949 (__v8df)_mm512_max_round_pd((A), (B), (R)), \ 950 (__v8df)(W))) 951 952 #define _mm512_maskz_max_round_pd(U, A, B, R) \ 953 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 954 (__v8df)_mm512_max_round_pd((A), (B), (R)), \ 955 (__v8df)_mm512_setzero_pd())) 956 957 static __inline__ __m512d __DEFAULT_FN_ATTRS512 958 _mm512_max_pd(__m512d __A, __m512d __B) 959 { 960 return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B, 961 _MM_FROUND_CUR_DIRECTION); 962 } 963 964 static __inline__ __m512d __DEFAULT_FN_ATTRS512 965 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 966 { 967 return (__m512d)__builtin_ia32_selectpd_512(__U, 968 (__v8df)_mm512_max_pd(__A, __B), 969 (__v8df)__W); 970 } 971 972 static __inline__ __m512d __DEFAULT_FN_ATTRS512 973 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) 974 { 975 return (__m512d)__builtin_ia32_selectpd_512(__U, 976 (__v8df)_mm512_max_pd(__A, __B), 977 (__v8df)_mm512_setzero_pd()); 978 } 979 980 #define _mm512_max_round_ps(A, B, R) \ 981 ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \ 982 (__v16sf)(__m512)(B), (int)(R))) 983 984 #define _mm512_mask_max_round_ps(W, U, A, B, R) \ 985 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 986 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ 987 (__v16sf)(W))) 988 989 #define _mm512_maskz_max_round_ps(U, A, B, R) \ 990 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 991 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ 992 (__v16sf)_mm512_setzero_ps())) 993 994 static __inline__ __m512 __DEFAULT_FN_ATTRS512 995 _mm512_max_ps(__m512 __A, __m512 __B) 996 { 997 return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B, 998 _MM_FROUND_CUR_DIRECTION); 999 } 1000 1001 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1002 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 1003 { 1004 return (__m512)__builtin_ia32_selectps_512(__U, 1005 (__v16sf)_mm512_max_ps(__A, __B), 1006 (__v16sf)__W); 1007 } 1008 1009 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1010 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) 1011 { 1012 return (__m512)__builtin_ia32_selectps_512(__U, 1013 (__v16sf)_mm512_max_ps(__A, __B), 1014 (__v16sf)_mm512_setzero_ps()); 1015 } 1016 1017 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1018 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1019 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, 1020 (__v4sf) __B, 1021 (__v4sf) __W, 1022 (__mmask8) __U, 1023 _MM_FROUND_CUR_DIRECTION); 1024 } 1025 1026 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1027 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1028 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, 1029 (__v4sf) __B, 1030 (__v4sf) _mm_setzero_ps (), 1031 (__mmask8) __U, 1032 _MM_FROUND_CUR_DIRECTION); 1033 } 1034 1035 #define _mm_max_round_ss(A, B, R) \ 1036 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1037 (__v4sf)(__m128)(B), \ 1038 (__v4sf)_mm_setzero_ps(), \ 1039 (__mmask8)-1, (int)(R))) 1040 1041 #define _mm_mask_max_round_ss(W, U, A, B, R) \ 1042 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1043 (__v4sf)(__m128)(B), \ 1044 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1045 (int)(R))) 1046 1047 #define _mm_maskz_max_round_ss(U, A, B, R) \ 1048 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1049 (__v4sf)(__m128)(B), \ 1050 (__v4sf)_mm_setzero_ps(), \ 1051 (__mmask8)(U), (int)(R))) 1052 1053 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1054 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1055 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, 1056 (__v2df) __B, 1057 (__v2df) __W, 1058 (__mmask8) __U, 1059 _MM_FROUND_CUR_DIRECTION); 1060 } 1061 1062 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1063 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1064 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, 1065 (__v2df) __B, 1066 (__v2df) _mm_setzero_pd (), 1067 (__mmask8) __U, 1068 _MM_FROUND_CUR_DIRECTION); 1069 } 1070 1071 #define _mm_max_round_sd(A, B, R) \ 1072 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1073 (__v2df)(__m128d)(B), \ 1074 (__v2df)_mm_setzero_pd(), \ 1075 (__mmask8)-1, (int)(R))) 1076 1077 #define _mm_mask_max_round_sd(W, U, A, B, R) \ 1078 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1079 (__v2df)(__m128d)(B), \ 1080 (__v2df)(__m128d)(W), \ 1081 (__mmask8)(U), (int)(R))) 1082 1083 #define _mm_maskz_max_round_sd(U, A, B, R) \ 1084 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1085 (__v2df)(__m128d)(B), \ 1086 (__v2df)_mm_setzero_pd(), \ 1087 (__mmask8)(U), (int)(R))) 1088 1089 static __inline __m512i 1090 __DEFAULT_FN_ATTRS512 1091 _mm512_max_epi32(__m512i __A, __m512i __B) 1092 { 1093 return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B); 1094 } 1095 1096 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1097 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1098 { 1099 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1100 (__v16si)_mm512_max_epi32(__A, __B), 1101 (__v16si)__W); 1102 } 1103 1104 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1105 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) 1106 { 1107 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1108 (__v16si)_mm512_max_epi32(__A, __B), 1109 (__v16si)_mm512_setzero_si512()); 1110 } 1111 1112 static __inline __m512i __DEFAULT_FN_ATTRS512 1113 _mm512_max_epu32(__m512i __A, __m512i __B) 1114 { 1115 return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B); 1116 } 1117 1118 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1119 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1120 { 1121 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1122 (__v16si)_mm512_max_epu32(__A, __B), 1123 (__v16si)__W); 1124 } 1125 1126 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1127 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) 1128 { 1129 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1130 (__v16si)_mm512_max_epu32(__A, __B), 1131 (__v16si)_mm512_setzero_si512()); 1132 } 1133 1134 static __inline __m512i __DEFAULT_FN_ATTRS512 1135 _mm512_max_epi64(__m512i __A, __m512i __B) 1136 { 1137 return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B); 1138 } 1139 1140 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1141 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1142 { 1143 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1144 (__v8di)_mm512_max_epi64(__A, __B), 1145 (__v8di)__W); 1146 } 1147 1148 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1149 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) 1150 { 1151 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1152 (__v8di)_mm512_max_epi64(__A, __B), 1153 (__v8di)_mm512_setzero_si512()); 1154 } 1155 1156 static __inline __m512i __DEFAULT_FN_ATTRS512 1157 _mm512_max_epu64(__m512i __A, __m512i __B) 1158 { 1159 return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B); 1160 } 1161 1162 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1163 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1164 { 1165 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1166 (__v8di)_mm512_max_epu64(__A, __B), 1167 (__v8di)__W); 1168 } 1169 1170 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1171 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) 1172 { 1173 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1174 (__v8di)_mm512_max_epu64(__A, __B), 1175 (__v8di)_mm512_setzero_si512()); 1176 } 1177 1178 #define _mm512_min_round_pd(A, B, R) \ 1179 ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \ 1180 (__v8df)(__m512d)(B), (int)(R))) 1181 1182 #define _mm512_mask_min_round_pd(W, U, A, B, R) \ 1183 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1184 (__v8df)_mm512_min_round_pd((A), (B), (R)), \ 1185 (__v8df)(W))) 1186 1187 #define _mm512_maskz_min_round_pd(U, A, B, R) \ 1188 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1189 (__v8df)_mm512_min_round_pd((A), (B), (R)), \ 1190 (__v8df)_mm512_setzero_pd())) 1191 1192 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1193 _mm512_min_pd(__m512d __A, __m512d __B) 1194 { 1195 return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B, 1196 _MM_FROUND_CUR_DIRECTION); 1197 } 1198 1199 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1200 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 1201 { 1202 return (__m512d)__builtin_ia32_selectpd_512(__U, 1203 (__v8df)_mm512_min_pd(__A, __B), 1204 (__v8df)__W); 1205 } 1206 1207 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1208 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) 1209 { 1210 return (__m512d)__builtin_ia32_selectpd_512(__U, 1211 (__v8df)_mm512_min_pd(__A, __B), 1212 (__v8df)_mm512_setzero_pd()); 1213 } 1214 1215 #define _mm512_min_round_ps(A, B, R) \ 1216 ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \ 1217 (__v16sf)(__m512)(B), (int)(R))) 1218 1219 #define _mm512_mask_min_round_ps(W, U, A, B, R) \ 1220 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1221 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ 1222 (__v16sf)(W))) 1223 1224 #define _mm512_maskz_min_round_ps(U, A, B, R) \ 1225 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1226 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ 1227 (__v16sf)_mm512_setzero_ps())) 1228 1229 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1230 _mm512_min_ps(__m512 __A, __m512 __B) 1231 { 1232 return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B, 1233 _MM_FROUND_CUR_DIRECTION); 1234 } 1235 1236 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1237 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 1238 { 1239 return (__m512)__builtin_ia32_selectps_512(__U, 1240 (__v16sf)_mm512_min_ps(__A, __B), 1241 (__v16sf)__W); 1242 } 1243 1244 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1245 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) 1246 { 1247 return (__m512)__builtin_ia32_selectps_512(__U, 1248 (__v16sf)_mm512_min_ps(__A, __B), 1249 (__v16sf)_mm512_setzero_ps()); 1250 } 1251 1252 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1253 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1254 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, 1255 (__v4sf) __B, 1256 (__v4sf) __W, 1257 (__mmask8) __U, 1258 _MM_FROUND_CUR_DIRECTION); 1259 } 1260 1261 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1262 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1263 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, 1264 (__v4sf) __B, 1265 (__v4sf) _mm_setzero_ps (), 1266 (__mmask8) __U, 1267 _MM_FROUND_CUR_DIRECTION); 1268 } 1269 1270 #define _mm_min_round_ss(A, B, R) \ 1271 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1272 (__v4sf)(__m128)(B), \ 1273 (__v4sf)_mm_setzero_ps(), \ 1274 (__mmask8)-1, (int)(R))) 1275 1276 #define _mm_mask_min_round_ss(W, U, A, B, R) \ 1277 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1278 (__v4sf)(__m128)(B), \ 1279 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1280 (int)(R))) 1281 1282 #define _mm_maskz_min_round_ss(U, A, B, R) \ 1283 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1284 (__v4sf)(__m128)(B), \ 1285 (__v4sf)_mm_setzero_ps(), \ 1286 (__mmask8)(U), (int)(R))) 1287 1288 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1289 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1290 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, 1291 (__v2df) __B, 1292 (__v2df) __W, 1293 (__mmask8) __U, 1294 _MM_FROUND_CUR_DIRECTION); 1295 } 1296 1297 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1298 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1299 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, 1300 (__v2df) __B, 1301 (__v2df) _mm_setzero_pd (), 1302 (__mmask8) __U, 1303 _MM_FROUND_CUR_DIRECTION); 1304 } 1305 1306 #define _mm_min_round_sd(A, B, R) \ 1307 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1308 (__v2df)(__m128d)(B), \ 1309 (__v2df)_mm_setzero_pd(), \ 1310 (__mmask8)-1, (int)(R))) 1311 1312 #define _mm_mask_min_round_sd(W, U, A, B, R) \ 1313 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1314 (__v2df)(__m128d)(B), \ 1315 (__v2df)(__m128d)(W), \ 1316 (__mmask8)(U), (int)(R))) 1317 1318 #define _mm_maskz_min_round_sd(U, A, B, R) \ 1319 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1320 (__v2df)(__m128d)(B), \ 1321 (__v2df)_mm_setzero_pd(), \ 1322 (__mmask8)(U), (int)(R))) 1323 1324 static __inline __m512i 1325 __DEFAULT_FN_ATTRS512 1326 _mm512_min_epi32(__m512i __A, __m512i __B) 1327 { 1328 return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B); 1329 } 1330 1331 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1332 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1333 { 1334 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1335 (__v16si)_mm512_min_epi32(__A, __B), 1336 (__v16si)__W); 1337 } 1338 1339 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1340 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) 1341 { 1342 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1343 (__v16si)_mm512_min_epi32(__A, __B), 1344 (__v16si)_mm512_setzero_si512()); 1345 } 1346 1347 static __inline __m512i __DEFAULT_FN_ATTRS512 1348 _mm512_min_epu32(__m512i __A, __m512i __B) 1349 { 1350 return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B); 1351 } 1352 1353 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1354 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1355 { 1356 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1357 (__v16si)_mm512_min_epu32(__A, __B), 1358 (__v16si)__W); 1359 } 1360 1361 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1362 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) 1363 { 1364 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1365 (__v16si)_mm512_min_epu32(__A, __B), 1366 (__v16si)_mm512_setzero_si512()); 1367 } 1368 1369 static __inline __m512i __DEFAULT_FN_ATTRS512 1370 _mm512_min_epi64(__m512i __A, __m512i __B) 1371 { 1372 return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B); 1373 } 1374 1375 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1376 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1377 { 1378 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1379 (__v8di)_mm512_min_epi64(__A, __B), 1380 (__v8di)__W); 1381 } 1382 1383 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1384 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) 1385 { 1386 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1387 (__v8di)_mm512_min_epi64(__A, __B), 1388 (__v8di)_mm512_setzero_si512()); 1389 } 1390 1391 static __inline __m512i __DEFAULT_FN_ATTRS512 1392 _mm512_min_epu64(__m512i __A, __m512i __B) 1393 { 1394 return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B); 1395 } 1396 1397 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1398 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1399 { 1400 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1401 (__v8di)_mm512_min_epu64(__A, __B), 1402 (__v8di)__W); 1403 } 1404 1405 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1406 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) 1407 { 1408 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1409 (__v8di)_mm512_min_epu64(__A, __B), 1410 (__v8di)_mm512_setzero_si512()); 1411 } 1412 1413 static __inline __m512i __DEFAULT_FN_ATTRS512 1414 _mm512_mul_epi32(__m512i __X, __m512i __Y) 1415 { 1416 return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); 1417 } 1418 1419 static __inline __m512i __DEFAULT_FN_ATTRS512 1420 _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) 1421 { 1422 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1423 (__v8di)_mm512_mul_epi32(__X, __Y), 1424 (__v8di)__W); 1425 } 1426 1427 static __inline __m512i __DEFAULT_FN_ATTRS512 1428 _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) 1429 { 1430 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1431 (__v8di)_mm512_mul_epi32(__X, __Y), 1432 (__v8di)_mm512_setzero_si512 ()); 1433 } 1434 1435 static __inline __m512i __DEFAULT_FN_ATTRS512 1436 _mm512_mul_epu32(__m512i __X, __m512i __Y) 1437 { 1438 return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); 1439 } 1440 1441 static __inline __m512i __DEFAULT_FN_ATTRS512 1442 _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) 1443 { 1444 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1445 (__v8di)_mm512_mul_epu32(__X, __Y), 1446 (__v8di)__W); 1447 } 1448 1449 static __inline __m512i __DEFAULT_FN_ATTRS512 1450 _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) 1451 { 1452 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1453 (__v8di)_mm512_mul_epu32(__X, __Y), 1454 (__v8di)_mm512_setzero_si512 ()); 1455 } 1456 1457 static __inline __m512i __DEFAULT_FN_ATTRS512 1458 _mm512_mullo_epi32 (__m512i __A, __m512i __B) 1459 { 1460 return (__m512i) ((__v16su) __A * (__v16su) __B); 1461 } 1462 1463 static __inline __m512i __DEFAULT_FN_ATTRS512 1464 _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) 1465 { 1466 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1467 (__v16si)_mm512_mullo_epi32(__A, __B), 1468 (__v16si)_mm512_setzero_si512()); 1469 } 1470 1471 static __inline __m512i __DEFAULT_FN_ATTRS512 1472 _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1473 { 1474 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1475 (__v16si)_mm512_mullo_epi32(__A, __B), 1476 (__v16si)__W); 1477 } 1478 1479 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1480 _mm512_mullox_epi64 (__m512i __A, __m512i __B) { 1481 return (__m512i) ((__v8du) __A * (__v8du) __B); 1482 } 1483 1484 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1485 _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { 1486 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1487 (__v8di)_mm512_mullox_epi64(__A, __B), 1488 (__v8di)__W); 1489 } 1490 1491 #define _mm512_sqrt_round_pd(A, R) \ 1492 ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R))) 1493 1494 #define _mm512_mask_sqrt_round_pd(W, U, A, R) \ 1495 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1496 (__v8df)_mm512_sqrt_round_pd((A), (R)), \ 1497 (__v8df)(__m512d)(W))) 1498 1499 #define _mm512_maskz_sqrt_round_pd(U, A, R) \ 1500 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1501 (__v8df)_mm512_sqrt_round_pd((A), (R)), \ 1502 (__v8df)_mm512_setzero_pd())) 1503 1504 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1505 _mm512_sqrt_pd(__m512d __A) 1506 { 1507 return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A, 1508 _MM_FROUND_CUR_DIRECTION); 1509 } 1510 1511 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1512 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) 1513 { 1514 return (__m512d)__builtin_ia32_selectpd_512(__U, 1515 (__v8df)_mm512_sqrt_pd(__A), 1516 (__v8df)__W); 1517 } 1518 1519 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1520 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) 1521 { 1522 return (__m512d)__builtin_ia32_selectpd_512(__U, 1523 (__v8df)_mm512_sqrt_pd(__A), 1524 (__v8df)_mm512_setzero_pd()); 1525 } 1526 1527 #define _mm512_sqrt_round_ps(A, R) \ 1528 ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R))) 1529 1530 #define _mm512_mask_sqrt_round_ps(W, U, A, R) \ 1531 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1532 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ 1533 (__v16sf)(__m512)(W))) 1534 1535 #define _mm512_maskz_sqrt_round_ps(U, A, R) \ 1536 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1537 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ 1538 (__v16sf)_mm512_setzero_ps())) 1539 1540 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1541 _mm512_sqrt_ps(__m512 __A) 1542 { 1543 return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A, 1544 _MM_FROUND_CUR_DIRECTION); 1545 } 1546 1547 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1548 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) 1549 { 1550 return (__m512)__builtin_ia32_selectps_512(__U, 1551 (__v16sf)_mm512_sqrt_ps(__A), 1552 (__v16sf)__W); 1553 } 1554 1555 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1556 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) 1557 { 1558 return (__m512)__builtin_ia32_selectps_512(__U, 1559 (__v16sf)_mm512_sqrt_ps(__A), 1560 (__v16sf)_mm512_setzero_ps()); 1561 } 1562 1563 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1564 _mm512_rsqrt14_pd(__m512d __A) 1565 { 1566 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1567 (__v8df) 1568 _mm512_setzero_pd (), 1569 (__mmask8) -1);} 1570 1571 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1572 _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) 1573 { 1574 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1575 (__v8df) __W, 1576 (__mmask8) __U); 1577 } 1578 1579 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1580 _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) 1581 { 1582 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1583 (__v8df) 1584 _mm512_setzero_pd (), 1585 (__mmask8) __U); 1586 } 1587 1588 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1589 _mm512_rsqrt14_ps(__m512 __A) 1590 { 1591 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1592 (__v16sf) 1593 _mm512_setzero_ps (), 1594 (__mmask16) -1); 1595 } 1596 1597 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1598 _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) 1599 { 1600 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1601 (__v16sf) __W, 1602 (__mmask16) __U); 1603 } 1604 1605 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1606 _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) 1607 { 1608 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1609 (__v16sf) 1610 _mm512_setzero_ps (), 1611 (__mmask16) __U); 1612 } 1613 1614 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1615 _mm_rsqrt14_ss(__m128 __A, __m128 __B) 1616 { 1617 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1618 (__v4sf) __B, 1619 (__v4sf) 1620 _mm_setzero_ps (), 1621 (__mmask8) -1); 1622 } 1623 1624 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1625 _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 1626 { 1627 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1628 (__v4sf) __B, 1629 (__v4sf) __W, 1630 (__mmask8) __U); 1631 } 1632 1633 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1634 _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) 1635 { 1636 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1637 (__v4sf) __B, 1638 (__v4sf) _mm_setzero_ps (), 1639 (__mmask8) __U); 1640 } 1641 1642 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1643 _mm_rsqrt14_sd(__m128d __A, __m128d __B) 1644 { 1645 return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A, 1646 (__v2df) __B, 1647 (__v2df) 1648 _mm_setzero_pd (), 1649 (__mmask8) -1); 1650 } 1651 1652 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1653 _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 1654 { 1655 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, 1656 (__v2df) __B, 1657 (__v2df) __W, 1658 (__mmask8) __U); 1659 } 1660 1661 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1662 _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) 1663 { 1664 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, 1665 (__v2df) __B, 1666 (__v2df) _mm_setzero_pd (), 1667 (__mmask8) __U); 1668 } 1669 1670 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1671 _mm512_rcp14_pd(__m512d __A) 1672 { 1673 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1674 (__v8df) 1675 _mm512_setzero_pd (), 1676 (__mmask8) -1); 1677 } 1678 1679 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1680 _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) 1681 { 1682 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1683 (__v8df) __W, 1684 (__mmask8) __U); 1685 } 1686 1687 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1688 _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) 1689 { 1690 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1691 (__v8df) 1692 _mm512_setzero_pd (), 1693 (__mmask8) __U); 1694 } 1695 1696 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1697 _mm512_rcp14_ps(__m512 __A) 1698 { 1699 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1700 (__v16sf) 1701 _mm512_setzero_ps (), 1702 (__mmask16) -1); 1703 } 1704 1705 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1706 _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) 1707 { 1708 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1709 (__v16sf) __W, 1710 (__mmask16) __U); 1711 } 1712 1713 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1714 _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) 1715 { 1716 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1717 (__v16sf) 1718 _mm512_setzero_ps (), 1719 (__mmask16) __U); 1720 } 1721 1722 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1723 _mm_rcp14_ss(__m128 __A, __m128 __B) 1724 { 1725 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1726 (__v4sf) __B, 1727 (__v4sf) 1728 _mm_setzero_ps (), 1729 (__mmask8) -1); 1730 } 1731 1732 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1733 _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 1734 { 1735 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1736 (__v4sf) __B, 1737 (__v4sf) __W, 1738 (__mmask8) __U); 1739 } 1740 1741 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1742 _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) 1743 { 1744 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1745 (__v4sf) __B, 1746 (__v4sf) _mm_setzero_ps (), 1747 (__mmask8) __U); 1748 } 1749 1750 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1751 _mm_rcp14_sd(__m128d __A, __m128d __B) 1752 { 1753 return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A, 1754 (__v2df) __B, 1755 (__v2df) 1756 _mm_setzero_pd (), 1757 (__mmask8) -1); 1758 } 1759 1760 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1761 _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 1762 { 1763 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, 1764 (__v2df) __B, 1765 (__v2df) __W, 1766 (__mmask8) __U); 1767 } 1768 1769 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1770 _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) 1771 { 1772 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, 1773 (__v2df) __B, 1774 (__v2df) _mm_setzero_pd (), 1775 (__mmask8) __U); 1776 } 1777 1778 static __inline __m512 __DEFAULT_FN_ATTRS512 1779 _mm512_floor_ps(__m512 __A) 1780 { 1781 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1782 _MM_FROUND_FLOOR, 1783 (__v16sf) __A, (unsigned short)-1, 1784 _MM_FROUND_CUR_DIRECTION); 1785 } 1786 1787 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1788 _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) 1789 { 1790 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1791 _MM_FROUND_FLOOR, 1792 (__v16sf) __W, __U, 1793 _MM_FROUND_CUR_DIRECTION); 1794 } 1795 1796 static __inline __m512d __DEFAULT_FN_ATTRS512 1797 _mm512_floor_pd(__m512d __A) 1798 { 1799 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1800 _MM_FROUND_FLOOR, 1801 (__v8df) __A, (unsigned char)-1, 1802 _MM_FROUND_CUR_DIRECTION); 1803 } 1804 1805 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1806 _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) 1807 { 1808 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1809 _MM_FROUND_FLOOR, 1810 (__v8df) __W, __U, 1811 _MM_FROUND_CUR_DIRECTION); 1812 } 1813 1814 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1815 _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) 1816 { 1817 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1818 _MM_FROUND_CEIL, 1819 (__v16sf) __W, __U, 1820 _MM_FROUND_CUR_DIRECTION); 1821 } 1822 1823 static __inline __m512 __DEFAULT_FN_ATTRS512 1824 _mm512_ceil_ps(__m512 __A) 1825 { 1826 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1827 _MM_FROUND_CEIL, 1828 (__v16sf) __A, (unsigned short)-1, 1829 _MM_FROUND_CUR_DIRECTION); 1830 } 1831 1832 static __inline __m512d __DEFAULT_FN_ATTRS512 1833 _mm512_ceil_pd(__m512d __A) 1834 { 1835 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1836 _MM_FROUND_CEIL, 1837 (__v8df) __A, (unsigned char)-1, 1838 _MM_FROUND_CUR_DIRECTION); 1839 } 1840 1841 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1842 _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) 1843 { 1844 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1845 _MM_FROUND_CEIL, 1846 (__v8df) __W, __U, 1847 _MM_FROUND_CUR_DIRECTION); 1848 } 1849 1850 static __inline __m512i __DEFAULT_FN_ATTRS512 1851 _mm512_abs_epi64(__m512i __A) 1852 { 1853 return (__m512i)__builtin_elementwise_abs((__v8di)__A); 1854 } 1855 1856 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1857 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 1858 { 1859 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1860 (__v8di)_mm512_abs_epi64(__A), 1861 (__v8di)__W); 1862 } 1863 1864 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1865 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) 1866 { 1867 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1868 (__v8di)_mm512_abs_epi64(__A), 1869 (__v8di)_mm512_setzero_si512()); 1870 } 1871 1872 static __inline __m512i __DEFAULT_FN_ATTRS512 1873 _mm512_abs_epi32(__m512i __A) 1874 { 1875 return (__m512i)__builtin_elementwise_abs((__v16si) __A); 1876 } 1877 1878 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1879 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 1880 { 1881 return (__m512i)__builtin_ia32_selectd_512(__U, 1882 (__v16si)_mm512_abs_epi32(__A), 1883 (__v16si)__W); 1884 } 1885 1886 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1887 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) 1888 { 1889 return (__m512i)__builtin_ia32_selectd_512(__U, 1890 (__v16si)_mm512_abs_epi32(__A), 1891 (__v16si)_mm512_setzero_si512()); 1892 } 1893 1894 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1895 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1896 __A = _mm_add_ss(__A, __B); 1897 return __builtin_ia32_selectss_128(__U, __A, __W); 1898 } 1899 1900 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1901 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1902 __A = _mm_add_ss(__A, __B); 1903 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 1904 } 1905 1906 #define _mm_add_round_ss(A, B, R) \ 1907 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1908 (__v4sf)(__m128)(B), \ 1909 (__v4sf)_mm_setzero_ps(), \ 1910 (__mmask8)-1, (int)(R))) 1911 1912 #define _mm_mask_add_round_ss(W, U, A, B, R) \ 1913 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1914 (__v4sf)(__m128)(B), \ 1915 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1916 (int)(R))) 1917 1918 #define _mm_maskz_add_round_ss(U, A, B, R) \ 1919 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1920 (__v4sf)(__m128)(B), \ 1921 (__v4sf)_mm_setzero_ps(), \ 1922 (__mmask8)(U), (int)(R))) 1923 1924 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1925 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1926 __A = _mm_add_sd(__A, __B); 1927 return __builtin_ia32_selectsd_128(__U, __A, __W); 1928 } 1929 1930 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1931 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1932 __A = _mm_add_sd(__A, __B); 1933 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 1934 } 1935 #define _mm_add_round_sd(A, B, R) \ 1936 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1937 (__v2df)(__m128d)(B), \ 1938 (__v2df)_mm_setzero_pd(), \ 1939 (__mmask8)-1, (int)(R))) 1940 1941 #define _mm_mask_add_round_sd(W, U, A, B, R) \ 1942 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1943 (__v2df)(__m128d)(B), \ 1944 (__v2df)(__m128d)(W), \ 1945 (__mmask8)(U), (int)(R))) 1946 1947 #define _mm_maskz_add_round_sd(U, A, B, R) \ 1948 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1949 (__v2df)(__m128d)(B), \ 1950 (__v2df)_mm_setzero_pd(), \ 1951 (__mmask8)(U), (int)(R))) 1952 1953 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1954 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 1955 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 1956 (__v8df)_mm512_add_pd(__A, __B), 1957 (__v8df)__W); 1958 } 1959 1960 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1961 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { 1962 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 1963 (__v8df)_mm512_add_pd(__A, __B), 1964 (__v8df)_mm512_setzero_pd()); 1965 } 1966 1967 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1968 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 1969 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 1970 (__v16sf)_mm512_add_ps(__A, __B), 1971 (__v16sf)__W); 1972 } 1973 1974 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1975 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { 1976 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 1977 (__v16sf)_mm512_add_ps(__A, __B), 1978 (__v16sf)_mm512_setzero_ps()); 1979 } 1980 1981 #define _mm512_add_round_pd(A, B, R) \ 1982 ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \ 1983 (__v8df)(__m512d)(B), (int)(R))) 1984 1985 #define _mm512_mask_add_round_pd(W, U, A, B, R) \ 1986 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1987 (__v8df)_mm512_add_round_pd((A), (B), (R)), \ 1988 (__v8df)(__m512d)(W))) 1989 1990 #define _mm512_maskz_add_round_pd(U, A, B, R) \ 1991 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1992 (__v8df)_mm512_add_round_pd((A), (B), (R)), \ 1993 (__v8df)_mm512_setzero_pd())) 1994 1995 #define _mm512_add_round_ps(A, B, R) \ 1996 ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ 1997 (__v16sf)(__m512)(B), (int)(R))) 1998 1999 #define _mm512_mask_add_round_ps(W, U, A, B, R) \ 2000 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2001 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ 2002 (__v16sf)(__m512)(W))) 2003 2004 #define _mm512_maskz_add_round_ps(U, A, B, R) \ 2005 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2006 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ 2007 (__v16sf)_mm512_setzero_ps())) 2008 2009 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2010 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2011 __A = _mm_sub_ss(__A, __B); 2012 return __builtin_ia32_selectss_128(__U, __A, __W); 2013 } 2014 2015 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2016 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2017 __A = _mm_sub_ss(__A, __B); 2018 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2019 } 2020 #define _mm_sub_round_ss(A, B, R) \ 2021 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2022 (__v4sf)(__m128)(B), \ 2023 (__v4sf)_mm_setzero_ps(), \ 2024 (__mmask8)-1, (int)(R))) 2025 2026 #define _mm_mask_sub_round_ss(W, U, A, B, R) \ 2027 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2028 (__v4sf)(__m128)(B), \ 2029 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2030 (int)(R))) 2031 2032 #define _mm_maskz_sub_round_ss(U, A, B, R) \ 2033 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2034 (__v4sf)(__m128)(B), \ 2035 (__v4sf)_mm_setzero_ps(), \ 2036 (__mmask8)(U), (int)(R))) 2037 2038 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2039 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2040 __A = _mm_sub_sd(__A, __B); 2041 return __builtin_ia32_selectsd_128(__U, __A, __W); 2042 } 2043 2044 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2045 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2046 __A = _mm_sub_sd(__A, __B); 2047 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2048 } 2049 2050 #define _mm_sub_round_sd(A, B, R) \ 2051 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2052 (__v2df)(__m128d)(B), \ 2053 (__v2df)_mm_setzero_pd(), \ 2054 (__mmask8)-1, (int)(R))) 2055 2056 #define _mm_mask_sub_round_sd(W, U, A, B, R) \ 2057 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2058 (__v2df)(__m128d)(B), \ 2059 (__v2df)(__m128d)(W), \ 2060 (__mmask8)(U), (int)(R))) 2061 2062 #define _mm_maskz_sub_round_sd(U, A, B, R) \ 2063 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2064 (__v2df)(__m128d)(B), \ 2065 (__v2df)_mm_setzero_pd(), \ 2066 (__mmask8)(U), (int)(R))) 2067 2068 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2069 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2070 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2071 (__v8df)_mm512_sub_pd(__A, __B), 2072 (__v8df)__W); 2073 } 2074 2075 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2076 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2077 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2078 (__v8df)_mm512_sub_pd(__A, __B), 2079 (__v8df)_mm512_setzero_pd()); 2080 } 2081 2082 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2083 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2084 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2085 (__v16sf)_mm512_sub_ps(__A, __B), 2086 (__v16sf)__W); 2087 } 2088 2089 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2090 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2091 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2092 (__v16sf)_mm512_sub_ps(__A, __B), 2093 (__v16sf)_mm512_setzero_ps()); 2094 } 2095 2096 #define _mm512_sub_round_pd(A, B, R) \ 2097 ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \ 2098 (__v8df)(__m512d)(B), (int)(R))) 2099 2100 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \ 2101 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2102 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ 2103 (__v8df)(__m512d)(W))) 2104 2105 #define _mm512_maskz_sub_round_pd(U, A, B, R) \ 2106 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2107 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ 2108 (__v8df)_mm512_setzero_pd())) 2109 2110 #define _mm512_sub_round_ps(A, B, R) \ 2111 ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ 2112 (__v16sf)(__m512)(B), (int)(R))) 2113 2114 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \ 2115 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2116 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ 2117 (__v16sf)(__m512)(W))) 2118 2119 #define _mm512_maskz_sub_round_ps(U, A, B, R) \ 2120 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2121 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ 2122 (__v16sf)_mm512_setzero_ps())) 2123 2124 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2125 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2126 __A = _mm_mul_ss(__A, __B); 2127 return __builtin_ia32_selectss_128(__U, __A, __W); 2128 } 2129 2130 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2131 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2132 __A = _mm_mul_ss(__A, __B); 2133 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2134 } 2135 #define _mm_mul_round_ss(A, B, R) \ 2136 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2137 (__v4sf)(__m128)(B), \ 2138 (__v4sf)_mm_setzero_ps(), \ 2139 (__mmask8)-1, (int)(R))) 2140 2141 #define _mm_mask_mul_round_ss(W, U, A, B, R) \ 2142 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2143 (__v4sf)(__m128)(B), \ 2144 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2145 (int)(R))) 2146 2147 #define _mm_maskz_mul_round_ss(U, A, B, R) \ 2148 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2149 (__v4sf)(__m128)(B), \ 2150 (__v4sf)_mm_setzero_ps(), \ 2151 (__mmask8)(U), (int)(R))) 2152 2153 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2154 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2155 __A = _mm_mul_sd(__A, __B); 2156 return __builtin_ia32_selectsd_128(__U, __A, __W); 2157 } 2158 2159 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2160 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2161 __A = _mm_mul_sd(__A, __B); 2162 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2163 } 2164 2165 #define _mm_mul_round_sd(A, B, R) \ 2166 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2167 (__v2df)(__m128d)(B), \ 2168 (__v2df)_mm_setzero_pd(), \ 2169 (__mmask8)-1, (int)(R))) 2170 2171 #define _mm_mask_mul_round_sd(W, U, A, B, R) \ 2172 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2173 (__v2df)(__m128d)(B), \ 2174 (__v2df)(__m128d)(W), \ 2175 (__mmask8)(U), (int)(R))) 2176 2177 #define _mm_maskz_mul_round_sd(U, A, B, R) \ 2178 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2179 (__v2df)(__m128d)(B), \ 2180 (__v2df)_mm_setzero_pd(), \ 2181 (__mmask8)(U), (int)(R))) 2182 2183 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2184 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2185 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2186 (__v8df)_mm512_mul_pd(__A, __B), 2187 (__v8df)__W); 2188 } 2189 2190 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2191 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2192 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2193 (__v8df)_mm512_mul_pd(__A, __B), 2194 (__v8df)_mm512_setzero_pd()); 2195 } 2196 2197 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2198 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2199 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2200 (__v16sf)_mm512_mul_ps(__A, __B), 2201 (__v16sf)__W); 2202 } 2203 2204 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2205 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2206 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2207 (__v16sf)_mm512_mul_ps(__A, __B), 2208 (__v16sf)_mm512_setzero_ps()); 2209 } 2210 2211 #define _mm512_mul_round_pd(A, B, R) \ 2212 ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \ 2213 (__v8df)(__m512d)(B), (int)(R))) 2214 2215 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \ 2216 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2217 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ 2218 (__v8df)(__m512d)(W))) 2219 2220 #define _mm512_maskz_mul_round_pd(U, A, B, R) \ 2221 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2222 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ 2223 (__v8df)_mm512_setzero_pd())) 2224 2225 #define _mm512_mul_round_ps(A, B, R) \ 2226 ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ 2227 (__v16sf)(__m512)(B), (int)(R))) 2228 2229 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \ 2230 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2231 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ 2232 (__v16sf)(__m512)(W))) 2233 2234 #define _mm512_maskz_mul_round_ps(U, A, B, R) \ 2235 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2236 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ 2237 (__v16sf)_mm512_setzero_ps())) 2238 2239 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2240 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2241 __A = _mm_div_ss(__A, __B); 2242 return __builtin_ia32_selectss_128(__U, __A, __W); 2243 } 2244 2245 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2246 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2247 __A = _mm_div_ss(__A, __B); 2248 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2249 } 2250 2251 #define _mm_div_round_ss(A, B, R) \ 2252 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2253 (__v4sf)(__m128)(B), \ 2254 (__v4sf)_mm_setzero_ps(), \ 2255 (__mmask8)-1, (int)(R))) 2256 2257 #define _mm_mask_div_round_ss(W, U, A, B, R) \ 2258 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2259 (__v4sf)(__m128)(B), \ 2260 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2261 (int)(R))) 2262 2263 #define _mm_maskz_div_round_ss(U, A, B, R) \ 2264 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2265 (__v4sf)(__m128)(B), \ 2266 (__v4sf)_mm_setzero_ps(), \ 2267 (__mmask8)(U), (int)(R))) 2268 2269 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2270 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2271 __A = _mm_div_sd(__A, __B); 2272 return __builtin_ia32_selectsd_128(__U, __A, __W); 2273 } 2274 2275 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2276 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2277 __A = _mm_div_sd(__A, __B); 2278 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2279 } 2280 2281 #define _mm_div_round_sd(A, B, R) \ 2282 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2283 (__v2df)(__m128d)(B), \ 2284 (__v2df)_mm_setzero_pd(), \ 2285 (__mmask8)-1, (int)(R))) 2286 2287 #define _mm_mask_div_round_sd(W, U, A, B, R) \ 2288 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2289 (__v2df)(__m128d)(B), \ 2290 (__v2df)(__m128d)(W), \ 2291 (__mmask8)(U), (int)(R))) 2292 2293 #define _mm_maskz_div_round_sd(U, A, B, R) \ 2294 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2295 (__v2df)(__m128d)(B), \ 2296 (__v2df)_mm_setzero_pd(), \ 2297 (__mmask8)(U), (int)(R))) 2298 2299 static __inline __m512d __DEFAULT_FN_ATTRS512 2300 _mm512_div_pd(__m512d __a, __m512d __b) 2301 { 2302 return (__m512d)((__v8df)__a/(__v8df)__b); 2303 } 2304 2305 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2306 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2307 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2308 (__v8df)_mm512_div_pd(__A, __B), 2309 (__v8df)__W); 2310 } 2311 2312 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2313 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2314 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2315 (__v8df)_mm512_div_pd(__A, __B), 2316 (__v8df)_mm512_setzero_pd()); 2317 } 2318 2319 static __inline __m512 __DEFAULT_FN_ATTRS512 2320 _mm512_div_ps(__m512 __a, __m512 __b) 2321 { 2322 return (__m512)((__v16sf)__a/(__v16sf)__b); 2323 } 2324 2325 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2326 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2327 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2328 (__v16sf)_mm512_div_ps(__A, __B), 2329 (__v16sf)__W); 2330 } 2331 2332 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2333 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2334 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2335 (__v16sf)_mm512_div_ps(__A, __B), 2336 (__v16sf)_mm512_setzero_ps()); 2337 } 2338 2339 #define _mm512_div_round_pd(A, B, R) \ 2340 ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \ 2341 (__v8df)(__m512d)(B), (int)(R))) 2342 2343 #define _mm512_mask_div_round_pd(W, U, A, B, R) \ 2344 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2345 (__v8df)_mm512_div_round_pd((A), (B), (R)), \ 2346 (__v8df)(__m512d)(W))) 2347 2348 #define _mm512_maskz_div_round_pd(U, A, B, R) \ 2349 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2350 (__v8df)_mm512_div_round_pd((A), (B), (R)), \ 2351 (__v8df)_mm512_setzero_pd())) 2352 2353 #define _mm512_div_round_ps(A, B, R) \ 2354 ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ 2355 (__v16sf)(__m512)(B), (int)(R))) 2356 2357 #define _mm512_mask_div_round_ps(W, U, A, B, R) \ 2358 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2359 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ 2360 (__v16sf)(__m512)(W))) 2361 2362 #define _mm512_maskz_div_round_ps(U, A, B, R) \ 2363 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2364 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ 2365 (__v16sf)_mm512_setzero_ps())) 2366 2367 #define _mm512_roundscale_ps(A, B) \ 2368 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ 2369 (__v16sf)_mm512_undefined_ps(), \ 2370 (__mmask16)-1, \ 2371 _MM_FROUND_CUR_DIRECTION)) 2372 2373 #define _mm512_mask_roundscale_ps(A, B, C, imm) \ 2374 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ 2375 (__v16sf)(__m512)(A), (__mmask16)(B), \ 2376 _MM_FROUND_CUR_DIRECTION)) 2377 2378 #define _mm512_maskz_roundscale_ps(A, B, imm) \ 2379 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ 2380 (__v16sf)_mm512_setzero_ps(), \ 2381 (__mmask16)(A), \ 2382 _MM_FROUND_CUR_DIRECTION)) 2383 2384 #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \ 2385 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ 2386 (__v16sf)(__m512)(A), (__mmask16)(B), \ 2387 (int)(R))) 2388 2389 #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \ 2390 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ 2391 (__v16sf)_mm512_setzero_ps(), \ 2392 (__mmask16)(A), (int)(R))) 2393 2394 #define _mm512_roundscale_round_ps(A, imm, R) \ 2395 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \ 2396 (__v16sf)_mm512_undefined_ps(), \ 2397 (__mmask16)-1, (int)(R))) 2398 2399 #define _mm512_roundscale_pd(A, B) \ 2400 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ 2401 (__v8df)_mm512_undefined_pd(), \ 2402 (__mmask8)-1, \ 2403 _MM_FROUND_CUR_DIRECTION)) 2404 2405 #define _mm512_mask_roundscale_pd(A, B, C, imm) \ 2406 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ 2407 (__v8df)(__m512d)(A), (__mmask8)(B), \ 2408 _MM_FROUND_CUR_DIRECTION)) 2409 2410 #define _mm512_maskz_roundscale_pd(A, B, imm) \ 2411 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ 2412 (__v8df)_mm512_setzero_pd(), \ 2413 (__mmask8)(A), \ 2414 _MM_FROUND_CUR_DIRECTION)) 2415 2416 #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \ 2417 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ 2418 (__v8df)(__m512d)(A), (__mmask8)(B), \ 2419 (int)(R))) 2420 2421 #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \ 2422 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ 2423 (__v8df)_mm512_setzero_pd(), \ 2424 (__mmask8)(A), (int)(R))) 2425 2426 #define _mm512_roundscale_round_pd(A, imm, R) \ 2427 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \ 2428 (__v8df)_mm512_undefined_pd(), \ 2429 (__mmask8)-1, (int)(R))) 2430 2431 #define _mm512_fmadd_round_pd(A, B, C, R) \ 2432 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2433 (__v8df)(__m512d)(B), \ 2434 (__v8df)(__m512d)(C), \ 2435 (__mmask8)-1, (int)(R))) 2436 2437 2438 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ 2439 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2440 (__v8df)(__m512d)(B), \ 2441 (__v8df)(__m512d)(C), \ 2442 (__mmask8)(U), (int)(R))) 2443 2444 2445 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ 2446 ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ 2447 (__v8df)(__m512d)(B), \ 2448 (__v8df)(__m512d)(C), \ 2449 (__mmask8)(U), (int)(R))) 2450 2451 2452 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ 2453 ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ 2454 (__v8df)(__m512d)(B), \ 2455 (__v8df)(__m512d)(C), \ 2456 (__mmask8)(U), (int)(R))) 2457 2458 2459 #define _mm512_fmsub_round_pd(A, B, C, R) \ 2460 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2461 (__v8df)(__m512d)(B), \ 2462 -(__v8df)(__m512d)(C), \ 2463 (__mmask8)-1, (int)(R))) 2464 2465 2466 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ 2467 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2468 (__v8df)(__m512d)(B), \ 2469 -(__v8df)(__m512d)(C), \ 2470 (__mmask8)(U), (int)(R))) 2471 2472 2473 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ 2474 ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ 2475 (__v8df)(__m512d)(B), \ 2476 -(__v8df)(__m512d)(C), \ 2477 (__mmask8)(U), (int)(R))) 2478 2479 2480 #define _mm512_fnmadd_round_pd(A, B, C, R) \ 2481 ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ 2482 (__v8df)(__m512d)(B), \ 2483 (__v8df)(__m512d)(C), \ 2484 (__mmask8)-1, (int)(R))) 2485 2486 2487 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ 2488 ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ 2489 (__v8df)(__m512d)(B), \ 2490 (__v8df)(__m512d)(C), \ 2491 (__mmask8)(U), (int)(R))) 2492 2493 2494 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ 2495 ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ 2496 (__v8df)(__m512d)(B), \ 2497 (__v8df)(__m512d)(C), \ 2498 (__mmask8)(U), (int)(R))) 2499 2500 2501 #define _mm512_fnmsub_round_pd(A, B, C, R) \ 2502 ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ 2503 (__v8df)(__m512d)(B), \ 2504 -(__v8df)(__m512d)(C), \ 2505 (__mmask8)-1, (int)(R))) 2506 2507 2508 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ 2509 ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ 2510 (__v8df)(__m512d)(B), \ 2511 -(__v8df)(__m512d)(C), \ 2512 (__mmask8)(U), (int)(R))) 2513 2514 2515 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2516 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) 2517 { 2518 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2519 (__v8df) __B, 2520 (__v8df) __C, 2521 (__mmask8) -1, 2522 _MM_FROUND_CUR_DIRECTION); 2523 } 2524 2525 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2526 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2527 { 2528 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2529 (__v8df) __B, 2530 (__v8df) __C, 2531 (__mmask8) __U, 2532 _MM_FROUND_CUR_DIRECTION); 2533 } 2534 2535 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2536 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2537 { 2538 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, 2539 (__v8df) __B, 2540 (__v8df) __C, 2541 (__mmask8) __U, 2542 _MM_FROUND_CUR_DIRECTION); 2543 } 2544 2545 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2546 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2547 { 2548 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, 2549 (__v8df) __B, 2550 (__v8df) __C, 2551 (__mmask8) __U, 2552 _MM_FROUND_CUR_DIRECTION); 2553 } 2554 2555 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2556 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) 2557 { 2558 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2559 (__v8df) __B, 2560 -(__v8df) __C, 2561 (__mmask8) -1, 2562 _MM_FROUND_CUR_DIRECTION); 2563 } 2564 2565 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2566 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2567 { 2568 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2569 (__v8df) __B, 2570 -(__v8df) __C, 2571 (__mmask8) __U, 2572 _MM_FROUND_CUR_DIRECTION); 2573 } 2574 2575 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2576 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2577 { 2578 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, 2579 (__v8df) __B, 2580 -(__v8df) __C, 2581 (__mmask8) __U, 2582 _MM_FROUND_CUR_DIRECTION); 2583 } 2584 2585 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2586 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) 2587 { 2588 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2589 -(__v8df) __B, 2590 (__v8df) __C, 2591 (__mmask8) -1, 2592 _MM_FROUND_CUR_DIRECTION); 2593 } 2594 2595 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2596 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2597 { 2598 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, 2599 (__v8df) __B, 2600 (__v8df) __C, 2601 (__mmask8) __U, 2602 _MM_FROUND_CUR_DIRECTION); 2603 } 2604 2605 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2606 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2607 { 2608 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, 2609 (__v8df) __B, 2610 (__v8df) __C, 2611 (__mmask8) __U, 2612 _MM_FROUND_CUR_DIRECTION); 2613 } 2614 2615 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2616 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) 2617 { 2618 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2619 -(__v8df) __B, 2620 -(__v8df) __C, 2621 (__mmask8) -1, 2622 _MM_FROUND_CUR_DIRECTION); 2623 } 2624 2625 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2626 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2627 { 2628 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, 2629 (__v8df) __B, 2630 -(__v8df) __C, 2631 (__mmask8) __U, 2632 _MM_FROUND_CUR_DIRECTION); 2633 } 2634 2635 #define _mm512_fmadd_round_ps(A, B, C, R) \ 2636 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2637 (__v16sf)(__m512)(B), \ 2638 (__v16sf)(__m512)(C), \ 2639 (__mmask16)-1, (int)(R))) 2640 2641 2642 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ 2643 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2644 (__v16sf)(__m512)(B), \ 2645 (__v16sf)(__m512)(C), \ 2646 (__mmask16)(U), (int)(R))) 2647 2648 2649 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ 2650 ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ 2651 (__v16sf)(__m512)(B), \ 2652 (__v16sf)(__m512)(C), \ 2653 (__mmask16)(U), (int)(R))) 2654 2655 2656 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ 2657 ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ 2658 (__v16sf)(__m512)(B), \ 2659 (__v16sf)(__m512)(C), \ 2660 (__mmask16)(U), (int)(R))) 2661 2662 2663 #define _mm512_fmsub_round_ps(A, B, C, R) \ 2664 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2665 (__v16sf)(__m512)(B), \ 2666 -(__v16sf)(__m512)(C), \ 2667 (__mmask16)-1, (int)(R))) 2668 2669 2670 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ 2671 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2672 (__v16sf)(__m512)(B), \ 2673 -(__v16sf)(__m512)(C), \ 2674 (__mmask16)(U), (int)(R))) 2675 2676 2677 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ 2678 ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ 2679 (__v16sf)(__m512)(B), \ 2680 -(__v16sf)(__m512)(C), \ 2681 (__mmask16)(U), (int)(R))) 2682 2683 2684 #define _mm512_fnmadd_round_ps(A, B, C, R) \ 2685 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2686 -(__v16sf)(__m512)(B), \ 2687 (__v16sf)(__m512)(C), \ 2688 (__mmask16)-1, (int)(R))) 2689 2690 2691 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ 2692 ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ 2693 (__v16sf)(__m512)(B), \ 2694 (__v16sf)(__m512)(C), \ 2695 (__mmask16)(U), (int)(R))) 2696 2697 2698 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ 2699 ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ 2700 (__v16sf)(__m512)(B), \ 2701 (__v16sf)(__m512)(C), \ 2702 (__mmask16)(U), (int)(R))) 2703 2704 2705 #define _mm512_fnmsub_round_ps(A, B, C, R) \ 2706 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2707 -(__v16sf)(__m512)(B), \ 2708 -(__v16sf)(__m512)(C), \ 2709 (__mmask16)-1, (int)(R))) 2710 2711 2712 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ 2713 ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ 2714 (__v16sf)(__m512)(B), \ 2715 -(__v16sf)(__m512)(C), \ 2716 (__mmask16)(U), (int)(R))) 2717 2718 2719 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2720 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) 2721 { 2722 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2723 (__v16sf) __B, 2724 (__v16sf) __C, 2725 (__mmask16) -1, 2726 _MM_FROUND_CUR_DIRECTION); 2727 } 2728 2729 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2730 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 2731 { 2732 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2733 (__v16sf) __B, 2734 (__v16sf) __C, 2735 (__mmask16) __U, 2736 _MM_FROUND_CUR_DIRECTION); 2737 } 2738 2739 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2740 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 2741 { 2742 return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, 2743 (__v16sf) __B, 2744 (__v16sf) __C, 2745 (__mmask16) __U, 2746 _MM_FROUND_CUR_DIRECTION); 2747 } 2748 2749 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2750 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2751 { 2752 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, 2753 (__v16sf) __B, 2754 (__v16sf) __C, 2755 (__mmask16) __U, 2756 _MM_FROUND_CUR_DIRECTION); 2757 } 2758 2759 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2760 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) 2761 { 2762 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2763 (__v16sf) __B, 2764 -(__v16sf) __C, 2765 (__mmask16) -1, 2766 _MM_FROUND_CUR_DIRECTION); 2767 } 2768 2769 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2770 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 2771 { 2772 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2773 (__v16sf) __B, 2774 -(__v16sf) __C, 2775 (__mmask16) __U, 2776 _MM_FROUND_CUR_DIRECTION); 2777 } 2778 2779 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2780 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2781 { 2782 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, 2783 (__v16sf) __B, 2784 -(__v16sf) __C, 2785 (__mmask16) __U, 2786 _MM_FROUND_CUR_DIRECTION); 2787 } 2788 2789 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2790 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) 2791 { 2792 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2793 -(__v16sf) __B, 2794 (__v16sf) __C, 2795 (__mmask16) -1, 2796 _MM_FROUND_CUR_DIRECTION); 2797 } 2798 2799 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2800 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 2801 { 2802 return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, 2803 (__v16sf) __B, 2804 (__v16sf) __C, 2805 (__mmask16) __U, 2806 _MM_FROUND_CUR_DIRECTION); 2807 } 2808 2809 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2810 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2811 { 2812 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, 2813 (__v16sf) __B, 2814 (__v16sf) __C, 2815 (__mmask16) __U, 2816 _MM_FROUND_CUR_DIRECTION); 2817 } 2818 2819 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2820 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) 2821 { 2822 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2823 -(__v16sf) __B, 2824 -(__v16sf) __C, 2825 (__mmask16) -1, 2826 _MM_FROUND_CUR_DIRECTION); 2827 } 2828 2829 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2830 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2831 { 2832 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, 2833 (__v16sf) __B, 2834 -(__v16sf) __C, 2835 (__mmask16) __U, 2836 _MM_FROUND_CUR_DIRECTION); 2837 } 2838 2839 #define _mm512_fmaddsub_round_pd(A, B, C, R) \ 2840 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2841 (__v8df)(__m512d)(B), \ 2842 (__v8df)(__m512d)(C), \ 2843 (__mmask8)-1, (int)(R))) 2844 2845 2846 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ 2847 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2848 (__v8df)(__m512d)(B), \ 2849 (__v8df)(__m512d)(C), \ 2850 (__mmask8)(U), (int)(R))) 2851 2852 2853 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ 2854 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ 2855 (__v8df)(__m512d)(B), \ 2856 (__v8df)(__m512d)(C), \ 2857 (__mmask8)(U), (int)(R))) 2858 2859 2860 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ 2861 ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ 2862 (__v8df)(__m512d)(B), \ 2863 (__v8df)(__m512d)(C), \ 2864 (__mmask8)(U), (int)(R))) 2865 2866 2867 #define _mm512_fmsubadd_round_pd(A, B, C, R) \ 2868 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2869 (__v8df)(__m512d)(B), \ 2870 -(__v8df)(__m512d)(C), \ 2871 (__mmask8)-1, (int)(R))) 2872 2873 2874 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ 2875 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2876 (__v8df)(__m512d)(B), \ 2877 -(__v8df)(__m512d)(C), \ 2878 (__mmask8)(U), (int)(R))) 2879 2880 2881 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ 2882 ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ 2883 (__v8df)(__m512d)(B), \ 2884 -(__v8df)(__m512d)(C), \ 2885 (__mmask8)(U), (int)(R))) 2886 2887 2888 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2889 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) 2890 { 2891 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2892 (__v8df) __B, 2893 (__v8df) __C, 2894 (__mmask8) -1, 2895 _MM_FROUND_CUR_DIRECTION); 2896 } 2897 2898 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2899 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2900 { 2901 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2902 (__v8df) __B, 2903 (__v8df) __C, 2904 (__mmask8) __U, 2905 _MM_FROUND_CUR_DIRECTION); 2906 } 2907 2908 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2909 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2910 { 2911 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, 2912 (__v8df) __B, 2913 (__v8df) __C, 2914 (__mmask8) __U, 2915 _MM_FROUND_CUR_DIRECTION); 2916 } 2917 2918 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2919 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2920 { 2921 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, 2922 (__v8df) __B, 2923 (__v8df) __C, 2924 (__mmask8) __U, 2925 _MM_FROUND_CUR_DIRECTION); 2926 } 2927 2928 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2929 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) 2930 { 2931 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2932 (__v8df) __B, 2933 -(__v8df) __C, 2934 (__mmask8) -1, 2935 _MM_FROUND_CUR_DIRECTION); 2936 } 2937 2938 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2939 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2940 { 2941 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2942 (__v8df) __B, 2943 -(__v8df) __C, 2944 (__mmask8) __U, 2945 _MM_FROUND_CUR_DIRECTION); 2946 } 2947 2948 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2949 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2950 { 2951 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, 2952 (__v8df) __B, 2953 -(__v8df) __C, 2954 (__mmask8) __U, 2955 _MM_FROUND_CUR_DIRECTION); 2956 } 2957 2958 #define _mm512_fmaddsub_round_ps(A, B, C, R) \ 2959 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2960 (__v16sf)(__m512)(B), \ 2961 (__v16sf)(__m512)(C), \ 2962 (__mmask16)-1, (int)(R))) 2963 2964 2965 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ 2966 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2967 (__v16sf)(__m512)(B), \ 2968 (__v16sf)(__m512)(C), \ 2969 (__mmask16)(U), (int)(R))) 2970 2971 2972 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ 2973 ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ 2974 (__v16sf)(__m512)(B), \ 2975 (__v16sf)(__m512)(C), \ 2976 (__mmask16)(U), (int)(R))) 2977 2978 2979 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ 2980 ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ 2981 (__v16sf)(__m512)(B), \ 2982 (__v16sf)(__m512)(C), \ 2983 (__mmask16)(U), (int)(R))) 2984 2985 2986 #define _mm512_fmsubadd_round_ps(A, B, C, R) \ 2987 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2988 (__v16sf)(__m512)(B), \ 2989 -(__v16sf)(__m512)(C), \ 2990 (__mmask16)-1, (int)(R))) 2991 2992 2993 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ 2994 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2995 (__v16sf)(__m512)(B), \ 2996 -(__v16sf)(__m512)(C), \ 2997 (__mmask16)(U), (int)(R))) 2998 2999 3000 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ 3001 ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ 3002 (__v16sf)(__m512)(B), \ 3003 -(__v16sf)(__m512)(C), \ 3004 (__mmask16)(U), (int)(R))) 3005 3006 3007 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3008 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) 3009 { 3010 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3011 (__v16sf) __B, 3012 (__v16sf) __C, 3013 (__mmask16) -1, 3014 _MM_FROUND_CUR_DIRECTION); 3015 } 3016 3017 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3018 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3019 { 3020 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3021 (__v16sf) __B, 3022 (__v16sf) __C, 3023 (__mmask16) __U, 3024 _MM_FROUND_CUR_DIRECTION); 3025 } 3026 3027 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3028 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3029 { 3030 return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, 3031 (__v16sf) __B, 3032 (__v16sf) __C, 3033 (__mmask16) __U, 3034 _MM_FROUND_CUR_DIRECTION); 3035 } 3036 3037 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3038 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 3039 { 3040 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, 3041 (__v16sf) __B, 3042 (__v16sf) __C, 3043 (__mmask16) __U, 3044 _MM_FROUND_CUR_DIRECTION); 3045 } 3046 3047 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3048 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) 3049 { 3050 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3051 (__v16sf) __B, 3052 -(__v16sf) __C, 3053 (__mmask16) -1, 3054 _MM_FROUND_CUR_DIRECTION); 3055 } 3056 3057 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3058 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3059 { 3060 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3061 (__v16sf) __B, 3062 -(__v16sf) __C, 3063 (__mmask16) __U, 3064 _MM_FROUND_CUR_DIRECTION); 3065 } 3066 3067 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3068 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 3069 { 3070 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, 3071 (__v16sf) __B, 3072 -(__v16sf) __C, 3073 (__mmask16) __U, 3074 _MM_FROUND_CUR_DIRECTION); 3075 } 3076 3077 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ 3078 ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ 3079 (__v8df)(__m512d)(B), \ 3080 (__v8df)(__m512d)(C), \ 3081 (__mmask8)(U), (int)(R))) 3082 3083 3084 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3085 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3086 { 3087 return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, 3088 (__v8df) __B, 3089 (__v8df) __C, 3090 (__mmask8) __U, 3091 _MM_FROUND_CUR_DIRECTION); 3092 } 3093 3094 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ 3095 ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ 3096 (__v16sf)(__m512)(B), \ 3097 (__v16sf)(__m512)(C), \ 3098 (__mmask16)(U), (int)(R))) 3099 3100 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3101 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3102 { 3103 return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, 3104 (__v16sf) __B, 3105 (__v16sf) __C, 3106 (__mmask16) __U, 3107 _MM_FROUND_CUR_DIRECTION); 3108 } 3109 3110 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ 3111 ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ 3112 (__v8df)(__m512d)(B), \ 3113 (__v8df)(__m512d)(C), \ 3114 (__mmask8)(U), (int)(R))) 3115 3116 3117 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3118 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3119 { 3120 return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, 3121 (__v8df) __B, 3122 (__v8df) __C, 3123 (__mmask8) __U, 3124 _MM_FROUND_CUR_DIRECTION); 3125 } 3126 3127 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ 3128 ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ 3129 (__v16sf)(__m512)(B), \ 3130 (__v16sf)(__m512)(C), \ 3131 (__mmask16)(U), (int)(R))) 3132 3133 3134 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3135 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3136 { 3137 return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, 3138 (__v16sf) __B, 3139 (__v16sf) __C, 3140 (__mmask16) __U, 3141 _MM_FROUND_CUR_DIRECTION); 3142 } 3143 3144 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ 3145 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 3146 -(__v8df)(__m512d)(B), \ 3147 (__v8df)(__m512d)(C), \ 3148 (__mmask8)(U), (int)(R))) 3149 3150 3151 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3152 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 3153 { 3154 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 3155 -(__v8df) __B, 3156 (__v8df) __C, 3157 (__mmask8) __U, 3158 _MM_FROUND_CUR_DIRECTION); 3159 } 3160 3161 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ 3162 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 3163 -(__v16sf)(__m512)(B), \ 3164 (__v16sf)(__m512)(C), \ 3165 (__mmask16)(U), (int)(R))) 3166 3167 3168 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3169 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3170 { 3171 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 3172 -(__v16sf) __B, 3173 (__v16sf) __C, 3174 (__mmask16) __U, 3175 _MM_FROUND_CUR_DIRECTION); 3176 } 3177 3178 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ 3179 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 3180 -(__v8df)(__m512d)(B), \ 3181 -(__v8df)(__m512d)(C), \ 3182 (__mmask8)(U), (int)(R))) 3183 3184 3185 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ 3186 ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \ 3187 (__v8df)(__m512d)(B), \ 3188 (__v8df)(__m512d)(C), \ 3189 (__mmask8)(U), (int)(R))) 3190 3191 3192 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3193 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 3194 { 3195 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 3196 -(__v8df) __B, 3197 -(__v8df) __C, 3198 (__mmask8) __U, 3199 _MM_FROUND_CUR_DIRECTION); 3200 } 3201 3202 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3203 _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3204 { 3205 return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A, 3206 (__v8df) __B, 3207 (__v8df) __C, 3208 (__mmask8) __U, 3209 _MM_FROUND_CUR_DIRECTION); 3210 } 3211 3212 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ 3213 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 3214 -(__v16sf)(__m512)(B), \ 3215 -(__v16sf)(__m512)(C), \ 3216 (__mmask16)(U), (int)(R))) 3217 3218 3219 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ 3220 ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \ 3221 (__v16sf)(__m512)(B), \ 3222 (__v16sf)(__m512)(C), \ 3223 (__mmask16)(U), (int)(R))) 3224 3225 3226 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3227 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3228 { 3229 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 3230 -(__v16sf) __B, 3231 -(__v16sf) __C, 3232 (__mmask16) __U, 3233 _MM_FROUND_CUR_DIRECTION); 3234 } 3235 3236 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3237 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3238 { 3239 return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A, 3240 (__v16sf) __B, 3241 (__v16sf) __C, 3242 (__mmask16) __U, 3243 _MM_FROUND_CUR_DIRECTION); 3244 } 3245 3246 3247 3248 /* Vector permutations */ 3249 3250 static __inline __m512i __DEFAULT_FN_ATTRS512 3251 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) 3252 { 3253 return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, 3254 (__v16si) __B); 3255 } 3256 3257 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3258 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, 3259 __m512i __B) 3260 { 3261 return (__m512i)__builtin_ia32_selectd_512(__U, 3262 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3263 (__v16si)__A); 3264 } 3265 3266 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3267 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, 3268 __m512i __B) 3269 { 3270 return (__m512i)__builtin_ia32_selectd_512(__U, 3271 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3272 (__v16si)__I); 3273 } 3274 3275 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3276 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, 3277 __m512i __B) 3278 { 3279 return (__m512i)__builtin_ia32_selectd_512(__U, 3280 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3281 (__v16si)_mm512_setzero_si512()); 3282 } 3283 3284 static __inline __m512i __DEFAULT_FN_ATTRS512 3285 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) 3286 { 3287 return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, 3288 (__v8di) __B); 3289 } 3290 3291 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3292 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, 3293 __m512i __B) 3294 { 3295 return (__m512i)__builtin_ia32_selectq_512(__U, 3296 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3297 (__v8di)__A); 3298 } 3299 3300 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3301 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, 3302 __m512i __B) 3303 { 3304 return (__m512i)__builtin_ia32_selectq_512(__U, 3305 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3306 (__v8di)__I); 3307 } 3308 3309 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3310 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, 3311 __m512i __B) 3312 { 3313 return (__m512i)__builtin_ia32_selectq_512(__U, 3314 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3315 (__v8di)_mm512_setzero_si512()); 3316 } 3317 3318 #define _mm512_alignr_epi64(A, B, I) \ 3319 ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \ 3320 (__v8di)(__m512i)(B), (int)(I))) 3321 3322 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \ 3323 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 3324 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ 3325 (__v8di)(__m512i)(W))) 3326 3327 #define _mm512_maskz_alignr_epi64(U, A, B, imm) \ 3328 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 3329 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ 3330 (__v8di)_mm512_setzero_si512())) 3331 3332 #define _mm512_alignr_epi32(A, B, I) \ 3333 ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \ 3334 (__v16si)(__m512i)(B), (int)(I))) 3335 3336 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \ 3337 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 3338 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ 3339 (__v16si)(__m512i)(W))) 3340 3341 #define _mm512_maskz_alignr_epi32(U, A, B, imm) \ 3342 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 3343 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ 3344 (__v16si)_mm512_setzero_si512())) 3345 /* Vector Extract */ 3346 3347 #define _mm512_extractf64x4_pd(A, I) \ 3348 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ 3349 (__v4df)_mm256_undefined_pd(), \ 3350 (__mmask8)-1)) 3351 3352 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \ 3353 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ 3354 (__v4df)(__m256d)(W), \ 3355 (__mmask8)(U))) 3356 3357 #define _mm512_maskz_extractf64x4_pd(U, A, imm) \ 3358 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ 3359 (__v4df)_mm256_setzero_pd(), \ 3360 (__mmask8)(U))) 3361 3362 #define _mm512_extractf32x4_ps(A, I) \ 3363 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ 3364 (__v4sf)_mm_undefined_ps(), \ 3365 (__mmask8)-1)) 3366 3367 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \ 3368 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ 3369 (__v4sf)(__m128)(W), \ 3370 (__mmask8)(U))) 3371 3372 #define _mm512_maskz_extractf32x4_ps(U, A, imm) \ 3373 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ 3374 (__v4sf)_mm_setzero_ps(), \ 3375 (__mmask8)(U))) 3376 3377 /* Vector Blend */ 3378 3379 static __inline __m512d __DEFAULT_FN_ATTRS512 3380 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) 3381 { 3382 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 3383 (__v8df) __W, 3384 (__v8df) __A); 3385 } 3386 3387 static __inline __m512 __DEFAULT_FN_ATTRS512 3388 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) 3389 { 3390 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 3391 (__v16sf) __W, 3392 (__v16sf) __A); 3393 } 3394 3395 static __inline __m512i __DEFAULT_FN_ATTRS512 3396 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) 3397 { 3398 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 3399 (__v8di) __W, 3400 (__v8di) __A); 3401 } 3402 3403 static __inline __m512i __DEFAULT_FN_ATTRS512 3404 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) 3405 { 3406 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 3407 (__v16si) __W, 3408 (__v16si) __A); 3409 } 3410 3411 /* Compare */ 3412 3413 #define _mm512_cmp_round_ps_mask(A, B, P, R) \ 3414 ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ 3415 (__v16sf)(__m512)(B), (int)(P), \ 3416 (__mmask16)-1, (int)(R))) 3417 3418 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \ 3419 ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ 3420 (__v16sf)(__m512)(B), (int)(P), \ 3421 (__mmask16)(U), (int)(R))) 3422 3423 #define _mm512_cmp_ps_mask(A, B, P) \ 3424 _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3425 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \ 3426 _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3427 3428 #define _mm512_cmpeq_ps_mask(A, B) \ 3429 _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ) 3430 #define _mm512_mask_cmpeq_ps_mask(k, A, B) \ 3431 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ) 3432 3433 #define _mm512_cmplt_ps_mask(A, B) \ 3434 _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS) 3435 #define _mm512_mask_cmplt_ps_mask(k, A, B) \ 3436 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS) 3437 3438 #define _mm512_cmple_ps_mask(A, B) \ 3439 _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS) 3440 #define _mm512_mask_cmple_ps_mask(k, A, B) \ 3441 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS) 3442 3443 #define _mm512_cmpunord_ps_mask(A, B) \ 3444 _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q) 3445 #define _mm512_mask_cmpunord_ps_mask(k, A, B) \ 3446 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q) 3447 3448 #define _mm512_cmpneq_ps_mask(A, B) \ 3449 _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ) 3450 #define _mm512_mask_cmpneq_ps_mask(k, A, B) \ 3451 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ) 3452 3453 #define _mm512_cmpnlt_ps_mask(A, B) \ 3454 _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US) 3455 #define _mm512_mask_cmpnlt_ps_mask(k, A, B) \ 3456 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US) 3457 3458 #define _mm512_cmpnle_ps_mask(A, B) \ 3459 _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US) 3460 #define _mm512_mask_cmpnle_ps_mask(k, A, B) \ 3461 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US) 3462 3463 #define _mm512_cmpord_ps_mask(A, B) \ 3464 _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q) 3465 #define _mm512_mask_cmpord_ps_mask(k, A, B) \ 3466 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q) 3467 3468 #define _mm512_cmp_round_pd_mask(A, B, P, R) \ 3469 ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ 3470 (__v8df)(__m512d)(B), (int)(P), \ 3471 (__mmask8)-1, (int)(R))) 3472 3473 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \ 3474 ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ 3475 (__v8df)(__m512d)(B), (int)(P), \ 3476 (__mmask8)(U), (int)(R))) 3477 3478 #define _mm512_cmp_pd_mask(A, B, P) \ 3479 _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3480 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \ 3481 _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3482 3483 #define _mm512_cmpeq_pd_mask(A, B) \ 3484 _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ) 3485 #define _mm512_mask_cmpeq_pd_mask(k, A, B) \ 3486 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ) 3487 3488 #define _mm512_cmplt_pd_mask(A, B) \ 3489 _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS) 3490 #define _mm512_mask_cmplt_pd_mask(k, A, B) \ 3491 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS) 3492 3493 #define _mm512_cmple_pd_mask(A, B) \ 3494 _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS) 3495 #define _mm512_mask_cmple_pd_mask(k, A, B) \ 3496 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS) 3497 3498 #define _mm512_cmpunord_pd_mask(A, B) \ 3499 _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q) 3500 #define _mm512_mask_cmpunord_pd_mask(k, A, B) \ 3501 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q) 3502 3503 #define _mm512_cmpneq_pd_mask(A, B) \ 3504 _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ) 3505 #define _mm512_mask_cmpneq_pd_mask(k, A, B) \ 3506 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ) 3507 3508 #define _mm512_cmpnlt_pd_mask(A, B) \ 3509 _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US) 3510 #define _mm512_mask_cmpnlt_pd_mask(k, A, B) \ 3511 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US) 3512 3513 #define _mm512_cmpnle_pd_mask(A, B) \ 3514 _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US) 3515 #define _mm512_mask_cmpnle_pd_mask(k, A, B) \ 3516 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US) 3517 3518 #define _mm512_cmpord_pd_mask(A, B) \ 3519 _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q) 3520 #define _mm512_mask_cmpord_pd_mask(k, A, B) \ 3521 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q) 3522 3523 /* Conversion */ 3524 3525 #define _mm512_cvtt_roundps_epu32(A, R) \ 3526 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3527 (__v16si)_mm512_undefined_epi32(), \ 3528 (__mmask16)-1, (int)(R))) 3529 3530 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \ 3531 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3532 (__v16si)(__m512i)(W), \ 3533 (__mmask16)(U), (int)(R))) 3534 3535 #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \ 3536 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3537 (__v16si)_mm512_setzero_si512(), \ 3538 (__mmask16)(U), (int)(R))) 3539 3540 3541 static __inline __m512i __DEFAULT_FN_ATTRS512 3542 _mm512_cvttps_epu32(__m512 __A) 3543 { 3544 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3545 (__v16si) 3546 _mm512_setzero_si512 (), 3547 (__mmask16) -1, 3548 _MM_FROUND_CUR_DIRECTION); 3549 } 3550 3551 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3552 _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) 3553 { 3554 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3555 (__v16si) __W, 3556 (__mmask16) __U, 3557 _MM_FROUND_CUR_DIRECTION); 3558 } 3559 3560 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3561 _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) 3562 { 3563 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3564 (__v16si) _mm512_setzero_si512 (), 3565 (__mmask16) __U, 3566 _MM_FROUND_CUR_DIRECTION); 3567 } 3568 3569 #define _mm512_cvt_roundepi32_ps(A, R) \ 3570 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3571 (__v16sf)_mm512_setzero_ps(), \ 3572 (__mmask16)-1, (int)(R))) 3573 3574 #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \ 3575 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3576 (__v16sf)(__m512)(W), \ 3577 (__mmask16)(U), (int)(R))) 3578 3579 #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \ 3580 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3581 (__v16sf)_mm512_setzero_ps(), \ 3582 (__mmask16)(U), (int)(R))) 3583 3584 #define _mm512_cvt_roundepu32_ps(A, R) \ 3585 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3586 (__v16sf)_mm512_setzero_ps(), \ 3587 (__mmask16)-1, (int)(R))) 3588 3589 #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \ 3590 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3591 (__v16sf)(__m512)(W), \ 3592 (__mmask16)(U), (int)(R))) 3593 3594 #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \ 3595 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3596 (__v16sf)_mm512_setzero_ps(), \ 3597 (__mmask16)(U), (int)(R))) 3598 3599 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3600 _mm512_cvtepu32_ps (__m512i __A) 3601 { 3602 return (__m512)__builtin_convertvector((__v16su)__A, __v16sf); 3603 } 3604 3605 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3606 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) 3607 { 3608 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3609 (__v16sf)_mm512_cvtepu32_ps(__A), 3610 (__v16sf)__W); 3611 } 3612 3613 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3614 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) 3615 { 3616 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3617 (__v16sf)_mm512_cvtepu32_ps(__A), 3618 (__v16sf)_mm512_setzero_ps()); 3619 } 3620 3621 static __inline __m512d __DEFAULT_FN_ATTRS512 3622 _mm512_cvtepi32_pd(__m256i __A) 3623 { 3624 return (__m512d)__builtin_convertvector((__v8si)__A, __v8df); 3625 } 3626 3627 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3628 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) 3629 { 3630 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3631 (__v8df)_mm512_cvtepi32_pd(__A), 3632 (__v8df)__W); 3633 } 3634 3635 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3636 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) 3637 { 3638 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3639 (__v8df)_mm512_cvtepi32_pd(__A), 3640 (__v8df)_mm512_setzero_pd()); 3641 } 3642 3643 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3644 _mm512_cvtepi32lo_pd(__m512i __A) 3645 { 3646 return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A)); 3647 } 3648 3649 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3650 _mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) 3651 { 3652 return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A)); 3653 } 3654 3655 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3656 _mm512_cvtepi32_ps (__m512i __A) 3657 { 3658 return (__m512)__builtin_convertvector((__v16si)__A, __v16sf); 3659 } 3660 3661 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3662 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) 3663 { 3664 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3665 (__v16sf)_mm512_cvtepi32_ps(__A), 3666 (__v16sf)__W); 3667 } 3668 3669 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3670 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) 3671 { 3672 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3673 (__v16sf)_mm512_cvtepi32_ps(__A), 3674 (__v16sf)_mm512_setzero_ps()); 3675 } 3676 3677 static __inline __m512d __DEFAULT_FN_ATTRS512 3678 _mm512_cvtepu32_pd(__m256i __A) 3679 { 3680 return (__m512d)__builtin_convertvector((__v8su)__A, __v8df); 3681 } 3682 3683 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3684 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) 3685 { 3686 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3687 (__v8df)_mm512_cvtepu32_pd(__A), 3688 (__v8df)__W); 3689 } 3690 3691 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3692 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) 3693 { 3694 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3695 (__v8df)_mm512_cvtepu32_pd(__A), 3696 (__v8df)_mm512_setzero_pd()); 3697 } 3698 3699 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3700 _mm512_cvtepu32lo_pd(__m512i __A) 3701 { 3702 return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A)); 3703 } 3704 3705 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3706 _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) 3707 { 3708 return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A)); 3709 } 3710 3711 #define _mm512_cvt_roundpd_ps(A, R) \ 3712 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3713 (__v8sf)_mm256_setzero_ps(), \ 3714 (__mmask8)-1, (int)(R))) 3715 3716 #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \ 3717 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3718 (__v8sf)(__m256)(W), (__mmask8)(U), \ 3719 (int)(R))) 3720 3721 #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \ 3722 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3723 (__v8sf)_mm256_setzero_ps(), \ 3724 (__mmask8)(U), (int)(R))) 3725 3726 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3727 _mm512_cvtpd_ps (__m512d __A) 3728 { 3729 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3730 (__v8sf) _mm256_undefined_ps (), 3731 (__mmask8) -1, 3732 _MM_FROUND_CUR_DIRECTION); 3733 } 3734 3735 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3736 _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) 3737 { 3738 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3739 (__v8sf) __W, 3740 (__mmask8) __U, 3741 _MM_FROUND_CUR_DIRECTION); 3742 } 3743 3744 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3745 _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) 3746 { 3747 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3748 (__v8sf) _mm256_setzero_ps (), 3749 (__mmask8) __U, 3750 _MM_FROUND_CUR_DIRECTION); 3751 } 3752 3753 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3754 _mm512_cvtpd_pslo (__m512d __A) 3755 { 3756 return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A), 3757 (__v8sf) _mm256_setzero_ps (), 3758 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3759 } 3760 3761 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3762 _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) 3763 { 3764 return (__m512) __builtin_shufflevector ( 3765 (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W), 3766 __U, __A), 3767 (__v8sf) _mm256_setzero_ps (), 3768 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3769 } 3770 3771 #define _mm512_cvt_roundps_ph(A, I) \ 3772 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3773 (__v16hi)_mm256_undefined_si256(), \ 3774 (__mmask16)-1)) 3775 3776 #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ 3777 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3778 (__v16hi)(__m256i)(U), \ 3779 (__mmask16)(W))) 3780 3781 #define _mm512_maskz_cvt_roundps_ph(W, A, I) \ 3782 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3783 (__v16hi)_mm256_setzero_si256(), \ 3784 (__mmask16)(W))) 3785 3786 #define _mm512_cvtps_ph _mm512_cvt_roundps_ph 3787 #define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph 3788 #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph 3789 3790 #define _mm512_cvt_roundph_ps(A, R) \ 3791 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3792 (__v16sf)_mm512_undefined_ps(), \ 3793 (__mmask16)-1, (int)(R))) 3794 3795 #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \ 3796 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3797 (__v16sf)(__m512)(W), \ 3798 (__mmask16)(U), (int)(R))) 3799 3800 #define _mm512_maskz_cvt_roundph_ps(U, A, R) \ 3801 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3802 (__v16sf)_mm512_setzero_ps(), \ 3803 (__mmask16)(U), (int)(R))) 3804 3805 3806 static __inline __m512 __DEFAULT_FN_ATTRS512 3807 _mm512_cvtph_ps(__m256i __A) 3808 { 3809 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3810 (__v16sf) 3811 _mm512_setzero_ps (), 3812 (__mmask16) -1, 3813 _MM_FROUND_CUR_DIRECTION); 3814 } 3815 3816 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3817 _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) 3818 { 3819 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3820 (__v16sf) __W, 3821 (__mmask16) __U, 3822 _MM_FROUND_CUR_DIRECTION); 3823 } 3824 3825 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3826 _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) 3827 { 3828 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3829 (__v16sf) _mm512_setzero_ps (), 3830 (__mmask16) __U, 3831 _MM_FROUND_CUR_DIRECTION); 3832 } 3833 3834 #define _mm512_cvtt_roundpd_epi32(A, R) \ 3835 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3836 (__v8si)_mm256_setzero_si256(), \ 3837 (__mmask8)-1, (int)(R))) 3838 3839 #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \ 3840 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3841 (__v8si)(__m256i)(W), \ 3842 (__mmask8)(U), (int)(R))) 3843 3844 #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \ 3845 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3846 (__v8si)_mm256_setzero_si256(), \ 3847 (__mmask8)(U), (int)(R))) 3848 3849 static __inline __m256i __DEFAULT_FN_ATTRS512 3850 _mm512_cvttpd_epi32(__m512d __a) 3851 { 3852 return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a, 3853 (__v8si)_mm256_setzero_si256(), 3854 (__mmask8) -1, 3855 _MM_FROUND_CUR_DIRECTION); 3856 } 3857 3858 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3859 _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) 3860 { 3861 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, 3862 (__v8si) __W, 3863 (__mmask8) __U, 3864 _MM_FROUND_CUR_DIRECTION); 3865 } 3866 3867 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3868 _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) 3869 { 3870 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, 3871 (__v8si) _mm256_setzero_si256 (), 3872 (__mmask8) __U, 3873 _MM_FROUND_CUR_DIRECTION); 3874 } 3875 3876 #define _mm512_cvtt_roundps_epi32(A, R) \ 3877 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3878 (__v16si)_mm512_setzero_si512(), \ 3879 (__mmask16)-1, (int)(R))) 3880 3881 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \ 3882 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3883 (__v16si)(__m512i)(W), \ 3884 (__mmask16)(U), (int)(R))) 3885 3886 #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \ 3887 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3888 (__v16si)_mm512_setzero_si512(), \ 3889 (__mmask16)(U), (int)(R))) 3890 3891 static __inline __m512i __DEFAULT_FN_ATTRS512 3892 _mm512_cvttps_epi32(__m512 __a) 3893 { 3894 return (__m512i) 3895 __builtin_ia32_cvttps2dq512_mask((__v16sf) __a, 3896 (__v16si) _mm512_setzero_si512 (), 3897 (__mmask16) -1, _MM_FROUND_CUR_DIRECTION); 3898 } 3899 3900 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3901 _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) 3902 { 3903 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, 3904 (__v16si) __W, 3905 (__mmask16) __U, 3906 _MM_FROUND_CUR_DIRECTION); 3907 } 3908 3909 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3910 _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) 3911 { 3912 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, 3913 (__v16si) _mm512_setzero_si512 (), 3914 (__mmask16) __U, 3915 _MM_FROUND_CUR_DIRECTION); 3916 } 3917 3918 #define _mm512_cvt_roundps_epi32(A, R) \ 3919 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3920 (__v16si)_mm512_setzero_si512(), \ 3921 (__mmask16)-1, (int)(R))) 3922 3923 #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \ 3924 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3925 (__v16si)(__m512i)(W), \ 3926 (__mmask16)(U), (int)(R))) 3927 3928 #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \ 3929 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3930 (__v16si)_mm512_setzero_si512(), \ 3931 (__mmask16)(U), (int)(R))) 3932 3933 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3934 _mm512_cvtps_epi32 (__m512 __A) 3935 { 3936 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3937 (__v16si) _mm512_undefined_epi32 (), 3938 (__mmask16) -1, 3939 _MM_FROUND_CUR_DIRECTION); 3940 } 3941 3942 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3943 _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) 3944 { 3945 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3946 (__v16si) __W, 3947 (__mmask16) __U, 3948 _MM_FROUND_CUR_DIRECTION); 3949 } 3950 3951 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3952 _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) 3953 { 3954 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3955 (__v16si) 3956 _mm512_setzero_si512 (), 3957 (__mmask16) __U, 3958 _MM_FROUND_CUR_DIRECTION); 3959 } 3960 3961 #define _mm512_cvt_roundpd_epi32(A, R) \ 3962 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3963 (__v8si)_mm256_setzero_si256(), \ 3964 (__mmask8)-1, (int)(R))) 3965 3966 #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \ 3967 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3968 (__v8si)(__m256i)(W), \ 3969 (__mmask8)(U), (int)(R))) 3970 3971 #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \ 3972 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3973 (__v8si)_mm256_setzero_si256(), \ 3974 (__mmask8)(U), (int)(R))) 3975 3976 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3977 _mm512_cvtpd_epi32 (__m512d __A) 3978 { 3979 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 3980 (__v8si) 3981 _mm256_undefined_si256 (), 3982 (__mmask8) -1, 3983 _MM_FROUND_CUR_DIRECTION); 3984 } 3985 3986 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3987 _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) 3988 { 3989 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 3990 (__v8si) __W, 3991 (__mmask8) __U, 3992 _MM_FROUND_CUR_DIRECTION); 3993 } 3994 3995 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3996 _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) 3997 { 3998 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 3999 (__v8si) 4000 _mm256_setzero_si256 (), 4001 (__mmask8) __U, 4002 _MM_FROUND_CUR_DIRECTION); 4003 } 4004 4005 #define _mm512_cvt_roundps_epu32(A, R) \ 4006 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4007 (__v16si)_mm512_setzero_si512(), \ 4008 (__mmask16)-1, (int)(R))) 4009 4010 #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \ 4011 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4012 (__v16si)(__m512i)(W), \ 4013 (__mmask16)(U), (int)(R))) 4014 4015 #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \ 4016 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4017 (__v16si)_mm512_setzero_si512(), \ 4018 (__mmask16)(U), (int)(R))) 4019 4020 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4021 _mm512_cvtps_epu32 ( __m512 __A) 4022 { 4023 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\ 4024 (__v16si)\ 4025 _mm512_undefined_epi32 (), 4026 (__mmask16) -1,\ 4027 _MM_FROUND_CUR_DIRECTION); 4028 } 4029 4030 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4031 _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) 4032 { 4033 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, 4034 (__v16si) __W, 4035 (__mmask16) __U, 4036 _MM_FROUND_CUR_DIRECTION); 4037 } 4038 4039 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4040 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A) 4041 { 4042 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, 4043 (__v16si) 4044 _mm512_setzero_si512 (), 4045 (__mmask16) __U , 4046 _MM_FROUND_CUR_DIRECTION); 4047 } 4048 4049 #define _mm512_cvt_roundpd_epu32(A, R) \ 4050 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4051 (__v8si)_mm256_setzero_si256(), \ 4052 (__mmask8)-1, (int)(R))) 4053 4054 #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \ 4055 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4056 (__v8si)(__m256i)(W), \ 4057 (__mmask8)(U), (int)(R))) 4058 4059 #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \ 4060 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4061 (__v8si)_mm256_setzero_si256(), \ 4062 (__mmask8)(U), (int)(R))) 4063 4064 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4065 _mm512_cvtpd_epu32 (__m512d __A) 4066 { 4067 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4068 (__v8si) 4069 _mm256_undefined_si256 (), 4070 (__mmask8) -1, 4071 _MM_FROUND_CUR_DIRECTION); 4072 } 4073 4074 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4075 _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) 4076 { 4077 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4078 (__v8si) __W, 4079 (__mmask8) __U, 4080 _MM_FROUND_CUR_DIRECTION); 4081 } 4082 4083 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4084 _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) 4085 { 4086 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4087 (__v8si) 4088 _mm256_setzero_si256 (), 4089 (__mmask8) __U, 4090 _MM_FROUND_CUR_DIRECTION); 4091 } 4092 4093 static __inline__ double __DEFAULT_FN_ATTRS512 4094 _mm512_cvtsd_f64(__m512d __a) 4095 { 4096 return __a[0]; 4097 } 4098 4099 static __inline__ float __DEFAULT_FN_ATTRS512 4100 _mm512_cvtss_f32(__m512 __a) 4101 { 4102 return __a[0]; 4103 } 4104 4105 /* Unpack and Interleave */ 4106 4107 static __inline __m512d __DEFAULT_FN_ATTRS512 4108 _mm512_unpackhi_pd(__m512d __a, __m512d __b) 4109 { 4110 return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 4111 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); 4112 } 4113 4114 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4115 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 4116 { 4117 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4118 (__v8df)_mm512_unpackhi_pd(__A, __B), 4119 (__v8df)__W); 4120 } 4121 4122 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4123 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) 4124 { 4125 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4126 (__v8df)_mm512_unpackhi_pd(__A, __B), 4127 (__v8df)_mm512_setzero_pd()); 4128 } 4129 4130 static __inline __m512d __DEFAULT_FN_ATTRS512 4131 _mm512_unpacklo_pd(__m512d __a, __m512d __b) 4132 { 4133 return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 4134 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); 4135 } 4136 4137 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4138 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 4139 { 4140 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4141 (__v8df)_mm512_unpacklo_pd(__A, __B), 4142 (__v8df)__W); 4143 } 4144 4145 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4146 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) 4147 { 4148 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4149 (__v8df)_mm512_unpacklo_pd(__A, __B), 4150 (__v8df)_mm512_setzero_pd()); 4151 } 4152 4153 static __inline __m512 __DEFAULT_FN_ATTRS512 4154 _mm512_unpackhi_ps(__m512 __a, __m512 __b) 4155 { 4156 return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, 4157 2, 18, 3, 19, 4158 2+4, 18+4, 3+4, 19+4, 4159 2+8, 18+8, 3+8, 19+8, 4160 2+12, 18+12, 3+12, 19+12); 4161 } 4162 4163 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4164 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 4165 { 4166 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4167 (__v16sf)_mm512_unpackhi_ps(__A, __B), 4168 (__v16sf)__W); 4169 } 4170 4171 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4172 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) 4173 { 4174 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4175 (__v16sf)_mm512_unpackhi_ps(__A, __B), 4176 (__v16sf)_mm512_setzero_ps()); 4177 } 4178 4179 static __inline __m512 __DEFAULT_FN_ATTRS512 4180 _mm512_unpacklo_ps(__m512 __a, __m512 __b) 4181 { 4182 return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, 4183 0, 16, 1, 17, 4184 0+4, 16+4, 1+4, 17+4, 4185 0+8, 16+8, 1+8, 17+8, 4186 0+12, 16+12, 1+12, 17+12); 4187 } 4188 4189 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4190 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 4191 { 4192 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4193 (__v16sf)_mm512_unpacklo_ps(__A, __B), 4194 (__v16sf)__W); 4195 } 4196 4197 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4198 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) 4199 { 4200 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4201 (__v16sf)_mm512_unpacklo_ps(__A, __B), 4202 (__v16sf)_mm512_setzero_ps()); 4203 } 4204 4205 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4206 _mm512_unpackhi_epi32(__m512i __A, __m512i __B) 4207 { 4208 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, 4209 2, 18, 3, 19, 4210 2+4, 18+4, 3+4, 19+4, 4211 2+8, 18+8, 3+8, 19+8, 4212 2+12, 18+12, 3+12, 19+12); 4213 } 4214 4215 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4216 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4217 { 4218 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4219 (__v16si)_mm512_unpackhi_epi32(__A, __B), 4220 (__v16si)__W); 4221 } 4222 4223 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4224 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) 4225 { 4226 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4227 (__v16si)_mm512_unpackhi_epi32(__A, __B), 4228 (__v16si)_mm512_setzero_si512()); 4229 } 4230 4231 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4232 _mm512_unpacklo_epi32(__m512i __A, __m512i __B) 4233 { 4234 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, 4235 0, 16, 1, 17, 4236 0+4, 16+4, 1+4, 17+4, 4237 0+8, 16+8, 1+8, 17+8, 4238 0+12, 16+12, 1+12, 17+12); 4239 } 4240 4241 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4242 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4243 { 4244 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4245 (__v16si)_mm512_unpacklo_epi32(__A, __B), 4246 (__v16si)__W); 4247 } 4248 4249 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4250 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) 4251 { 4252 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4253 (__v16si)_mm512_unpacklo_epi32(__A, __B), 4254 (__v16si)_mm512_setzero_si512()); 4255 } 4256 4257 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4258 _mm512_unpackhi_epi64(__m512i __A, __m512i __B) 4259 { 4260 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 4261 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); 4262 } 4263 4264 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4265 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4266 { 4267 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4268 (__v8di)_mm512_unpackhi_epi64(__A, __B), 4269 (__v8di)__W); 4270 } 4271 4272 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4273 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) 4274 { 4275 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4276 (__v8di)_mm512_unpackhi_epi64(__A, __B), 4277 (__v8di)_mm512_setzero_si512()); 4278 } 4279 4280 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4281 _mm512_unpacklo_epi64 (__m512i __A, __m512i __B) 4282 { 4283 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 4284 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); 4285 } 4286 4287 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4288 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4289 { 4290 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4291 (__v8di)_mm512_unpacklo_epi64(__A, __B), 4292 (__v8di)__W); 4293 } 4294 4295 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4296 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 4297 { 4298 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4299 (__v8di)_mm512_unpacklo_epi64(__A, __B), 4300 (__v8di)_mm512_setzero_si512()); 4301 } 4302 4303 4304 /* SIMD load ops */ 4305 4306 static __inline __m512i __DEFAULT_FN_ATTRS512 4307 _mm512_loadu_si512 (void const *__P) 4308 { 4309 struct __loadu_si512 { 4310 __m512i_u __v; 4311 } __attribute__((__packed__, __may_alias__)); 4312 return ((const struct __loadu_si512*)__P)->__v; 4313 } 4314 4315 static __inline __m512i __DEFAULT_FN_ATTRS512 4316 _mm512_loadu_epi32 (void const *__P) 4317 { 4318 struct __loadu_epi32 { 4319 __m512i_u __v; 4320 } __attribute__((__packed__, __may_alias__)); 4321 return ((const struct __loadu_epi32*)__P)->__v; 4322 } 4323 4324 static __inline __m512i __DEFAULT_FN_ATTRS512 4325 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) 4326 { 4327 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, 4328 (__v16si) __W, 4329 (__mmask16) __U); 4330 } 4331 4332 4333 static __inline __m512i __DEFAULT_FN_ATTRS512 4334 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) 4335 { 4336 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P, 4337 (__v16si) 4338 _mm512_setzero_si512 (), 4339 (__mmask16) __U); 4340 } 4341 4342 static __inline __m512i __DEFAULT_FN_ATTRS512 4343 _mm512_loadu_epi64 (void const *__P) 4344 { 4345 struct __loadu_epi64 { 4346 __m512i_u __v; 4347 } __attribute__((__packed__, __may_alias__)); 4348 return ((const struct __loadu_epi64*)__P)->__v; 4349 } 4350 4351 static __inline __m512i __DEFAULT_FN_ATTRS512 4352 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) 4353 { 4354 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, 4355 (__v8di) __W, 4356 (__mmask8) __U); 4357 } 4358 4359 static __inline __m512i __DEFAULT_FN_ATTRS512 4360 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) 4361 { 4362 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P, 4363 (__v8di) 4364 _mm512_setzero_si512 (), 4365 (__mmask8) __U); 4366 } 4367 4368 static __inline __m512 __DEFAULT_FN_ATTRS512 4369 _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) 4370 { 4371 return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, 4372 (__v16sf) __W, 4373 (__mmask16) __U); 4374 } 4375 4376 static __inline __m512 __DEFAULT_FN_ATTRS512 4377 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) 4378 { 4379 return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P, 4380 (__v16sf) 4381 _mm512_setzero_ps (), 4382 (__mmask16) __U); 4383 } 4384 4385 static __inline __m512d __DEFAULT_FN_ATTRS512 4386 _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) 4387 { 4388 return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, 4389 (__v8df) __W, 4390 (__mmask8) __U); 4391 } 4392 4393 static __inline __m512d __DEFAULT_FN_ATTRS512 4394 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) 4395 { 4396 return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P, 4397 (__v8df) 4398 _mm512_setzero_pd (), 4399 (__mmask8) __U); 4400 } 4401 4402 static __inline __m512d __DEFAULT_FN_ATTRS512 4403 _mm512_loadu_pd(void const *__p) 4404 { 4405 struct __loadu_pd { 4406 __m512d_u __v; 4407 } __attribute__((__packed__, __may_alias__)); 4408 return ((const struct __loadu_pd*)__p)->__v; 4409 } 4410 4411 static __inline __m512 __DEFAULT_FN_ATTRS512 4412 _mm512_loadu_ps(void const *__p) 4413 { 4414 struct __loadu_ps { 4415 __m512_u __v; 4416 } __attribute__((__packed__, __may_alias__)); 4417 return ((const struct __loadu_ps*)__p)->__v; 4418 } 4419 4420 static __inline __m512 __DEFAULT_FN_ATTRS512 4421 _mm512_load_ps(void const *__p) 4422 { 4423 return *(const __m512*)__p; 4424 } 4425 4426 static __inline __m512 __DEFAULT_FN_ATTRS512 4427 _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) 4428 { 4429 return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, 4430 (__v16sf) __W, 4431 (__mmask16) __U); 4432 } 4433 4434 static __inline __m512 __DEFAULT_FN_ATTRS512 4435 _mm512_maskz_load_ps(__mmask16 __U, void const *__P) 4436 { 4437 return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P, 4438 (__v16sf) 4439 _mm512_setzero_ps (), 4440 (__mmask16) __U); 4441 } 4442 4443 static __inline __m512d __DEFAULT_FN_ATTRS512 4444 _mm512_load_pd(void const *__p) 4445 { 4446 return *(const __m512d*)__p; 4447 } 4448 4449 static __inline __m512d __DEFAULT_FN_ATTRS512 4450 _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) 4451 { 4452 return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, 4453 (__v8df) __W, 4454 (__mmask8) __U); 4455 } 4456 4457 static __inline __m512d __DEFAULT_FN_ATTRS512 4458 _mm512_maskz_load_pd(__mmask8 __U, void const *__P) 4459 { 4460 return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P, 4461 (__v8df) 4462 _mm512_setzero_pd (), 4463 (__mmask8) __U); 4464 } 4465 4466 static __inline __m512i __DEFAULT_FN_ATTRS512 4467 _mm512_load_si512 (void const *__P) 4468 { 4469 return *(const __m512i *) __P; 4470 } 4471 4472 static __inline __m512i __DEFAULT_FN_ATTRS512 4473 _mm512_load_epi32 (void const *__P) 4474 { 4475 return *(const __m512i *) __P; 4476 } 4477 4478 static __inline __m512i __DEFAULT_FN_ATTRS512 4479 _mm512_load_epi64 (void const *__P) 4480 { 4481 return *(const __m512i *) __P; 4482 } 4483 4484 /* SIMD store ops */ 4485 4486 static __inline void __DEFAULT_FN_ATTRS512 4487 _mm512_storeu_epi64 (void *__P, __m512i __A) 4488 { 4489 struct __storeu_epi64 { 4490 __m512i_u __v; 4491 } __attribute__((__packed__, __may_alias__)); 4492 ((struct __storeu_epi64*)__P)->__v = __A; 4493 } 4494 4495 static __inline void __DEFAULT_FN_ATTRS512 4496 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) 4497 { 4498 __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A, 4499 (__mmask8) __U); 4500 } 4501 4502 static __inline void __DEFAULT_FN_ATTRS512 4503 _mm512_storeu_si512 (void *__P, __m512i __A) 4504 { 4505 struct __storeu_si512 { 4506 __m512i_u __v; 4507 } __attribute__((__packed__, __may_alias__)); 4508 ((struct __storeu_si512*)__P)->__v = __A; 4509 } 4510 4511 static __inline void __DEFAULT_FN_ATTRS512 4512 _mm512_storeu_epi32 (void *__P, __m512i __A) 4513 { 4514 struct __storeu_epi32 { 4515 __m512i_u __v; 4516 } __attribute__((__packed__, __may_alias__)); 4517 ((struct __storeu_epi32*)__P)->__v = __A; 4518 } 4519 4520 static __inline void __DEFAULT_FN_ATTRS512 4521 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) 4522 { 4523 __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A, 4524 (__mmask16) __U); 4525 } 4526 4527 static __inline void __DEFAULT_FN_ATTRS512 4528 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) 4529 { 4530 __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U); 4531 } 4532 4533 static __inline void __DEFAULT_FN_ATTRS512 4534 _mm512_storeu_pd(void *__P, __m512d __A) 4535 { 4536 struct __storeu_pd { 4537 __m512d_u __v; 4538 } __attribute__((__packed__, __may_alias__)); 4539 ((struct __storeu_pd*)__P)->__v = __A; 4540 } 4541 4542 static __inline void __DEFAULT_FN_ATTRS512 4543 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) 4544 { 4545 __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A, 4546 (__mmask16) __U); 4547 } 4548 4549 static __inline void __DEFAULT_FN_ATTRS512 4550 _mm512_storeu_ps(void *__P, __m512 __A) 4551 { 4552 struct __storeu_ps { 4553 __m512_u __v; 4554 } __attribute__((__packed__, __may_alias__)); 4555 ((struct __storeu_ps*)__P)->__v = __A; 4556 } 4557 4558 static __inline void __DEFAULT_FN_ATTRS512 4559 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) 4560 { 4561 __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U); 4562 } 4563 4564 static __inline void __DEFAULT_FN_ATTRS512 4565 _mm512_store_pd(void *__P, __m512d __A) 4566 { 4567 *(__m512d*)__P = __A; 4568 } 4569 4570 static __inline void __DEFAULT_FN_ATTRS512 4571 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) 4572 { 4573 __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A, 4574 (__mmask16) __U); 4575 } 4576 4577 static __inline void __DEFAULT_FN_ATTRS512 4578 _mm512_store_ps(void *__P, __m512 __A) 4579 { 4580 *(__m512*)__P = __A; 4581 } 4582 4583 static __inline void __DEFAULT_FN_ATTRS512 4584 _mm512_store_si512 (void *__P, __m512i __A) 4585 { 4586 *(__m512i *) __P = __A; 4587 } 4588 4589 static __inline void __DEFAULT_FN_ATTRS512 4590 _mm512_store_epi32 (void *__P, __m512i __A) 4591 { 4592 *(__m512i *) __P = __A; 4593 } 4594 4595 static __inline void __DEFAULT_FN_ATTRS512 4596 _mm512_store_epi64 (void *__P, __m512i __A) 4597 { 4598 *(__m512i *) __P = __A; 4599 } 4600 4601 /* Mask ops */ 4602 4603 static __inline __mmask16 __DEFAULT_FN_ATTRS 4604 _mm512_knot(__mmask16 __M) 4605 { 4606 return __builtin_ia32_knothi(__M); 4607 } 4608 4609 /* Integer compare */ 4610 4611 #define _mm512_cmpeq_epi32_mask(A, B) \ 4612 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) 4613 #define _mm512_mask_cmpeq_epi32_mask(k, A, B) \ 4614 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) 4615 #define _mm512_cmpge_epi32_mask(A, B) \ 4616 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) 4617 #define _mm512_mask_cmpge_epi32_mask(k, A, B) \ 4618 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) 4619 #define _mm512_cmpgt_epi32_mask(A, B) \ 4620 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) 4621 #define _mm512_mask_cmpgt_epi32_mask(k, A, B) \ 4622 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) 4623 #define _mm512_cmple_epi32_mask(A, B) \ 4624 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) 4625 #define _mm512_mask_cmple_epi32_mask(k, A, B) \ 4626 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) 4627 #define _mm512_cmplt_epi32_mask(A, B) \ 4628 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) 4629 #define _mm512_mask_cmplt_epi32_mask(k, A, B) \ 4630 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) 4631 #define _mm512_cmpneq_epi32_mask(A, B) \ 4632 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) 4633 #define _mm512_mask_cmpneq_epi32_mask(k, A, B) \ 4634 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) 4635 4636 #define _mm512_cmpeq_epu32_mask(A, B) \ 4637 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) 4638 #define _mm512_mask_cmpeq_epu32_mask(k, A, B) \ 4639 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) 4640 #define _mm512_cmpge_epu32_mask(A, B) \ 4641 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) 4642 #define _mm512_mask_cmpge_epu32_mask(k, A, B) \ 4643 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) 4644 #define _mm512_cmpgt_epu32_mask(A, B) \ 4645 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) 4646 #define _mm512_mask_cmpgt_epu32_mask(k, A, B) \ 4647 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) 4648 #define _mm512_cmple_epu32_mask(A, B) \ 4649 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) 4650 #define _mm512_mask_cmple_epu32_mask(k, A, B) \ 4651 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) 4652 #define _mm512_cmplt_epu32_mask(A, B) \ 4653 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) 4654 #define _mm512_mask_cmplt_epu32_mask(k, A, B) \ 4655 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) 4656 #define _mm512_cmpneq_epu32_mask(A, B) \ 4657 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) 4658 #define _mm512_mask_cmpneq_epu32_mask(k, A, B) \ 4659 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) 4660 4661 #define _mm512_cmpeq_epi64_mask(A, B) \ 4662 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) 4663 #define _mm512_mask_cmpeq_epi64_mask(k, A, B) \ 4664 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) 4665 #define _mm512_cmpge_epi64_mask(A, B) \ 4666 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) 4667 #define _mm512_mask_cmpge_epi64_mask(k, A, B) \ 4668 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) 4669 #define _mm512_cmpgt_epi64_mask(A, B) \ 4670 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) 4671 #define _mm512_mask_cmpgt_epi64_mask(k, A, B) \ 4672 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) 4673 #define _mm512_cmple_epi64_mask(A, B) \ 4674 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) 4675 #define _mm512_mask_cmple_epi64_mask(k, A, B) \ 4676 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) 4677 #define _mm512_cmplt_epi64_mask(A, B) \ 4678 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) 4679 #define _mm512_mask_cmplt_epi64_mask(k, A, B) \ 4680 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) 4681 #define _mm512_cmpneq_epi64_mask(A, B) \ 4682 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) 4683 #define _mm512_mask_cmpneq_epi64_mask(k, A, B) \ 4684 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) 4685 4686 #define _mm512_cmpeq_epu64_mask(A, B) \ 4687 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) 4688 #define _mm512_mask_cmpeq_epu64_mask(k, A, B) \ 4689 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) 4690 #define _mm512_cmpge_epu64_mask(A, B) \ 4691 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) 4692 #define _mm512_mask_cmpge_epu64_mask(k, A, B) \ 4693 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) 4694 #define _mm512_cmpgt_epu64_mask(A, B) \ 4695 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) 4696 #define _mm512_mask_cmpgt_epu64_mask(k, A, B) \ 4697 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) 4698 #define _mm512_cmple_epu64_mask(A, B) \ 4699 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) 4700 #define _mm512_mask_cmple_epu64_mask(k, A, B) \ 4701 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) 4702 #define _mm512_cmplt_epu64_mask(A, B) \ 4703 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) 4704 #define _mm512_mask_cmplt_epu64_mask(k, A, B) \ 4705 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) 4706 #define _mm512_cmpneq_epu64_mask(A, B) \ 4707 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) 4708 #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \ 4709 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) 4710 4711 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4712 _mm512_cvtepi8_epi32(__m128i __A) 4713 { 4714 /* This function always performs a signed extension, but __v16qi is a char 4715 which may be signed or unsigned, so use __v16qs. */ 4716 return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si); 4717 } 4718 4719 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4720 _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) 4721 { 4722 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4723 (__v16si)_mm512_cvtepi8_epi32(__A), 4724 (__v16si)__W); 4725 } 4726 4727 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4728 _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) 4729 { 4730 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4731 (__v16si)_mm512_cvtepi8_epi32(__A), 4732 (__v16si)_mm512_setzero_si512()); 4733 } 4734 4735 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4736 _mm512_cvtepi8_epi64(__m128i __A) 4737 { 4738 /* This function always performs a signed extension, but __v16qi is a char 4739 which may be signed or unsigned, so use __v16qs. */ 4740 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); 4741 } 4742 4743 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4744 _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4745 { 4746 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4747 (__v8di)_mm512_cvtepi8_epi64(__A), 4748 (__v8di)__W); 4749 } 4750 4751 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4752 _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) 4753 { 4754 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4755 (__v8di)_mm512_cvtepi8_epi64(__A), 4756 (__v8di)_mm512_setzero_si512 ()); 4757 } 4758 4759 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4760 _mm512_cvtepi32_epi64(__m256i __X) 4761 { 4762 return (__m512i)__builtin_convertvector((__v8si)__X, __v8di); 4763 } 4764 4765 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4766 _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) 4767 { 4768 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4769 (__v8di)_mm512_cvtepi32_epi64(__X), 4770 (__v8di)__W); 4771 } 4772 4773 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4774 _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) 4775 { 4776 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4777 (__v8di)_mm512_cvtepi32_epi64(__X), 4778 (__v8di)_mm512_setzero_si512()); 4779 } 4780 4781 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4782 _mm512_cvtepi16_epi32(__m256i __A) 4783 { 4784 return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si); 4785 } 4786 4787 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4788 _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) 4789 { 4790 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4791 (__v16si)_mm512_cvtepi16_epi32(__A), 4792 (__v16si)__W); 4793 } 4794 4795 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4796 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) 4797 { 4798 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4799 (__v16si)_mm512_cvtepi16_epi32(__A), 4800 (__v16si)_mm512_setzero_si512 ()); 4801 } 4802 4803 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4804 _mm512_cvtepi16_epi64(__m128i __A) 4805 { 4806 return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di); 4807 } 4808 4809 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4810 _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4811 { 4812 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4813 (__v8di)_mm512_cvtepi16_epi64(__A), 4814 (__v8di)__W); 4815 } 4816 4817 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4818 _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) 4819 { 4820 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4821 (__v8di)_mm512_cvtepi16_epi64(__A), 4822 (__v8di)_mm512_setzero_si512()); 4823 } 4824 4825 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4826 _mm512_cvtepu8_epi32(__m128i __A) 4827 { 4828 return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si); 4829 } 4830 4831 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4832 _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) 4833 { 4834 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4835 (__v16si)_mm512_cvtepu8_epi32(__A), 4836 (__v16si)__W); 4837 } 4838 4839 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4840 _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) 4841 { 4842 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4843 (__v16si)_mm512_cvtepu8_epi32(__A), 4844 (__v16si)_mm512_setzero_si512()); 4845 } 4846 4847 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4848 _mm512_cvtepu8_epi64(__m128i __A) 4849 { 4850 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); 4851 } 4852 4853 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4854 _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4855 { 4856 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4857 (__v8di)_mm512_cvtepu8_epi64(__A), 4858 (__v8di)__W); 4859 } 4860 4861 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4862 _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) 4863 { 4864 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4865 (__v8di)_mm512_cvtepu8_epi64(__A), 4866 (__v8di)_mm512_setzero_si512()); 4867 } 4868 4869 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4870 _mm512_cvtepu32_epi64(__m256i __X) 4871 { 4872 return (__m512i)__builtin_convertvector((__v8su)__X, __v8di); 4873 } 4874 4875 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4876 _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) 4877 { 4878 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4879 (__v8di)_mm512_cvtepu32_epi64(__X), 4880 (__v8di)__W); 4881 } 4882 4883 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4884 _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) 4885 { 4886 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4887 (__v8di)_mm512_cvtepu32_epi64(__X), 4888 (__v8di)_mm512_setzero_si512()); 4889 } 4890 4891 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4892 _mm512_cvtepu16_epi32(__m256i __A) 4893 { 4894 return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si); 4895 } 4896 4897 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4898 _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) 4899 { 4900 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4901 (__v16si)_mm512_cvtepu16_epi32(__A), 4902 (__v16si)__W); 4903 } 4904 4905 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4906 _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) 4907 { 4908 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4909 (__v16si)_mm512_cvtepu16_epi32(__A), 4910 (__v16si)_mm512_setzero_si512()); 4911 } 4912 4913 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4914 _mm512_cvtepu16_epi64(__m128i __A) 4915 { 4916 return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di); 4917 } 4918 4919 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4920 _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4921 { 4922 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4923 (__v8di)_mm512_cvtepu16_epi64(__A), 4924 (__v8di)__W); 4925 } 4926 4927 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4928 _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) 4929 { 4930 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4931 (__v8di)_mm512_cvtepu16_epi64(__A), 4932 (__v8di)_mm512_setzero_si512()); 4933 } 4934 4935 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4936 _mm512_rorv_epi32 (__m512i __A, __m512i __B) 4937 { 4938 return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B); 4939 } 4940 4941 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4942 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4943 { 4944 return (__m512i)__builtin_ia32_selectd_512(__U, 4945 (__v16si)_mm512_rorv_epi32(__A, __B), 4946 (__v16si)__W); 4947 } 4948 4949 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4950 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 4951 { 4952 return (__m512i)__builtin_ia32_selectd_512(__U, 4953 (__v16si)_mm512_rorv_epi32(__A, __B), 4954 (__v16si)_mm512_setzero_si512()); 4955 } 4956 4957 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4958 _mm512_rorv_epi64 (__m512i __A, __m512i __B) 4959 { 4960 return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B); 4961 } 4962 4963 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4964 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4965 { 4966 return (__m512i)__builtin_ia32_selectq_512(__U, 4967 (__v8di)_mm512_rorv_epi64(__A, __B), 4968 (__v8di)__W); 4969 } 4970 4971 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4972 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 4973 { 4974 return (__m512i)__builtin_ia32_selectq_512(__U, 4975 (__v8di)_mm512_rorv_epi64(__A, __B), 4976 (__v8di)_mm512_setzero_si512()); 4977 } 4978 4979 4980 4981 #define _mm512_cmp_epi32_mask(a, b, p) \ 4982 ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ 4983 (__v16si)(__m512i)(b), (int)(p), \ 4984 (__mmask16)-1)) 4985 4986 #define _mm512_cmp_epu32_mask(a, b, p) \ 4987 ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ 4988 (__v16si)(__m512i)(b), (int)(p), \ 4989 (__mmask16)-1)) 4990 4991 #define _mm512_cmp_epi64_mask(a, b, p) \ 4992 ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ 4993 (__v8di)(__m512i)(b), (int)(p), \ 4994 (__mmask8)-1)) 4995 4996 #define _mm512_cmp_epu64_mask(a, b, p) \ 4997 ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ 4998 (__v8di)(__m512i)(b), (int)(p), \ 4999 (__mmask8)-1)) 5000 5001 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \ 5002 ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ 5003 (__v16si)(__m512i)(b), (int)(p), \ 5004 (__mmask16)(m))) 5005 5006 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \ 5007 ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ 5008 (__v16si)(__m512i)(b), (int)(p), \ 5009 (__mmask16)(m))) 5010 5011 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \ 5012 ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ 5013 (__v8di)(__m512i)(b), (int)(p), \ 5014 (__mmask8)(m))) 5015 5016 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \ 5017 ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ 5018 (__v8di)(__m512i)(b), (int)(p), \ 5019 (__mmask8)(m))) 5020 5021 #define _mm512_rol_epi32(a, b) \ 5022 ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b))) 5023 5024 #define _mm512_mask_rol_epi32(W, U, a, b) \ 5025 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5026 (__v16si)_mm512_rol_epi32((a), (b)), \ 5027 (__v16si)(__m512i)(W))) 5028 5029 #define _mm512_maskz_rol_epi32(U, a, b) \ 5030 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5031 (__v16si)_mm512_rol_epi32((a), (b)), \ 5032 (__v16si)_mm512_setzero_si512())) 5033 5034 #define _mm512_rol_epi64(a, b) \ 5035 ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b))) 5036 5037 #define _mm512_mask_rol_epi64(W, U, a, b) \ 5038 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5039 (__v8di)_mm512_rol_epi64((a), (b)), \ 5040 (__v8di)(__m512i)(W))) 5041 5042 #define _mm512_maskz_rol_epi64(U, a, b) \ 5043 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5044 (__v8di)_mm512_rol_epi64((a), (b)), \ 5045 (__v8di)_mm512_setzero_si512())) 5046 5047 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5048 _mm512_rolv_epi32 (__m512i __A, __m512i __B) 5049 { 5050 return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B); 5051 } 5052 5053 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5054 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 5055 { 5056 return (__m512i)__builtin_ia32_selectd_512(__U, 5057 (__v16si)_mm512_rolv_epi32(__A, __B), 5058 (__v16si)__W); 5059 } 5060 5061 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5062 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 5063 { 5064 return (__m512i)__builtin_ia32_selectd_512(__U, 5065 (__v16si)_mm512_rolv_epi32(__A, __B), 5066 (__v16si)_mm512_setzero_si512()); 5067 } 5068 5069 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5070 _mm512_rolv_epi64 (__m512i __A, __m512i __B) 5071 { 5072 return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B); 5073 } 5074 5075 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5076 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 5077 { 5078 return (__m512i)__builtin_ia32_selectq_512(__U, 5079 (__v8di)_mm512_rolv_epi64(__A, __B), 5080 (__v8di)__W); 5081 } 5082 5083 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5084 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 5085 { 5086 return (__m512i)__builtin_ia32_selectq_512(__U, 5087 (__v8di)_mm512_rolv_epi64(__A, __B), 5088 (__v8di)_mm512_setzero_si512()); 5089 } 5090 5091 #define _mm512_ror_epi32(A, B) \ 5092 ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B))) 5093 5094 #define _mm512_mask_ror_epi32(W, U, A, B) \ 5095 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5096 (__v16si)_mm512_ror_epi32((A), (B)), \ 5097 (__v16si)(__m512i)(W))) 5098 5099 #define _mm512_maskz_ror_epi32(U, A, B) \ 5100 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5101 (__v16si)_mm512_ror_epi32((A), (B)), \ 5102 (__v16si)_mm512_setzero_si512())) 5103 5104 #define _mm512_ror_epi64(A, B) \ 5105 ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B))) 5106 5107 #define _mm512_mask_ror_epi64(W, U, A, B) \ 5108 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5109 (__v8di)_mm512_ror_epi64((A), (B)), \ 5110 (__v8di)(__m512i)(W))) 5111 5112 #define _mm512_maskz_ror_epi64(U, A, B) \ 5113 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5114 (__v8di)_mm512_ror_epi64((A), (B)), \ 5115 (__v8di)_mm512_setzero_si512())) 5116 5117 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5118 _mm512_slli_epi32(__m512i __A, unsigned int __B) 5119 { 5120 return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B); 5121 } 5122 5123 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5124 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, 5125 unsigned int __B) 5126 { 5127 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5128 (__v16si)_mm512_slli_epi32(__A, __B), 5129 (__v16si)__W); 5130 } 5131 5132 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5133 _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { 5134 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5135 (__v16si)_mm512_slli_epi32(__A, __B), 5136 (__v16si)_mm512_setzero_si512()); 5137 } 5138 5139 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5140 _mm512_slli_epi64(__m512i __A, unsigned int __B) 5141 { 5142 return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B); 5143 } 5144 5145 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5146 _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) 5147 { 5148 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5149 (__v8di)_mm512_slli_epi64(__A, __B), 5150 (__v8di)__W); 5151 } 5152 5153 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5154 _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) 5155 { 5156 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5157 (__v8di)_mm512_slli_epi64(__A, __B), 5158 (__v8di)_mm512_setzero_si512()); 5159 } 5160 5161 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5162 _mm512_srli_epi32(__m512i __A, unsigned int __B) 5163 { 5164 return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B); 5165 } 5166 5167 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5168 _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, 5169 unsigned int __B) 5170 { 5171 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5172 (__v16si)_mm512_srli_epi32(__A, __B), 5173 (__v16si)__W); 5174 } 5175 5176 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5177 _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { 5178 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5179 (__v16si)_mm512_srli_epi32(__A, __B), 5180 (__v16si)_mm512_setzero_si512()); 5181 } 5182 5183 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5184 _mm512_srli_epi64(__m512i __A, unsigned int __B) 5185 { 5186 return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B); 5187 } 5188 5189 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5190 _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, 5191 unsigned int __B) 5192 { 5193 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5194 (__v8di)_mm512_srli_epi64(__A, __B), 5195 (__v8di)__W); 5196 } 5197 5198 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5199 _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, 5200 unsigned int __B) 5201 { 5202 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5203 (__v8di)_mm512_srli_epi64(__A, __B), 5204 (__v8di)_mm512_setzero_si512()); 5205 } 5206 5207 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5208 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) 5209 { 5210 return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, 5211 (__v16si) __W, 5212 (__mmask16) __U); 5213 } 5214 5215 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5216 _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) 5217 { 5218 return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, 5219 (__v16si) 5220 _mm512_setzero_si512 (), 5221 (__mmask16) __U); 5222 } 5223 5224 static __inline__ void __DEFAULT_FN_ATTRS512 5225 _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) 5226 { 5227 __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A, 5228 (__mmask16) __U); 5229 } 5230 5231 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5232 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 5233 { 5234 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 5235 (__v16si) __A, 5236 (__v16si) __W); 5237 } 5238 5239 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5240 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) 5241 { 5242 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 5243 (__v16si) __A, 5244 (__v16si) _mm512_setzero_si512 ()); 5245 } 5246 5247 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5248 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 5249 { 5250 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 5251 (__v8di) __A, 5252 (__v8di) __W); 5253 } 5254 5255 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5256 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) 5257 { 5258 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 5259 (__v8di) __A, 5260 (__v8di) _mm512_setzero_si512 ()); 5261 } 5262 5263 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5264 _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) 5265 { 5266 return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, 5267 (__v8di) __W, 5268 (__mmask8) __U); 5269 } 5270 5271 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5272 _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) 5273 { 5274 return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, 5275 (__v8di) 5276 _mm512_setzero_si512 (), 5277 (__mmask8) __U); 5278 } 5279 5280 static __inline__ void __DEFAULT_FN_ATTRS512 5281 _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) 5282 { 5283 __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A, 5284 (__mmask8) __U); 5285 } 5286 5287 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5288 _mm512_movedup_pd (__m512d __A) 5289 { 5290 return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A, 5291 0, 0, 2, 2, 4, 4, 6, 6); 5292 } 5293 5294 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5295 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) 5296 { 5297 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 5298 (__v8df)_mm512_movedup_pd(__A), 5299 (__v8df)__W); 5300 } 5301 5302 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5303 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) 5304 { 5305 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 5306 (__v8df)_mm512_movedup_pd(__A), 5307 (__v8df)_mm512_setzero_pd()); 5308 } 5309 5310 #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \ 5311 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5312 (__v8df)(__m512d)(B), \ 5313 (__v8di)(__m512i)(C), (int)(imm), \ 5314 (__mmask8)-1, (int)(R))) 5315 5316 #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \ 5317 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5318 (__v8df)(__m512d)(B), \ 5319 (__v8di)(__m512i)(C), (int)(imm), \ 5320 (__mmask8)(U), (int)(R))) 5321 5322 #define _mm512_fixupimm_pd(A, B, C, imm) \ 5323 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5324 (__v8df)(__m512d)(B), \ 5325 (__v8di)(__m512i)(C), (int)(imm), \ 5326 (__mmask8)-1, \ 5327 _MM_FROUND_CUR_DIRECTION)) 5328 5329 #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \ 5330 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5331 (__v8df)(__m512d)(B), \ 5332 (__v8di)(__m512i)(C), (int)(imm), \ 5333 (__mmask8)(U), \ 5334 _MM_FROUND_CUR_DIRECTION)) 5335 5336 #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \ 5337 ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ 5338 (__v8df)(__m512d)(B), \ 5339 (__v8di)(__m512i)(C), \ 5340 (int)(imm), (__mmask8)(U), \ 5341 (int)(R))) 5342 5343 #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \ 5344 ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ 5345 (__v8df)(__m512d)(B), \ 5346 (__v8di)(__m512i)(C), \ 5347 (int)(imm), (__mmask8)(U), \ 5348 _MM_FROUND_CUR_DIRECTION)) 5349 5350 #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \ 5351 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5352 (__v16sf)(__m512)(B), \ 5353 (__v16si)(__m512i)(C), (int)(imm), \ 5354 (__mmask16)-1, (int)(R))) 5355 5356 #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \ 5357 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5358 (__v16sf)(__m512)(B), \ 5359 (__v16si)(__m512i)(C), (int)(imm), \ 5360 (__mmask16)(U), (int)(R))) 5361 5362 #define _mm512_fixupimm_ps(A, B, C, imm) \ 5363 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5364 (__v16sf)(__m512)(B), \ 5365 (__v16si)(__m512i)(C), (int)(imm), \ 5366 (__mmask16)-1, \ 5367 _MM_FROUND_CUR_DIRECTION)) 5368 5369 #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \ 5370 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5371 (__v16sf)(__m512)(B), \ 5372 (__v16si)(__m512i)(C), (int)(imm), \ 5373 (__mmask16)(U), \ 5374 _MM_FROUND_CUR_DIRECTION)) 5375 5376 #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \ 5377 ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ 5378 (__v16sf)(__m512)(B), \ 5379 (__v16si)(__m512i)(C), \ 5380 (int)(imm), (__mmask16)(U), \ 5381 (int)(R))) 5382 5383 #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \ 5384 ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ 5385 (__v16sf)(__m512)(B), \ 5386 (__v16si)(__m512i)(C), \ 5387 (int)(imm), (__mmask16)(U), \ 5388 _MM_FROUND_CUR_DIRECTION)) 5389 5390 #define _mm_fixupimm_round_sd(A, B, C, imm, R) \ 5391 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5392 (__v2df)(__m128d)(B), \ 5393 (__v2di)(__m128i)(C), (int)(imm), \ 5394 (__mmask8)-1, (int)(R))) 5395 5396 #define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \ 5397 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5398 (__v2df)(__m128d)(B), \ 5399 (__v2di)(__m128i)(C), (int)(imm), \ 5400 (__mmask8)(U), (int)(R))) 5401 5402 #define _mm_fixupimm_sd(A, B, C, imm) \ 5403 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5404 (__v2df)(__m128d)(B), \ 5405 (__v2di)(__m128i)(C), (int)(imm), \ 5406 (__mmask8)-1, \ 5407 _MM_FROUND_CUR_DIRECTION)) 5408 5409 #define _mm_mask_fixupimm_sd(A, U, B, C, imm) \ 5410 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5411 (__v2df)(__m128d)(B), \ 5412 (__v2di)(__m128i)(C), (int)(imm), \ 5413 (__mmask8)(U), \ 5414 _MM_FROUND_CUR_DIRECTION)) 5415 5416 #define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \ 5417 ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ 5418 (__v2df)(__m128d)(B), \ 5419 (__v2di)(__m128i)(C), (int)(imm), \ 5420 (__mmask8)(U), (int)(R))) 5421 5422 #define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \ 5423 ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ 5424 (__v2df)(__m128d)(B), \ 5425 (__v2di)(__m128i)(C), (int)(imm), \ 5426 (__mmask8)(U), \ 5427 _MM_FROUND_CUR_DIRECTION)) 5428 5429 #define _mm_fixupimm_round_ss(A, B, C, imm, R) \ 5430 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5431 (__v4sf)(__m128)(B), \ 5432 (__v4si)(__m128i)(C), (int)(imm), \ 5433 (__mmask8)-1, (int)(R))) 5434 5435 #define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \ 5436 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5437 (__v4sf)(__m128)(B), \ 5438 (__v4si)(__m128i)(C), (int)(imm), \ 5439 (__mmask8)(U), (int)(R))) 5440 5441 #define _mm_fixupimm_ss(A, B, C, imm) \ 5442 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5443 (__v4sf)(__m128)(B), \ 5444 (__v4si)(__m128i)(C), (int)(imm), \ 5445 (__mmask8)-1, \ 5446 _MM_FROUND_CUR_DIRECTION)) 5447 5448 #define _mm_mask_fixupimm_ss(A, U, B, C, imm) \ 5449 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5450 (__v4sf)(__m128)(B), \ 5451 (__v4si)(__m128i)(C), (int)(imm), \ 5452 (__mmask8)(U), \ 5453 _MM_FROUND_CUR_DIRECTION)) 5454 5455 #define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \ 5456 ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ 5457 (__v4sf)(__m128)(B), \ 5458 (__v4si)(__m128i)(C), (int)(imm), \ 5459 (__mmask8)(U), (int)(R))) 5460 5461 #define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \ 5462 ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ 5463 (__v4sf)(__m128)(B), \ 5464 (__v4si)(__m128i)(C), (int)(imm), \ 5465 (__mmask8)(U), \ 5466 _MM_FROUND_CUR_DIRECTION)) 5467 5468 #define _mm_getexp_round_sd(A, B, R) \ 5469 ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5470 (__v2df)(__m128d)(B), \ 5471 (__v2df)_mm_setzero_pd(), \ 5472 (__mmask8)-1, (int)(R))) 5473 5474 5475 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5476 _mm_getexp_sd (__m128d __A, __m128d __B) 5477 { 5478 return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A, 5479 (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); 5480 } 5481 5482 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5483 _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 5484 { 5485 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, 5486 (__v2df) __B, 5487 (__v2df) __W, 5488 (__mmask8) __U, 5489 _MM_FROUND_CUR_DIRECTION); 5490 } 5491 5492 #define _mm_mask_getexp_round_sd(W, U, A, B, R) \ 5493 ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5494 (__v2df)(__m128d)(B), \ 5495 (__v2df)(__m128d)(W), \ 5496 (__mmask8)(U), (int)(R))) 5497 5498 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5499 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) 5500 { 5501 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, 5502 (__v2df) __B, 5503 (__v2df) _mm_setzero_pd (), 5504 (__mmask8) __U, 5505 _MM_FROUND_CUR_DIRECTION); 5506 } 5507 5508 #define _mm_maskz_getexp_round_sd(U, A, B, R) \ 5509 ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5510 (__v2df)(__m128d)(B), \ 5511 (__v2df)_mm_setzero_pd(), \ 5512 (__mmask8)(U), (int)(R))) 5513 5514 #define _mm_getexp_round_ss(A, B, R) \ 5515 ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5516 (__v4sf)(__m128)(B), \ 5517 (__v4sf)_mm_setzero_ps(), \ 5518 (__mmask8)-1, (int)(R))) 5519 5520 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5521 _mm_getexp_ss (__m128 __A, __m128 __B) 5522 { 5523 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5524 (__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); 5525 } 5526 5527 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5528 _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 5529 { 5530 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5531 (__v4sf) __B, 5532 (__v4sf) __W, 5533 (__mmask8) __U, 5534 _MM_FROUND_CUR_DIRECTION); 5535 } 5536 5537 #define _mm_mask_getexp_round_ss(W, U, A, B, R) \ 5538 ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5539 (__v4sf)(__m128)(B), \ 5540 (__v4sf)(__m128)(W), \ 5541 (__mmask8)(U), (int)(R))) 5542 5543 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5544 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) 5545 { 5546 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5547 (__v4sf) __B, 5548 (__v4sf) _mm_setzero_ps (), 5549 (__mmask8) __U, 5550 _MM_FROUND_CUR_DIRECTION); 5551 } 5552 5553 #define _mm_maskz_getexp_round_ss(U, A, B, R) \ 5554 ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5555 (__v4sf)(__m128)(B), \ 5556 (__v4sf)_mm_setzero_ps(), \ 5557 (__mmask8)(U), (int)(R))) 5558 5559 #define _mm_getmant_round_sd(A, B, C, D, R) \ 5560 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5561 (__v2df)(__m128d)(B), \ 5562 (int)(((D)<<2) | (C)), \ 5563 (__v2df)_mm_setzero_pd(), \ 5564 (__mmask8)-1, (int)(R))) 5565 5566 #define _mm_getmant_sd(A, B, C, D) \ 5567 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5568 (__v2df)(__m128d)(B), \ 5569 (int)(((D)<<2) | (C)), \ 5570 (__v2df)_mm_setzero_pd(), \ 5571 (__mmask8)-1, \ 5572 _MM_FROUND_CUR_DIRECTION)) 5573 5574 #define _mm_mask_getmant_sd(W, U, A, B, C, D) \ 5575 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5576 (__v2df)(__m128d)(B), \ 5577 (int)(((D)<<2) | (C)), \ 5578 (__v2df)(__m128d)(W), \ 5579 (__mmask8)(U), \ 5580 _MM_FROUND_CUR_DIRECTION)) 5581 5582 #define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \ 5583 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5584 (__v2df)(__m128d)(B), \ 5585 (int)(((D)<<2) | (C)), \ 5586 (__v2df)(__m128d)(W), \ 5587 (__mmask8)(U), (int)(R))) 5588 5589 #define _mm_maskz_getmant_sd(U, A, B, C, D) \ 5590 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5591 (__v2df)(__m128d)(B), \ 5592 (int)(((D)<<2) | (C)), \ 5593 (__v2df)_mm_setzero_pd(), \ 5594 (__mmask8)(U), \ 5595 _MM_FROUND_CUR_DIRECTION)) 5596 5597 #define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \ 5598 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5599 (__v2df)(__m128d)(B), \ 5600 (int)(((D)<<2) | (C)), \ 5601 (__v2df)_mm_setzero_pd(), \ 5602 (__mmask8)(U), (int)(R))) 5603 5604 #define _mm_getmant_round_ss(A, B, C, D, R) \ 5605 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5606 (__v4sf)(__m128)(B), \ 5607 (int)(((D)<<2) | (C)), \ 5608 (__v4sf)_mm_setzero_ps(), \ 5609 (__mmask8)-1, (int)(R))) 5610 5611 #define _mm_getmant_ss(A, B, C, D) \ 5612 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5613 (__v4sf)(__m128)(B), \ 5614 (int)(((D)<<2) | (C)), \ 5615 (__v4sf)_mm_setzero_ps(), \ 5616 (__mmask8)-1, \ 5617 _MM_FROUND_CUR_DIRECTION)) 5618 5619 #define _mm_mask_getmant_ss(W, U, A, B, C, D) \ 5620 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5621 (__v4sf)(__m128)(B), \ 5622 (int)(((D)<<2) | (C)), \ 5623 (__v4sf)(__m128)(W), \ 5624 (__mmask8)(U), \ 5625 _MM_FROUND_CUR_DIRECTION)) 5626 5627 #define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \ 5628 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5629 (__v4sf)(__m128)(B), \ 5630 (int)(((D)<<2) | (C)), \ 5631 (__v4sf)(__m128)(W), \ 5632 (__mmask8)(U), (int)(R))) 5633 5634 #define _mm_maskz_getmant_ss(U, A, B, C, D) \ 5635 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5636 (__v4sf)(__m128)(B), \ 5637 (int)(((D)<<2) | (C)), \ 5638 (__v4sf)_mm_setzero_ps(), \ 5639 (__mmask8)(U), \ 5640 _MM_FROUND_CUR_DIRECTION)) 5641 5642 #define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \ 5643 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5644 (__v4sf)(__m128)(B), \ 5645 (int)(((D)<<2) | (C)), \ 5646 (__v4sf)_mm_setzero_ps(), \ 5647 (__mmask8)(U), (int)(R))) 5648 5649 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 5650 _mm512_kmov (__mmask16 __A) 5651 { 5652 return __A; 5653 } 5654 5655 #define _mm_comi_round_sd(A, B, P, R) \ 5656 ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ 5657 (int)(P), (int)(R))) 5658 5659 #define _mm_comi_round_ss(A, B, P, R) \ 5660 ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ 5661 (int)(P), (int)(R))) 5662 5663 #ifdef __x86_64__ 5664 #define _mm_cvt_roundsd_si64(A, R) \ 5665 ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) 5666 #endif 5667 5668 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5669 _mm512_sll_epi32(__m512i __A, __m128i __B) 5670 { 5671 return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); 5672 } 5673 5674 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5675 _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5676 { 5677 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5678 (__v16si)_mm512_sll_epi32(__A, __B), 5679 (__v16si)__W); 5680 } 5681 5682 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5683 _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5684 { 5685 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5686 (__v16si)_mm512_sll_epi32(__A, __B), 5687 (__v16si)_mm512_setzero_si512()); 5688 } 5689 5690 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5691 _mm512_sll_epi64(__m512i __A, __m128i __B) 5692 { 5693 return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); 5694 } 5695 5696 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5697 _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5698 { 5699 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5700 (__v8di)_mm512_sll_epi64(__A, __B), 5701 (__v8di)__W); 5702 } 5703 5704 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5705 _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5706 { 5707 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5708 (__v8di)_mm512_sll_epi64(__A, __B), 5709 (__v8di)_mm512_setzero_si512()); 5710 } 5711 5712 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5713 _mm512_sllv_epi32(__m512i __X, __m512i __Y) 5714 { 5715 return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y); 5716 } 5717 5718 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5719 _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5720 { 5721 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5722 (__v16si)_mm512_sllv_epi32(__X, __Y), 5723 (__v16si)__W); 5724 } 5725 5726 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5727 _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5728 { 5729 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5730 (__v16si)_mm512_sllv_epi32(__X, __Y), 5731 (__v16si)_mm512_setzero_si512()); 5732 } 5733 5734 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5735 _mm512_sllv_epi64(__m512i __X, __m512i __Y) 5736 { 5737 return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y); 5738 } 5739 5740 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5741 _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5742 { 5743 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5744 (__v8di)_mm512_sllv_epi64(__X, __Y), 5745 (__v8di)__W); 5746 } 5747 5748 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5749 _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5750 { 5751 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5752 (__v8di)_mm512_sllv_epi64(__X, __Y), 5753 (__v8di)_mm512_setzero_si512()); 5754 } 5755 5756 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5757 _mm512_sra_epi32(__m512i __A, __m128i __B) 5758 { 5759 return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); 5760 } 5761 5762 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5763 _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5764 { 5765 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5766 (__v16si)_mm512_sra_epi32(__A, __B), 5767 (__v16si)__W); 5768 } 5769 5770 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5771 _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5772 { 5773 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5774 (__v16si)_mm512_sra_epi32(__A, __B), 5775 (__v16si)_mm512_setzero_si512()); 5776 } 5777 5778 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5779 _mm512_sra_epi64(__m512i __A, __m128i __B) 5780 { 5781 return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); 5782 } 5783 5784 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5785 _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5786 { 5787 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5788 (__v8di)_mm512_sra_epi64(__A, __B), 5789 (__v8di)__W); 5790 } 5791 5792 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5793 _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5794 { 5795 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5796 (__v8di)_mm512_sra_epi64(__A, __B), 5797 (__v8di)_mm512_setzero_si512()); 5798 } 5799 5800 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5801 _mm512_srav_epi32(__m512i __X, __m512i __Y) 5802 { 5803 return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y); 5804 } 5805 5806 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5807 _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5808 { 5809 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5810 (__v16si)_mm512_srav_epi32(__X, __Y), 5811 (__v16si)__W); 5812 } 5813 5814 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5815 _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5816 { 5817 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5818 (__v16si)_mm512_srav_epi32(__X, __Y), 5819 (__v16si)_mm512_setzero_si512()); 5820 } 5821 5822 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5823 _mm512_srav_epi64(__m512i __X, __m512i __Y) 5824 { 5825 return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y); 5826 } 5827 5828 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5829 _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5830 { 5831 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5832 (__v8di)_mm512_srav_epi64(__X, __Y), 5833 (__v8di)__W); 5834 } 5835 5836 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5837 _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5838 { 5839 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5840 (__v8di)_mm512_srav_epi64(__X, __Y), 5841 (__v8di)_mm512_setzero_si512()); 5842 } 5843 5844 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5845 _mm512_srl_epi32(__m512i __A, __m128i __B) 5846 { 5847 return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); 5848 } 5849 5850 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5851 _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5852 { 5853 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5854 (__v16si)_mm512_srl_epi32(__A, __B), 5855 (__v16si)__W); 5856 } 5857 5858 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5859 _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5860 { 5861 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5862 (__v16si)_mm512_srl_epi32(__A, __B), 5863 (__v16si)_mm512_setzero_si512()); 5864 } 5865 5866 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5867 _mm512_srl_epi64(__m512i __A, __m128i __B) 5868 { 5869 return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); 5870 } 5871 5872 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5873 _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5874 { 5875 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5876 (__v8di)_mm512_srl_epi64(__A, __B), 5877 (__v8di)__W); 5878 } 5879 5880 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5881 _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5882 { 5883 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5884 (__v8di)_mm512_srl_epi64(__A, __B), 5885 (__v8di)_mm512_setzero_si512()); 5886 } 5887 5888 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5889 _mm512_srlv_epi32(__m512i __X, __m512i __Y) 5890 { 5891 return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y); 5892 } 5893 5894 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5895 _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5896 { 5897 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5898 (__v16si)_mm512_srlv_epi32(__X, __Y), 5899 (__v16si)__W); 5900 } 5901 5902 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5903 _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5904 { 5905 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5906 (__v16si)_mm512_srlv_epi32(__X, __Y), 5907 (__v16si)_mm512_setzero_si512()); 5908 } 5909 5910 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5911 _mm512_srlv_epi64 (__m512i __X, __m512i __Y) 5912 { 5913 return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y); 5914 } 5915 5916 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5917 _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5918 { 5919 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5920 (__v8di)_mm512_srlv_epi64(__X, __Y), 5921 (__v8di)__W); 5922 } 5923 5924 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5925 _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5926 { 5927 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5928 (__v8di)_mm512_srlv_epi64(__X, __Y), 5929 (__v8di)_mm512_setzero_si512()); 5930 } 5931 5932 /// \enum _MM_TERNLOG_ENUM 5933 /// A helper to represent the ternary logic operations among vector \a A, 5934 /// \a B and \a C. The representation is passed to \a imm. 5935 typedef enum { 5936 _MM_TERNLOG_A = 0xF0, 5937 _MM_TERNLOG_B = 0xCC, 5938 _MM_TERNLOG_C = 0xAA 5939 } _MM_TERNLOG_ENUM; 5940 5941 #define _mm512_ternarylogic_epi32(A, B, C, imm) \ 5942 ((__m512i)__builtin_ia32_pternlogd512_mask( \ 5943 (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ 5944 (unsigned char)(imm), (__mmask16)-1)) 5945 5946 #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \ 5947 ((__m512i)__builtin_ia32_pternlogd512_mask( \ 5948 (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ 5949 (unsigned char)(imm), (__mmask16)(U))) 5950 5951 #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \ 5952 ((__m512i)__builtin_ia32_pternlogd512_maskz( \ 5953 (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ 5954 (unsigned char)(imm), (__mmask16)(U))) 5955 5956 #define _mm512_ternarylogic_epi64(A, B, C, imm) \ 5957 ((__m512i)__builtin_ia32_pternlogq512_mask( \ 5958 (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ 5959 (unsigned char)(imm), (__mmask8)-1)) 5960 5961 #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \ 5962 ((__m512i)__builtin_ia32_pternlogq512_mask( \ 5963 (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ 5964 (unsigned char)(imm), (__mmask8)(U))) 5965 5966 #define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \ 5967 ((__m512i)__builtin_ia32_pternlogq512_maskz( \ 5968 (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ 5969 (unsigned char)(imm), (__mmask8)(U))) 5970 5971 #ifdef __x86_64__ 5972 #define _mm_cvt_roundsd_i64(A, R) \ 5973 ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) 5974 #endif 5975 5976 #define _mm_cvt_roundsd_si32(A, R) \ 5977 ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))) 5978 5979 #define _mm_cvt_roundsd_i32(A, R) \ 5980 ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))) 5981 5982 #define _mm_cvt_roundsd_u32(A, R) \ 5983 ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R))) 5984 5985 static __inline__ unsigned __DEFAULT_FN_ATTRS128 5986 _mm_cvtsd_u32 (__m128d __A) 5987 { 5988 return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, 5989 _MM_FROUND_CUR_DIRECTION); 5990 } 5991 5992 #ifdef __x86_64__ 5993 #define _mm_cvt_roundsd_u64(A, R) \ 5994 ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ 5995 (int)(R))) 5996 5997 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 5998 _mm_cvtsd_u64 (__m128d __A) 5999 { 6000 return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) 6001 __A, 6002 _MM_FROUND_CUR_DIRECTION); 6003 } 6004 #endif 6005 6006 #define _mm_cvt_roundss_si32(A, R) \ 6007 ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))) 6008 6009 #define _mm_cvt_roundss_i32(A, R) \ 6010 ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))) 6011 6012 #ifdef __x86_64__ 6013 #define _mm_cvt_roundss_si64(A, R) \ 6014 ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))) 6015 6016 #define _mm_cvt_roundss_i64(A, R) \ 6017 ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))) 6018 #endif 6019 6020 #define _mm_cvt_roundss_u32(A, R) \ 6021 ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R))) 6022 6023 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6024 _mm_cvtss_u32 (__m128 __A) 6025 { 6026 return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, 6027 _MM_FROUND_CUR_DIRECTION); 6028 } 6029 6030 #ifdef __x86_64__ 6031 #define _mm_cvt_roundss_u64(A, R) \ 6032 ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ 6033 (int)(R))) 6034 6035 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6036 _mm_cvtss_u64 (__m128 __A) 6037 { 6038 return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) 6039 __A, 6040 _MM_FROUND_CUR_DIRECTION); 6041 } 6042 #endif 6043 6044 #define _mm_cvtt_roundsd_i32(A, R) \ 6045 ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))) 6046 6047 #define _mm_cvtt_roundsd_si32(A, R) \ 6048 ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))) 6049 6050 static __inline__ int __DEFAULT_FN_ATTRS128 6051 _mm_cvttsd_i32 (__m128d __A) 6052 { 6053 return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, 6054 _MM_FROUND_CUR_DIRECTION); 6055 } 6056 6057 #ifdef __x86_64__ 6058 #define _mm_cvtt_roundsd_si64(A, R) \ 6059 ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))) 6060 6061 #define _mm_cvtt_roundsd_i64(A, R) \ 6062 ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))) 6063 6064 static __inline__ long long __DEFAULT_FN_ATTRS128 6065 _mm_cvttsd_i64 (__m128d __A) 6066 { 6067 return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, 6068 _MM_FROUND_CUR_DIRECTION); 6069 } 6070 #endif 6071 6072 #define _mm_cvtt_roundsd_u32(A, R) \ 6073 ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R))) 6074 6075 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6076 _mm_cvttsd_u32 (__m128d __A) 6077 { 6078 return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, 6079 _MM_FROUND_CUR_DIRECTION); 6080 } 6081 6082 #ifdef __x86_64__ 6083 #define _mm_cvtt_roundsd_u64(A, R) \ 6084 ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ 6085 (int)(R))) 6086 6087 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6088 _mm_cvttsd_u64 (__m128d __A) 6089 { 6090 return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) 6091 __A, 6092 _MM_FROUND_CUR_DIRECTION); 6093 } 6094 #endif 6095 6096 #define _mm_cvtt_roundss_i32(A, R) \ 6097 ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))) 6098 6099 #define _mm_cvtt_roundss_si32(A, R) \ 6100 ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))) 6101 6102 static __inline__ int __DEFAULT_FN_ATTRS128 6103 _mm_cvttss_i32 (__m128 __A) 6104 { 6105 return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, 6106 _MM_FROUND_CUR_DIRECTION); 6107 } 6108 6109 #ifdef __x86_64__ 6110 #define _mm_cvtt_roundss_i64(A, R) \ 6111 ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))) 6112 6113 #define _mm_cvtt_roundss_si64(A, R) \ 6114 ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))) 6115 6116 static __inline__ long long __DEFAULT_FN_ATTRS128 6117 _mm_cvttss_i64 (__m128 __A) 6118 { 6119 return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, 6120 _MM_FROUND_CUR_DIRECTION); 6121 } 6122 #endif 6123 6124 #define _mm_cvtt_roundss_u32(A, R) \ 6125 ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R))) 6126 6127 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6128 _mm_cvttss_u32 (__m128 __A) 6129 { 6130 return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, 6131 _MM_FROUND_CUR_DIRECTION); 6132 } 6133 6134 #ifdef __x86_64__ 6135 #define _mm_cvtt_roundss_u64(A, R) \ 6136 ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ 6137 (int)(R))) 6138 6139 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6140 _mm_cvttss_u64 (__m128 __A) 6141 { 6142 return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) 6143 __A, 6144 _MM_FROUND_CUR_DIRECTION); 6145 } 6146 #endif 6147 6148 #define _mm512_permute_pd(X, C) \ 6149 ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C))) 6150 6151 #define _mm512_mask_permute_pd(W, U, X, C) \ 6152 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6153 (__v8df)_mm512_permute_pd((X), (C)), \ 6154 (__v8df)(__m512d)(W))) 6155 6156 #define _mm512_maskz_permute_pd(U, X, C) \ 6157 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6158 (__v8df)_mm512_permute_pd((X), (C)), \ 6159 (__v8df)_mm512_setzero_pd())) 6160 6161 #define _mm512_permute_ps(X, C) \ 6162 ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C))) 6163 6164 #define _mm512_mask_permute_ps(W, U, X, C) \ 6165 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6166 (__v16sf)_mm512_permute_ps((X), (C)), \ 6167 (__v16sf)(__m512)(W))) 6168 6169 #define _mm512_maskz_permute_ps(U, X, C) \ 6170 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6171 (__v16sf)_mm512_permute_ps((X), (C)), \ 6172 (__v16sf)_mm512_setzero_ps())) 6173 6174 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6175 _mm512_permutevar_pd(__m512d __A, __m512i __C) 6176 { 6177 return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C); 6178 } 6179 6180 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6181 _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) 6182 { 6183 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 6184 (__v8df)_mm512_permutevar_pd(__A, __C), 6185 (__v8df)__W); 6186 } 6187 6188 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6189 _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) 6190 { 6191 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 6192 (__v8df)_mm512_permutevar_pd(__A, __C), 6193 (__v8df)_mm512_setzero_pd()); 6194 } 6195 6196 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6197 _mm512_permutevar_ps(__m512 __A, __m512i __C) 6198 { 6199 return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C); 6200 } 6201 6202 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6203 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) 6204 { 6205 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 6206 (__v16sf)_mm512_permutevar_ps(__A, __C), 6207 (__v16sf)__W); 6208 } 6209 6210 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6211 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) 6212 { 6213 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 6214 (__v16sf)_mm512_permutevar_ps(__A, __C), 6215 (__v16sf)_mm512_setzero_ps()); 6216 } 6217 6218 static __inline __m512d __DEFAULT_FN_ATTRS512 6219 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) 6220 { 6221 return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, 6222 (__v8df)__B); 6223 } 6224 6225 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6226 _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) 6227 { 6228 return (__m512d)__builtin_ia32_selectpd_512(__U, 6229 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6230 (__v8df)__A); 6231 } 6232 6233 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6234 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, 6235 __m512d __B) 6236 { 6237 return (__m512d)__builtin_ia32_selectpd_512(__U, 6238 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6239 (__v8df)(__m512d)__I); 6240 } 6241 6242 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6243 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, 6244 __m512d __B) 6245 { 6246 return (__m512d)__builtin_ia32_selectpd_512(__U, 6247 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6248 (__v8df)_mm512_setzero_pd()); 6249 } 6250 6251 static __inline __m512 __DEFAULT_FN_ATTRS512 6252 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) 6253 { 6254 return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, 6255 (__v16sf) __B); 6256 } 6257 6258 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6259 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) 6260 { 6261 return (__m512)__builtin_ia32_selectps_512(__U, 6262 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6263 (__v16sf)__A); 6264 } 6265 6266 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6267 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) 6268 { 6269 return (__m512)__builtin_ia32_selectps_512(__U, 6270 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6271 (__v16sf)(__m512)__I); 6272 } 6273 6274 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6275 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) 6276 { 6277 return (__m512)__builtin_ia32_selectps_512(__U, 6278 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6279 (__v16sf)_mm512_setzero_ps()); 6280 } 6281 6282 6283 #define _mm512_cvtt_roundpd_epu32(A, R) \ 6284 ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6285 (__v8si)_mm256_undefined_si256(), \ 6286 (__mmask8)-1, (int)(R))) 6287 6288 #define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \ 6289 ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6290 (__v8si)(__m256i)(W), \ 6291 (__mmask8)(U), (int)(R))) 6292 6293 #define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \ 6294 ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6295 (__v8si)_mm256_setzero_si256(), \ 6296 (__mmask8)(U), (int)(R))) 6297 6298 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6299 _mm512_cvttpd_epu32 (__m512d __A) 6300 { 6301 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6302 (__v8si) 6303 _mm256_undefined_si256 (), 6304 (__mmask8) -1, 6305 _MM_FROUND_CUR_DIRECTION); 6306 } 6307 6308 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6309 _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) 6310 { 6311 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6312 (__v8si) __W, 6313 (__mmask8) __U, 6314 _MM_FROUND_CUR_DIRECTION); 6315 } 6316 6317 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6318 _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) 6319 { 6320 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6321 (__v8si) 6322 _mm256_setzero_si256 (), 6323 (__mmask8) __U, 6324 _MM_FROUND_CUR_DIRECTION); 6325 } 6326 6327 #define _mm_roundscale_round_sd(A, B, imm, R) \ 6328 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6329 (__v2df)(__m128d)(B), \ 6330 (__v2df)_mm_setzero_pd(), \ 6331 (__mmask8)-1, (int)(imm), \ 6332 (int)(R))) 6333 6334 #define _mm_roundscale_sd(A, B, imm) \ 6335 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6336 (__v2df)(__m128d)(B), \ 6337 (__v2df)_mm_setzero_pd(), \ 6338 (__mmask8)-1, (int)(imm), \ 6339 _MM_FROUND_CUR_DIRECTION)) 6340 6341 #define _mm_mask_roundscale_sd(W, U, A, B, imm) \ 6342 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6343 (__v2df)(__m128d)(B), \ 6344 (__v2df)(__m128d)(W), \ 6345 (__mmask8)(U), (int)(imm), \ 6346 _MM_FROUND_CUR_DIRECTION)) 6347 6348 #define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \ 6349 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6350 (__v2df)(__m128d)(B), \ 6351 (__v2df)(__m128d)(W), \ 6352 (__mmask8)(U), (int)(I), \ 6353 (int)(R))) 6354 6355 #define _mm_maskz_roundscale_sd(U, A, B, I) \ 6356 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6357 (__v2df)(__m128d)(B), \ 6358 (__v2df)_mm_setzero_pd(), \ 6359 (__mmask8)(U), (int)(I), \ 6360 _MM_FROUND_CUR_DIRECTION)) 6361 6362 #define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ 6363 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6364 (__v2df)(__m128d)(B), \ 6365 (__v2df)_mm_setzero_pd(), \ 6366 (__mmask8)(U), (int)(I), \ 6367 (int)(R))) 6368 6369 #define _mm_roundscale_round_ss(A, B, imm, R) \ 6370 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6371 (__v4sf)(__m128)(B), \ 6372 (__v4sf)_mm_setzero_ps(), \ 6373 (__mmask8)-1, (int)(imm), \ 6374 (int)(R))) 6375 6376 #define _mm_roundscale_ss(A, B, imm) \ 6377 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6378 (__v4sf)(__m128)(B), \ 6379 (__v4sf)_mm_setzero_ps(), \ 6380 (__mmask8)-1, (int)(imm), \ 6381 _MM_FROUND_CUR_DIRECTION)) 6382 6383 #define _mm_mask_roundscale_ss(W, U, A, B, I) \ 6384 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6385 (__v4sf)(__m128)(B), \ 6386 (__v4sf)(__m128)(W), \ 6387 (__mmask8)(U), (int)(I), \ 6388 _MM_FROUND_CUR_DIRECTION)) 6389 6390 #define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \ 6391 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6392 (__v4sf)(__m128)(B), \ 6393 (__v4sf)(__m128)(W), \ 6394 (__mmask8)(U), (int)(I), \ 6395 (int)(R))) 6396 6397 #define _mm_maskz_roundscale_ss(U, A, B, I) \ 6398 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6399 (__v4sf)(__m128)(B), \ 6400 (__v4sf)_mm_setzero_ps(), \ 6401 (__mmask8)(U), (int)(I), \ 6402 _MM_FROUND_CUR_DIRECTION)) 6403 6404 #define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ 6405 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6406 (__v4sf)(__m128)(B), \ 6407 (__v4sf)_mm_setzero_ps(), \ 6408 (__mmask8)(U), (int)(I), \ 6409 (int)(R))) 6410 6411 #define _mm512_scalef_round_pd(A, B, R) \ 6412 ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6413 (__v8df)(__m512d)(B), \ 6414 (__v8df)_mm512_undefined_pd(), \ 6415 (__mmask8)-1, (int)(R))) 6416 6417 #define _mm512_mask_scalef_round_pd(W, U, A, B, R) \ 6418 ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6419 (__v8df)(__m512d)(B), \ 6420 (__v8df)(__m512d)(W), \ 6421 (__mmask8)(U), (int)(R))) 6422 6423 #define _mm512_maskz_scalef_round_pd(U, A, B, R) \ 6424 ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6425 (__v8df)(__m512d)(B), \ 6426 (__v8df)_mm512_setzero_pd(), \ 6427 (__mmask8)(U), (int)(R))) 6428 6429 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6430 _mm512_scalef_pd (__m512d __A, __m512d __B) 6431 { 6432 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6433 (__v8df) __B, 6434 (__v8df) 6435 _mm512_undefined_pd (), 6436 (__mmask8) -1, 6437 _MM_FROUND_CUR_DIRECTION); 6438 } 6439 6440 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6441 _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 6442 { 6443 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6444 (__v8df) __B, 6445 (__v8df) __W, 6446 (__mmask8) __U, 6447 _MM_FROUND_CUR_DIRECTION); 6448 } 6449 6450 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6451 _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) 6452 { 6453 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6454 (__v8df) __B, 6455 (__v8df) 6456 _mm512_setzero_pd (), 6457 (__mmask8) __U, 6458 _MM_FROUND_CUR_DIRECTION); 6459 } 6460 6461 #define _mm512_scalef_round_ps(A, B, R) \ 6462 ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6463 (__v16sf)(__m512)(B), \ 6464 (__v16sf)_mm512_undefined_ps(), \ 6465 (__mmask16)-1, (int)(R))) 6466 6467 #define _mm512_mask_scalef_round_ps(W, U, A, B, R) \ 6468 ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6469 (__v16sf)(__m512)(B), \ 6470 (__v16sf)(__m512)(W), \ 6471 (__mmask16)(U), (int)(R))) 6472 6473 #define _mm512_maskz_scalef_round_ps(U, A, B, R) \ 6474 ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6475 (__v16sf)(__m512)(B), \ 6476 (__v16sf)_mm512_setzero_ps(), \ 6477 (__mmask16)(U), (int)(R))) 6478 6479 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6480 _mm512_scalef_ps (__m512 __A, __m512 __B) 6481 { 6482 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6483 (__v16sf) __B, 6484 (__v16sf) 6485 _mm512_undefined_ps (), 6486 (__mmask16) -1, 6487 _MM_FROUND_CUR_DIRECTION); 6488 } 6489 6490 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6491 _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 6492 { 6493 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6494 (__v16sf) __B, 6495 (__v16sf) __W, 6496 (__mmask16) __U, 6497 _MM_FROUND_CUR_DIRECTION); 6498 } 6499 6500 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6501 _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) 6502 { 6503 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6504 (__v16sf) __B, 6505 (__v16sf) 6506 _mm512_setzero_ps (), 6507 (__mmask16) __U, 6508 _MM_FROUND_CUR_DIRECTION); 6509 } 6510 6511 #define _mm_scalef_round_sd(A, B, R) \ 6512 ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6513 (__v2df)(__m128d)(B), \ 6514 (__v2df)_mm_setzero_pd(), \ 6515 (__mmask8)-1, (int)(R))) 6516 6517 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6518 _mm_scalef_sd (__m128d __A, __m128d __B) 6519 { 6520 return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A, 6521 (__v2df)( __B), (__v2df) _mm_setzero_pd(), 6522 (__mmask8) -1, 6523 _MM_FROUND_CUR_DIRECTION); 6524 } 6525 6526 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6527 _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 6528 { 6529 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, 6530 (__v2df) __B, 6531 (__v2df) __W, 6532 (__mmask8) __U, 6533 _MM_FROUND_CUR_DIRECTION); 6534 } 6535 6536 #define _mm_mask_scalef_round_sd(W, U, A, B, R) \ 6537 ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6538 (__v2df)(__m128d)(B), \ 6539 (__v2df)(__m128d)(W), \ 6540 (__mmask8)(U), (int)(R))) 6541 6542 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6543 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) 6544 { 6545 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, 6546 (__v2df) __B, 6547 (__v2df) _mm_setzero_pd (), 6548 (__mmask8) __U, 6549 _MM_FROUND_CUR_DIRECTION); 6550 } 6551 6552 #define _mm_maskz_scalef_round_sd(U, A, B, R) \ 6553 ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6554 (__v2df)(__m128d)(B), \ 6555 (__v2df)_mm_setzero_pd(), \ 6556 (__mmask8)(U), (int)(R))) 6557 6558 #define _mm_scalef_round_ss(A, B, R) \ 6559 ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6560 (__v4sf)(__m128)(B), \ 6561 (__v4sf)_mm_setzero_ps(), \ 6562 (__mmask8)-1, (int)(R))) 6563 6564 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6565 _mm_scalef_ss (__m128 __A, __m128 __B) 6566 { 6567 return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A, 6568 (__v4sf)( __B), (__v4sf) _mm_setzero_ps(), 6569 (__mmask8) -1, 6570 _MM_FROUND_CUR_DIRECTION); 6571 } 6572 6573 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6574 _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 6575 { 6576 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, 6577 (__v4sf) __B, 6578 (__v4sf) __W, 6579 (__mmask8) __U, 6580 _MM_FROUND_CUR_DIRECTION); 6581 } 6582 6583 #define _mm_mask_scalef_round_ss(W, U, A, B, R) \ 6584 ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6585 (__v4sf)(__m128)(B), \ 6586 (__v4sf)(__m128)(W), \ 6587 (__mmask8)(U), (int)(R))) 6588 6589 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6590 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) 6591 { 6592 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, 6593 (__v4sf) __B, 6594 (__v4sf) _mm_setzero_ps (), 6595 (__mmask8) __U, 6596 _MM_FROUND_CUR_DIRECTION); 6597 } 6598 6599 #define _mm_maskz_scalef_round_ss(U, A, B, R) \ 6600 ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6601 (__v4sf)(__m128)(B), \ 6602 (__v4sf)_mm_setzero_ps(), \ 6603 (__mmask8)(U), \ 6604 (int)(R))) 6605 6606 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6607 _mm512_srai_epi32(__m512i __A, unsigned int __B) 6608 { 6609 return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B); 6610 } 6611 6612 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6613 _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, 6614 unsigned int __B) 6615 { 6616 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 6617 (__v16si)_mm512_srai_epi32(__A, __B), 6618 (__v16si)__W); 6619 } 6620 6621 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6622 _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, 6623 unsigned int __B) { 6624 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 6625 (__v16si)_mm512_srai_epi32(__A, __B), 6626 (__v16si)_mm512_setzero_si512()); 6627 } 6628 6629 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6630 _mm512_srai_epi64(__m512i __A, unsigned int __B) 6631 { 6632 return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B); 6633 } 6634 6635 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6636 _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) 6637 { 6638 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 6639 (__v8di)_mm512_srai_epi64(__A, __B), 6640 (__v8di)__W); 6641 } 6642 6643 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6644 _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) 6645 { 6646 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 6647 (__v8di)_mm512_srai_epi64(__A, __B), 6648 (__v8di)_mm512_setzero_si512()); 6649 } 6650 6651 #define _mm512_shuffle_f32x4(A, B, imm) \ 6652 ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \ 6653 (__v16sf)(__m512)(B), (int)(imm))) 6654 6655 #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \ 6656 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6657 (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ 6658 (__v16sf)(__m512)(W))) 6659 6660 #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \ 6661 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6662 (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ 6663 (__v16sf)_mm512_setzero_ps())) 6664 6665 #define _mm512_shuffle_f64x2(A, B, imm) \ 6666 ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \ 6667 (__v8df)(__m512d)(B), (int)(imm))) 6668 6669 #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \ 6670 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6671 (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ 6672 (__v8df)(__m512d)(W))) 6673 6674 #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \ 6675 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6676 (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ 6677 (__v8df)_mm512_setzero_pd())) 6678 6679 #define _mm512_shuffle_i32x4(A, B, imm) \ 6680 ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \ 6681 (__v16si)(__m512i)(B), (int)(imm))) 6682 6683 #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \ 6684 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 6685 (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ 6686 (__v16si)(__m512i)(W))) 6687 6688 #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \ 6689 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 6690 (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ 6691 (__v16si)_mm512_setzero_si512())) 6692 6693 #define _mm512_shuffle_i64x2(A, B, imm) \ 6694 ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \ 6695 (__v8di)(__m512i)(B), (int)(imm))) 6696 6697 #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \ 6698 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 6699 (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ 6700 (__v8di)(__m512i)(W))) 6701 6702 #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \ 6703 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 6704 (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ 6705 (__v8di)_mm512_setzero_si512())) 6706 6707 #define _mm512_shuffle_pd(A, B, M) \ 6708 ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \ 6709 (__v8df)(__m512d)(B), (int)(M))) 6710 6711 #define _mm512_mask_shuffle_pd(W, U, A, B, M) \ 6712 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6713 (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ 6714 (__v8df)(__m512d)(W))) 6715 6716 #define _mm512_maskz_shuffle_pd(U, A, B, M) \ 6717 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6718 (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ 6719 (__v8df)_mm512_setzero_pd())) 6720 6721 #define _mm512_shuffle_ps(A, B, M) \ 6722 ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \ 6723 (__v16sf)(__m512)(B), (int)(M))) 6724 6725 #define _mm512_mask_shuffle_ps(W, U, A, B, M) \ 6726 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6727 (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ 6728 (__v16sf)(__m512)(W))) 6729 6730 #define _mm512_maskz_shuffle_ps(U, A, B, M) \ 6731 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6732 (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ 6733 (__v16sf)_mm512_setzero_ps())) 6734 6735 #define _mm_sqrt_round_sd(A, B, R) \ 6736 ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6737 (__v2df)(__m128d)(B), \ 6738 (__v2df)_mm_setzero_pd(), \ 6739 (__mmask8)-1, (int)(R))) 6740 6741 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6742 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 6743 { 6744 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, 6745 (__v2df) __B, 6746 (__v2df) __W, 6747 (__mmask8) __U, 6748 _MM_FROUND_CUR_DIRECTION); 6749 } 6750 6751 #define _mm_mask_sqrt_round_sd(W, U, A, B, R) \ 6752 ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6753 (__v2df)(__m128d)(B), \ 6754 (__v2df)(__m128d)(W), \ 6755 (__mmask8)(U), (int)(R))) 6756 6757 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6758 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) 6759 { 6760 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, 6761 (__v2df) __B, 6762 (__v2df) _mm_setzero_pd (), 6763 (__mmask8) __U, 6764 _MM_FROUND_CUR_DIRECTION); 6765 } 6766 6767 #define _mm_maskz_sqrt_round_sd(U, A, B, R) \ 6768 ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6769 (__v2df)(__m128d)(B), \ 6770 (__v2df)_mm_setzero_pd(), \ 6771 (__mmask8)(U), (int)(R))) 6772 6773 #define _mm_sqrt_round_ss(A, B, R) \ 6774 ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6775 (__v4sf)(__m128)(B), \ 6776 (__v4sf)_mm_setzero_ps(), \ 6777 (__mmask8)-1, (int)(R))) 6778 6779 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6780 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 6781 { 6782 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, 6783 (__v4sf) __B, 6784 (__v4sf) __W, 6785 (__mmask8) __U, 6786 _MM_FROUND_CUR_DIRECTION); 6787 } 6788 6789 #define _mm_mask_sqrt_round_ss(W, U, A, B, R) \ 6790 ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6791 (__v4sf)(__m128)(B), \ 6792 (__v4sf)(__m128)(W), (__mmask8)(U), \ 6793 (int)(R))) 6794 6795 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6796 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) 6797 { 6798 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, 6799 (__v4sf) __B, 6800 (__v4sf) _mm_setzero_ps (), 6801 (__mmask8) __U, 6802 _MM_FROUND_CUR_DIRECTION); 6803 } 6804 6805 #define _mm_maskz_sqrt_round_ss(U, A, B, R) \ 6806 ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6807 (__v4sf)(__m128)(B), \ 6808 (__v4sf)_mm_setzero_ps(), \ 6809 (__mmask8)(U), (int)(R))) 6810 6811 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6812 _mm512_broadcast_f32x4(__m128 __A) 6813 { 6814 return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 6815 0, 1, 2, 3, 0, 1, 2, 3, 6816 0, 1, 2, 3, 0, 1, 2, 3); 6817 } 6818 6819 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6820 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) 6821 { 6822 return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 6823 (__v16sf)_mm512_broadcast_f32x4(__A), 6824 (__v16sf)__O); 6825 } 6826 6827 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6828 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) 6829 { 6830 return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 6831 (__v16sf)_mm512_broadcast_f32x4(__A), 6832 (__v16sf)_mm512_setzero_ps()); 6833 } 6834 6835 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6836 _mm512_broadcast_f64x4(__m256d __A) 6837 { 6838 return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A, 6839 0, 1, 2, 3, 0, 1, 2, 3); 6840 } 6841 6842 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6843 _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) 6844 { 6845 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, 6846 (__v8df)_mm512_broadcast_f64x4(__A), 6847 (__v8df)__O); 6848 } 6849 6850 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6851 _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) 6852 { 6853 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, 6854 (__v8df)_mm512_broadcast_f64x4(__A), 6855 (__v8df)_mm512_setzero_pd()); 6856 } 6857 6858 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6859 _mm512_broadcast_i32x4(__m128i __A) 6860 { 6861 return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 6862 0, 1, 2, 3, 0, 1, 2, 3, 6863 0, 1, 2, 3, 0, 1, 2, 3); 6864 } 6865 6866 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6867 _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) 6868 { 6869 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 6870 (__v16si)_mm512_broadcast_i32x4(__A), 6871 (__v16si)__O); 6872 } 6873 6874 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6875 _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) 6876 { 6877 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 6878 (__v16si)_mm512_broadcast_i32x4(__A), 6879 (__v16si)_mm512_setzero_si512()); 6880 } 6881 6882 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6883 _mm512_broadcast_i64x4(__m256i __A) 6884 { 6885 return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A, 6886 0, 1, 2, 3, 0, 1, 2, 3); 6887 } 6888 6889 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6890 _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) 6891 { 6892 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 6893 (__v8di)_mm512_broadcast_i64x4(__A), 6894 (__v8di)__O); 6895 } 6896 6897 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6898 _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) 6899 { 6900 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 6901 (__v8di)_mm512_broadcast_i64x4(__A), 6902 (__v8di)_mm512_setzero_si512()); 6903 } 6904 6905 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6906 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) 6907 { 6908 return (__m512d)__builtin_ia32_selectpd_512(__M, 6909 (__v8df) _mm512_broadcastsd_pd(__A), 6910 (__v8df) __O); 6911 } 6912 6913 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6914 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) 6915 { 6916 return (__m512d)__builtin_ia32_selectpd_512(__M, 6917 (__v8df) _mm512_broadcastsd_pd(__A), 6918 (__v8df) _mm512_setzero_pd()); 6919 } 6920 6921 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6922 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) 6923 { 6924 return (__m512)__builtin_ia32_selectps_512(__M, 6925 (__v16sf) _mm512_broadcastss_ps(__A), 6926 (__v16sf) __O); 6927 } 6928 6929 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6930 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) 6931 { 6932 return (__m512)__builtin_ia32_selectps_512(__M, 6933 (__v16sf) _mm512_broadcastss_ps(__A), 6934 (__v16sf) _mm512_setzero_ps()); 6935 } 6936 6937 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6938 _mm512_cvtsepi32_epi8 (__m512i __A) 6939 { 6940 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6941 (__v16qi) _mm_undefined_si128 (), 6942 (__mmask16) -1); 6943 } 6944 6945 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6946 _mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 6947 { 6948 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6949 (__v16qi) __O, __M); 6950 } 6951 6952 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6953 _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) 6954 { 6955 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6956 (__v16qi) _mm_setzero_si128 (), 6957 __M); 6958 } 6959 6960 static __inline__ void __DEFAULT_FN_ATTRS512 6961 _mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 6962 { 6963 __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 6964 } 6965 6966 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6967 _mm512_cvtsepi32_epi16 (__m512i __A) 6968 { 6969 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6970 (__v16hi) _mm256_undefined_si256 (), 6971 (__mmask16) -1); 6972 } 6973 6974 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6975 _mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 6976 { 6977 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6978 (__v16hi) __O, __M); 6979 } 6980 6981 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6982 _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) 6983 { 6984 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6985 (__v16hi) _mm256_setzero_si256 (), 6986 __M); 6987 } 6988 6989 static __inline__ void __DEFAULT_FN_ATTRS512 6990 _mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) 6991 { 6992 __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); 6993 } 6994 6995 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6996 _mm512_cvtsepi64_epi8 (__m512i __A) 6997 { 6998 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 6999 (__v16qi) _mm_undefined_si128 (), 7000 (__mmask8) -1); 7001 } 7002 7003 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7004 _mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 7005 { 7006 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 7007 (__v16qi) __O, __M); 7008 } 7009 7010 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7011 _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) 7012 { 7013 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 7014 (__v16qi) _mm_setzero_si128 (), 7015 __M); 7016 } 7017 7018 static __inline__ void __DEFAULT_FN_ATTRS512 7019 _mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7020 { 7021 __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7022 } 7023 7024 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7025 _mm512_cvtsepi64_epi32 (__m512i __A) 7026 { 7027 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7028 (__v8si) _mm256_undefined_si256 (), 7029 (__mmask8) -1); 7030 } 7031 7032 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7033 _mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7034 { 7035 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7036 (__v8si) __O, __M); 7037 } 7038 7039 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7040 _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) 7041 { 7042 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7043 (__v8si) _mm256_setzero_si256 (), 7044 __M); 7045 } 7046 7047 static __inline__ void __DEFAULT_FN_ATTRS512 7048 _mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) 7049 { 7050 __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); 7051 } 7052 7053 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7054 _mm512_cvtsepi64_epi16 (__m512i __A) 7055 { 7056 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7057 (__v8hi) _mm_undefined_si128 (), 7058 (__mmask8) -1); 7059 } 7060 7061 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7062 _mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7063 { 7064 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7065 (__v8hi) __O, __M); 7066 } 7067 7068 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7069 _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) 7070 { 7071 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7072 (__v8hi) _mm_setzero_si128 (), 7073 __M); 7074 } 7075 7076 static __inline__ void __DEFAULT_FN_ATTRS512 7077 _mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) 7078 { 7079 __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); 7080 } 7081 7082 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7083 _mm512_cvtusepi32_epi8 (__m512i __A) 7084 { 7085 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7086 (__v16qi) _mm_undefined_si128 (), 7087 (__mmask16) -1); 7088 } 7089 7090 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7091 _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 7092 { 7093 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7094 (__v16qi) __O, 7095 __M); 7096 } 7097 7098 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7099 _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) 7100 { 7101 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7102 (__v16qi) _mm_setzero_si128 (), 7103 __M); 7104 } 7105 7106 static __inline__ void __DEFAULT_FN_ATTRS512 7107 _mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 7108 { 7109 __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 7110 } 7111 7112 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7113 _mm512_cvtusepi32_epi16 (__m512i __A) 7114 { 7115 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7116 (__v16hi) _mm256_undefined_si256 (), 7117 (__mmask16) -1); 7118 } 7119 7120 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7121 _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 7122 { 7123 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7124 (__v16hi) __O, 7125 __M); 7126 } 7127 7128 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7129 _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) 7130 { 7131 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7132 (__v16hi) _mm256_setzero_si256 (), 7133 __M); 7134 } 7135 7136 static __inline__ void __DEFAULT_FN_ATTRS512 7137 _mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) 7138 { 7139 __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); 7140 } 7141 7142 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7143 _mm512_cvtusepi64_epi8 (__m512i __A) 7144 { 7145 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7146 (__v16qi) _mm_undefined_si128 (), 7147 (__mmask8) -1); 7148 } 7149 7150 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7151 _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 7152 { 7153 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7154 (__v16qi) __O, 7155 __M); 7156 } 7157 7158 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7159 _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) 7160 { 7161 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7162 (__v16qi) _mm_setzero_si128 (), 7163 __M); 7164 } 7165 7166 static __inline__ void __DEFAULT_FN_ATTRS512 7167 _mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7168 { 7169 __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7170 } 7171 7172 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7173 _mm512_cvtusepi64_epi32 (__m512i __A) 7174 { 7175 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7176 (__v8si) _mm256_undefined_si256 (), 7177 (__mmask8) -1); 7178 } 7179 7180 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7181 _mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7182 { 7183 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7184 (__v8si) __O, __M); 7185 } 7186 7187 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7188 _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) 7189 { 7190 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7191 (__v8si) _mm256_setzero_si256 (), 7192 __M); 7193 } 7194 7195 static __inline__ void __DEFAULT_FN_ATTRS512 7196 _mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) 7197 { 7198 __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); 7199 } 7200 7201 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7202 _mm512_cvtusepi64_epi16 (__m512i __A) 7203 { 7204 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7205 (__v8hi) _mm_undefined_si128 (), 7206 (__mmask8) -1); 7207 } 7208 7209 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7210 _mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7211 { 7212 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7213 (__v8hi) __O, __M); 7214 } 7215 7216 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7217 _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) 7218 { 7219 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7220 (__v8hi) _mm_setzero_si128 (), 7221 __M); 7222 } 7223 7224 static __inline__ void __DEFAULT_FN_ATTRS512 7225 _mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) 7226 { 7227 __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); 7228 } 7229 7230 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7231 _mm512_cvtepi32_epi8 (__m512i __A) 7232 { 7233 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7234 (__v16qi) _mm_undefined_si128 (), 7235 (__mmask16) -1); 7236 } 7237 7238 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7239 _mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 7240 { 7241 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7242 (__v16qi) __O, __M); 7243 } 7244 7245 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7246 _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) 7247 { 7248 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7249 (__v16qi) _mm_setzero_si128 (), 7250 __M); 7251 } 7252 7253 static __inline__ void __DEFAULT_FN_ATTRS512 7254 _mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 7255 { 7256 __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 7257 } 7258 7259 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7260 _mm512_cvtepi32_epi16 (__m512i __A) 7261 { 7262 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7263 (__v16hi) _mm256_undefined_si256 (), 7264 (__mmask16) -1); 7265 } 7266 7267 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7268 _mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 7269 { 7270 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7271 (__v16hi) __O, __M); 7272 } 7273 7274 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7275 _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) 7276 { 7277 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7278 (__v16hi) _mm256_setzero_si256 (), 7279 __M); 7280 } 7281 7282 static __inline__ void __DEFAULT_FN_ATTRS512 7283 _mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) 7284 { 7285 __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); 7286 } 7287 7288 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7289 _mm512_cvtepi64_epi8 (__m512i __A) 7290 { 7291 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7292 (__v16qi) _mm_undefined_si128 (), 7293 (__mmask8) -1); 7294 } 7295 7296 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7297 _mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 7298 { 7299 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7300 (__v16qi) __O, __M); 7301 } 7302 7303 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7304 _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) 7305 { 7306 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7307 (__v16qi) _mm_setzero_si128 (), 7308 __M); 7309 } 7310 7311 static __inline__ void __DEFAULT_FN_ATTRS512 7312 _mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7313 { 7314 __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7315 } 7316 7317 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7318 _mm512_cvtepi64_epi32 (__m512i __A) 7319 { 7320 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7321 (__v8si) _mm256_undefined_si256 (), 7322 (__mmask8) -1); 7323 } 7324 7325 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7326 _mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7327 { 7328 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7329 (__v8si) __O, __M); 7330 } 7331 7332 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7333 _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) 7334 { 7335 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7336 (__v8si) _mm256_setzero_si256 (), 7337 __M); 7338 } 7339 7340 static __inline__ void __DEFAULT_FN_ATTRS512 7341 _mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) 7342 { 7343 __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); 7344 } 7345 7346 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7347 _mm512_cvtepi64_epi16 (__m512i __A) 7348 { 7349 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7350 (__v8hi) _mm_undefined_si128 (), 7351 (__mmask8) -1); 7352 } 7353 7354 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7355 _mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7356 { 7357 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7358 (__v8hi) __O, __M); 7359 } 7360 7361 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7362 _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) 7363 { 7364 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7365 (__v8hi) _mm_setzero_si128 (), 7366 __M); 7367 } 7368 7369 static __inline__ void __DEFAULT_FN_ATTRS512 7370 _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) 7371 { 7372 __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); 7373 } 7374 7375 #define _mm512_extracti32x4_epi32(A, imm) \ 7376 ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7377 (__v4si)_mm_undefined_si128(), \ 7378 (__mmask8)-1)) 7379 7380 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \ 7381 ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7382 (__v4si)(__m128i)(W), \ 7383 (__mmask8)(U))) 7384 7385 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \ 7386 ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7387 (__v4si)_mm_setzero_si128(), \ 7388 (__mmask8)(U))) 7389 7390 #define _mm512_extracti64x4_epi64(A, imm) \ 7391 ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7392 (__v4di)_mm256_undefined_si256(), \ 7393 (__mmask8)-1)) 7394 7395 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \ 7396 ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7397 (__v4di)(__m256i)(W), \ 7398 (__mmask8)(U))) 7399 7400 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \ 7401 ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7402 (__v4di)_mm256_setzero_si256(), \ 7403 (__mmask8)(U))) 7404 7405 #define _mm512_insertf64x4(A, B, imm) \ 7406 ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ 7407 (__v4df)(__m256d)(B), (int)(imm))) 7408 7409 #define _mm512_mask_insertf64x4(W, U, A, B, imm) \ 7410 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 7411 (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ 7412 (__v8df)(__m512d)(W))) 7413 7414 #define _mm512_maskz_insertf64x4(U, A, B, imm) \ 7415 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 7416 (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ 7417 (__v8df)_mm512_setzero_pd())) 7418 7419 #define _mm512_inserti64x4(A, B, imm) \ 7420 ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \ 7421 (__v4di)(__m256i)(B), (int)(imm))) 7422 7423 #define _mm512_mask_inserti64x4(W, U, A, B, imm) \ 7424 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 7425 (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ 7426 (__v8di)(__m512i)(W))) 7427 7428 #define _mm512_maskz_inserti64x4(U, A, B, imm) \ 7429 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 7430 (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ 7431 (__v8di)_mm512_setzero_si512())) 7432 7433 #define _mm512_insertf32x4(A, B, imm) \ 7434 ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \ 7435 (__v4sf)(__m128)(B), (int)(imm))) 7436 7437 #define _mm512_mask_insertf32x4(W, U, A, B, imm) \ 7438 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 7439 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ 7440 (__v16sf)(__m512)(W))) 7441 7442 #define _mm512_maskz_insertf32x4(U, A, B, imm) \ 7443 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 7444 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ 7445 (__v16sf)_mm512_setzero_ps())) 7446 7447 #define _mm512_inserti32x4(A, B, imm) \ 7448 ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \ 7449 (__v4si)(__m128i)(B), (int)(imm))) 7450 7451 #define _mm512_mask_inserti32x4(W, U, A, B, imm) \ 7452 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 7453 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ 7454 (__v16si)(__m512i)(W))) 7455 7456 #define _mm512_maskz_inserti32x4(U, A, B, imm) \ 7457 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 7458 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ 7459 (__v16si)_mm512_setzero_si512())) 7460 7461 #define _mm512_getmant_round_pd(A, B, C, R) \ 7462 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7463 (int)(((C)<<2) | (B)), \ 7464 (__v8df)_mm512_undefined_pd(), \ 7465 (__mmask8)-1, (int)(R))) 7466 7467 #define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \ 7468 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7469 (int)(((C)<<2) | (B)), \ 7470 (__v8df)(__m512d)(W), \ 7471 (__mmask8)(U), (int)(R))) 7472 7473 #define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \ 7474 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7475 (int)(((C)<<2) | (B)), \ 7476 (__v8df)_mm512_setzero_pd(), \ 7477 (__mmask8)(U), (int)(R))) 7478 7479 #define _mm512_getmant_pd(A, B, C) \ 7480 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7481 (int)(((C)<<2) | (B)), \ 7482 (__v8df)_mm512_setzero_pd(), \ 7483 (__mmask8)-1, \ 7484 _MM_FROUND_CUR_DIRECTION)) 7485 7486 #define _mm512_mask_getmant_pd(W, U, A, B, C) \ 7487 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7488 (int)(((C)<<2) | (B)), \ 7489 (__v8df)(__m512d)(W), \ 7490 (__mmask8)(U), \ 7491 _MM_FROUND_CUR_DIRECTION)) 7492 7493 #define _mm512_maskz_getmant_pd(U, A, B, C) \ 7494 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7495 (int)(((C)<<2) | (B)), \ 7496 (__v8df)_mm512_setzero_pd(), \ 7497 (__mmask8)(U), \ 7498 _MM_FROUND_CUR_DIRECTION)) 7499 7500 #define _mm512_getmant_round_ps(A, B, C, R) \ 7501 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7502 (int)(((C)<<2) | (B)), \ 7503 (__v16sf)_mm512_undefined_ps(), \ 7504 (__mmask16)-1, (int)(R))) 7505 7506 #define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \ 7507 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7508 (int)(((C)<<2) | (B)), \ 7509 (__v16sf)(__m512)(W), \ 7510 (__mmask16)(U), (int)(R))) 7511 7512 #define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \ 7513 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7514 (int)(((C)<<2) | (B)), \ 7515 (__v16sf)_mm512_setzero_ps(), \ 7516 (__mmask16)(U), (int)(R))) 7517 7518 #define _mm512_getmant_ps(A, B, C) \ 7519 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7520 (int)(((C)<<2)|(B)), \ 7521 (__v16sf)_mm512_undefined_ps(), \ 7522 (__mmask16)-1, \ 7523 _MM_FROUND_CUR_DIRECTION)) 7524 7525 #define _mm512_mask_getmant_ps(W, U, A, B, C) \ 7526 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7527 (int)(((C)<<2)|(B)), \ 7528 (__v16sf)(__m512)(W), \ 7529 (__mmask16)(U), \ 7530 _MM_FROUND_CUR_DIRECTION)) 7531 7532 #define _mm512_maskz_getmant_ps(U, A, B, C) \ 7533 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7534 (int)(((C)<<2)|(B)), \ 7535 (__v16sf)_mm512_setzero_ps(), \ 7536 (__mmask16)(U), \ 7537 _MM_FROUND_CUR_DIRECTION)) 7538 7539 #define _mm512_getexp_round_pd(A, R) \ 7540 ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7541 (__v8df)_mm512_undefined_pd(), \ 7542 (__mmask8)-1, (int)(R))) 7543 7544 #define _mm512_mask_getexp_round_pd(W, U, A, R) \ 7545 ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7546 (__v8df)(__m512d)(W), \ 7547 (__mmask8)(U), (int)(R))) 7548 7549 #define _mm512_maskz_getexp_round_pd(U, A, R) \ 7550 ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7551 (__v8df)_mm512_setzero_pd(), \ 7552 (__mmask8)(U), (int)(R))) 7553 7554 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7555 _mm512_getexp_pd (__m512d __A) 7556 { 7557 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7558 (__v8df) _mm512_undefined_pd (), 7559 (__mmask8) -1, 7560 _MM_FROUND_CUR_DIRECTION); 7561 } 7562 7563 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7564 _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) 7565 { 7566 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7567 (__v8df) __W, 7568 (__mmask8) __U, 7569 _MM_FROUND_CUR_DIRECTION); 7570 } 7571 7572 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7573 _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) 7574 { 7575 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7576 (__v8df) _mm512_setzero_pd (), 7577 (__mmask8) __U, 7578 _MM_FROUND_CUR_DIRECTION); 7579 } 7580 7581 #define _mm512_getexp_round_ps(A, R) \ 7582 ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7583 (__v16sf)_mm512_undefined_ps(), \ 7584 (__mmask16)-1, (int)(R))) 7585 7586 #define _mm512_mask_getexp_round_ps(W, U, A, R) \ 7587 ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7588 (__v16sf)(__m512)(W), \ 7589 (__mmask16)(U), (int)(R))) 7590 7591 #define _mm512_maskz_getexp_round_ps(U, A, R) \ 7592 ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7593 (__v16sf)_mm512_setzero_ps(), \ 7594 (__mmask16)(U), (int)(R))) 7595 7596 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7597 _mm512_getexp_ps (__m512 __A) 7598 { 7599 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7600 (__v16sf) _mm512_undefined_ps (), 7601 (__mmask16) -1, 7602 _MM_FROUND_CUR_DIRECTION); 7603 } 7604 7605 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7606 _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) 7607 { 7608 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7609 (__v16sf) __W, 7610 (__mmask16) __U, 7611 _MM_FROUND_CUR_DIRECTION); 7612 } 7613 7614 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7615 _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) 7616 { 7617 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7618 (__v16sf) _mm512_setzero_ps (), 7619 (__mmask16) __U, 7620 _MM_FROUND_CUR_DIRECTION); 7621 } 7622 7623 #define _mm512_i64gather_ps(index, addr, scale) \ 7624 ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \ 7625 (void const *)(addr), \ 7626 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7627 (int)(scale))) 7628 7629 #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \ 7630 ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ 7631 (void const *)(addr), \ 7632 (__v8di)(__m512i)(index), \ 7633 (__mmask8)(mask), (int)(scale))) 7634 7635 #define _mm512_i64gather_epi32(index, addr, scale) \ 7636 ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \ 7637 (void const *)(addr), \ 7638 (__v8di)(__m512i)(index), \ 7639 (__mmask8)-1, (int)(scale))) 7640 7641 #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \ 7642 ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \ 7643 (void const *)(addr), \ 7644 (__v8di)(__m512i)(index), \ 7645 (__mmask8)(mask), (int)(scale))) 7646 7647 #define _mm512_i64gather_pd(index, addr, scale) \ 7648 ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \ 7649 (void const *)(addr), \ 7650 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7651 (int)(scale))) 7652 7653 #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \ 7654 ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \ 7655 (void const *)(addr), \ 7656 (__v8di)(__m512i)(index), \ 7657 (__mmask8)(mask), (int)(scale))) 7658 7659 #define _mm512_i64gather_epi64(index, addr, scale) \ 7660 ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \ 7661 (void const *)(addr), \ 7662 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7663 (int)(scale))) 7664 7665 #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \ 7666 ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \ 7667 (void const *)(addr), \ 7668 (__v8di)(__m512i)(index), \ 7669 (__mmask8)(mask), (int)(scale))) 7670 7671 #define _mm512_i32gather_ps(index, addr, scale) \ 7672 ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \ 7673 (void const *)(addr), \ 7674 (__v16si)(__m512)(index), \ 7675 (__mmask16)-1, (int)(scale))) 7676 7677 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \ 7678 ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \ 7679 (void const *)(addr), \ 7680 (__v16si)(__m512)(index), \ 7681 (__mmask16)(mask), (int)(scale))) 7682 7683 #define _mm512_i32gather_epi32(index, addr, scale) \ 7684 ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \ 7685 (void const *)(addr), \ 7686 (__v16si)(__m512i)(index), \ 7687 (__mmask16)-1, (int)(scale))) 7688 7689 #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \ 7690 ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \ 7691 (void const *)(addr), \ 7692 (__v16si)(__m512i)(index), \ 7693 (__mmask16)(mask), (int)(scale))) 7694 7695 #define _mm512_i32gather_pd(index, addr, scale) \ 7696 ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \ 7697 (void const *)(addr), \ 7698 (__v8si)(__m256i)(index), (__mmask8)-1, \ 7699 (int)(scale))) 7700 7701 #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \ 7702 ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \ 7703 (void const *)(addr), \ 7704 (__v8si)(__m256i)(index), \ 7705 (__mmask8)(mask), (int)(scale))) 7706 7707 #define _mm512_i32gather_epi64(index, addr, scale) \ 7708 ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \ 7709 (void const *)(addr), \ 7710 (__v8si)(__m256i)(index), (__mmask8)-1, \ 7711 (int)(scale))) 7712 7713 #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \ 7714 ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \ 7715 (void const *)(addr), \ 7716 (__v8si)(__m256i)(index), \ 7717 (__mmask8)(mask), (int)(scale))) 7718 7719 #define _mm512_i64scatter_ps(addr, index, v1, scale) \ 7720 __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \ 7721 (__v8di)(__m512i)(index), \ 7722 (__v8sf)(__m256)(v1), (int)(scale)) 7723 7724 #define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \ 7725 __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \ 7726 (__v8di)(__m512i)(index), \ 7727 (__v8sf)(__m256)(v1), (int)(scale)) 7728 7729 #define _mm512_i64scatter_epi32(addr, index, v1, scale) \ 7730 __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \ 7731 (__v8di)(__m512i)(index), \ 7732 (__v8si)(__m256i)(v1), (int)(scale)) 7733 7734 #define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ 7735 __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \ 7736 (__v8di)(__m512i)(index), \ 7737 (__v8si)(__m256i)(v1), (int)(scale)) 7738 7739 #define _mm512_i64scatter_pd(addr, index, v1, scale) \ 7740 __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \ 7741 (__v8di)(__m512i)(index), \ 7742 (__v8df)(__m512d)(v1), (int)(scale)) 7743 7744 #define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \ 7745 __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \ 7746 (__v8di)(__m512i)(index), \ 7747 (__v8df)(__m512d)(v1), (int)(scale)) 7748 7749 #define _mm512_i64scatter_epi64(addr, index, v1, scale) \ 7750 __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \ 7751 (__v8di)(__m512i)(index), \ 7752 (__v8di)(__m512i)(v1), (int)(scale)) 7753 7754 #define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ 7755 __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \ 7756 (__v8di)(__m512i)(index), \ 7757 (__v8di)(__m512i)(v1), (int)(scale)) 7758 7759 #define _mm512_i32scatter_ps(addr, index, v1, scale) \ 7760 __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \ 7761 (__v16si)(__m512i)(index), \ 7762 (__v16sf)(__m512)(v1), (int)(scale)) 7763 7764 #define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \ 7765 __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \ 7766 (__v16si)(__m512i)(index), \ 7767 (__v16sf)(__m512)(v1), (int)(scale)) 7768 7769 #define _mm512_i32scatter_epi32(addr, index, v1, scale) \ 7770 __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \ 7771 (__v16si)(__m512i)(index), \ 7772 (__v16si)(__m512i)(v1), (int)(scale)) 7773 7774 #define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ 7775 __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \ 7776 (__v16si)(__m512i)(index), \ 7777 (__v16si)(__m512i)(v1), (int)(scale)) 7778 7779 #define _mm512_i32scatter_pd(addr, index, v1, scale) \ 7780 __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \ 7781 (__v8si)(__m256i)(index), \ 7782 (__v8df)(__m512d)(v1), (int)(scale)) 7783 7784 #define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \ 7785 __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \ 7786 (__v8si)(__m256i)(index), \ 7787 (__v8df)(__m512d)(v1), (int)(scale)) 7788 7789 #define _mm512_i32scatter_epi64(addr, index, v1, scale) \ 7790 __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \ 7791 (__v8si)(__m256i)(index), \ 7792 (__v8di)(__m512i)(v1), (int)(scale)) 7793 7794 #define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ 7795 __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \ 7796 (__v8si)(__m256i)(index), \ 7797 (__v8di)(__m512i)(v1), (int)(scale)) 7798 7799 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7800 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7801 { 7802 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7803 (__v4sf)__A, 7804 (__v4sf)__B, 7805 (__mmask8)__U, 7806 _MM_FROUND_CUR_DIRECTION); 7807 } 7808 7809 #define _mm_fmadd_round_ss(A, B, C, R) \ 7810 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7811 (__v4sf)(__m128)(B), \ 7812 (__v4sf)(__m128)(C), (__mmask8)-1, \ 7813 (int)(R))) 7814 7815 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) \ 7816 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7817 (__v4sf)(__m128)(A), \ 7818 (__v4sf)(__m128)(B), (__mmask8)(U), \ 7819 (int)(R))) 7820 7821 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7822 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7823 { 7824 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7825 (__v4sf)__B, 7826 (__v4sf)__C, 7827 (__mmask8)__U, 7828 _MM_FROUND_CUR_DIRECTION); 7829 } 7830 7831 #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ 7832 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7833 (__v4sf)(__m128)(B), \ 7834 (__v4sf)(__m128)(C), (__mmask8)(U), \ 7835 (int)(R))) 7836 7837 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7838 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7839 { 7840 return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, 7841 (__v4sf)__X, 7842 (__v4sf)__Y, 7843 (__mmask8)__U, 7844 _MM_FROUND_CUR_DIRECTION); 7845 } 7846 7847 #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \ 7848 ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ 7849 (__v4sf)(__m128)(X), \ 7850 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7851 (int)(R))) 7852 7853 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7854 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7855 { 7856 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7857 (__v4sf)__A, 7858 -(__v4sf)__B, 7859 (__mmask8)__U, 7860 _MM_FROUND_CUR_DIRECTION); 7861 } 7862 7863 #define _mm_fmsub_round_ss(A, B, C, R) \ 7864 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7865 (__v4sf)(__m128)(B), \ 7866 -(__v4sf)(__m128)(C), (__mmask8)-1, \ 7867 (int)(R))) 7868 7869 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) \ 7870 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7871 (__v4sf)(__m128)(A), \ 7872 -(__v4sf)(__m128)(B), (__mmask8)(U), \ 7873 (int)(R))) 7874 7875 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7876 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7877 { 7878 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7879 (__v4sf)__B, 7880 -(__v4sf)__C, 7881 (__mmask8)__U, 7882 _MM_FROUND_CUR_DIRECTION); 7883 } 7884 7885 #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ 7886 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7887 (__v4sf)(__m128)(B), \ 7888 -(__v4sf)(__m128)(C), (__mmask8)(U), \ 7889 (int)(R))) 7890 7891 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7892 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7893 { 7894 return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, 7895 (__v4sf)__X, 7896 (__v4sf)__Y, 7897 (__mmask8)__U, 7898 _MM_FROUND_CUR_DIRECTION); 7899 } 7900 7901 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \ 7902 ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ 7903 (__v4sf)(__m128)(X), \ 7904 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7905 (int)(R))) 7906 7907 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7908 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7909 { 7910 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7911 -(__v4sf)__A, 7912 (__v4sf)__B, 7913 (__mmask8)__U, 7914 _MM_FROUND_CUR_DIRECTION); 7915 } 7916 7917 #define _mm_fnmadd_round_ss(A, B, C, R) \ 7918 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7919 -(__v4sf)(__m128)(B), \ 7920 (__v4sf)(__m128)(C), (__mmask8)-1, \ 7921 (int)(R))) 7922 7923 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \ 7924 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7925 -(__v4sf)(__m128)(A), \ 7926 (__v4sf)(__m128)(B), (__mmask8)(U), \ 7927 (int)(R))) 7928 7929 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7930 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7931 { 7932 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7933 -(__v4sf)__B, 7934 (__v4sf)__C, 7935 (__mmask8)__U, 7936 _MM_FROUND_CUR_DIRECTION); 7937 } 7938 7939 #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ 7940 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7941 -(__v4sf)(__m128)(B), \ 7942 (__v4sf)(__m128)(C), (__mmask8)(U), \ 7943 (int)(R))) 7944 7945 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7946 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7947 { 7948 return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, 7949 -(__v4sf)__X, 7950 (__v4sf)__Y, 7951 (__mmask8)__U, 7952 _MM_FROUND_CUR_DIRECTION); 7953 } 7954 7955 #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \ 7956 ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ 7957 -(__v4sf)(__m128)(X), \ 7958 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7959 (int)(R))) 7960 7961 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7962 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7963 { 7964 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7965 -(__v4sf)__A, 7966 -(__v4sf)__B, 7967 (__mmask8)__U, 7968 _MM_FROUND_CUR_DIRECTION); 7969 } 7970 7971 #define _mm_fnmsub_round_ss(A, B, C, R) \ 7972 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7973 -(__v4sf)(__m128)(B), \ 7974 -(__v4sf)(__m128)(C), (__mmask8)-1, \ 7975 (int)(R))) 7976 7977 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \ 7978 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7979 -(__v4sf)(__m128)(A), \ 7980 -(__v4sf)(__m128)(B), (__mmask8)(U), \ 7981 (int)(R))) 7982 7983 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7984 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7985 { 7986 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7987 -(__v4sf)__B, 7988 -(__v4sf)__C, 7989 (__mmask8)__U, 7990 _MM_FROUND_CUR_DIRECTION); 7991 } 7992 7993 #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ 7994 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7995 -(__v4sf)(__m128)(B), \ 7996 -(__v4sf)(__m128)(C), (__mmask8)(U), \ 7997 (int)(R))) 7998 7999 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8000 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 8001 { 8002 return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, 8003 -(__v4sf)__X, 8004 (__v4sf)__Y, 8005 (__mmask8)__U, 8006 _MM_FROUND_CUR_DIRECTION); 8007 } 8008 8009 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \ 8010 ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ 8011 -(__v4sf)(__m128)(X), \ 8012 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 8013 (int)(R))) 8014 8015 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8016 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8017 { 8018 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8019 (__v2df)__A, 8020 (__v2df)__B, 8021 (__mmask8)__U, 8022 _MM_FROUND_CUR_DIRECTION); 8023 } 8024 8025 #define _mm_fmadd_round_sd(A, B, C, R) \ 8026 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8027 (__v2df)(__m128d)(B), \ 8028 (__v2df)(__m128d)(C), (__mmask8)-1, \ 8029 (int)(R))) 8030 8031 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) \ 8032 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8033 (__v2df)(__m128d)(A), \ 8034 (__v2df)(__m128d)(B), (__mmask8)(U), \ 8035 (int)(R))) 8036 8037 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8038 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8039 { 8040 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8041 (__v2df)__B, 8042 (__v2df)__C, 8043 (__mmask8)__U, 8044 _MM_FROUND_CUR_DIRECTION); 8045 } 8046 8047 #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ 8048 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8049 (__v2df)(__m128d)(B), \ 8050 (__v2df)(__m128d)(C), (__mmask8)(U), \ 8051 (int)(R))) 8052 8053 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8054 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8055 { 8056 return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, 8057 (__v2df)__X, 8058 (__v2df)__Y, 8059 (__mmask8)__U, 8060 _MM_FROUND_CUR_DIRECTION); 8061 } 8062 8063 #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \ 8064 ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ 8065 (__v2df)(__m128d)(X), \ 8066 (__v2df)(__m128d)(Y), (__mmask8)(U), \ 8067 (int)(R))) 8068 8069 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8070 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8071 { 8072 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8073 (__v2df)__A, 8074 -(__v2df)__B, 8075 (__mmask8)__U, 8076 _MM_FROUND_CUR_DIRECTION); 8077 } 8078 8079 #define _mm_fmsub_round_sd(A, B, C, R) \ 8080 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8081 (__v2df)(__m128d)(B), \ 8082 -(__v2df)(__m128d)(C), (__mmask8)-1, \ 8083 (int)(R))) 8084 8085 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) \ 8086 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8087 (__v2df)(__m128d)(A), \ 8088 -(__v2df)(__m128d)(B), (__mmask8)(U), \ 8089 (int)(R))) 8090 8091 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8092 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8093 { 8094 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8095 (__v2df)__B, 8096 -(__v2df)__C, 8097 (__mmask8)__U, 8098 _MM_FROUND_CUR_DIRECTION); 8099 } 8100 8101 #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ 8102 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8103 (__v2df)(__m128d)(B), \ 8104 -(__v2df)(__m128d)(C), \ 8105 (__mmask8)(U), (int)(R))) 8106 8107 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8108 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8109 { 8110 return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, 8111 (__v2df)__X, 8112 (__v2df)__Y, 8113 (__mmask8)__U, 8114 _MM_FROUND_CUR_DIRECTION); 8115 } 8116 8117 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \ 8118 ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ 8119 (__v2df)(__m128d)(X), \ 8120 (__v2df)(__m128d)(Y), \ 8121 (__mmask8)(U), (int)(R))) 8122 8123 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8124 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8125 { 8126 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8127 -(__v2df)__A, 8128 (__v2df)__B, 8129 (__mmask8)__U, 8130 _MM_FROUND_CUR_DIRECTION); 8131 } 8132 8133 #define _mm_fnmadd_round_sd(A, B, C, R) \ 8134 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8135 -(__v2df)(__m128d)(B), \ 8136 (__v2df)(__m128d)(C), (__mmask8)-1, \ 8137 (int)(R))) 8138 8139 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \ 8140 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8141 -(__v2df)(__m128d)(A), \ 8142 (__v2df)(__m128d)(B), (__mmask8)(U), \ 8143 (int)(R))) 8144 8145 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8146 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8147 { 8148 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8149 -(__v2df)__B, 8150 (__v2df)__C, 8151 (__mmask8)__U, 8152 _MM_FROUND_CUR_DIRECTION); 8153 } 8154 8155 #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ 8156 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8157 -(__v2df)(__m128d)(B), \ 8158 (__v2df)(__m128d)(C), (__mmask8)(U), \ 8159 (int)(R))) 8160 8161 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8162 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8163 { 8164 return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, 8165 -(__v2df)__X, 8166 (__v2df)__Y, 8167 (__mmask8)__U, 8168 _MM_FROUND_CUR_DIRECTION); 8169 } 8170 8171 #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \ 8172 ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ 8173 -(__v2df)(__m128d)(X), \ 8174 (__v2df)(__m128d)(Y), (__mmask8)(U), \ 8175 (int)(R))) 8176 8177 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8178 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8179 { 8180 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8181 -(__v2df)__A, 8182 -(__v2df)__B, 8183 (__mmask8)__U, 8184 _MM_FROUND_CUR_DIRECTION); 8185 } 8186 8187 #define _mm_fnmsub_round_sd(A, B, C, R) \ 8188 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8189 -(__v2df)(__m128d)(B), \ 8190 -(__v2df)(__m128d)(C), (__mmask8)-1, \ 8191 (int)(R))) 8192 8193 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \ 8194 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8195 -(__v2df)(__m128d)(A), \ 8196 -(__v2df)(__m128d)(B), (__mmask8)(U), \ 8197 (int)(R))) 8198 8199 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8200 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8201 { 8202 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8203 -(__v2df)__B, 8204 -(__v2df)__C, 8205 (__mmask8)__U, 8206 _MM_FROUND_CUR_DIRECTION); 8207 } 8208 8209 #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ 8210 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8211 -(__v2df)(__m128d)(B), \ 8212 -(__v2df)(__m128d)(C), \ 8213 (__mmask8)(U), \ 8214 (int)(R))) 8215 8216 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8217 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8218 { 8219 return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, 8220 -(__v2df)__X, 8221 (__v2df)__Y, 8222 (__mmask8)__U, 8223 _MM_FROUND_CUR_DIRECTION); 8224 } 8225 8226 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \ 8227 ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ 8228 -(__v2df)(__m128d)(X), \ 8229 (__v2df)(__m128d)(Y), \ 8230 (__mmask8)(U), (int)(R))) 8231 8232 #define _mm512_permutex_pd(X, C) \ 8233 ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C))) 8234 8235 #define _mm512_mask_permutex_pd(W, U, X, C) \ 8236 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 8237 (__v8df)_mm512_permutex_pd((X), (C)), \ 8238 (__v8df)(__m512d)(W))) 8239 8240 #define _mm512_maskz_permutex_pd(U, X, C) \ 8241 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 8242 (__v8df)_mm512_permutex_pd((X), (C)), \ 8243 (__v8df)_mm512_setzero_pd())) 8244 8245 #define _mm512_permutex_epi64(X, C) \ 8246 ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C))) 8247 8248 #define _mm512_mask_permutex_epi64(W, U, X, C) \ 8249 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 8250 (__v8di)_mm512_permutex_epi64((X), (C)), \ 8251 (__v8di)(__m512i)(W))) 8252 8253 #define _mm512_maskz_permutex_epi64(U, X, C) \ 8254 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 8255 (__v8di)_mm512_permutex_epi64((X), (C)), \ 8256 (__v8di)_mm512_setzero_si512())) 8257 8258 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8259 _mm512_permutexvar_pd (__m512i __X, __m512d __Y) 8260 { 8261 return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X); 8262 } 8263 8264 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8265 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) 8266 { 8267 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8268 (__v8df)_mm512_permutexvar_pd(__X, __Y), 8269 (__v8df)__W); 8270 } 8271 8272 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8273 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) 8274 { 8275 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8276 (__v8df)_mm512_permutexvar_pd(__X, __Y), 8277 (__v8df)_mm512_setzero_pd()); 8278 } 8279 8280 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8281 _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) 8282 { 8283 return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X); 8284 } 8285 8286 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8287 _mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) 8288 { 8289 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 8290 (__v8di)_mm512_permutexvar_epi64(__X, __Y), 8291 (__v8di)_mm512_setzero_si512()); 8292 } 8293 8294 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8295 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, 8296 __m512i __Y) 8297 { 8298 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 8299 (__v8di)_mm512_permutexvar_epi64(__X, __Y), 8300 (__v8di)__W); 8301 } 8302 8303 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8304 _mm512_permutexvar_ps (__m512i __X, __m512 __Y) 8305 { 8306 return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X); 8307 } 8308 8309 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8310 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) 8311 { 8312 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8313 (__v16sf)_mm512_permutexvar_ps(__X, __Y), 8314 (__v16sf)__W); 8315 } 8316 8317 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8318 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) 8319 { 8320 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8321 (__v16sf)_mm512_permutexvar_ps(__X, __Y), 8322 (__v16sf)_mm512_setzero_ps()); 8323 } 8324 8325 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8326 _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) 8327 { 8328 return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X); 8329 } 8330 8331 #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32 8332 8333 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8334 _mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) 8335 { 8336 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 8337 (__v16si)_mm512_permutexvar_epi32(__X, __Y), 8338 (__v16si)_mm512_setzero_si512()); 8339 } 8340 8341 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8342 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, 8343 __m512i __Y) 8344 { 8345 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 8346 (__v16si)_mm512_permutexvar_epi32(__X, __Y), 8347 (__v16si)__W); 8348 } 8349 8350 #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 8351 8352 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8353 _mm512_kand (__mmask16 __A, __mmask16 __B) 8354 { 8355 return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); 8356 } 8357 8358 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8359 _mm512_kandn (__mmask16 __A, __mmask16 __B) 8360 { 8361 return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); 8362 } 8363 8364 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8365 _mm512_kor (__mmask16 __A, __mmask16 __B) 8366 { 8367 return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); 8368 } 8369 8370 static __inline__ int __DEFAULT_FN_ATTRS 8371 _mm512_kortestc (__mmask16 __A, __mmask16 __B) 8372 { 8373 return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B); 8374 } 8375 8376 static __inline__ int __DEFAULT_FN_ATTRS 8377 _mm512_kortestz (__mmask16 __A, __mmask16 __B) 8378 { 8379 return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B); 8380 } 8381 8382 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8383 _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) 8384 { 8385 return (unsigned char)__builtin_ia32_kortestchi(__A, __B); 8386 } 8387 8388 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8389 _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) 8390 { 8391 return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); 8392 } 8393 8394 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8395 _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { 8396 *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B); 8397 return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); 8398 } 8399 8400 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8401 _mm512_kunpackb (__mmask16 __A, __mmask16 __B) 8402 { 8403 return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); 8404 } 8405 8406 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8407 _mm512_kxnor (__mmask16 __A, __mmask16 __B) 8408 { 8409 return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); 8410 } 8411 8412 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8413 _mm512_kxor (__mmask16 __A, __mmask16 __B) 8414 { 8415 return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); 8416 } 8417 8418 #define _kand_mask16 _mm512_kand 8419 #define _kandn_mask16 _mm512_kandn 8420 #define _knot_mask16 _mm512_knot 8421 #define _kor_mask16 _mm512_kor 8422 #define _kxnor_mask16 _mm512_kxnor 8423 #define _kxor_mask16 _mm512_kxor 8424 8425 #define _kshiftli_mask16(A, I) \ 8426 ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I))) 8427 8428 #define _kshiftri_mask16(A, I) \ 8429 ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I))) 8430 8431 static __inline__ unsigned int __DEFAULT_FN_ATTRS 8432 _cvtmask16_u32(__mmask16 __A) { 8433 return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); 8434 } 8435 8436 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8437 _cvtu32_mask16(unsigned int __A) { 8438 return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); 8439 } 8440 8441 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8442 _load_mask16(__mmask16 *__A) { 8443 return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); 8444 } 8445 8446 static __inline__ void __DEFAULT_FN_ATTRS 8447 _store_mask16(__mmask16 *__A, __mmask16 __B) { 8448 *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B); 8449 } 8450 8451 static __inline__ void __DEFAULT_FN_ATTRS512 8452 _mm512_stream_si512 (void * __P, __m512i __A) 8453 { 8454 typedef __v8di __v8di_aligned __attribute__((aligned(64))); 8455 __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P); 8456 } 8457 8458 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8459 _mm512_stream_load_si512 (void const *__P) 8460 { 8461 typedef __v8di __v8di_aligned __attribute__((aligned(64))); 8462 return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P); 8463 } 8464 8465 static __inline__ void __DEFAULT_FN_ATTRS512 8466 _mm512_stream_pd (void *__P, __m512d __A) 8467 { 8468 typedef __v8df __v8df_aligned __attribute__((aligned(64))); 8469 __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P); 8470 } 8471 8472 static __inline__ void __DEFAULT_FN_ATTRS512 8473 _mm512_stream_ps (void *__P, __m512 __A) 8474 { 8475 typedef __v16sf __v16sf_aligned __attribute__((aligned(64))); 8476 __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P); 8477 } 8478 8479 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8480 _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) 8481 { 8482 return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, 8483 (__v8df) __W, 8484 (__mmask8) __U); 8485 } 8486 8487 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8488 _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) 8489 { 8490 return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, 8491 (__v8df) 8492 _mm512_setzero_pd (), 8493 (__mmask8) __U); 8494 } 8495 8496 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8497 _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 8498 { 8499 return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, 8500 (__v8di) __W, 8501 (__mmask8) __U); 8502 } 8503 8504 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8505 _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) 8506 { 8507 return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, 8508 (__v8di) 8509 _mm512_setzero_si512 (), 8510 (__mmask8) __U); 8511 } 8512 8513 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8514 _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) 8515 { 8516 return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, 8517 (__v16sf) __W, 8518 (__mmask16) __U); 8519 } 8520 8521 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8522 _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) 8523 { 8524 return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, 8525 (__v16sf) 8526 _mm512_setzero_ps (), 8527 (__mmask16) __U); 8528 } 8529 8530 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8531 _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 8532 { 8533 return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, 8534 (__v16si) __W, 8535 (__mmask16) __U); 8536 } 8537 8538 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8539 _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) 8540 { 8541 return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, 8542 (__v16si) 8543 _mm512_setzero_si512 (), 8544 (__mmask16) __U); 8545 } 8546 8547 #define _mm_cmp_round_ss_mask(X, Y, P, R) \ 8548 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8549 (__v4sf)(__m128)(Y), (int)(P), \ 8550 (__mmask8)-1, (int)(R))) 8551 8552 #define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ 8553 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8554 (__v4sf)(__m128)(Y), (int)(P), \ 8555 (__mmask8)(M), (int)(R))) 8556 8557 #define _mm_cmp_ss_mask(X, Y, P) \ 8558 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8559 (__v4sf)(__m128)(Y), (int)(P), \ 8560 (__mmask8)-1, \ 8561 _MM_FROUND_CUR_DIRECTION)) 8562 8563 #define _mm_mask_cmp_ss_mask(M, X, Y, P) \ 8564 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8565 (__v4sf)(__m128)(Y), (int)(P), \ 8566 (__mmask8)(M), \ 8567 _MM_FROUND_CUR_DIRECTION)) 8568 8569 #define _mm_cmp_round_sd_mask(X, Y, P, R) \ 8570 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8571 (__v2df)(__m128d)(Y), (int)(P), \ 8572 (__mmask8)-1, (int)(R))) 8573 8574 #define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ 8575 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8576 (__v2df)(__m128d)(Y), (int)(P), \ 8577 (__mmask8)(M), (int)(R))) 8578 8579 #define _mm_cmp_sd_mask(X, Y, P) \ 8580 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8581 (__v2df)(__m128d)(Y), (int)(P), \ 8582 (__mmask8)-1, \ 8583 _MM_FROUND_CUR_DIRECTION)) 8584 8585 #define _mm_mask_cmp_sd_mask(M, X, Y, P) \ 8586 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8587 (__v2df)(__m128d)(Y), (int)(P), \ 8588 (__mmask8)(M), \ 8589 _MM_FROUND_CUR_DIRECTION)) 8590 8591 /* Bit Test */ 8592 8593 static __inline __mmask16 __DEFAULT_FN_ATTRS512 8594 _mm512_test_epi32_mask (__m512i __A, __m512i __B) 8595 { 8596 return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B), 8597 _mm512_setzero_si512()); 8598 } 8599 8600 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8601 _mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) 8602 { 8603 return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), 8604 _mm512_setzero_si512()); 8605 } 8606 8607 static __inline __mmask8 __DEFAULT_FN_ATTRS512 8608 _mm512_test_epi64_mask (__m512i __A, __m512i __B) 8609 { 8610 return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B), 8611 _mm512_setzero_si512()); 8612 } 8613 8614 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8615 _mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) 8616 { 8617 return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), 8618 _mm512_setzero_si512()); 8619 } 8620 8621 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8622 _mm512_testn_epi32_mask (__m512i __A, __m512i __B) 8623 { 8624 return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B), 8625 _mm512_setzero_si512()); 8626 } 8627 8628 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8629 _mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) 8630 { 8631 return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), 8632 _mm512_setzero_si512()); 8633 } 8634 8635 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8636 _mm512_testn_epi64_mask (__m512i __A, __m512i __B) 8637 { 8638 return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B), 8639 _mm512_setzero_si512()); 8640 } 8641 8642 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8643 _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) 8644 { 8645 return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), 8646 _mm512_setzero_si512()); 8647 } 8648 8649 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8650 _mm512_movehdup_ps (__m512 __A) 8651 { 8652 return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, 8653 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); 8654 } 8655 8656 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8657 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) 8658 { 8659 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8660 (__v16sf)_mm512_movehdup_ps(__A), 8661 (__v16sf)__W); 8662 } 8663 8664 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8665 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) 8666 { 8667 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8668 (__v16sf)_mm512_movehdup_ps(__A), 8669 (__v16sf)_mm512_setzero_ps()); 8670 } 8671 8672 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8673 _mm512_moveldup_ps (__m512 __A) 8674 { 8675 return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, 8676 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); 8677 } 8678 8679 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8680 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) 8681 { 8682 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8683 (__v16sf)_mm512_moveldup_ps(__A), 8684 (__v16sf)__W); 8685 } 8686 8687 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8688 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) 8689 { 8690 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8691 (__v16sf)_mm512_moveldup_ps(__A), 8692 (__v16sf)_mm512_setzero_ps()); 8693 } 8694 8695 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8696 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 8697 { 8698 return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W); 8699 } 8700 8701 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8702 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) 8703 { 8704 return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), 8705 _mm_setzero_ps()); 8706 } 8707 8708 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8709 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8710 { 8711 return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W); 8712 } 8713 8714 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8715 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) 8716 { 8717 return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), 8718 _mm_setzero_pd()); 8719 } 8720 8721 static __inline__ void __DEFAULT_FN_ATTRS128 8722 _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) 8723 { 8724 __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1); 8725 } 8726 8727 static __inline__ void __DEFAULT_FN_ATTRS128 8728 _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) 8729 { 8730 __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1); 8731 } 8732 8733 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8734 _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) 8735 { 8736 __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W, 8737 (__v4sf)_mm_setzero_ps(), 8738 0, 4, 4, 4); 8739 8740 return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1); 8741 } 8742 8743 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8744 _mm_maskz_load_ss (__mmask8 __U, const float* __A) 8745 { 8746 return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A, 8747 (__v4sf) _mm_setzero_ps(), 8748 __U & 1); 8749 } 8750 8751 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8752 _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) 8753 { 8754 __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, 8755 (__v2df)_mm_setzero_pd(), 8756 0, 2); 8757 8758 return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1); 8759 } 8760 8761 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8762 _mm_maskz_load_sd (__mmask8 __U, const double* __A) 8763 { 8764 return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, 8765 (__v2df) _mm_setzero_pd(), 8766 __U & 1); 8767 } 8768 8769 #define _mm512_shuffle_epi32(A, I) \ 8770 ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I))) 8771 8772 #define _mm512_mask_shuffle_epi32(W, U, A, I) \ 8773 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 8774 (__v16si)_mm512_shuffle_epi32((A), (I)), \ 8775 (__v16si)(__m512i)(W))) 8776 8777 #define _mm512_maskz_shuffle_epi32(U, A, I) \ 8778 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 8779 (__v16si)_mm512_shuffle_epi32((A), (I)), \ 8780 (__v16si)_mm512_setzero_si512())) 8781 8782 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8783 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) 8784 { 8785 return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, 8786 (__v8df) __W, 8787 (__mmask8) __U); 8788 } 8789 8790 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8791 _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) 8792 { 8793 return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, 8794 (__v8df) _mm512_setzero_pd (), 8795 (__mmask8) __U); 8796 } 8797 8798 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8799 _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 8800 { 8801 return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, 8802 (__v8di) __W, 8803 (__mmask8) __U); 8804 } 8805 8806 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8807 _mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A) 8808 { 8809 return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, 8810 (__v8di) _mm512_setzero_si512 (), 8811 (__mmask8) __U); 8812 } 8813 8814 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8815 _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) 8816 { 8817 return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, 8818 (__v8df) __W, 8819 (__mmask8) __U); 8820 } 8821 8822 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8823 _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) 8824 { 8825 return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, 8826 (__v8df) _mm512_setzero_pd(), 8827 (__mmask8) __U); 8828 } 8829 8830 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8831 _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) 8832 { 8833 return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, 8834 (__v8di) __W, 8835 (__mmask8) __U); 8836 } 8837 8838 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8839 _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) 8840 { 8841 return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, 8842 (__v8di) _mm512_setzero_si512(), 8843 (__mmask8) __U); 8844 } 8845 8846 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8847 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) 8848 { 8849 return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, 8850 (__v16sf) __W, 8851 (__mmask16) __U); 8852 } 8853 8854 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8855 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) 8856 { 8857 return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, 8858 (__v16sf) _mm512_setzero_ps(), 8859 (__mmask16) __U); 8860 } 8861 8862 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8863 _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) 8864 { 8865 return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, 8866 (__v16si) __W, 8867 (__mmask16) __U); 8868 } 8869 8870 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8871 _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) 8872 { 8873 return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, 8874 (__v16si) _mm512_setzero_si512(), 8875 (__mmask16) __U); 8876 } 8877 8878 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8879 _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) 8880 { 8881 return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, 8882 (__v16sf) __W, 8883 (__mmask16) __U); 8884 } 8885 8886 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8887 _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) 8888 { 8889 return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, 8890 (__v16sf) _mm512_setzero_ps(), 8891 (__mmask16) __U); 8892 } 8893 8894 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8895 _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 8896 { 8897 return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, 8898 (__v16si) __W, 8899 (__mmask16) __U); 8900 } 8901 8902 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8903 _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) 8904 { 8905 return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, 8906 (__v16si) _mm512_setzero_si512(), 8907 (__mmask16) __U); 8908 } 8909 8910 #define _mm512_cvt_roundps_pd(A, R) \ 8911 ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8912 (__v8df)_mm512_undefined_pd(), \ 8913 (__mmask8)-1, (int)(R))) 8914 8915 #define _mm512_mask_cvt_roundps_pd(W, U, A, R) \ 8916 ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8917 (__v8df)(__m512d)(W), \ 8918 (__mmask8)(U), (int)(R))) 8919 8920 #define _mm512_maskz_cvt_roundps_pd(U, A, R) \ 8921 ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8922 (__v8df)_mm512_setzero_pd(), \ 8923 (__mmask8)(U), (int)(R))) 8924 8925 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8926 _mm512_cvtps_pd (__m256 __A) 8927 { 8928 return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df); 8929 } 8930 8931 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8932 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) 8933 { 8934 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8935 (__v8df)_mm512_cvtps_pd(__A), 8936 (__v8df)__W); 8937 } 8938 8939 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8940 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) 8941 { 8942 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8943 (__v8df)_mm512_cvtps_pd(__A), 8944 (__v8df)_mm512_setzero_pd()); 8945 } 8946 8947 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8948 _mm512_cvtpslo_pd (__m512 __A) 8949 { 8950 return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); 8951 } 8952 8953 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8954 _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) 8955 { 8956 return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); 8957 } 8958 8959 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8960 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) 8961 { 8962 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 8963 (__v8df) __A, 8964 (__v8df) __W); 8965 } 8966 8967 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8968 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) 8969 { 8970 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 8971 (__v8df) __A, 8972 (__v8df) _mm512_setzero_pd ()); 8973 } 8974 8975 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8976 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) 8977 { 8978 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 8979 (__v16sf) __A, 8980 (__v16sf) __W); 8981 } 8982 8983 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8984 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) 8985 { 8986 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 8987 (__v16sf) __A, 8988 (__v16sf) _mm512_setzero_ps ()); 8989 } 8990 8991 static __inline__ void __DEFAULT_FN_ATTRS512 8992 _mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) 8993 { 8994 __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, 8995 (__mmask8) __U); 8996 } 8997 8998 static __inline__ void __DEFAULT_FN_ATTRS512 8999 _mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) 9000 { 9001 __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, 9002 (__mmask8) __U); 9003 } 9004 9005 static __inline__ void __DEFAULT_FN_ATTRS512 9006 _mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) 9007 { 9008 __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, 9009 (__mmask16) __U); 9010 } 9011 9012 static __inline__ void __DEFAULT_FN_ATTRS512 9013 _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) 9014 { 9015 __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, 9016 (__mmask16) __U); 9017 } 9018 9019 #define _mm_cvt_roundsd_ss(A, B, R) \ 9020 ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9021 (__v2df)(__m128d)(B), \ 9022 (__v4sf)_mm_undefined_ps(), \ 9023 (__mmask8)-1, (int)(R))) 9024 9025 #define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \ 9026 ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9027 (__v2df)(__m128d)(B), \ 9028 (__v4sf)(__m128)(W), \ 9029 (__mmask8)(U), (int)(R))) 9030 9031 #define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \ 9032 ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9033 (__v2df)(__m128d)(B), \ 9034 (__v4sf)_mm_setzero_ps(), \ 9035 (__mmask8)(U), (int)(R))) 9036 9037 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9038 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) 9039 { 9040 return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, 9041 (__v2df)__B, 9042 (__v4sf)__W, 9043 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9044 } 9045 9046 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9047 _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) 9048 { 9049 return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, 9050 (__v2df)__B, 9051 (__v4sf)_mm_setzero_ps(), 9052 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9053 } 9054 9055 #define _mm_cvtss_i32 _mm_cvtss_si32 9056 #define _mm_cvtsd_i32 _mm_cvtsd_si32 9057 #define _mm_cvti32_sd _mm_cvtsi32_sd 9058 #define _mm_cvti32_ss _mm_cvtsi32_ss 9059 #ifdef __x86_64__ 9060 #define _mm_cvtss_i64 _mm_cvtss_si64 9061 #define _mm_cvtsd_i64 _mm_cvtsd_si64 9062 #define _mm_cvti64_sd _mm_cvtsi64_sd 9063 #define _mm_cvti64_ss _mm_cvtsi64_ss 9064 #endif 9065 9066 #ifdef __x86_64__ 9067 #define _mm_cvt_roundi64_sd(A, B, R) \ 9068 ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ 9069 (int)(R))) 9070 9071 #define _mm_cvt_roundsi64_sd(A, B, R) \ 9072 ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ 9073 (int)(R))) 9074 #endif 9075 9076 #define _mm_cvt_roundsi32_ss(A, B, R) \ 9077 ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))) 9078 9079 #define _mm_cvt_roundi32_ss(A, B, R) \ 9080 ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))) 9081 9082 #ifdef __x86_64__ 9083 #define _mm_cvt_roundsi64_ss(A, B, R) \ 9084 ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ 9085 (int)(R))) 9086 9087 #define _mm_cvt_roundi64_ss(A, B, R) \ 9088 ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ 9089 (int)(R))) 9090 #endif 9091 9092 #define _mm_cvt_roundss_sd(A, B, R) \ 9093 ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9094 (__v4sf)(__m128)(B), \ 9095 (__v2df)_mm_undefined_pd(), \ 9096 (__mmask8)-1, (int)(R))) 9097 9098 #define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \ 9099 ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9100 (__v4sf)(__m128)(B), \ 9101 (__v2df)(__m128d)(W), \ 9102 (__mmask8)(U), (int)(R))) 9103 9104 #define _mm_maskz_cvt_roundss_sd(U, A, B, R) \ 9105 ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9106 (__v4sf)(__m128)(B), \ 9107 (__v2df)_mm_setzero_pd(), \ 9108 (__mmask8)(U), (int)(R))) 9109 9110 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9111 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) 9112 { 9113 return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, 9114 (__v4sf)__B, 9115 (__v2df)__W, 9116 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9117 } 9118 9119 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9120 _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B) 9121 { 9122 return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, 9123 (__v4sf)__B, 9124 (__v2df)_mm_setzero_pd(), 9125 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9126 } 9127 9128 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9129 _mm_cvtu32_sd (__m128d __A, unsigned __B) 9130 { 9131 __A[0] = __B; 9132 return __A; 9133 } 9134 9135 #ifdef __x86_64__ 9136 #define _mm_cvt_roundu64_sd(A, B, R) \ 9137 ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ 9138 (unsigned long long)(B), (int)(R))) 9139 9140 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9141 _mm_cvtu64_sd (__m128d __A, unsigned long long __B) 9142 { 9143 __A[0] = __B; 9144 return __A; 9145 } 9146 #endif 9147 9148 #define _mm_cvt_roundu32_ss(A, B, R) \ 9149 ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ 9150 (int)(R))) 9151 9152 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9153 _mm_cvtu32_ss (__m128 __A, unsigned __B) 9154 { 9155 __A[0] = __B; 9156 return __A; 9157 } 9158 9159 #ifdef __x86_64__ 9160 #define _mm_cvt_roundu64_ss(A, B, R) \ 9161 ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ 9162 (unsigned long long)(B), (int)(R))) 9163 9164 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9165 _mm_cvtu64_ss (__m128 __A, unsigned long long __B) 9166 { 9167 __A[0] = __B; 9168 return __A; 9169 } 9170 #endif 9171 9172 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9173 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) 9174 { 9175 return (__m512i) __builtin_ia32_selectd_512(__M, 9176 (__v16si) _mm512_set1_epi32(__A), 9177 (__v16si) __O); 9178 } 9179 9180 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9181 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) 9182 { 9183 return (__m512i) __builtin_ia32_selectq_512(__M, 9184 (__v8di) _mm512_set1_epi64(__A), 9185 (__v8di) __O); 9186 } 9187 9188 static __inline __m512i __DEFAULT_FN_ATTRS512 9189 _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, 9190 char __e58, char __e57, char __e56, char __e55, char __e54, char __e53, 9191 char __e52, char __e51, char __e50, char __e49, char __e48, char __e47, 9192 char __e46, char __e45, char __e44, char __e43, char __e42, char __e41, 9193 char __e40, char __e39, char __e38, char __e37, char __e36, char __e35, 9194 char __e34, char __e33, char __e32, char __e31, char __e30, char __e29, 9195 char __e28, char __e27, char __e26, char __e25, char __e24, char __e23, 9196 char __e22, char __e21, char __e20, char __e19, char __e18, char __e17, 9197 char __e16, char __e15, char __e14, char __e13, char __e12, char __e11, 9198 char __e10, char __e9, char __e8, char __e7, char __e6, char __e5, 9199 char __e4, char __e3, char __e2, char __e1, char __e0) { 9200 9201 return __extension__ (__m512i)(__v64qi) 9202 {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, 9203 __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, 9204 __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, 9205 __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31, 9206 __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39, 9207 __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47, 9208 __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55, 9209 __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63}; 9210 } 9211 9212 static __inline __m512i __DEFAULT_FN_ATTRS512 9213 _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, 9214 short __e27, short __e26, short __e25, short __e24, short __e23, 9215 short __e22, short __e21, short __e20, short __e19, short __e18, 9216 short __e17, short __e16, short __e15, short __e14, short __e13, 9217 short __e12, short __e11, short __e10, short __e9, short __e8, 9218 short __e7, short __e6, short __e5, short __e4, short __e3, 9219 short __e2, short __e1, short __e0) { 9220 return __extension__ (__m512i)(__v32hi) 9221 {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, 9222 __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, 9223 __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, 9224 __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 }; 9225 } 9226 9227 static __inline __m512i __DEFAULT_FN_ATTRS512 9228 _mm512_set_epi32 (int __A, int __B, int __C, int __D, 9229 int __E, int __F, int __G, int __H, 9230 int __I, int __J, int __K, int __L, 9231 int __M, int __N, int __O, int __P) 9232 { 9233 return __extension__ (__m512i)(__v16si) 9234 { __P, __O, __N, __M, __L, __K, __J, __I, 9235 __H, __G, __F, __E, __D, __C, __B, __A }; 9236 } 9237 9238 #define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \ 9239 e8,e9,e10,e11,e12,e13,e14,e15) \ 9240 _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \ 9241 (e5),(e4),(e3),(e2),(e1),(e0)) 9242 9243 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9244 _mm512_set_epi64 (long long __A, long long __B, long long __C, 9245 long long __D, long long __E, long long __F, 9246 long long __G, long long __H) 9247 { 9248 return __extension__ (__m512i) (__v8di) 9249 { __H, __G, __F, __E, __D, __C, __B, __A }; 9250 } 9251 9252 #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \ 9253 _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) 9254 9255 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9256 _mm512_set_pd (double __A, double __B, double __C, double __D, 9257 double __E, double __F, double __G, double __H) 9258 { 9259 return __extension__ (__m512d) 9260 { __H, __G, __F, __E, __D, __C, __B, __A }; 9261 } 9262 9263 #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \ 9264 _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) 9265 9266 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9267 _mm512_set_ps (float __A, float __B, float __C, float __D, 9268 float __E, float __F, float __G, float __H, 9269 float __I, float __J, float __K, float __L, 9270 float __M, float __N, float __O, float __P) 9271 { 9272 return __extension__ (__m512) 9273 { __P, __O, __N, __M, __L, __K, __J, __I, 9274 __H, __G, __F, __E, __D, __C, __B, __A }; 9275 } 9276 9277 #define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \ 9278 _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \ 9279 (e4),(e3),(e2),(e1),(e0)) 9280 9281 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9282 _mm512_abs_ps(__m512 __A) 9283 { 9284 return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; 9285 } 9286 9287 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9288 _mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) 9289 { 9290 return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; 9291 } 9292 9293 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9294 _mm512_abs_pd(__m512d __A) 9295 { 9296 return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ; 9297 } 9298 9299 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9300 _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) 9301 { 9302 return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); 9303 } 9304 9305 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as 9306 * outputs. This class of vector operation forms the basis of many scientific 9307 * computations. In vector-reduction arithmetic, the evaluation order is 9308 * independent of the order of the input elements of V. 9309 9310 * For floating-point intrinsics: 9311 * 1. When using fadd/fmul intrinsics, the order of operations within the 9312 * vector is unspecified (associative math). 9313 * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector 9314 * produce unspecified results. 9315 9316 * Used bisection method. At each step, we partition the vector with previous 9317 * step in half, and the operation is performed on its two halves. 9318 * This takes log2(n) steps where n is the number of elements in the vector. 9319 */ 9320 9321 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) { 9322 return __builtin_reduce_add((__v8di)__W); 9323 } 9324 9325 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) { 9326 return __builtin_reduce_mul((__v8di)__W); 9327 } 9328 9329 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) { 9330 return __builtin_reduce_and((__v8di)__W); 9331 } 9332 9333 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) { 9334 return __builtin_reduce_or((__v8di)__W); 9335 } 9336 9337 static __inline__ long long __DEFAULT_FN_ATTRS512 9338 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { 9339 __W = _mm512_maskz_mov_epi64(__M, __W); 9340 return __builtin_reduce_add((__v8di)__W); 9341 } 9342 9343 static __inline__ long long __DEFAULT_FN_ATTRS512 9344 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { 9345 __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W); 9346 return __builtin_reduce_mul((__v8di)__W); 9347 } 9348 9349 static __inline__ long long __DEFAULT_FN_ATTRS512 9350 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { 9351 __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W); 9352 return __builtin_reduce_and((__v8di)__W); 9353 } 9354 9355 static __inline__ long long __DEFAULT_FN_ATTRS512 9356 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { 9357 __W = _mm512_maskz_mov_epi64(__M, __W); 9358 return __builtin_reduce_or((__v8di)__W); 9359 } 9360 9361 // -0.0 is used to ignore the start value since it is the neutral value of 9362 // floating point addition. For more information, please refer to 9363 // https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic 9364 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { 9365 return __builtin_ia32_reduce_fadd_pd512(-0.0, __W); 9366 } 9367 9368 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) { 9369 return __builtin_ia32_reduce_fmul_pd512(1.0, __W); 9370 } 9371 9372 static __inline__ double __DEFAULT_FN_ATTRS512 9373 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { 9374 __W = _mm512_maskz_mov_pd(__M, __W); 9375 return __builtin_ia32_reduce_fadd_pd512(-0.0, __W); 9376 } 9377 9378 static __inline__ double __DEFAULT_FN_ATTRS512 9379 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { 9380 __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W); 9381 return __builtin_ia32_reduce_fmul_pd512(1.0, __W); 9382 } 9383 9384 static __inline__ int __DEFAULT_FN_ATTRS512 9385 _mm512_reduce_add_epi32(__m512i __W) { 9386 return __builtin_reduce_add((__v16si)__W); 9387 } 9388 9389 static __inline__ int __DEFAULT_FN_ATTRS512 9390 _mm512_reduce_mul_epi32(__m512i __W) { 9391 return __builtin_reduce_mul((__v16si)__W); 9392 } 9393 9394 static __inline__ int __DEFAULT_FN_ATTRS512 9395 _mm512_reduce_and_epi32(__m512i __W) { 9396 return __builtin_reduce_and((__v16si)__W); 9397 } 9398 9399 static __inline__ int __DEFAULT_FN_ATTRS512 9400 _mm512_reduce_or_epi32(__m512i __W) { 9401 return __builtin_reduce_or((__v16si)__W); 9402 } 9403 9404 static __inline__ int __DEFAULT_FN_ATTRS512 9405 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { 9406 __W = _mm512_maskz_mov_epi32(__M, __W); 9407 return __builtin_reduce_add((__v16si)__W); 9408 } 9409 9410 static __inline__ int __DEFAULT_FN_ATTRS512 9411 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { 9412 __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W); 9413 return __builtin_reduce_mul((__v16si)__W); 9414 } 9415 9416 static __inline__ int __DEFAULT_FN_ATTRS512 9417 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { 9418 __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W); 9419 return __builtin_reduce_and((__v16si)__W); 9420 } 9421 9422 static __inline__ int __DEFAULT_FN_ATTRS512 9423 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { 9424 __W = _mm512_maskz_mov_epi32(__M, __W); 9425 return __builtin_reduce_or((__v16si)__W); 9426 } 9427 9428 static __inline__ float __DEFAULT_FN_ATTRS512 9429 _mm512_reduce_add_ps(__m512 __W) { 9430 return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W); 9431 } 9432 9433 static __inline__ float __DEFAULT_FN_ATTRS512 9434 _mm512_reduce_mul_ps(__m512 __W) { 9435 return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); 9436 } 9437 9438 static __inline__ float __DEFAULT_FN_ATTRS512 9439 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { 9440 __W = _mm512_maskz_mov_ps(__M, __W); 9441 return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W); 9442 } 9443 9444 static __inline__ float __DEFAULT_FN_ATTRS512 9445 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { 9446 __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W); 9447 return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); 9448 } 9449 9450 static __inline__ long long __DEFAULT_FN_ATTRS512 9451 _mm512_reduce_max_epi64(__m512i __V) { 9452 return __builtin_reduce_max((__v8di)__V); 9453 } 9454 9455 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9456 _mm512_reduce_max_epu64(__m512i __V) { 9457 return __builtin_reduce_max((__v8du)__V); 9458 } 9459 9460 static __inline__ long long __DEFAULT_FN_ATTRS512 9461 _mm512_reduce_min_epi64(__m512i __V) { 9462 return __builtin_reduce_min((__v8di)__V); 9463 } 9464 9465 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9466 _mm512_reduce_min_epu64(__m512i __V) { 9467 return __builtin_reduce_min((__v8du)__V); 9468 } 9469 9470 static __inline__ long long __DEFAULT_FN_ATTRS512 9471 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { 9472 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V); 9473 return __builtin_reduce_max((__v8di)__V); 9474 } 9475 9476 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9477 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { 9478 __V = _mm512_maskz_mov_epi64(__M, __V); 9479 return __builtin_reduce_max((__v8du)__V); 9480 } 9481 9482 static __inline__ long long __DEFAULT_FN_ATTRS512 9483 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { 9484 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V); 9485 return __builtin_reduce_min((__v8di)__V); 9486 } 9487 9488 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9489 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { 9490 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V); 9491 return __builtin_reduce_min((__v8du)__V); 9492 } 9493 static __inline__ int __DEFAULT_FN_ATTRS512 9494 _mm512_reduce_max_epi32(__m512i __V) { 9495 return __builtin_reduce_max((__v16si)__V); 9496 } 9497 9498 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9499 _mm512_reduce_max_epu32(__m512i __V) { 9500 return __builtin_reduce_max((__v16su)__V); 9501 } 9502 9503 static __inline__ int __DEFAULT_FN_ATTRS512 9504 _mm512_reduce_min_epi32(__m512i __V) { 9505 return __builtin_reduce_min((__v16si)__V); 9506 } 9507 9508 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9509 _mm512_reduce_min_epu32(__m512i __V) { 9510 return __builtin_reduce_min((__v16su)__V); 9511 } 9512 9513 static __inline__ int __DEFAULT_FN_ATTRS512 9514 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { 9515 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V); 9516 return __builtin_reduce_max((__v16si)__V); 9517 } 9518 9519 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9520 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { 9521 __V = _mm512_maskz_mov_epi32(__M, __V); 9522 return __builtin_reduce_max((__v16su)__V); 9523 } 9524 9525 static __inline__ int __DEFAULT_FN_ATTRS512 9526 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { 9527 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V); 9528 return __builtin_reduce_min((__v16si)__V); 9529 } 9530 9531 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9532 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { 9533 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V); 9534 return __builtin_reduce_min((__v16su)__V); 9535 } 9536 9537 static __inline__ double __DEFAULT_FN_ATTRS512 9538 _mm512_reduce_max_pd(__m512d __V) { 9539 return __builtin_ia32_reduce_fmax_pd512(__V); 9540 } 9541 9542 static __inline__ double __DEFAULT_FN_ATTRS512 9543 _mm512_reduce_min_pd(__m512d __V) { 9544 return __builtin_ia32_reduce_fmin_pd512(__V); 9545 } 9546 9547 static __inline__ double __DEFAULT_FN_ATTRS512 9548 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { 9549 __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V); 9550 return __builtin_ia32_reduce_fmax_pd512(__V); 9551 } 9552 9553 static __inline__ double __DEFAULT_FN_ATTRS512 9554 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { 9555 __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V); 9556 return __builtin_ia32_reduce_fmin_pd512(__V); 9557 } 9558 9559 static __inline__ float __DEFAULT_FN_ATTRS512 9560 _mm512_reduce_max_ps(__m512 __V) { 9561 return __builtin_ia32_reduce_fmax_ps512(__V); 9562 } 9563 9564 static __inline__ float __DEFAULT_FN_ATTRS512 9565 _mm512_reduce_min_ps(__m512 __V) { 9566 return __builtin_ia32_reduce_fmin_ps512(__V); 9567 } 9568 9569 static __inline__ float __DEFAULT_FN_ATTRS512 9570 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { 9571 __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V); 9572 return __builtin_ia32_reduce_fmax_ps512(__V); 9573 } 9574 9575 static __inline__ float __DEFAULT_FN_ATTRS512 9576 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { 9577 __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V); 9578 return __builtin_ia32_reduce_fmin_ps512(__V); 9579 } 9580 9581 /// Moves the least significant 32 bits of a vector of [16 x i32] to a 9582 /// 32-bit signed integer value. 9583 /// 9584 /// \headerfile <x86intrin.h> 9585 /// 9586 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 9587 /// 9588 /// \param __A 9589 /// A vector of [16 x i32]. The least significant 32 bits are moved to the 9590 /// destination. 9591 /// \returns A 32-bit signed integer containing the moved value. 9592 static __inline__ int __DEFAULT_FN_ATTRS512 9593 _mm512_cvtsi512_si32(__m512i __A) { 9594 __v16si __b = (__v16si)__A; 9595 return __b[0]; 9596 } 9597 9598 /// Loads 8 double-precision (64-bit) floating-point elements stored at memory 9599 /// locations starting at location \a base_addr at packed 32-bit integer indices 9600 /// stored in the lower half of \a vindex scaled by \a scale them in dst. 9601 /// 9602 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions. 9603 /// 9604 /// \code{.operation} 9605 /// FOR j := 0 to 7 9606 /// i := j*64 9607 /// m := j*32 9608 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9609 /// dst[i+63:i] := MEM[addr+63:addr] 9610 /// ENDFOR 9611 /// dst[MAX:512] := 0 9612 /// \endcode 9613 #define _mm512_i32logather_pd(vindex, base_addr, scale) \ 9614 _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale)) 9615 9616 /// Loads 8 double-precision (64-bit) floating-point elements from memory 9617 /// starting at location \a base_addr at packed 32-bit integer indices stored in 9618 /// the lower half of \a vindex scaled by \a scale into dst using writemask 9619 /// \a mask (elements are copied from \a src when the corresponding mask bit is 9620 /// not set). 9621 /// 9622 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions. 9623 /// 9624 /// \code{.operation} 9625 /// FOR j := 0 to 7 9626 /// i := j*64 9627 /// m := j*32 9628 /// IF mask[j] 9629 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9630 /// dst[i+63:i] := MEM[addr+63:addr] 9631 /// ELSE 9632 /// dst[i+63:i] := src[i+63:i] 9633 /// FI 9634 /// ENDFOR 9635 /// dst[MAX:512] := 0 9636 /// \endcode 9637 #define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \ 9638 _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \ 9639 (base_addr), (scale)) 9640 9641 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr 9642 /// at packed 32-bit integer indices stored in the lower half of \a vindex 9643 /// scaled by \a scale and stores them in dst. 9644 /// 9645 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions. 9646 /// 9647 /// \code{.operation} 9648 /// FOR j := 0 to 7 9649 /// i := j*64 9650 /// m := j*32 9651 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9652 /// dst[i+63:i] := MEM[addr+63:addr] 9653 /// ENDFOR 9654 /// dst[MAX:512] := 0 9655 /// \endcode 9656 #define _mm512_i32logather_epi64(vindex, base_addr, scale) \ 9657 _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale)) 9658 9659 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr 9660 /// at packed 32-bit integer indices stored in the lower half of \a vindex 9661 /// scaled by \a scale and stores them in dst using writemask \a mask (elements 9662 /// are copied from \a src when the corresponding mask bit is not set). 9663 /// 9664 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions. 9665 /// 9666 /// \code{.operation} 9667 /// FOR j := 0 to 7 9668 /// i := j*64 9669 /// m := j*32 9670 /// IF mask[j] 9671 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9672 /// dst[i+63:i] := MEM[addr+63:addr] 9673 /// ELSE 9674 /// dst[i+63:i] := src[i+63:i] 9675 /// FI 9676 /// ENDFOR 9677 /// dst[MAX:512] := 0 9678 /// \endcode 9679 #define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \ 9680 _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \ 9681 (base_addr), (scale)) 9682 9683 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1 9684 /// and to memory locations starting at location \a base_addr at packed 32-bit 9685 /// integer indices stored in \a vindex scaled by \a scale. 9686 /// 9687 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions. 9688 /// 9689 /// \code{.operation} 9690 /// FOR j := 0 to 7 9691 /// i := j*64 9692 /// m := j*32 9693 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9694 /// MEM[addr+63:addr] := v1[i+63:i] 9695 /// ENDFOR 9696 /// \endcode 9697 #define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \ 9698 _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale)) 9699 9700 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1 9701 /// to memory locations starting at location \a base_addr at packed 32-bit 9702 /// integer indices stored in \a vindex scaled by \a scale. Only those elements 9703 /// whose corresponding mask bit is set in writemask \a mask are written to 9704 /// memory. 9705 /// 9706 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions. 9707 /// 9708 /// \code{.operation} 9709 /// FOR j := 0 to 7 9710 /// i := j*64 9711 /// m := j*32 9712 /// IF mask[j] 9713 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9714 /// MEM[addr+63:addr] := a[i+63:i] 9715 /// FI 9716 /// ENDFOR 9717 /// \endcode 9718 #define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \ 9719 _mm512_mask_i32scatter_pd((base_addr), (mask), \ 9720 _mm512_castsi512_si256(vindex), (v1), (scale)) 9721 9722 /// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in 9723 /// memory locations starting at location \a base_addr at packed 32-bit integer 9724 /// indices stored in \a vindex scaled by \a scale. 9725 /// 9726 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions. 9727 /// 9728 /// \code{.operation} 9729 /// FOR j := 0 to 7 9730 /// i := j*64 9731 /// m := j*32 9732 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9733 /// MEM[addr+63:addr] := a[i+63:i] 9734 /// ENDFOR 9735 /// \endcode 9736 #define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \ 9737 _mm512_i32scatter_epi64((base_addr), \ 9738 _mm512_castsi512_si256(vindex), (v1), (scale)) 9739 9740 /// Stores 8 packed 64-bit integer elements located in a and stores them in 9741 /// memory locations starting at location \a base_addr at packed 32-bit integer 9742 /// indices stored in \a vindex scaled by scale using writemask \a mask (elements 9743 /// whose corresponding mask bit is not set are not written to memory). 9744 /// 9745 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions. 9746 /// 9747 /// \code{.operation} 9748 /// FOR j := 0 to 7 9749 /// i := j*64 9750 /// m := j*32 9751 /// IF mask[j] 9752 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9753 /// MEM[addr+63:addr] := a[i+63:i] 9754 /// FI 9755 /// ENDFOR 9756 /// \endcode 9757 #define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \ 9758 _mm512_mask_i32scatter_epi64((base_addr), (mask), \ 9759 _mm512_castsi512_si256(vindex), (v1), (scale)) 9760 9761 #undef __DEFAULT_FN_ATTRS512 9762 #undef __DEFAULT_FN_ATTRS128 9763 #undef __DEFAULT_FN_ATTRS 9764 9765 #endif /* __AVX512FINTRIN_H */ 9766