1 /*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead." 11 #endif 12 13 #ifndef __AVX512FINTRIN_H 14 #define __AVX512FINTRIN_H 15 16 typedef char __v64qi __attribute__((__vector_size__(64))); 17 typedef short __v32hi __attribute__((__vector_size__(64))); 18 typedef double __v8df __attribute__((__vector_size__(64))); 19 typedef float __v16sf __attribute__((__vector_size__(64))); 20 typedef long long __v8di __attribute__((__vector_size__(64))); 21 typedef int __v16si __attribute__((__vector_size__(64))); 22 23 /* Unsigned types */ 24 typedef unsigned char __v64qu __attribute__((__vector_size__(64))); 25 typedef unsigned short __v32hu __attribute__((__vector_size__(64))); 26 typedef unsigned long long __v8du __attribute__((__vector_size__(64))); 27 typedef unsigned int __v16su __attribute__((__vector_size__(64))); 28 29 typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64))); 30 typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64))); 31 typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64))); 32 33 typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1))); 34 typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1))); 35 typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1))); 36 37 typedef unsigned char __mmask8; 38 typedef unsigned short __mmask16; 39 40 /* Rounding mode macros. */ 41 #define _MM_FROUND_TO_NEAREST_INT 0x00 42 #define _MM_FROUND_TO_NEG_INF 0x01 43 #define _MM_FROUND_TO_POS_INF 0x02 44 #define _MM_FROUND_TO_ZERO 0x03 45 #define _MM_FROUND_CUR_DIRECTION 0x04 46 47 /* Constants for integer comparison predicates */ 48 typedef enum { 49 _MM_CMPINT_EQ, /* Equal */ 50 _MM_CMPINT_LT, /* Less than */ 51 _MM_CMPINT_LE, /* Less than or Equal */ 52 _MM_CMPINT_UNUSED, 53 _MM_CMPINT_NE, /* Not Equal */ 54 _MM_CMPINT_NLT, /* Not Less than */ 55 #define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */ 56 _MM_CMPINT_NLE /* Not Less than or Equal */ 57 #define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */ 58 } _MM_CMPINT_ENUM; 59 60 typedef enum 61 { 62 _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, 63 _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, 64 _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, 65 _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, 66 _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, 67 _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, 68 _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, 69 _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, 70 _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, 71 _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, 72 _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, 73 _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, 74 _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, 75 _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, 76 _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, 77 _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, 78 _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, 79 _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, 80 _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, 81 _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, 82 _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, 83 _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, 84 _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, 85 _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, 86 _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, 87 _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, 88 _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, 89 _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, 90 _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, 91 _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, 92 _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, 93 _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, 94 _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, 95 _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, 96 _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, 97 _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, 98 _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, 99 _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, 100 _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, 101 _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, 102 _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, 103 _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, 104 _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, 105 _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, 106 _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, 107 _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, 108 _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, 109 _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, 110 _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, 111 _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, 112 _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, 113 _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, 114 _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, 115 _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, 116 _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, 117 _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, 118 _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, 119 _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, 120 _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, 121 _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, 122 _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, 123 _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, 124 _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, 125 _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, 126 _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, 127 _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, 128 _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, 129 _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, 130 _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, 131 _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, 132 _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, 133 _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, 134 _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, 135 _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, 136 _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, 137 _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, 138 _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, 139 _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, 140 _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, 141 _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, 142 _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, 143 _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, 144 _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, 145 _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, 146 _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, 147 _MM_PERM_DDDD = 0xFF 148 } _MM_PERM_ENUM; 149 150 typedef enum 151 { 152 _MM_MANT_NORM_1_2, /* interval [1, 2) */ 153 _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */ 154 _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */ 155 _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */ 156 } _MM_MANTISSA_NORM_ENUM; 157 158 typedef enum 159 { 160 _MM_MANT_SIGN_src, /* sign = sign(SRC) */ 161 _MM_MANT_SIGN_zero, /* sign = 0 */ 162 _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */ 163 } _MM_MANTISSA_SIGN_ENUM; 164 165 /* Define the default attributes for the functions in this file. */ 166 #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512))) 167 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128))) 168 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f"))) 169 170 /* Create vectors with repeated elements */ 171 172 static __inline __m512i __DEFAULT_FN_ATTRS512 173 _mm512_setzero_si512(void) 174 { 175 return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; 176 } 177 178 #define _mm512_setzero_epi32 _mm512_setzero_si512 179 180 static __inline__ __m512d __DEFAULT_FN_ATTRS512 181 _mm512_undefined_pd(void) 182 { 183 return (__m512d)__builtin_ia32_undef512(); 184 } 185 186 static __inline__ __m512 __DEFAULT_FN_ATTRS512 187 _mm512_undefined(void) 188 { 189 return (__m512)__builtin_ia32_undef512(); 190 } 191 192 static __inline__ __m512 __DEFAULT_FN_ATTRS512 193 _mm512_undefined_ps(void) 194 { 195 return (__m512)__builtin_ia32_undef512(); 196 } 197 198 static __inline__ __m512i __DEFAULT_FN_ATTRS512 199 _mm512_undefined_epi32(void) 200 { 201 return (__m512i)__builtin_ia32_undef512(); 202 } 203 204 static __inline__ __m512i __DEFAULT_FN_ATTRS512 205 _mm512_broadcastd_epi32 (__m128i __A) 206 { 207 return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A, 208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 209 } 210 211 static __inline__ __m512i __DEFAULT_FN_ATTRS512 212 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) 213 { 214 return (__m512i)__builtin_ia32_selectd_512(__M, 215 (__v16si) _mm512_broadcastd_epi32(__A), 216 (__v16si) __O); 217 } 218 219 static __inline__ __m512i __DEFAULT_FN_ATTRS512 220 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) 221 { 222 return (__m512i)__builtin_ia32_selectd_512(__M, 223 (__v16si) _mm512_broadcastd_epi32(__A), 224 (__v16si) _mm512_setzero_si512()); 225 } 226 227 static __inline__ __m512i __DEFAULT_FN_ATTRS512 228 _mm512_broadcastq_epi64 (__m128i __A) 229 { 230 return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A, 231 0, 0, 0, 0, 0, 0, 0, 0); 232 } 233 234 static __inline__ __m512i __DEFAULT_FN_ATTRS512 235 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) 236 { 237 return (__m512i)__builtin_ia32_selectq_512(__M, 238 (__v8di) _mm512_broadcastq_epi64(__A), 239 (__v8di) __O); 240 241 } 242 243 static __inline__ __m512i __DEFAULT_FN_ATTRS512 244 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) 245 { 246 return (__m512i)__builtin_ia32_selectq_512(__M, 247 (__v8di) _mm512_broadcastq_epi64(__A), 248 (__v8di) _mm512_setzero_si512()); 249 } 250 251 252 static __inline __m512 __DEFAULT_FN_ATTRS512 253 _mm512_setzero_ps(void) 254 { 255 return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 256 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; 257 } 258 259 #define _mm512_setzero _mm512_setzero_ps 260 261 static __inline __m512d __DEFAULT_FN_ATTRS512 262 _mm512_setzero_pd(void) 263 { 264 return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; 265 } 266 267 static __inline __m512 __DEFAULT_FN_ATTRS512 268 _mm512_set1_ps(float __w) 269 { 270 return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w, 271 __w, __w, __w, __w, __w, __w, __w, __w }; 272 } 273 274 static __inline __m512d __DEFAULT_FN_ATTRS512 275 _mm512_set1_pd(double __w) 276 { 277 return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w }; 278 } 279 280 static __inline __m512i __DEFAULT_FN_ATTRS512 281 _mm512_set1_epi8(char __w) 282 { 283 return __extension__ (__m512i)(__v64qi){ 284 __w, __w, __w, __w, __w, __w, __w, __w, 285 __w, __w, __w, __w, __w, __w, __w, __w, 286 __w, __w, __w, __w, __w, __w, __w, __w, 287 __w, __w, __w, __w, __w, __w, __w, __w, 288 __w, __w, __w, __w, __w, __w, __w, __w, 289 __w, __w, __w, __w, __w, __w, __w, __w, 290 __w, __w, __w, __w, __w, __w, __w, __w, 291 __w, __w, __w, __w, __w, __w, __w, __w }; 292 } 293 294 static __inline __m512i __DEFAULT_FN_ATTRS512 295 _mm512_set1_epi16(short __w) 296 { 297 return __extension__ (__m512i)(__v32hi){ 298 __w, __w, __w, __w, __w, __w, __w, __w, 299 __w, __w, __w, __w, __w, __w, __w, __w, 300 __w, __w, __w, __w, __w, __w, __w, __w, 301 __w, __w, __w, __w, __w, __w, __w, __w }; 302 } 303 304 static __inline __m512i __DEFAULT_FN_ATTRS512 305 _mm512_set1_epi32(int __s) 306 { 307 return __extension__ (__m512i)(__v16si){ 308 __s, __s, __s, __s, __s, __s, __s, __s, 309 __s, __s, __s, __s, __s, __s, __s, __s }; 310 } 311 312 static __inline __m512i __DEFAULT_FN_ATTRS512 313 _mm512_maskz_set1_epi32(__mmask16 __M, int __A) 314 { 315 return (__m512i)__builtin_ia32_selectd_512(__M, 316 (__v16si)_mm512_set1_epi32(__A), 317 (__v16si)_mm512_setzero_si512()); 318 } 319 320 static __inline __m512i __DEFAULT_FN_ATTRS512 321 _mm512_set1_epi64(long long __d) 322 { 323 return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; 324 } 325 326 static __inline __m512i __DEFAULT_FN_ATTRS512 327 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) 328 { 329 return (__m512i)__builtin_ia32_selectq_512(__M, 330 (__v8di)_mm512_set1_epi64(__A), 331 (__v8di)_mm512_setzero_si512()); 332 } 333 334 static __inline__ __m512 __DEFAULT_FN_ATTRS512 335 _mm512_broadcastss_ps(__m128 __A) 336 { 337 return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A, 338 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 339 } 340 341 static __inline __m512i __DEFAULT_FN_ATTRS512 342 _mm512_set4_epi32 (int __A, int __B, int __C, int __D) 343 { 344 return __extension__ (__m512i)(__v16si) 345 { __D, __C, __B, __A, __D, __C, __B, __A, 346 __D, __C, __B, __A, __D, __C, __B, __A }; 347 } 348 349 static __inline __m512i __DEFAULT_FN_ATTRS512 350 _mm512_set4_epi64 (long long __A, long long __B, long long __C, 351 long long __D) 352 { 353 return __extension__ (__m512i) (__v8di) 354 { __D, __C, __B, __A, __D, __C, __B, __A }; 355 } 356 357 static __inline __m512d __DEFAULT_FN_ATTRS512 358 _mm512_set4_pd (double __A, double __B, double __C, double __D) 359 { 360 return __extension__ (__m512d) 361 { __D, __C, __B, __A, __D, __C, __B, __A }; 362 } 363 364 static __inline __m512 __DEFAULT_FN_ATTRS512 365 _mm512_set4_ps (float __A, float __B, float __C, float __D) 366 { 367 return __extension__ (__m512) 368 { __D, __C, __B, __A, __D, __C, __B, __A, 369 __D, __C, __B, __A, __D, __C, __B, __A }; 370 } 371 372 #define _mm512_setr4_epi32(e0,e1,e2,e3) \ 373 _mm512_set4_epi32((e3),(e2),(e1),(e0)) 374 375 #define _mm512_setr4_epi64(e0,e1,e2,e3) \ 376 _mm512_set4_epi64((e3),(e2),(e1),(e0)) 377 378 #define _mm512_setr4_pd(e0,e1,e2,e3) \ 379 _mm512_set4_pd((e3),(e2),(e1),(e0)) 380 381 #define _mm512_setr4_ps(e0,e1,e2,e3) \ 382 _mm512_set4_ps((e3),(e2),(e1),(e0)) 383 384 static __inline__ __m512d __DEFAULT_FN_ATTRS512 385 _mm512_broadcastsd_pd(__m128d __A) 386 { 387 return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A, 388 0, 0, 0, 0, 0, 0, 0, 0); 389 } 390 391 /* Cast between vector types */ 392 393 static __inline __m512d __DEFAULT_FN_ATTRS512 394 _mm512_castpd256_pd512(__m256d __a) 395 { 396 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1); 397 } 398 399 static __inline __m512 __DEFAULT_FN_ATTRS512 400 _mm512_castps256_ps512(__m256 __a) 401 { 402 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 403 -1, -1, -1, -1, -1, -1, -1, -1); 404 } 405 406 static __inline __m128d __DEFAULT_FN_ATTRS512 407 _mm512_castpd512_pd128(__m512d __a) 408 { 409 return __builtin_shufflevector(__a, __a, 0, 1); 410 } 411 412 static __inline __m256d __DEFAULT_FN_ATTRS512 413 _mm512_castpd512_pd256 (__m512d __A) 414 { 415 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3); 416 } 417 418 static __inline __m128 __DEFAULT_FN_ATTRS512 419 _mm512_castps512_ps128(__m512 __a) 420 { 421 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); 422 } 423 424 static __inline __m256 __DEFAULT_FN_ATTRS512 425 _mm512_castps512_ps256 (__m512 __A) 426 { 427 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7); 428 } 429 430 static __inline __m512 __DEFAULT_FN_ATTRS512 431 _mm512_castpd_ps (__m512d __A) 432 { 433 return (__m512) (__A); 434 } 435 436 static __inline __m512i __DEFAULT_FN_ATTRS512 437 _mm512_castpd_si512 (__m512d __A) 438 { 439 return (__m512i) (__A); 440 } 441 442 static __inline__ __m512d __DEFAULT_FN_ATTRS512 443 _mm512_castpd128_pd512 (__m128d __A) 444 { 445 return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); 446 } 447 448 static __inline __m512d __DEFAULT_FN_ATTRS512 449 _mm512_castps_pd (__m512 __A) 450 { 451 return (__m512d) (__A); 452 } 453 454 static __inline __m512i __DEFAULT_FN_ATTRS512 455 _mm512_castps_si512 (__m512 __A) 456 { 457 return (__m512i) (__A); 458 } 459 460 static __inline__ __m512 __DEFAULT_FN_ATTRS512 461 _mm512_castps128_ps512 (__m128 __A) 462 { 463 return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 464 } 465 466 static __inline__ __m512i __DEFAULT_FN_ATTRS512 467 _mm512_castsi128_si512 (__m128i __A) 468 { 469 return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); 470 } 471 472 static __inline__ __m512i __DEFAULT_FN_ATTRS512 473 _mm512_castsi256_si512 (__m256i __A) 474 { 475 return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1); 476 } 477 478 static __inline __m512 __DEFAULT_FN_ATTRS512 479 _mm512_castsi512_ps (__m512i __A) 480 { 481 return (__m512) (__A); 482 } 483 484 static __inline __m512d __DEFAULT_FN_ATTRS512 485 _mm512_castsi512_pd (__m512i __A) 486 { 487 return (__m512d) (__A); 488 } 489 490 static __inline __m128i __DEFAULT_FN_ATTRS512 491 _mm512_castsi512_si128 (__m512i __A) 492 { 493 return (__m128i)__builtin_shufflevector(__A, __A , 0, 1); 494 } 495 496 static __inline __m256i __DEFAULT_FN_ATTRS512 497 _mm512_castsi512_si256 (__m512i __A) 498 { 499 return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); 500 } 501 502 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 503 _mm512_int2mask(int __a) 504 { 505 return (__mmask16)__a; 506 } 507 508 static __inline__ int __DEFAULT_FN_ATTRS 509 _mm512_mask2int(__mmask16 __a) 510 { 511 return (int)__a; 512 } 513 514 /// Constructs a 512-bit floating-point vector of [8 x double] from a 515 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits 516 /// contain the value of the source vector. The upper 384 bits are set 517 /// to zero. 518 /// 519 /// \headerfile <x86intrin.h> 520 /// 521 /// This intrinsic has no corresponding instruction. 522 /// 523 /// \param __a 524 /// A 128-bit vector of [2 x double]. 525 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits 526 /// contain the value of the parameter. The upper 384 bits are set to zero. 527 static __inline __m512d __DEFAULT_FN_ATTRS512 528 _mm512_zextpd128_pd512(__m128d __a) 529 { 530 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3); 531 } 532 533 /// Constructs a 512-bit floating-point vector of [8 x double] from a 534 /// 256-bit floating-point vector of [4 x double]. The lower 256 bits 535 /// contain the value of the source vector. The upper 256 bits are set 536 /// to zero. 537 /// 538 /// \headerfile <x86intrin.h> 539 /// 540 /// This intrinsic has no corresponding instruction. 541 /// 542 /// \param __a 543 /// A 256-bit vector of [4 x double]. 544 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits 545 /// contain the value of the parameter. The upper 256 bits are set to zero. 546 static __inline __m512d __DEFAULT_FN_ATTRS512 547 _mm512_zextpd256_pd512(__m256d __a) 548 { 549 return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7); 550 } 551 552 /// Constructs a 512-bit floating-point vector of [16 x float] from a 553 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 554 /// the value of the source vector. The upper 384 bits are set to zero. 555 /// 556 /// \headerfile <x86intrin.h> 557 /// 558 /// This intrinsic has no corresponding instruction. 559 /// 560 /// \param __a 561 /// A 128-bit vector of [4 x float]. 562 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits 563 /// contain the value of the parameter. The upper 384 bits are set to zero. 564 static __inline __m512 __DEFAULT_FN_ATTRS512 565 _mm512_zextps128_ps512(__m128 __a) 566 { 567 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7); 568 } 569 570 /// Constructs a 512-bit floating-point vector of [16 x float] from a 571 /// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain 572 /// the value of the source vector. The upper 256 bits are set to zero. 573 /// 574 /// \headerfile <x86intrin.h> 575 /// 576 /// This intrinsic has no corresponding instruction. 577 /// 578 /// \param __a 579 /// A 256-bit vector of [8 x float]. 580 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits 581 /// contain the value of the parameter. The upper 256 bits are set to zero. 582 static __inline __m512 __DEFAULT_FN_ATTRS512 583 _mm512_zextps256_ps512(__m256 __a) 584 { 585 return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 586 } 587 588 /// Constructs a 512-bit integer vector from a 128-bit integer vector. 589 /// The lower 128 bits contain the value of the source vector. The upper 590 /// 384 bits are set to zero. 591 /// 592 /// \headerfile <x86intrin.h> 593 /// 594 /// This intrinsic has no corresponding instruction. 595 /// 596 /// \param __a 597 /// A 128-bit integer vector. 598 /// \returns A 512-bit integer vector. The lower 128 bits contain the value of 599 /// the parameter. The upper 384 bits are set to zero. 600 static __inline __m512i __DEFAULT_FN_ATTRS512 601 _mm512_zextsi128_si512(__m128i __a) 602 { 603 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3); 604 } 605 606 /// Constructs a 512-bit integer vector from a 256-bit integer vector. 607 /// The lower 256 bits contain the value of the source vector. The upper 608 /// 256 bits are set to zero. 609 /// 610 /// \headerfile <x86intrin.h> 611 /// 612 /// This intrinsic has no corresponding instruction. 613 /// 614 /// \param __a 615 /// A 256-bit integer vector. 616 /// \returns A 512-bit integer vector. The lower 256 bits contain the value of 617 /// the parameter. The upper 256 bits are set to zero. 618 static __inline __m512i __DEFAULT_FN_ATTRS512 619 _mm512_zextsi256_si512(__m256i __a) 620 { 621 return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7); 622 } 623 624 /* Bitwise operators */ 625 static __inline__ __m512i __DEFAULT_FN_ATTRS512 626 _mm512_and_epi32(__m512i __a, __m512i __b) 627 { 628 return (__m512i)((__v16su)__a & (__v16su)__b); 629 } 630 631 static __inline__ __m512i __DEFAULT_FN_ATTRS512 632 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 633 { 634 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 635 (__v16si) _mm512_and_epi32(__a, __b), 636 (__v16si) __src); 637 } 638 639 static __inline__ __m512i __DEFAULT_FN_ATTRS512 640 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) 641 { 642 return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (), 643 __k, __a, __b); 644 } 645 646 static __inline__ __m512i __DEFAULT_FN_ATTRS512 647 _mm512_and_epi64(__m512i __a, __m512i __b) 648 { 649 return (__m512i)((__v8du)__a & (__v8du)__b); 650 } 651 652 static __inline__ __m512i __DEFAULT_FN_ATTRS512 653 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 654 { 655 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k, 656 (__v8di) _mm512_and_epi64(__a, __b), 657 (__v8di) __src); 658 } 659 660 static __inline__ __m512i __DEFAULT_FN_ATTRS512 661 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) 662 { 663 return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (), 664 __k, __a, __b); 665 } 666 667 static __inline__ __m512i __DEFAULT_FN_ATTRS512 668 _mm512_andnot_si512 (__m512i __A, __m512i __B) 669 { 670 return (__m512i)(~(__v8du)__A & (__v8du)__B); 671 } 672 673 static __inline__ __m512i __DEFAULT_FN_ATTRS512 674 _mm512_andnot_epi32 (__m512i __A, __m512i __B) 675 { 676 return (__m512i)(~(__v16su)__A & (__v16su)__B); 677 } 678 679 static __inline__ __m512i __DEFAULT_FN_ATTRS512 680 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 681 { 682 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 683 (__v16si)_mm512_andnot_epi32(__A, __B), 684 (__v16si)__W); 685 } 686 687 static __inline__ __m512i __DEFAULT_FN_ATTRS512 688 _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B) 689 { 690 return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(), 691 __U, __A, __B); 692 } 693 694 static __inline__ __m512i __DEFAULT_FN_ATTRS512 695 _mm512_andnot_epi64(__m512i __A, __m512i __B) 696 { 697 return (__m512i)(~(__v8du)__A & (__v8du)__B); 698 } 699 700 static __inline__ __m512i __DEFAULT_FN_ATTRS512 701 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 702 { 703 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 704 (__v8di)_mm512_andnot_epi64(__A, __B), 705 (__v8di)__W); 706 } 707 708 static __inline__ __m512i __DEFAULT_FN_ATTRS512 709 _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B) 710 { 711 return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(), 712 __U, __A, __B); 713 } 714 715 static __inline__ __m512i __DEFAULT_FN_ATTRS512 716 _mm512_or_epi32(__m512i __a, __m512i __b) 717 { 718 return (__m512i)((__v16su)__a | (__v16su)__b); 719 } 720 721 static __inline__ __m512i __DEFAULT_FN_ATTRS512 722 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 723 { 724 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 725 (__v16si)_mm512_or_epi32(__a, __b), 726 (__v16si)__src); 727 } 728 729 static __inline__ __m512i __DEFAULT_FN_ATTRS512 730 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) 731 { 732 return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b); 733 } 734 735 static __inline__ __m512i __DEFAULT_FN_ATTRS512 736 _mm512_or_epi64(__m512i __a, __m512i __b) 737 { 738 return (__m512i)((__v8du)__a | (__v8du)__b); 739 } 740 741 static __inline__ __m512i __DEFAULT_FN_ATTRS512 742 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 743 { 744 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, 745 (__v8di)_mm512_or_epi64(__a, __b), 746 (__v8di)__src); 747 } 748 749 static __inline__ __m512i __DEFAULT_FN_ATTRS512 750 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) 751 { 752 return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b); 753 } 754 755 static __inline__ __m512i __DEFAULT_FN_ATTRS512 756 _mm512_xor_epi32(__m512i __a, __m512i __b) 757 { 758 return (__m512i)((__v16su)__a ^ (__v16su)__b); 759 } 760 761 static __inline__ __m512i __DEFAULT_FN_ATTRS512 762 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 763 { 764 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 765 (__v16si)_mm512_xor_epi32(__a, __b), 766 (__v16si)__src); 767 } 768 769 static __inline__ __m512i __DEFAULT_FN_ATTRS512 770 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) 771 { 772 return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b); 773 } 774 775 static __inline__ __m512i __DEFAULT_FN_ATTRS512 776 _mm512_xor_epi64(__m512i __a, __m512i __b) 777 { 778 return (__m512i)((__v8du)__a ^ (__v8du)__b); 779 } 780 781 static __inline__ __m512i __DEFAULT_FN_ATTRS512 782 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 783 { 784 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, 785 (__v8di)_mm512_xor_epi64(__a, __b), 786 (__v8di)__src); 787 } 788 789 static __inline__ __m512i __DEFAULT_FN_ATTRS512 790 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) 791 { 792 return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b); 793 } 794 795 static __inline__ __m512i __DEFAULT_FN_ATTRS512 796 _mm512_and_si512(__m512i __a, __m512i __b) 797 { 798 return (__m512i)((__v8du)__a & (__v8du)__b); 799 } 800 801 static __inline__ __m512i __DEFAULT_FN_ATTRS512 802 _mm512_or_si512(__m512i __a, __m512i __b) 803 { 804 return (__m512i)((__v8du)__a | (__v8du)__b); 805 } 806 807 static __inline__ __m512i __DEFAULT_FN_ATTRS512 808 _mm512_xor_si512(__m512i __a, __m512i __b) 809 { 810 return (__m512i)((__v8du)__a ^ (__v8du)__b); 811 } 812 813 /* Arithmetic */ 814 815 static __inline __m512d __DEFAULT_FN_ATTRS512 816 _mm512_add_pd(__m512d __a, __m512d __b) 817 { 818 return (__m512d)((__v8df)__a + (__v8df)__b); 819 } 820 821 static __inline __m512 __DEFAULT_FN_ATTRS512 822 _mm512_add_ps(__m512 __a, __m512 __b) 823 { 824 return (__m512)((__v16sf)__a + (__v16sf)__b); 825 } 826 827 static __inline __m512d __DEFAULT_FN_ATTRS512 828 _mm512_mul_pd(__m512d __a, __m512d __b) 829 { 830 return (__m512d)((__v8df)__a * (__v8df)__b); 831 } 832 833 static __inline __m512 __DEFAULT_FN_ATTRS512 834 _mm512_mul_ps(__m512 __a, __m512 __b) 835 { 836 return (__m512)((__v16sf)__a * (__v16sf)__b); 837 } 838 839 static __inline __m512d __DEFAULT_FN_ATTRS512 840 _mm512_sub_pd(__m512d __a, __m512d __b) 841 { 842 return (__m512d)((__v8df)__a - (__v8df)__b); 843 } 844 845 static __inline __m512 __DEFAULT_FN_ATTRS512 846 _mm512_sub_ps(__m512 __a, __m512 __b) 847 { 848 return (__m512)((__v16sf)__a - (__v16sf)__b); 849 } 850 851 static __inline__ __m512i __DEFAULT_FN_ATTRS512 852 _mm512_add_epi64 (__m512i __A, __m512i __B) 853 { 854 return (__m512i) ((__v8du) __A + (__v8du) __B); 855 } 856 857 static __inline__ __m512i __DEFAULT_FN_ATTRS512 858 _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 859 { 860 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 861 (__v8di)_mm512_add_epi64(__A, __B), 862 (__v8di)__W); 863 } 864 865 static __inline__ __m512i __DEFAULT_FN_ATTRS512 866 _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) 867 { 868 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 869 (__v8di)_mm512_add_epi64(__A, __B), 870 (__v8di)_mm512_setzero_si512()); 871 } 872 873 static __inline__ __m512i __DEFAULT_FN_ATTRS512 874 _mm512_sub_epi64 (__m512i __A, __m512i __B) 875 { 876 return (__m512i) ((__v8du) __A - (__v8du) __B); 877 } 878 879 static __inline__ __m512i __DEFAULT_FN_ATTRS512 880 _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 881 { 882 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 883 (__v8di)_mm512_sub_epi64(__A, __B), 884 (__v8di)__W); 885 } 886 887 static __inline__ __m512i __DEFAULT_FN_ATTRS512 888 _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) 889 { 890 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 891 (__v8di)_mm512_sub_epi64(__A, __B), 892 (__v8di)_mm512_setzero_si512()); 893 } 894 895 static __inline__ __m512i __DEFAULT_FN_ATTRS512 896 _mm512_add_epi32 (__m512i __A, __m512i __B) 897 { 898 return (__m512i) ((__v16su) __A + (__v16su) __B); 899 } 900 901 static __inline__ __m512i __DEFAULT_FN_ATTRS512 902 _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 903 { 904 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 905 (__v16si)_mm512_add_epi32(__A, __B), 906 (__v16si)__W); 907 } 908 909 static __inline__ __m512i __DEFAULT_FN_ATTRS512 910 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 911 { 912 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 913 (__v16si)_mm512_add_epi32(__A, __B), 914 (__v16si)_mm512_setzero_si512()); 915 } 916 917 static __inline__ __m512i __DEFAULT_FN_ATTRS512 918 _mm512_sub_epi32 (__m512i __A, __m512i __B) 919 { 920 return (__m512i) ((__v16su) __A - (__v16su) __B); 921 } 922 923 static __inline__ __m512i __DEFAULT_FN_ATTRS512 924 _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 925 { 926 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 927 (__v16si)_mm512_sub_epi32(__A, __B), 928 (__v16si)__W); 929 } 930 931 static __inline__ __m512i __DEFAULT_FN_ATTRS512 932 _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) 933 { 934 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 935 (__v16si)_mm512_sub_epi32(__A, __B), 936 (__v16si)_mm512_setzero_si512()); 937 } 938 939 #define _mm512_max_round_pd(A, B, R) \ 940 (__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \ 941 (__v8df)(__m512d)(B), (int)(R)) 942 943 #define _mm512_mask_max_round_pd(W, U, A, B, R) \ 944 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 945 (__v8df)_mm512_max_round_pd((A), (B), (R)), \ 946 (__v8df)(W)) 947 948 #define _mm512_maskz_max_round_pd(U, A, B, R) \ 949 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 950 (__v8df)_mm512_max_round_pd((A), (B), (R)), \ 951 (__v8df)_mm512_setzero_pd()) 952 953 static __inline__ __m512d __DEFAULT_FN_ATTRS512 954 _mm512_max_pd(__m512d __A, __m512d __B) 955 { 956 return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B, 957 _MM_FROUND_CUR_DIRECTION); 958 } 959 960 static __inline__ __m512d __DEFAULT_FN_ATTRS512 961 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 962 { 963 return (__m512d)__builtin_ia32_selectpd_512(__U, 964 (__v8df)_mm512_max_pd(__A, __B), 965 (__v8df)__W); 966 } 967 968 static __inline__ __m512d __DEFAULT_FN_ATTRS512 969 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) 970 { 971 return (__m512d)__builtin_ia32_selectpd_512(__U, 972 (__v8df)_mm512_max_pd(__A, __B), 973 (__v8df)_mm512_setzero_pd()); 974 } 975 976 #define _mm512_max_round_ps(A, B, R) \ 977 (__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \ 978 (__v16sf)(__m512)(B), (int)(R)) 979 980 #define _mm512_mask_max_round_ps(W, U, A, B, R) \ 981 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 982 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ 983 (__v16sf)(W)) 984 985 #define _mm512_maskz_max_round_ps(U, A, B, R) \ 986 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 987 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ 988 (__v16sf)_mm512_setzero_ps()) 989 990 static __inline__ __m512 __DEFAULT_FN_ATTRS512 991 _mm512_max_ps(__m512 __A, __m512 __B) 992 { 993 return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B, 994 _MM_FROUND_CUR_DIRECTION); 995 } 996 997 static __inline__ __m512 __DEFAULT_FN_ATTRS512 998 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 999 { 1000 return (__m512)__builtin_ia32_selectps_512(__U, 1001 (__v16sf)_mm512_max_ps(__A, __B), 1002 (__v16sf)__W); 1003 } 1004 1005 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1006 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) 1007 { 1008 return (__m512)__builtin_ia32_selectps_512(__U, 1009 (__v16sf)_mm512_max_ps(__A, __B), 1010 (__v16sf)_mm512_setzero_ps()); 1011 } 1012 1013 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1014 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1015 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, 1016 (__v4sf) __B, 1017 (__v4sf) __W, 1018 (__mmask8) __U, 1019 _MM_FROUND_CUR_DIRECTION); 1020 } 1021 1022 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1023 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1024 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, 1025 (__v4sf) __B, 1026 (__v4sf) _mm_setzero_ps (), 1027 (__mmask8) __U, 1028 _MM_FROUND_CUR_DIRECTION); 1029 } 1030 1031 #define _mm_max_round_ss(A, B, R) \ 1032 (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1033 (__v4sf)(__m128)(B), \ 1034 (__v4sf)_mm_setzero_ps(), \ 1035 (__mmask8)-1, (int)(R)) 1036 1037 #define _mm_mask_max_round_ss(W, U, A, B, R) \ 1038 (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1039 (__v4sf)(__m128)(B), \ 1040 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1041 (int)(R)) 1042 1043 #define _mm_maskz_max_round_ss(U, A, B, R) \ 1044 (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1045 (__v4sf)(__m128)(B), \ 1046 (__v4sf)_mm_setzero_ps(), \ 1047 (__mmask8)(U), (int)(R)) 1048 1049 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1050 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1051 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, 1052 (__v2df) __B, 1053 (__v2df) __W, 1054 (__mmask8) __U, 1055 _MM_FROUND_CUR_DIRECTION); 1056 } 1057 1058 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1059 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1060 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, 1061 (__v2df) __B, 1062 (__v2df) _mm_setzero_pd (), 1063 (__mmask8) __U, 1064 _MM_FROUND_CUR_DIRECTION); 1065 } 1066 1067 #define _mm_max_round_sd(A, B, R) \ 1068 (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1069 (__v2df)(__m128d)(B), \ 1070 (__v2df)_mm_setzero_pd(), \ 1071 (__mmask8)-1, (int)(R)) 1072 1073 #define _mm_mask_max_round_sd(W, U, A, B, R) \ 1074 (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1075 (__v2df)(__m128d)(B), \ 1076 (__v2df)(__m128d)(W), \ 1077 (__mmask8)(U), (int)(R)) 1078 1079 #define _mm_maskz_max_round_sd(U, A, B, R) \ 1080 (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1081 (__v2df)(__m128d)(B), \ 1082 (__v2df)_mm_setzero_pd(), \ 1083 (__mmask8)(U), (int)(R)) 1084 1085 static __inline __m512i 1086 __DEFAULT_FN_ATTRS512 1087 _mm512_max_epi32(__m512i __A, __m512i __B) 1088 { 1089 return (__m512i)__builtin_ia32_pmaxsd512((__v16si)__A, (__v16si)__B); 1090 } 1091 1092 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1093 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1094 { 1095 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1096 (__v16si)_mm512_max_epi32(__A, __B), 1097 (__v16si)__W); 1098 } 1099 1100 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1101 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) 1102 { 1103 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1104 (__v16si)_mm512_max_epi32(__A, __B), 1105 (__v16si)_mm512_setzero_si512()); 1106 } 1107 1108 static __inline __m512i __DEFAULT_FN_ATTRS512 1109 _mm512_max_epu32(__m512i __A, __m512i __B) 1110 { 1111 return (__m512i)__builtin_ia32_pmaxud512((__v16si)__A, (__v16si)__B); 1112 } 1113 1114 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1115 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1116 { 1117 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1118 (__v16si)_mm512_max_epu32(__A, __B), 1119 (__v16si)__W); 1120 } 1121 1122 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1123 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) 1124 { 1125 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1126 (__v16si)_mm512_max_epu32(__A, __B), 1127 (__v16si)_mm512_setzero_si512()); 1128 } 1129 1130 static __inline __m512i __DEFAULT_FN_ATTRS512 1131 _mm512_max_epi64(__m512i __A, __m512i __B) 1132 { 1133 return (__m512i)__builtin_ia32_pmaxsq512((__v8di)__A, (__v8di)__B); 1134 } 1135 1136 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1137 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1138 { 1139 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1140 (__v8di)_mm512_max_epi64(__A, __B), 1141 (__v8di)__W); 1142 } 1143 1144 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1145 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) 1146 { 1147 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1148 (__v8di)_mm512_max_epi64(__A, __B), 1149 (__v8di)_mm512_setzero_si512()); 1150 } 1151 1152 static __inline __m512i __DEFAULT_FN_ATTRS512 1153 _mm512_max_epu64(__m512i __A, __m512i __B) 1154 { 1155 return (__m512i)__builtin_ia32_pmaxuq512((__v8di)__A, (__v8di)__B); 1156 } 1157 1158 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1159 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1160 { 1161 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1162 (__v8di)_mm512_max_epu64(__A, __B), 1163 (__v8di)__W); 1164 } 1165 1166 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1167 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) 1168 { 1169 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1170 (__v8di)_mm512_max_epu64(__A, __B), 1171 (__v8di)_mm512_setzero_si512()); 1172 } 1173 1174 #define _mm512_min_round_pd(A, B, R) \ 1175 (__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \ 1176 (__v8df)(__m512d)(B), (int)(R)) 1177 1178 #define _mm512_mask_min_round_pd(W, U, A, B, R) \ 1179 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1180 (__v8df)_mm512_min_round_pd((A), (B), (R)), \ 1181 (__v8df)(W)) 1182 1183 #define _mm512_maskz_min_round_pd(U, A, B, R) \ 1184 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1185 (__v8df)_mm512_min_round_pd((A), (B), (R)), \ 1186 (__v8df)_mm512_setzero_pd()) 1187 1188 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1189 _mm512_min_pd(__m512d __A, __m512d __B) 1190 { 1191 return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B, 1192 _MM_FROUND_CUR_DIRECTION); 1193 } 1194 1195 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1196 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 1197 { 1198 return (__m512d)__builtin_ia32_selectpd_512(__U, 1199 (__v8df)_mm512_min_pd(__A, __B), 1200 (__v8df)__W); 1201 } 1202 1203 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1204 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) 1205 { 1206 return (__m512d)__builtin_ia32_selectpd_512(__U, 1207 (__v8df)_mm512_min_pd(__A, __B), 1208 (__v8df)_mm512_setzero_pd()); 1209 } 1210 1211 #define _mm512_min_round_ps(A, B, R) \ 1212 (__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \ 1213 (__v16sf)(__m512)(B), (int)(R)) 1214 1215 #define _mm512_mask_min_round_ps(W, U, A, B, R) \ 1216 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1217 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ 1218 (__v16sf)(W)) 1219 1220 #define _mm512_maskz_min_round_ps(U, A, B, R) \ 1221 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1222 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ 1223 (__v16sf)_mm512_setzero_ps()) 1224 1225 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1226 _mm512_min_ps(__m512 __A, __m512 __B) 1227 { 1228 return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B, 1229 _MM_FROUND_CUR_DIRECTION); 1230 } 1231 1232 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1233 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 1234 { 1235 return (__m512)__builtin_ia32_selectps_512(__U, 1236 (__v16sf)_mm512_min_ps(__A, __B), 1237 (__v16sf)__W); 1238 } 1239 1240 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1241 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) 1242 { 1243 return (__m512)__builtin_ia32_selectps_512(__U, 1244 (__v16sf)_mm512_min_ps(__A, __B), 1245 (__v16sf)_mm512_setzero_ps()); 1246 } 1247 1248 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1249 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1250 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, 1251 (__v4sf) __B, 1252 (__v4sf) __W, 1253 (__mmask8) __U, 1254 _MM_FROUND_CUR_DIRECTION); 1255 } 1256 1257 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1258 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1259 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, 1260 (__v4sf) __B, 1261 (__v4sf) _mm_setzero_ps (), 1262 (__mmask8) __U, 1263 _MM_FROUND_CUR_DIRECTION); 1264 } 1265 1266 #define _mm_min_round_ss(A, B, R) \ 1267 (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1268 (__v4sf)(__m128)(B), \ 1269 (__v4sf)_mm_setzero_ps(), \ 1270 (__mmask8)-1, (int)(R)) 1271 1272 #define _mm_mask_min_round_ss(W, U, A, B, R) \ 1273 (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1274 (__v4sf)(__m128)(B), \ 1275 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1276 (int)(R)) 1277 1278 #define _mm_maskz_min_round_ss(U, A, B, R) \ 1279 (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1280 (__v4sf)(__m128)(B), \ 1281 (__v4sf)_mm_setzero_ps(), \ 1282 (__mmask8)(U), (int)(R)) 1283 1284 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1285 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1286 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, 1287 (__v2df) __B, 1288 (__v2df) __W, 1289 (__mmask8) __U, 1290 _MM_FROUND_CUR_DIRECTION); 1291 } 1292 1293 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1294 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1295 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, 1296 (__v2df) __B, 1297 (__v2df) _mm_setzero_pd (), 1298 (__mmask8) __U, 1299 _MM_FROUND_CUR_DIRECTION); 1300 } 1301 1302 #define _mm_min_round_sd(A, B, R) \ 1303 (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1304 (__v2df)(__m128d)(B), \ 1305 (__v2df)_mm_setzero_pd(), \ 1306 (__mmask8)-1, (int)(R)) 1307 1308 #define _mm_mask_min_round_sd(W, U, A, B, R) \ 1309 (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1310 (__v2df)(__m128d)(B), \ 1311 (__v2df)(__m128d)(W), \ 1312 (__mmask8)(U), (int)(R)) 1313 1314 #define _mm_maskz_min_round_sd(U, A, B, R) \ 1315 (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1316 (__v2df)(__m128d)(B), \ 1317 (__v2df)_mm_setzero_pd(), \ 1318 (__mmask8)(U), (int)(R)) 1319 1320 static __inline __m512i 1321 __DEFAULT_FN_ATTRS512 1322 _mm512_min_epi32(__m512i __A, __m512i __B) 1323 { 1324 return (__m512i)__builtin_ia32_pminsd512((__v16si)__A, (__v16si)__B); 1325 } 1326 1327 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1328 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1329 { 1330 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1331 (__v16si)_mm512_min_epi32(__A, __B), 1332 (__v16si)__W); 1333 } 1334 1335 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1336 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) 1337 { 1338 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1339 (__v16si)_mm512_min_epi32(__A, __B), 1340 (__v16si)_mm512_setzero_si512()); 1341 } 1342 1343 static __inline __m512i __DEFAULT_FN_ATTRS512 1344 _mm512_min_epu32(__m512i __A, __m512i __B) 1345 { 1346 return (__m512i)__builtin_ia32_pminud512((__v16si)__A, (__v16si)__B); 1347 } 1348 1349 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1350 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1351 { 1352 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1353 (__v16si)_mm512_min_epu32(__A, __B), 1354 (__v16si)__W); 1355 } 1356 1357 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1358 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) 1359 { 1360 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1361 (__v16si)_mm512_min_epu32(__A, __B), 1362 (__v16si)_mm512_setzero_si512()); 1363 } 1364 1365 static __inline __m512i __DEFAULT_FN_ATTRS512 1366 _mm512_min_epi64(__m512i __A, __m512i __B) 1367 { 1368 return (__m512i)__builtin_ia32_pminsq512((__v8di)__A, (__v8di)__B); 1369 } 1370 1371 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1372 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1373 { 1374 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1375 (__v8di)_mm512_min_epi64(__A, __B), 1376 (__v8di)__W); 1377 } 1378 1379 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1380 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) 1381 { 1382 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1383 (__v8di)_mm512_min_epi64(__A, __B), 1384 (__v8di)_mm512_setzero_si512()); 1385 } 1386 1387 static __inline __m512i __DEFAULT_FN_ATTRS512 1388 _mm512_min_epu64(__m512i __A, __m512i __B) 1389 { 1390 return (__m512i)__builtin_ia32_pminuq512((__v8di)__A, (__v8di)__B); 1391 } 1392 1393 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1394 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1395 { 1396 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1397 (__v8di)_mm512_min_epu64(__A, __B), 1398 (__v8di)__W); 1399 } 1400 1401 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1402 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) 1403 { 1404 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1405 (__v8di)_mm512_min_epu64(__A, __B), 1406 (__v8di)_mm512_setzero_si512()); 1407 } 1408 1409 static __inline __m512i __DEFAULT_FN_ATTRS512 1410 _mm512_mul_epi32(__m512i __X, __m512i __Y) 1411 { 1412 return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); 1413 } 1414 1415 static __inline __m512i __DEFAULT_FN_ATTRS512 1416 _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) 1417 { 1418 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1419 (__v8di)_mm512_mul_epi32(__X, __Y), 1420 (__v8di)__W); 1421 } 1422 1423 static __inline __m512i __DEFAULT_FN_ATTRS512 1424 _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) 1425 { 1426 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1427 (__v8di)_mm512_mul_epi32(__X, __Y), 1428 (__v8di)_mm512_setzero_si512 ()); 1429 } 1430 1431 static __inline __m512i __DEFAULT_FN_ATTRS512 1432 _mm512_mul_epu32(__m512i __X, __m512i __Y) 1433 { 1434 return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); 1435 } 1436 1437 static __inline __m512i __DEFAULT_FN_ATTRS512 1438 _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) 1439 { 1440 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1441 (__v8di)_mm512_mul_epu32(__X, __Y), 1442 (__v8di)__W); 1443 } 1444 1445 static __inline __m512i __DEFAULT_FN_ATTRS512 1446 _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) 1447 { 1448 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1449 (__v8di)_mm512_mul_epu32(__X, __Y), 1450 (__v8di)_mm512_setzero_si512 ()); 1451 } 1452 1453 static __inline __m512i __DEFAULT_FN_ATTRS512 1454 _mm512_mullo_epi32 (__m512i __A, __m512i __B) 1455 { 1456 return (__m512i) ((__v16su) __A * (__v16su) __B); 1457 } 1458 1459 static __inline __m512i __DEFAULT_FN_ATTRS512 1460 _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) 1461 { 1462 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1463 (__v16si)_mm512_mullo_epi32(__A, __B), 1464 (__v16si)_mm512_setzero_si512()); 1465 } 1466 1467 static __inline __m512i __DEFAULT_FN_ATTRS512 1468 _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1469 { 1470 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1471 (__v16si)_mm512_mullo_epi32(__A, __B), 1472 (__v16si)__W); 1473 } 1474 1475 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1476 _mm512_mullox_epi64 (__m512i __A, __m512i __B) { 1477 return (__m512i) ((__v8du) __A * (__v8du) __B); 1478 } 1479 1480 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1481 _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { 1482 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1483 (__v8di)_mm512_mullox_epi64(__A, __B), 1484 (__v8di)__W); 1485 } 1486 1487 #define _mm512_sqrt_round_pd(A, R) \ 1488 (__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)) 1489 1490 #define _mm512_mask_sqrt_round_pd(W, U, A, R) \ 1491 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1492 (__v8df)_mm512_sqrt_round_pd((A), (R)), \ 1493 (__v8df)(__m512d)(W)) 1494 1495 #define _mm512_maskz_sqrt_round_pd(U, A, R) \ 1496 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1497 (__v8df)_mm512_sqrt_round_pd((A), (R)), \ 1498 (__v8df)_mm512_setzero_pd()) 1499 1500 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1501 _mm512_sqrt_pd(__m512d __A) 1502 { 1503 return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A, 1504 _MM_FROUND_CUR_DIRECTION); 1505 } 1506 1507 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1508 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) 1509 { 1510 return (__m512d)__builtin_ia32_selectpd_512(__U, 1511 (__v8df)_mm512_sqrt_pd(__A), 1512 (__v8df)__W); 1513 } 1514 1515 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1516 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) 1517 { 1518 return (__m512d)__builtin_ia32_selectpd_512(__U, 1519 (__v8df)_mm512_sqrt_pd(__A), 1520 (__v8df)_mm512_setzero_pd()); 1521 } 1522 1523 #define _mm512_sqrt_round_ps(A, R) \ 1524 (__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)) 1525 1526 #define _mm512_mask_sqrt_round_ps(W, U, A, R) \ 1527 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1528 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ 1529 (__v16sf)(__m512)(W)) 1530 1531 #define _mm512_maskz_sqrt_round_ps(U, A, R) \ 1532 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1533 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ 1534 (__v16sf)_mm512_setzero_ps()) 1535 1536 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1537 _mm512_sqrt_ps(__m512 __A) 1538 { 1539 return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A, 1540 _MM_FROUND_CUR_DIRECTION); 1541 } 1542 1543 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1544 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) 1545 { 1546 return (__m512)__builtin_ia32_selectps_512(__U, 1547 (__v16sf)_mm512_sqrt_ps(__A), 1548 (__v16sf)__W); 1549 } 1550 1551 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1552 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) 1553 { 1554 return (__m512)__builtin_ia32_selectps_512(__U, 1555 (__v16sf)_mm512_sqrt_ps(__A), 1556 (__v16sf)_mm512_setzero_ps()); 1557 } 1558 1559 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1560 _mm512_rsqrt14_pd(__m512d __A) 1561 { 1562 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1563 (__v8df) 1564 _mm512_setzero_pd (), 1565 (__mmask8) -1);} 1566 1567 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1568 _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) 1569 { 1570 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1571 (__v8df) __W, 1572 (__mmask8) __U); 1573 } 1574 1575 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1576 _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) 1577 { 1578 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1579 (__v8df) 1580 _mm512_setzero_pd (), 1581 (__mmask8) __U); 1582 } 1583 1584 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1585 _mm512_rsqrt14_ps(__m512 __A) 1586 { 1587 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1588 (__v16sf) 1589 _mm512_setzero_ps (), 1590 (__mmask16) -1); 1591 } 1592 1593 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1594 _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) 1595 { 1596 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1597 (__v16sf) __W, 1598 (__mmask16) __U); 1599 } 1600 1601 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1602 _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) 1603 { 1604 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1605 (__v16sf) 1606 _mm512_setzero_ps (), 1607 (__mmask16) __U); 1608 } 1609 1610 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1611 _mm_rsqrt14_ss(__m128 __A, __m128 __B) 1612 { 1613 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1614 (__v4sf) __B, 1615 (__v4sf) 1616 _mm_setzero_ps (), 1617 (__mmask8) -1); 1618 } 1619 1620 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1621 _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 1622 { 1623 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1624 (__v4sf) __B, 1625 (__v4sf) __W, 1626 (__mmask8) __U); 1627 } 1628 1629 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1630 _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) 1631 { 1632 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1633 (__v4sf) __B, 1634 (__v4sf) _mm_setzero_ps (), 1635 (__mmask8) __U); 1636 } 1637 1638 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1639 _mm_rsqrt14_sd(__m128d __A, __m128d __B) 1640 { 1641 return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A, 1642 (__v2df) __B, 1643 (__v2df) 1644 _mm_setzero_pd (), 1645 (__mmask8) -1); 1646 } 1647 1648 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1649 _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 1650 { 1651 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, 1652 (__v2df) __B, 1653 (__v2df) __W, 1654 (__mmask8) __U); 1655 } 1656 1657 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1658 _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) 1659 { 1660 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, 1661 (__v2df) __B, 1662 (__v2df) _mm_setzero_pd (), 1663 (__mmask8) __U); 1664 } 1665 1666 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1667 _mm512_rcp14_pd(__m512d __A) 1668 { 1669 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1670 (__v8df) 1671 _mm512_setzero_pd (), 1672 (__mmask8) -1); 1673 } 1674 1675 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1676 _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) 1677 { 1678 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1679 (__v8df) __W, 1680 (__mmask8) __U); 1681 } 1682 1683 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1684 _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) 1685 { 1686 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1687 (__v8df) 1688 _mm512_setzero_pd (), 1689 (__mmask8) __U); 1690 } 1691 1692 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1693 _mm512_rcp14_ps(__m512 __A) 1694 { 1695 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1696 (__v16sf) 1697 _mm512_setzero_ps (), 1698 (__mmask16) -1); 1699 } 1700 1701 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1702 _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) 1703 { 1704 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1705 (__v16sf) __W, 1706 (__mmask16) __U); 1707 } 1708 1709 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1710 _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) 1711 { 1712 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1713 (__v16sf) 1714 _mm512_setzero_ps (), 1715 (__mmask16) __U); 1716 } 1717 1718 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1719 _mm_rcp14_ss(__m128 __A, __m128 __B) 1720 { 1721 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1722 (__v4sf) __B, 1723 (__v4sf) 1724 _mm_setzero_ps (), 1725 (__mmask8) -1); 1726 } 1727 1728 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1729 _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 1730 { 1731 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1732 (__v4sf) __B, 1733 (__v4sf) __W, 1734 (__mmask8) __U); 1735 } 1736 1737 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1738 _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) 1739 { 1740 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1741 (__v4sf) __B, 1742 (__v4sf) _mm_setzero_ps (), 1743 (__mmask8) __U); 1744 } 1745 1746 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1747 _mm_rcp14_sd(__m128d __A, __m128d __B) 1748 { 1749 return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A, 1750 (__v2df) __B, 1751 (__v2df) 1752 _mm_setzero_pd (), 1753 (__mmask8) -1); 1754 } 1755 1756 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1757 _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 1758 { 1759 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, 1760 (__v2df) __B, 1761 (__v2df) __W, 1762 (__mmask8) __U); 1763 } 1764 1765 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1766 _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) 1767 { 1768 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, 1769 (__v2df) __B, 1770 (__v2df) _mm_setzero_pd (), 1771 (__mmask8) __U); 1772 } 1773 1774 static __inline __m512 __DEFAULT_FN_ATTRS512 1775 _mm512_floor_ps(__m512 __A) 1776 { 1777 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1778 _MM_FROUND_FLOOR, 1779 (__v16sf) __A, -1, 1780 _MM_FROUND_CUR_DIRECTION); 1781 } 1782 1783 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1784 _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) 1785 { 1786 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1787 _MM_FROUND_FLOOR, 1788 (__v16sf) __W, __U, 1789 _MM_FROUND_CUR_DIRECTION); 1790 } 1791 1792 static __inline __m512d __DEFAULT_FN_ATTRS512 1793 _mm512_floor_pd(__m512d __A) 1794 { 1795 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1796 _MM_FROUND_FLOOR, 1797 (__v8df) __A, -1, 1798 _MM_FROUND_CUR_DIRECTION); 1799 } 1800 1801 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1802 _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) 1803 { 1804 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1805 _MM_FROUND_FLOOR, 1806 (__v8df) __W, __U, 1807 _MM_FROUND_CUR_DIRECTION); 1808 } 1809 1810 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1811 _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) 1812 { 1813 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1814 _MM_FROUND_CEIL, 1815 (__v16sf) __W, __U, 1816 _MM_FROUND_CUR_DIRECTION); 1817 } 1818 1819 static __inline __m512 __DEFAULT_FN_ATTRS512 1820 _mm512_ceil_ps(__m512 __A) 1821 { 1822 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1823 _MM_FROUND_CEIL, 1824 (__v16sf) __A, -1, 1825 _MM_FROUND_CUR_DIRECTION); 1826 } 1827 1828 static __inline __m512d __DEFAULT_FN_ATTRS512 1829 _mm512_ceil_pd(__m512d __A) 1830 { 1831 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1832 _MM_FROUND_CEIL, 1833 (__v8df) __A, -1, 1834 _MM_FROUND_CUR_DIRECTION); 1835 } 1836 1837 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1838 _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) 1839 { 1840 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1841 _MM_FROUND_CEIL, 1842 (__v8df) __W, __U, 1843 _MM_FROUND_CUR_DIRECTION); 1844 } 1845 1846 static __inline __m512i __DEFAULT_FN_ATTRS512 1847 _mm512_abs_epi64(__m512i __A) 1848 { 1849 return (__m512i)__builtin_ia32_pabsq512((__v8di)__A); 1850 } 1851 1852 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1853 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 1854 { 1855 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1856 (__v8di)_mm512_abs_epi64(__A), 1857 (__v8di)__W); 1858 } 1859 1860 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1861 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) 1862 { 1863 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1864 (__v8di)_mm512_abs_epi64(__A), 1865 (__v8di)_mm512_setzero_si512()); 1866 } 1867 1868 static __inline __m512i __DEFAULT_FN_ATTRS512 1869 _mm512_abs_epi32(__m512i __A) 1870 { 1871 return (__m512i)__builtin_ia32_pabsd512((__v16si) __A); 1872 } 1873 1874 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1875 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 1876 { 1877 return (__m512i)__builtin_ia32_selectd_512(__U, 1878 (__v16si)_mm512_abs_epi32(__A), 1879 (__v16si)__W); 1880 } 1881 1882 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1883 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) 1884 { 1885 return (__m512i)__builtin_ia32_selectd_512(__U, 1886 (__v16si)_mm512_abs_epi32(__A), 1887 (__v16si)_mm512_setzero_si512()); 1888 } 1889 1890 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1891 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1892 __A = _mm_add_ss(__A, __B); 1893 return __builtin_ia32_selectss_128(__U, __A, __W); 1894 } 1895 1896 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1897 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1898 __A = _mm_add_ss(__A, __B); 1899 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 1900 } 1901 1902 #define _mm_add_round_ss(A, B, R) \ 1903 (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1904 (__v4sf)(__m128)(B), \ 1905 (__v4sf)_mm_setzero_ps(), \ 1906 (__mmask8)-1, (int)(R)) 1907 1908 #define _mm_mask_add_round_ss(W, U, A, B, R) \ 1909 (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1910 (__v4sf)(__m128)(B), \ 1911 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1912 (int)(R)) 1913 1914 #define _mm_maskz_add_round_ss(U, A, B, R) \ 1915 (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1916 (__v4sf)(__m128)(B), \ 1917 (__v4sf)_mm_setzero_ps(), \ 1918 (__mmask8)(U), (int)(R)) 1919 1920 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1921 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1922 __A = _mm_add_sd(__A, __B); 1923 return __builtin_ia32_selectsd_128(__U, __A, __W); 1924 } 1925 1926 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1927 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1928 __A = _mm_add_sd(__A, __B); 1929 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 1930 } 1931 #define _mm_add_round_sd(A, B, R) \ 1932 (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1933 (__v2df)(__m128d)(B), \ 1934 (__v2df)_mm_setzero_pd(), \ 1935 (__mmask8)-1, (int)(R)) 1936 1937 #define _mm_mask_add_round_sd(W, U, A, B, R) \ 1938 (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1939 (__v2df)(__m128d)(B), \ 1940 (__v2df)(__m128d)(W), \ 1941 (__mmask8)(U), (int)(R)) 1942 1943 #define _mm_maskz_add_round_sd(U, A, B, R) \ 1944 (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1945 (__v2df)(__m128d)(B), \ 1946 (__v2df)_mm_setzero_pd(), \ 1947 (__mmask8)(U), (int)(R)) 1948 1949 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1950 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 1951 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 1952 (__v8df)_mm512_add_pd(__A, __B), 1953 (__v8df)__W); 1954 } 1955 1956 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1957 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { 1958 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 1959 (__v8df)_mm512_add_pd(__A, __B), 1960 (__v8df)_mm512_setzero_pd()); 1961 } 1962 1963 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1964 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 1965 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 1966 (__v16sf)_mm512_add_ps(__A, __B), 1967 (__v16sf)__W); 1968 } 1969 1970 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1971 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { 1972 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 1973 (__v16sf)_mm512_add_ps(__A, __B), 1974 (__v16sf)_mm512_setzero_ps()); 1975 } 1976 1977 #define _mm512_add_round_pd(A, B, R) \ 1978 (__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \ 1979 (__v8df)(__m512d)(B), (int)(R)) 1980 1981 #define _mm512_mask_add_round_pd(W, U, A, B, R) \ 1982 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1983 (__v8df)_mm512_add_round_pd((A), (B), (R)), \ 1984 (__v8df)(__m512d)(W)) 1985 1986 #define _mm512_maskz_add_round_pd(U, A, B, R) \ 1987 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1988 (__v8df)_mm512_add_round_pd((A), (B), (R)), \ 1989 (__v8df)_mm512_setzero_pd()) 1990 1991 #define _mm512_add_round_ps(A, B, R) \ 1992 (__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ 1993 (__v16sf)(__m512)(B), (int)(R)) 1994 1995 #define _mm512_mask_add_round_ps(W, U, A, B, R) \ 1996 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1997 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ 1998 (__v16sf)(__m512)(W)) 1999 2000 #define _mm512_maskz_add_round_ps(U, A, B, R) \ 2001 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2002 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ 2003 (__v16sf)_mm512_setzero_ps()) 2004 2005 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2006 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2007 __A = _mm_sub_ss(__A, __B); 2008 return __builtin_ia32_selectss_128(__U, __A, __W); 2009 } 2010 2011 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2012 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2013 __A = _mm_sub_ss(__A, __B); 2014 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2015 } 2016 #define _mm_sub_round_ss(A, B, R) \ 2017 (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2018 (__v4sf)(__m128)(B), \ 2019 (__v4sf)_mm_setzero_ps(), \ 2020 (__mmask8)-1, (int)(R)) 2021 2022 #define _mm_mask_sub_round_ss(W, U, A, B, R) \ 2023 (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2024 (__v4sf)(__m128)(B), \ 2025 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2026 (int)(R)) 2027 2028 #define _mm_maskz_sub_round_ss(U, A, B, R) \ 2029 (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2030 (__v4sf)(__m128)(B), \ 2031 (__v4sf)_mm_setzero_ps(), \ 2032 (__mmask8)(U), (int)(R)) 2033 2034 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2035 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2036 __A = _mm_sub_sd(__A, __B); 2037 return __builtin_ia32_selectsd_128(__U, __A, __W); 2038 } 2039 2040 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2041 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2042 __A = _mm_sub_sd(__A, __B); 2043 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2044 } 2045 2046 #define _mm_sub_round_sd(A, B, R) \ 2047 (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2048 (__v2df)(__m128d)(B), \ 2049 (__v2df)_mm_setzero_pd(), \ 2050 (__mmask8)-1, (int)(R)) 2051 2052 #define _mm_mask_sub_round_sd(W, U, A, B, R) \ 2053 (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2054 (__v2df)(__m128d)(B), \ 2055 (__v2df)(__m128d)(W), \ 2056 (__mmask8)(U), (int)(R)) 2057 2058 #define _mm_maskz_sub_round_sd(U, A, B, R) \ 2059 (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2060 (__v2df)(__m128d)(B), \ 2061 (__v2df)_mm_setzero_pd(), \ 2062 (__mmask8)(U), (int)(R)) 2063 2064 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2065 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2066 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2067 (__v8df)_mm512_sub_pd(__A, __B), 2068 (__v8df)__W); 2069 } 2070 2071 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2072 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2073 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2074 (__v8df)_mm512_sub_pd(__A, __B), 2075 (__v8df)_mm512_setzero_pd()); 2076 } 2077 2078 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2079 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2080 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2081 (__v16sf)_mm512_sub_ps(__A, __B), 2082 (__v16sf)__W); 2083 } 2084 2085 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2086 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2087 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2088 (__v16sf)_mm512_sub_ps(__A, __B), 2089 (__v16sf)_mm512_setzero_ps()); 2090 } 2091 2092 #define _mm512_sub_round_pd(A, B, R) \ 2093 (__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \ 2094 (__v8df)(__m512d)(B), (int)(R)) 2095 2096 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \ 2097 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2098 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ 2099 (__v8df)(__m512d)(W)) 2100 2101 #define _mm512_maskz_sub_round_pd(U, A, B, R) \ 2102 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2103 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ 2104 (__v8df)_mm512_setzero_pd()) 2105 2106 #define _mm512_sub_round_ps(A, B, R) \ 2107 (__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ 2108 (__v16sf)(__m512)(B), (int)(R)) 2109 2110 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \ 2111 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2112 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ 2113 (__v16sf)(__m512)(W)) 2114 2115 #define _mm512_maskz_sub_round_ps(U, A, B, R) \ 2116 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2117 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ 2118 (__v16sf)_mm512_setzero_ps()) 2119 2120 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2121 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2122 __A = _mm_mul_ss(__A, __B); 2123 return __builtin_ia32_selectss_128(__U, __A, __W); 2124 } 2125 2126 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2127 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2128 __A = _mm_mul_ss(__A, __B); 2129 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2130 } 2131 #define _mm_mul_round_ss(A, B, R) \ 2132 (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2133 (__v4sf)(__m128)(B), \ 2134 (__v4sf)_mm_setzero_ps(), \ 2135 (__mmask8)-1, (int)(R)) 2136 2137 #define _mm_mask_mul_round_ss(W, U, A, B, R) \ 2138 (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2139 (__v4sf)(__m128)(B), \ 2140 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2141 (int)(R)) 2142 2143 #define _mm_maskz_mul_round_ss(U, A, B, R) \ 2144 (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2145 (__v4sf)(__m128)(B), \ 2146 (__v4sf)_mm_setzero_ps(), \ 2147 (__mmask8)(U), (int)(R)) 2148 2149 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2150 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2151 __A = _mm_mul_sd(__A, __B); 2152 return __builtin_ia32_selectsd_128(__U, __A, __W); 2153 } 2154 2155 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2156 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2157 __A = _mm_mul_sd(__A, __B); 2158 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2159 } 2160 2161 #define _mm_mul_round_sd(A, B, R) \ 2162 (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2163 (__v2df)(__m128d)(B), \ 2164 (__v2df)_mm_setzero_pd(), \ 2165 (__mmask8)-1, (int)(R)) 2166 2167 #define _mm_mask_mul_round_sd(W, U, A, B, R) \ 2168 (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2169 (__v2df)(__m128d)(B), \ 2170 (__v2df)(__m128d)(W), \ 2171 (__mmask8)(U), (int)(R)) 2172 2173 #define _mm_maskz_mul_round_sd(U, A, B, R) \ 2174 (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2175 (__v2df)(__m128d)(B), \ 2176 (__v2df)_mm_setzero_pd(), \ 2177 (__mmask8)(U), (int)(R)) 2178 2179 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2180 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2181 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2182 (__v8df)_mm512_mul_pd(__A, __B), 2183 (__v8df)__W); 2184 } 2185 2186 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2187 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2188 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2189 (__v8df)_mm512_mul_pd(__A, __B), 2190 (__v8df)_mm512_setzero_pd()); 2191 } 2192 2193 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2194 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2195 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2196 (__v16sf)_mm512_mul_ps(__A, __B), 2197 (__v16sf)__W); 2198 } 2199 2200 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2201 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2202 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2203 (__v16sf)_mm512_mul_ps(__A, __B), 2204 (__v16sf)_mm512_setzero_ps()); 2205 } 2206 2207 #define _mm512_mul_round_pd(A, B, R) \ 2208 (__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \ 2209 (__v8df)(__m512d)(B), (int)(R)) 2210 2211 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \ 2212 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2213 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ 2214 (__v8df)(__m512d)(W)) 2215 2216 #define _mm512_maskz_mul_round_pd(U, A, B, R) \ 2217 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2218 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ 2219 (__v8df)_mm512_setzero_pd()) 2220 2221 #define _mm512_mul_round_ps(A, B, R) \ 2222 (__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ 2223 (__v16sf)(__m512)(B), (int)(R)) 2224 2225 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \ 2226 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2227 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ 2228 (__v16sf)(__m512)(W)) 2229 2230 #define _mm512_maskz_mul_round_ps(U, A, B, R) \ 2231 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2232 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ 2233 (__v16sf)_mm512_setzero_ps()) 2234 2235 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2236 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2237 __A = _mm_div_ss(__A, __B); 2238 return __builtin_ia32_selectss_128(__U, __A, __W); 2239 } 2240 2241 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2242 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2243 __A = _mm_div_ss(__A, __B); 2244 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2245 } 2246 2247 #define _mm_div_round_ss(A, B, R) \ 2248 (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2249 (__v4sf)(__m128)(B), \ 2250 (__v4sf)_mm_setzero_ps(), \ 2251 (__mmask8)-1, (int)(R)) 2252 2253 #define _mm_mask_div_round_ss(W, U, A, B, R) \ 2254 (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2255 (__v4sf)(__m128)(B), \ 2256 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2257 (int)(R)) 2258 2259 #define _mm_maskz_div_round_ss(U, A, B, R) \ 2260 (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2261 (__v4sf)(__m128)(B), \ 2262 (__v4sf)_mm_setzero_ps(), \ 2263 (__mmask8)(U), (int)(R)) 2264 2265 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2266 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2267 __A = _mm_div_sd(__A, __B); 2268 return __builtin_ia32_selectsd_128(__U, __A, __W); 2269 } 2270 2271 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2272 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2273 __A = _mm_div_sd(__A, __B); 2274 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2275 } 2276 2277 #define _mm_div_round_sd(A, B, R) \ 2278 (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2279 (__v2df)(__m128d)(B), \ 2280 (__v2df)_mm_setzero_pd(), \ 2281 (__mmask8)-1, (int)(R)) 2282 2283 #define _mm_mask_div_round_sd(W, U, A, B, R) \ 2284 (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2285 (__v2df)(__m128d)(B), \ 2286 (__v2df)(__m128d)(W), \ 2287 (__mmask8)(U), (int)(R)) 2288 2289 #define _mm_maskz_div_round_sd(U, A, B, R) \ 2290 (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2291 (__v2df)(__m128d)(B), \ 2292 (__v2df)_mm_setzero_pd(), \ 2293 (__mmask8)(U), (int)(R)) 2294 2295 static __inline __m512d __DEFAULT_FN_ATTRS512 2296 _mm512_div_pd(__m512d __a, __m512d __b) 2297 { 2298 return (__m512d)((__v8df)__a/(__v8df)__b); 2299 } 2300 2301 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2302 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2303 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2304 (__v8df)_mm512_div_pd(__A, __B), 2305 (__v8df)__W); 2306 } 2307 2308 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2309 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2310 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2311 (__v8df)_mm512_div_pd(__A, __B), 2312 (__v8df)_mm512_setzero_pd()); 2313 } 2314 2315 static __inline __m512 __DEFAULT_FN_ATTRS512 2316 _mm512_div_ps(__m512 __a, __m512 __b) 2317 { 2318 return (__m512)((__v16sf)__a/(__v16sf)__b); 2319 } 2320 2321 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2322 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2323 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2324 (__v16sf)_mm512_div_ps(__A, __B), 2325 (__v16sf)__W); 2326 } 2327 2328 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2329 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2330 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2331 (__v16sf)_mm512_div_ps(__A, __B), 2332 (__v16sf)_mm512_setzero_ps()); 2333 } 2334 2335 #define _mm512_div_round_pd(A, B, R) \ 2336 (__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \ 2337 (__v8df)(__m512d)(B), (int)(R)) 2338 2339 #define _mm512_mask_div_round_pd(W, U, A, B, R) \ 2340 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2341 (__v8df)_mm512_div_round_pd((A), (B), (R)), \ 2342 (__v8df)(__m512d)(W)) 2343 2344 #define _mm512_maskz_div_round_pd(U, A, B, R) \ 2345 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2346 (__v8df)_mm512_div_round_pd((A), (B), (R)), \ 2347 (__v8df)_mm512_setzero_pd()) 2348 2349 #define _mm512_div_round_ps(A, B, R) \ 2350 (__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ 2351 (__v16sf)(__m512)(B), (int)(R)) 2352 2353 #define _mm512_mask_div_round_ps(W, U, A, B, R) \ 2354 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2355 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ 2356 (__v16sf)(__m512)(W)) 2357 2358 #define _mm512_maskz_div_round_ps(U, A, B, R) \ 2359 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2360 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ 2361 (__v16sf)_mm512_setzero_ps()) 2362 2363 #define _mm512_roundscale_ps(A, B) \ 2364 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ 2365 (__v16sf)_mm512_undefined_ps(), \ 2366 (__mmask16)-1, \ 2367 _MM_FROUND_CUR_DIRECTION) 2368 2369 #define _mm512_mask_roundscale_ps(A, B, C, imm) \ 2370 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ 2371 (__v16sf)(__m512)(A), (__mmask16)(B), \ 2372 _MM_FROUND_CUR_DIRECTION) 2373 2374 #define _mm512_maskz_roundscale_ps(A, B, imm) \ 2375 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ 2376 (__v16sf)_mm512_setzero_ps(), \ 2377 (__mmask16)(A), \ 2378 _MM_FROUND_CUR_DIRECTION) 2379 2380 #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \ 2381 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ 2382 (__v16sf)(__m512)(A), (__mmask16)(B), \ 2383 (int)(R)) 2384 2385 #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \ 2386 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ 2387 (__v16sf)_mm512_setzero_ps(), \ 2388 (__mmask16)(A), (int)(R)) 2389 2390 #define _mm512_roundscale_round_ps(A, imm, R) \ 2391 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \ 2392 (__v16sf)_mm512_undefined_ps(), \ 2393 (__mmask16)-1, (int)(R)) 2394 2395 #define _mm512_roundscale_pd(A, B) \ 2396 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ 2397 (__v8df)_mm512_undefined_pd(), \ 2398 (__mmask8)-1, \ 2399 _MM_FROUND_CUR_DIRECTION) 2400 2401 #define _mm512_mask_roundscale_pd(A, B, C, imm) \ 2402 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ 2403 (__v8df)(__m512d)(A), (__mmask8)(B), \ 2404 _MM_FROUND_CUR_DIRECTION) 2405 2406 #define _mm512_maskz_roundscale_pd(A, B, imm) \ 2407 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ 2408 (__v8df)_mm512_setzero_pd(), \ 2409 (__mmask8)(A), \ 2410 _MM_FROUND_CUR_DIRECTION) 2411 2412 #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \ 2413 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ 2414 (__v8df)(__m512d)(A), (__mmask8)(B), \ 2415 (int)(R)) 2416 2417 #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \ 2418 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ 2419 (__v8df)_mm512_setzero_pd(), \ 2420 (__mmask8)(A), (int)(R)) 2421 2422 #define _mm512_roundscale_round_pd(A, imm, R) \ 2423 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \ 2424 (__v8df)_mm512_undefined_pd(), \ 2425 (__mmask8)-1, (int)(R)) 2426 2427 #define _mm512_fmadd_round_pd(A, B, C, R) \ 2428 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2429 (__v8df)(__m512d)(B), \ 2430 (__v8df)(__m512d)(C), \ 2431 (__mmask8)-1, (int)(R)) 2432 2433 2434 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ 2435 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2436 (__v8df)(__m512d)(B), \ 2437 (__v8df)(__m512d)(C), \ 2438 (__mmask8)(U), (int)(R)) 2439 2440 2441 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ 2442 (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ 2443 (__v8df)(__m512d)(B), \ 2444 (__v8df)(__m512d)(C), \ 2445 (__mmask8)(U), (int)(R)) 2446 2447 2448 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ 2449 (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ 2450 (__v8df)(__m512d)(B), \ 2451 (__v8df)(__m512d)(C), \ 2452 (__mmask8)(U), (int)(R)) 2453 2454 2455 #define _mm512_fmsub_round_pd(A, B, C, R) \ 2456 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2457 (__v8df)(__m512d)(B), \ 2458 -(__v8df)(__m512d)(C), \ 2459 (__mmask8)-1, (int)(R)) 2460 2461 2462 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ 2463 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2464 (__v8df)(__m512d)(B), \ 2465 -(__v8df)(__m512d)(C), \ 2466 (__mmask8)(U), (int)(R)) 2467 2468 2469 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ 2470 (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ 2471 (__v8df)(__m512d)(B), \ 2472 -(__v8df)(__m512d)(C), \ 2473 (__mmask8)(U), (int)(R)) 2474 2475 2476 #define _mm512_fnmadd_round_pd(A, B, C, R) \ 2477 (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ 2478 (__v8df)(__m512d)(B), \ 2479 (__v8df)(__m512d)(C), \ 2480 (__mmask8)-1, (int)(R)) 2481 2482 2483 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ 2484 (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ 2485 (__v8df)(__m512d)(B), \ 2486 (__v8df)(__m512d)(C), \ 2487 (__mmask8)(U), (int)(R)) 2488 2489 2490 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ 2491 (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ 2492 (__v8df)(__m512d)(B), \ 2493 (__v8df)(__m512d)(C), \ 2494 (__mmask8)(U), (int)(R)) 2495 2496 2497 #define _mm512_fnmsub_round_pd(A, B, C, R) \ 2498 (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ 2499 (__v8df)(__m512d)(B), \ 2500 -(__v8df)(__m512d)(C), \ 2501 (__mmask8)-1, (int)(R)) 2502 2503 2504 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ 2505 (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ 2506 (__v8df)(__m512d)(B), \ 2507 -(__v8df)(__m512d)(C), \ 2508 (__mmask8)(U), (int)(R)) 2509 2510 2511 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2512 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) 2513 { 2514 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2515 (__v8df) __B, 2516 (__v8df) __C, 2517 (__mmask8) -1, 2518 _MM_FROUND_CUR_DIRECTION); 2519 } 2520 2521 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2522 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2523 { 2524 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2525 (__v8df) __B, 2526 (__v8df) __C, 2527 (__mmask8) __U, 2528 _MM_FROUND_CUR_DIRECTION); 2529 } 2530 2531 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2532 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2533 { 2534 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, 2535 (__v8df) __B, 2536 (__v8df) __C, 2537 (__mmask8) __U, 2538 _MM_FROUND_CUR_DIRECTION); 2539 } 2540 2541 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2542 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2543 { 2544 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, 2545 (__v8df) __B, 2546 (__v8df) __C, 2547 (__mmask8) __U, 2548 _MM_FROUND_CUR_DIRECTION); 2549 } 2550 2551 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2552 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) 2553 { 2554 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2555 (__v8df) __B, 2556 -(__v8df) __C, 2557 (__mmask8) -1, 2558 _MM_FROUND_CUR_DIRECTION); 2559 } 2560 2561 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2562 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2563 { 2564 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2565 (__v8df) __B, 2566 -(__v8df) __C, 2567 (__mmask8) __U, 2568 _MM_FROUND_CUR_DIRECTION); 2569 } 2570 2571 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2572 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2573 { 2574 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, 2575 (__v8df) __B, 2576 -(__v8df) __C, 2577 (__mmask8) __U, 2578 _MM_FROUND_CUR_DIRECTION); 2579 } 2580 2581 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2582 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) 2583 { 2584 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2585 -(__v8df) __B, 2586 (__v8df) __C, 2587 (__mmask8) -1, 2588 _MM_FROUND_CUR_DIRECTION); 2589 } 2590 2591 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2592 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2593 { 2594 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, 2595 (__v8df) __B, 2596 (__v8df) __C, 2597 (__mmask8) __U, 2598 _MM_FROUND_CUR_DIRECTION); 2599 } 2600 2601 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2602 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2603 { 2604 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, 2605 (__v8df) __B, 2606 (__v8df) __C, 2607 (__mmask8) __U, 2608 _MM_FROUND_CUR_DIRECTION); 2609 } 2610 2611 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2612 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) 2613 { 2614 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2615 -(__v8df) __B, 2616 -(__v8df) __C, 2617 (__mmask8) -1, 2618 _MM_FROUND_CUR_DIRECTION); 2619 } 2620 2621 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2622 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2623 { 2624 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, 2625 (__v8df) __B, 2626 -(__v8df) __C, 2627 (__mmask8) __U, 2628 _MM_FROUND_CUR_DIRECTION); 2629 } 2630 2631 #define _mm512_fmadd_round_ps(A, B, C, R) \ 2632 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2633 (__v16sf)(__m512)(B), \ 2634 (__v16sf)(__m512)(C), \ 2635 (__mmask16)-1, (int)(R)) 2636 2637 2638 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ 2639 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2640 (__v16sf)(__m512)(B), \ 2641 (__v16sf)(__m512)(C), \ 2642 (__mmask16)(U), (int)(R)) 2643 2644 2645 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ 2646 (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ 2647 (__v16sf)(__m512)(B), \ 2648 (__v16sf)(__m512)(C), \ 2649 (__mmask16)(U), (int)(R)) 2650 2651 2652 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ 2653 (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ 2654 (__v16sf)(__m512)(B), \ 2655 (__v16sf)(__m512)(C), \ 2656 (__mmask16)(U), (int)(R)) 2657 2658 2659 #define _mm512_fmsub_round_ps(A, B, C, R) \ 2660 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2661 (__v16sf)(__m512)(B), \ 2662 -(__v16sf)(__m512)(C), \ 2663 (__mmask16)-1, (int)(R)) 2664 2665 2666 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ 2667 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2668 (__v16sf)(__m512)(B), \ 2669 -(__v16sf)(__m512)(C), \ 2670 (__mmask16)(U), (int)(R)) 2671 2672 2673 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ 2674 (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ 2675 (__v16sf)(__m512)(B), \ 2676 -(__v16sf)(__m512)(C), \ 2677 (__mmask16)(U), (int)(R)) 2678 2679 2680 #define _mm512_fnmadd_round_ps(A, B, C, R) \ 2681 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2682 -(__v16sf)(__m512)(B), \ 2683 (__v16sf)(__m512)(C), \ 2684 (__mmask16)-1, (int)(R)) 2685 2686 2687 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ 2688 (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ 2689 (__v16sf)(__m512)(B), \ 2690 (__v16sf)(__m512)(C), \ 2691 (__mmask16)(U), (int)(R)) 2692 2693 2694 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ 2695 (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ 2696 (__v16sf)(__m512)(B), \ 2697 (__v16sf)(__m512)(C), \ 2698 (__mmask16)(U), (int)(R)) 2699 2700 2701 #define _mm512_fnmsub_round_ps(A, B, C, R) \ 2702 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2703 -(__v16sf)(__m512)(B), \ 2704 -(__v16sf)(__m512)(C), \ 2705 (__mmask16)-1, (int)(R)) 2706 2707 2708 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ 2709 (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ 2710 (__v16sf)(__m512)(B), \ 2711 -(__v16sf)(__m512)(C), \ 2712 (__mmask16)(U), (int)(R)) 2713 2714 2715 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2716 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) 2717 { 2718 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2719 (__v16sf) __B, 2720 (__v16sf) __C, 2721 (__mmask16) -1, 2722 _MM_FROUND_CUR_DIRECTION); 2723 } 2724 2725 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2726 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 2727 { 2728 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2729 (__v16sf) __B, 2730 (__v16sf) __C, 2731 (__mmask16) __U, 2732 _MM_FROUND_CUR_DIRECTION); 2733 } 2734 2735 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2736 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 2737 { 2738 return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, 2739 (__v16sf) __B, 2740 (__v16sf) __C, 2741 (__mmask16) __U, 2742 _MM_FROUND_CUR_DIRECTION); 2743 } 2744 2745 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2746 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2747 { 2748 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, 2749 (__v16sf) __B, 2750 (__v16sf) __C, 2751 (__mmask16) __U, 2752 _MM_FROUND_CUR_DIRECTION); 2753 } 2754 2755 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2756 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) 2757 { 2758 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2759 (__v16sf) __B, 2760 -(__v16sf) __C, 2761 (__mmask16) -1, 2762 _MM_FROUND_CUR_DIRECTION); 2763 } 2764 2765 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2766 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 2767 { 2768 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2769 (__v16sf) __B, 2770 -(__v16sf) __C, 2771 (__mmask16) __U, 2772 _MM_FROUND_CUR_DIRECTION); 2773 } 2774 2775 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2776 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2777 { 2778 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, 2779 (__v16sf) __B, 2780 -(__v16sf) __C, 2781 (__mmask16) __U, 2782 _MM_FROUND_CUR_DIRECTION); 2783 } 2784 2785 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2786 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) 2787 { 2788 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2789 -(__v16sf) __B, 2790 (__v16sf) __C, 2791 (__mmask16) -1, 2792 _MM_FROUND_CUR_DIRECTION); 2793 } 2794 2795 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2796 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 2797 { 2798 return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, 2799 (__v16sf) __B, 2800 (__v16sf) __C, 2801 (__mmask16) __U, 2802 _MM_FROUND_CUR_DIRECTION); 2803 } 2804 2805 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2806 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2807 { 2808 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, 2809 (__v16sf) __B, 2810 (__v16sf) __C, 2811 (__mmask16) __U, 2812 _MM_FROUND_CUR_DIRECTION); 2813 } 2814 2815 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2816 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) 2817 { 2818 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2819 -(__v16sf) __B, 2820 -(__v16sf) __C, 2821 (__mmask16) -1, 2822 _MM_FROUND_CUR_DIRECTION); 2823 } 2824 2825 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2826 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2827 { 2828 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, 2829 (__v16sf) __B, 2830 -(__v16sf) __C, 2831 (__mmask16) __U, 2832 _MM_FROUND_CUR_DIRECTION); 2833 } 2834 2835 #define _mm512_fmaddsub_round_pd(A, B, C, R) \ 2836 (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2837 (__v8df)(__m512d)(B), \ 2838 (__v8df)(__m512d)(C), \ 2839 (__mmask8)-1, (int)(R)) 2840 2841 2842 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ 2843 (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2844 (__v8df)(__m512d)(B), \ 2845 (__v8df)(__m512d)(C), \ 2846 (__mmask8)(U), (int)(R)) 2847 2848 2849 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ 2850 (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ 2851 (__v8df)(__m512d)(B), \ 2852 (__v8df)(__m512d)(C), \ 2853 (__mmask8)(U), (int)(R)) 2854 2855 2856 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ 2857 (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ 2858 (__v8df)(__m512d)(B), \ 2859 (__v8df)(__m512d)(C), \ 2860 (__mmask8)(U), (int)(R)) 2861 2862 2863 #define _mm512_fmsubadd_round_pd(A, B, C, R) \ 2864 (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2865 (__v8df)(__m512d)(B), \ 2866 -(__v8df)(__m512d)(C), \ 2867 (__mmask8)-1, (int)(R)) 2868 2869 2870 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ 2871 (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2872 (__v8df)(__m512d)(B), \ 2873 -(__v8df)(__m512d)(C), \ 2874 (__mmask8)(U), (int)(R)) 2875 2876 2877 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ 2878 (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ 2879 (__v8df)(__m512d)(B), \ 2880 -(__v8df)(__m512d)(C), \ 2881 (__mmask8)(U), (int)(R)) 2882 2883 2884 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2885 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) 2886 { 2887 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2888 (__v8df) __B, 2889 (__v8df) __C, 2890 (__mmask8) -1, 2891 _MM_FROUND_CUR_DIRECTION); 2892 } 2893 2894 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2895 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2896 { 2897 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2898 (__v8df) __B, 2899 (__v8df) __C, 2900 (__mmask8) __U, 2901 _MM_FROUND_CUR_DIRECTION); 2902 } 2903 2904 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2905 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2906 { 2907 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, 2908 (__v8df) __B, 2909 (__v8df) __C, 2910 (__mmask8) __U, 2911 _MM_FROUND_CUR_DIRECTION); 2912 } 2913 2914 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2915 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2916 { 2917 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, 2918 (__v8df) __B, 2919 (__v8df) __C, 2920 (__mmask8) __U, 2921 _MM_FROUND_CUR_DIRECTION); 2922 } 2923 2924 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2925 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) 2926 { 2927 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2928 (__v8df) __B, 2929 -(__v8df) __C, 2930 (__mmask8) -1, 2931 _MM_FROUND_CUR_DIRECTION); 2932 } 2933 2934 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2935 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2936 { 2937 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2938 (__v8df) __B, 2939 -(__v8df) __C, 2940 (__mmask8) __U, 2941 _MM_FROUND_CUR_DIRECTION); 2942 } 2943 2944 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2945 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2946 { 2947 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, 2948 (__v8df) __B, 2949 -(__v8df) __C, 2950 (__mmask8) __U, 2951 _MM_FROUND_CUR_DIRECTION); 2952 } 2953 2954 #define _mm512_fmaddsub_round_ps(A, B, C, R) \ 2955 (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2956 (__v16sf)(__m512)(B), \ 2957 (__v16sf)(__m512)(C), \ 2958 (__mmask16)-1, (int)(R)) 2959 2960 2961 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ 2962 (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2963 (__v16sf)(__m512)(B), \ 2964 (__v16sf)(__m512)(C), \ 2965 (__mmask16)(U), (int)(R)) 2966 2967 2968 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ 2969 (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ 2970 (__v16sf)(__m512)(B), \ 2971 (__v16sf)(__m512)(C), \ 2972 (__mmask16)(U), (int)(R)) 2973 2974 2975 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ 2976 (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ 2977 (__v16sf)(__m512)(B), \ 2978 (__v16sf)(__m512)(C), \ 2979 (__mmask16)(U), (int)(R)) 2980 2981 2982 #define _mm512_fmsubadd_round_ps(A, B, C, R) \ 2983 (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2984 (__v16sf)(__m512)(B), \ 2985 -(__v16sf)(__m512)(C), \ 2986 (__mmask16)-1, (int)(R)) 2987 2988 2989 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ 2990 (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2991 (__v16sf)(__m512)(B), \ 2992 -(__v16sf)(__m512)(C), \ 2993 (__mmask16)(U), (int)(R)) 2994 2995 2996 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ 2997 (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ 2998 (__v16sf)(__m512)(B), \ 2999 -(__v16sf)(__m512)(C), \ 3000 (__mmask16)(U), (int)(R)) 3001 3002 3003 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3004 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) 3005 { 3006 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3007 (__v16sf) __B, 3008 (__v16sf) __C, 3009 (__mmask16) -1, 3010 _MM_FROUND_CUR_DIRECTION); 3011 } 3012 3013 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3014 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3015 { 3016 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3017 (__v16sf) __B, 3018 (__v16sf) __C, 3019 (__mmask16) __U, 3020 _MM_FROUND_CUR_DIRECTION); 3021 } 3022 3023 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3024 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3025 { 3026 return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, 3027 (__v16sf) __B, 3028 (__v16sf) __C, 3029 (__mmask16) __U, 3030 _MM_FROUND_CUR_DIRECTION); 3031 } 3032 3033 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3034 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 3035 { 3036 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, 3037 (__v16sf) __B, 3038 (__v16sf) __C, 3039 (__mmask16) __U, 3040 _MM_FROUND_CUR_DIRECTION); 3041 } 3042 3043 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3044 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) 3045 { 3046 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3047 (__v16sf) __B, 3048 -(__v16sf) __C, 3049 (__mmask16) -1, 3050 _MM_FROUND_CUR_DIRECTION); 3051 } 3052 3053 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3054 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3055 { 3056 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3057 (__v16sf) __B, 3058 -(__v16sf) __C, 3059 (__mmask16) __U, 3060 _MM_FROUND_CUR_DIRECTION); 3061 } 3062 3063 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3064 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 3065 { 3066 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, 3067 (__v16sf) __B, 3068 -(__v16sf) __C, 3069 (__mmask16) __U, 3070 _MM_FROUND_CUR_DIRECTION); 3071 } 3072 3073 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ 3074 (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ 3075 (__v8df)(__m512d)(B), \ 3076 (__v8df)(__m512d)(C), \ 3077 (__mmask8)(U), (int)(R)) 3078 3079 3080 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3081 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3082 { 3083 return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, 3084 (__v8df) __B, 3085 (__v8df) __C, 3086 (__mmask8) __U, 3087 _MM_FROUND_CUR_DIRECTION); 3088 } 3089 3090 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ 3091 (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ 3092 (__v16sf)(__m512)(B), \ 3093 (__v16sf)(__m512)(C), \ 3094 (__mmask16)(U), (int)(R)) 3095 3096 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3097 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3098 { 3099 return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, 3100 (__v16sf) __B, 3101 (__v16sf) __C, 3102 (__mmask16) __U, 3103 _MM_FROUND_CUR_DIRECTION); 3104 } 3105 3106 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ 3107 (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ 3108 (__v8df)(__m512d)(B), \ 3109 (__v8df)(__m512d)(C), \ 3110 (__mmask8)(U), (int)(R)) 3111 3112 3113 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3114 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3115 { 3116 return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, 3117 (__v8df) __B, 3118 (__v8df) __C, 3119 (__mmask8) __U, 3120 _MM_FROUND_CUR_DIRECTION); 3121 } 3122 3123 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ 3124 (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ 3125 (__v16sf)(__m512)(B), \ 3126 (__v16sf)(__m512)(C), \ 3127 (__mmask16)(U), (int)(R)) 3128 3129 3130 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3131 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3132 { 3133 return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, 3134 (__v16sf) __B, 3135 (__v16sf) __C, 3136 (__mmask16) __U, 3137 _MM_FROUND_CUR_DIRECTION); 3138 } 3139 3140 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ 3141 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 3142 -(__v8df)(__m512d)(B), \ 3143 (__v8df)(__m512d)(C), \ 3144 (__mmask8)(U), (int)(R)) 3145 3146 3147 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3148 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 3149 { 3150 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 3151 -(__v8df) __B, 3152 (__v8df) __C, 3153 (__mmask8) __U, 3154 _MM_FROUND_CUR_DIRECTION); 3155 } 3156 3157 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ 3158 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 3159 -(__v16sf)(__m512)(B), \ 3160 (__v16sf)(__m512)(C), \ 3161 (__mmask16)(U), (int)(R)) 3162 3163 3164 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3165 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3166 { 3167 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 3168 -(__v16sf) __B, 3169 (__v16sf) __C, 3170 (__mmask16) __U, 3171 _MM_FROUND_CUR_DIRECTION); 3172 } 3173 3174 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ 3175 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 3176 -(__v8df)(__m512d)(B), \ 3177 -(__v8df)(__m512d)(C), \ 3178 (__mmask8)(U), (int)(R)) 3179 3180 3181 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ 3182 (__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \ 3183 (__v8df)(__m512d)(B), \ 3184 (__v8df)(__m512d)(C), \ 3185 (__mmask8)(U), (int)(R)) 3186 3187 3188 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3189 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 3190 { 3191 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 3192 -(__v8df) __B, 3193 -(__v8df) __C, 3194 (__mmask8) __U, 3195 _MM_FROUND_CUR_DIRECTION); 3196 } 3197 3198 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3199 _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3200 { 3201 return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A, 3202 (__v8df) __B, 3203 (__v8df) __C, 3204 (__mmask8) __U, 3205 _MM_FROUND_CUR_DIRECTION); 3206 } 3207 3208 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ 3209 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 3210 -(__v16sf)(__m512)(B), \ 3211 -(__v16sf)(__m512)(C), \ 3212 (__mmask16)(U), (int)(R)) 3213 3214 3215 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ 3216 (__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \ 3217 (__v16sf)(__m512)(B), \ 3218 (__v16sf)(__m512)(C), \ 3219 (__mmask16)(U), (int)(R)) 3220 3221 3222 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3223 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3224 { 3225 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 3226 -(__v16sf) __B, 3227 -(__v16sf) __C, 3228 (__mmask16) __U, 3229 _MM_FROUND_CUR_DIRECTION); 3230 } 3231 3232 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3233 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3234 { 3235 return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A, 3236 (__v16sf) __B, 3237 (__v16sf) __C, 3238 (__mmask16) __U, 3239 _MM_FROUND_CUR_DIRECTION); 3240 } 3241 3242 3243 3244 /* Vector permutations */ 3245 3246 static __inline __m512i __DEFAULT_FN_ATTRS512 3247 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) 3248 { 3249 return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, 3250 (__v16si) __B); 3251 } 3252 3253 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3254 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, 3255 __m512i __B) 3256 { 3257 return (__m512i)__builtin_ia32_selectd_512(__U, 3258 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3259 (__v16si)__A); 3260 } 3261 3262 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3263 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, 3264 __m512i __B) 3265 { 3266 return (__m512i)__builtin_ia32_selectd_512(__U, 3267 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3268 (__v16si)__I); 3269 } 3270 3271 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3272 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, 3273 __m512i __B) 3274 { 3275 return (__m512i)__builtin_ia32_selectd_512(__U, 3276 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3277 (__v16si)_mm512_setzero_si512()); 3278 } 3279 3280 static __inline __m512i __DEFAULT_FN_ATTRS512 3281 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) 3282 { 3283 return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, 3284 (__v8di) __B); 3285 } 3286 3287 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3288 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, 3289 __m512i __B) 3290 { 3291 return (__m512i)__builtin_ia32_selectq_512(__U, 3292 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3293 (__v8di)__A); 3294 } 3295 3296 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3297 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, 3298 __m512i __B) 3299 { 3300 return (__m512i)__builtin_ia32_selectq_512(__U, 3301 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3302 (__v8di)__I); 3303 } 3304 3305 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3306 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, 3307 __m512i __B) 3308 { 3309 return (__m512i)__builtin_ia32_selectq_512(__U, 3310 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3311 (__v8di)_mm512_setzero_si512()); 3312 } 3313 3314 #define _mm512_alignr_epi64(A, B, I) \ 3315 (__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \ 3316 (__v8di)(__m512i)(B), (int)(I)) 3317 3318 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \ 3319 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 3320 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ 3321 (__v8di)(__m512i)(W)) 3322 3323 #define _mm512_maskz_alignr_epi64(U, A, B, imm) \ 3324 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 3325 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ 3326 (__v8di)_mm512_setzero_si512()) 3327 3328 #define _mm512_alignr_epi32(A, B, I) \ 3329 (__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \ 3330 (__v16si)(__m512i)(B), (int)(I)) 3331 3332 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \ 3333 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 3334 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ 3335 (__v16si)(__m512i)(W)) 3336 3337 #define _mm512_maskz_alignr_epi32(U, A, B, imm) \ 3338 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 3339 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ 3340 (__v16si)_mm512_setzero_si512()) 3341 /* Vector Extract */ 3342 3343 #define _mm512_extractf64x4_pd(A, I) \ 3344 (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ 3345 (__v4df)_mm256_undefined_pd(), \ 3346 (__mmask8)-1) 3347 3348 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \ 3349 (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ 3350 (__v4df)(__m256d)(W), \ 3351 (__mmask8)(U)) 3352 3353 #define _mm512_maskz_extractf64x4_pd(U, A, imm) \ 3354 (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ 3355 (__v4df)_mm256_setzero_pd(), \ 3356 (__mmask8)(U)) 3357 3358 #define _mm512_extractf32x4_ps(A, I) \ 3359 (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ 3360 (__v4sf)_mm_undefined_ps(), \ 3361 (__mmask8)-1) 3362 3363 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \ 3364 (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ 3365 (__v4sf)(__m128)(W), \ 3366 (__mmask8)(U)) 3367 3368 #define _mm512_maskz_extractf32x4_ps(U, A, imm) \ 3369 (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ 3370 (__v4sf)_mm_setzero_ps(), \ 3371 (__mmask8)(U)) 3372 3373 /* Vector Blend */ 3374 3375 static __inline __m512d __DEFAULT_FN_ATTRS512 3376 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) 3377 { 3378 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 3379 (__v8df) __W, 3380 (__v8df) __A); 3381 } 3382 3383 static __inline __m512 __DEFAULT_FN_ATTRS512 3384 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) 3385 { 3386 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 3387 (__v16sf) __W, 3388 (__v16sf) __A); 3389 } 3390 3391 static __inline __m512i __DEFAULT_FN_ATTRS512 3392 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) 3393 { 3394 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 3395 (__v8di) __W, 3396 (__v8di) __A); 3397 } 3398 3399 static __inline __m512i __DEFAULT_FN_ATTRS512 3400 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) 3401 { 3402 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 3403 (__v16si) __W, 3404 (__v16si) __A); 3405 } 3406 3407 /* Compare */ 3408 3409 #define _mm512_cmp_round_ps_mask(A, B, P, R) \ 3410 (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ 3411 (__v16sf)(__m512)(B), (int)(P), \ 3412 (__mmask16)-1, (int)(R)) 3413 3414 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \ 3415 (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ 3416 (__v16sf)(__m512)(B), (int)(P), \ 3417 (__mmask16)(U), (int)(R)) 3418 3419 #define _mm512_cmp_ps_mask(A, B, P) \ 3420 _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3421 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \ 3422 _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3423 3424 #define _mm512_cmpeq_ps_mask(A, B) \ 3425 _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ) 3426 #define _mm512_mask_cmpeq_ps_mask(k, A, B) \ 3427 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ) 3428 3429 #define _mm512_cmplt_ps_mask(A, B) \ 3430 _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS) 3431 #define _mm512_mask_cmplt_ps_mask(k, A, B) \ 3432 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS) 3433 3434 #define _mm512_cmple_ps_mask(A, B) \ 3435 _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS) 3436 #define _mm512_mask_cmple_ps_mask(k, A, B) \ 3437 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS) 3438 3439 #define _mm512_cmpunord_ps_mask(A, B) \ 3440 _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q) 3441 #define _mm512_mask_cmpunord_ps_mask(k, A, B) \ 3442 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q) 3443 3444 #define _mm512_cmpneq_ps_mask(A, B) \ 3445 _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ) 3446 #define _mm512_mask_cmpneq_ps_mask(k, A, B) \ 3447 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ) 3448 3449 #define _mm512_cmpnlt_ps_mask(A, B) \ 3450 _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US) 3451 #define _mm512_mask_cmpnlt_ps_mask(k, A, B) \ 3452 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US) 3453 3454 #define _mm512_cmpnle_ps_mask(A, B) \ 3455 _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US) 3456 #define _mm512_mask_cmpnle_ps_mask(k, A, B) \ 3457 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US) 3458 3459 #define _mm512_cmpord_ps_mask(A, B) \ 3460 _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q) 3461 #define _mm512_mask_cmpord_ps_mask(k, A, B) \ 3462 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q) 3463 3464 #define _mm512_cmp_round_pd_mask(A, B, P, R) \ 3465 (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ 3466 (__v8df)(__m512d)(B), (int)(P), \ 3467 (__mmask8)-1, (int)(R)) 3468 3469 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \ 3470 (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ 3471 (__v8df)(__m512d)(B), (int)(P), \ 3472 (__mmask8)(U), (int)(R)) 3473 3474 #define _mm512_cmp_pd_mask(A, B, P) \ 3475 _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3476 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \ 3477 _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3478 3479 #define _mm512_cmpeq_pd_mask(A, B) \ 3480 _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ) 3481 #define _mm512_mask_cmpeq_pd_mask(k, A, B) \ 3482 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ) 3483 3484 #define _mm512_cmplt_pd_mask(A, B) \ 3485 _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS) 3486 #define _mm512_mask_cmplt_pd_mask(k, A, B) \ 3487 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS) 3488 3489 #define _mm512_cmple_pd_mask(A, B) \ 3490 _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS) 3491 #define _mm512_mask_cmple_pd_mask(k, A, B) \ 3492 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS) 3493 3494 #define _mm512_cmpunord_pd_mask(A, B) \ 3495 _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q) 3496 #define _mm512_mask_cmpunord_pd_mask(k, A, B) \ 3497 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q) 3498 3499 #define _mm512_cmpneq_pd_mask(A, B) \ 3500 _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ) 3501 #define _mm512_mask_cmpneq_pd_mask(k, A, B) \ 3502 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ) 3503 3504 #define _mm512_cmpnlt_pd_mask(A, B) \ 3505 _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US) 3506 #define _mm512_mask_cmpnlt_pd_mask(k, A, B) \ 3507 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US) 3508 3509 #define _mm512_cmpnle_pd_mask(A, B) \ 3510 _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US) 3511 #define _mm512_mask_cmpnle_pd_mask(k, A, B) \ 3512 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US) 3513 3514 #define _mm512_cmpord_pd_mask(A, B) \ 3515 _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q) 3516 #define _mm512_mask_cmpord_pd_mask(k, A, B) \ 3517 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q) 3518 3519 /* Conversion */ 3520 3521 #define _mm512_cvtt_roundps_epu32(A, R) \ 3522 (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3523 (__v16si)_mm512_undefined_epi32(), \ 3524 (__mmask16)-1, (int)(R)) 3525 3526 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \ 3527 (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3528 (__v16si)(__m512i)(W), \ 3529 (__mmask16)(U), (int)(R)) 3530 3531 #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \ 3532 (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3533 (__v16si)_mm512_setzero_si512(), \ 3534 (__mmask16)(U), (int)(R)) 3535 3536 3537 static __inline __m512i __DEFAULT_FN_ATTRS512 3538 _mm512_cvttps_epu32(__m512 __A) 3539 { 3540 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3541 (__v16si) 3542 _mm512_setzero_si512 (), 3543 (__mmask16) -1, 3544 _MM_FROUND_CUR_DIRECTION); 3545 } 3546 3547 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3548 _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) 3549 { 3550 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3551 (__v16si) __W, 3552 (__mmask16) __U, 3553 _MM_FROUND_CUR_DIRECTION); 3554 } 3555 3556 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3557 _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) 3558 { 3559 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3560 (__v16si) _mm512_setzero_si512 (), 3561 (__mmask16) __U, 3562 _MM_FROUND_CUR_DIRECTION); 3563 } 3564 3565 #define _mm512_cvt_roundepi32_ps(A, R) \ 3566 (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3567 (__v16sf)_mm512_setzero_ps(), \ 3568 (__mmask16)-1, (int)(R)) 3569 3570 #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \ 3571 (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3572 (__v16sf)(__m512)(W), \ 3573 (__mmask16)(U), (int)(R)) 3574 3575 #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \ 3576 (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3577 (__v16sf)_mm512_setzero_ps(), \ 3578 (__mmask16)(U), (int)(R)) 3579 3580 #define _mm512_cvt_roundepu32_ps(A, R) \ 3581 (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3582 (__v16sf)_mm512_setzero_ps(), \ 3583 (__mmask16)-1, (int)(R)) 3584 3585 #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \ 3586 (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3587 (__v16sf)(__m512)(W), \ 3588 (__mmask16)(U), (int)(R)) 3589 3590 #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \ 3591 (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3592 (__v16sf)_mm512_setzero_ps(), \ 3593 (__mmask16)(U), (int)(R)) 3594 3595 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3596 _mm512_cvtepu32_ps (__m512i __A) 3597 { 3598 return (__m512)__builtin_convertvector((__v16su)__A, __v16sf); 3599 } 3600 3601 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3602 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) 3603 { 3604 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3605 (__v16sf)_mm512_cvtepu32_ps(__A), 3606 (__v16sf)__W); 3607 } 3608 3609 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3610 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) 3611 { 3612 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3613 (__v16sf)_mm512_cvtepu32_ps(__A), 3614 (__v16sf)_mm512_setzero_ps()); 3615 } 3616 3617 static __inline __m512d __DEFAULT_FN_ATTRS512 3618 _mm512_cvtepi32_pd(__m256i __A) 3619 { 3620 return (__m512d)__builtin_convertvector((__v8si)__A, __v8df); 3621 } 3622 3623 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3624 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) 3625 { 3626 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3627 (__v8df)_mm512_cvtepi32_pd(__A), 3628 (__v8df)__W); 3629 } 3630 3631 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3632 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) 3633 { 3634 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3635 (__v8df)_mm512_cvtepi32_pd(__A), 3636 (__v8df)_mm512_setzero_pd()); 3637 } 3638 3639 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3640 _mm512_cvtepi32lo_pd(__m512i __A) 3641 { 3642 return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A)); 3643 } 3644 3645 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3646 _mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) 3647 { 3648 return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A)); 3649 } 3650 3651 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3652 _mm512_cvtepi32_ps (__m512i __A) 3653 { 3654 return (__m512)__builtin_convertvector((__v16si)__A, __v16sf); 3655 } 3656 3657 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3658 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) 3659 { 3660 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3661 (__v16sf)_mm512_cvtepi32_ps(__A), 3662 (__v16sf)__W); 3663 } 3664 3665 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3666 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) 3667 { 3668 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3669 (__v16sf)_mm512_cvtepi32_ps(__A), 3670 (__v16sf)_mm512_setzero_ps()); 3671 } 3672 3673 static __inline __m512d __DEFAULT_FN_ATTRS512 3674 _mm512_cvtepu32_pd(__m256i __A) 3675 { 3676 return (__m512d)__builtin_convertvector((__v8su)__A, __v8df); 3677 } 3678 3679 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3680 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) 3681 { 3682 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3683 (__v8df)_mm512_cvtepu32_pd(__A), 3684 (__v8df)__W); 3685 } 3686 3687 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3688 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) 3689 { 3690 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3691 (__v8df)_mm512_cvtepu32_pd(__A), 3692 (__v8df)_mm512_setzero_pd()); 3693 } 3694 3695 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3696 _mm512_cvtepu32lo_pd(__m512i __A) 3697 { 3698 return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A)); 3699 } 3700 3701 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3702 _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) 3703 { 3704 return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A)); 3705 } 3706 3707 #define _mm512_cvt_roundpd_ps(A, R) \ 3708 (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3709 (__v8sf)_mm256_setzero_ps(), \ 3710 (__mmask8)-1, (int)(R)) 3711 3712 #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \ 3713 (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3714 (__v8sf)(__m256)(W), (__mmask8)(U), \ 3715 (int)(R)) 3716 3717 #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \ 3718 (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3719 (__v8sf)_mm256_setzero_ps(), \ 3720 (__mmask8)(U), (int)(R)) 3721 3722 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3723 _mm512_cvtpd_ps (__m512d __A) 3724 { 3725 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3726 (__v8sf) _mm256_undefined_ps (), 3727 (__mmask8) -1, 3728 _MM_FROUND_CUR_DIRECTION); 3729 } 3730 3731 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3732 _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) 3733 { 3734 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3735 (__v8sf) __W, 3736 (__mmask8) __U, 3737 _MM_FROUND_CUR_DIRECTION); 3738 } 3739 3740 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3741 _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) 3742 { 3743 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3744 (__v8sf) _mm256_setzero_ps (), 3745 (__mmask8) __U, 3746 _MM_FROUND_CUR_DIRECTION); 3747 } 3748 3749 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3750 _mm512_cvtpd_pslo (__m512d __A) 3751 { 3752 return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A), 3753 (__v8sf) _mm256_setzero_ps (), 3754 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3755 } 3756 3757 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3758 _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) 3759 { 3760 return (__m512) __builtin_shufflevector ( 3761 (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W), 3762 __U, __A), 3763 (__v8sf) _mm256_setzero_ps (), 3764 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3765 } 3766 3767 #define _mm512_cvt_roundps_ph(A, I) \ 3768 (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3769 (__v16hi)_mm256_undefined_si256(), \ 3770 (__mmask16)-1) 3771 3772 #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ 3773 (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3774 (__v16hi)(__m256i)(U), \ 3775 (__mmask16)(W)) 3776 3777 #define _mm512_maskz_cvt_roundps_ph(W, A, I) \ 3778 (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3779 (__v16hi)_mm256_setzero_si256(), \ 3780 (__mmask16)(W)) 3781 3782 #define _mm512_cvtps_ph _mm512_cvt_roundps_ph 3783 #define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph 3784 #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph 3785 3786 #define _mm512_cvt_roundph_ps(A, R) \ 3787 (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3788 (__v16sf)_mm512_undefined_ps(), \ 3789 (__mmask16)-1, (int)(R)) 3790 3791 #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \ 3792 (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3793 (__v16sf)(__m512)(W), \ 3794 (__mmask16)(U), (int)(R)) 3795 3796 #define _mm512_maskz_cvt_roundph_ps(U, A, R) \ 3797 (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3798 (__v16sf)_mm512_setzero_ps(), \ 3799 (__mmask16)(U), (int)(R)) 3800 3801 3802 static __inline __m512 __DEFAULT_FN_ATTRS512 3803 _mm512_cvtph_ps(__m256i __A) 3804 { 3805 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3806 (__v16sf) 3807 _mm512_setzero_ps (), 3808 (__mmask16) -1, 3809 _MM_FROUND_CUR_DIRECTION); 3810 } 3811 3812 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3813 _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) 3814 { 3815 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3816 (__v16sf) __W, 3817 (__mmask16) __U, 3818 _MM_FROUND_CUR_DIRECTION); 3819 } 3820 3821 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3822 _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) 3823 { 3824 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3825 (__v16sf) _mm512_setzero_ps (), 3826 (__mmask16) __U, 3827 _MM_FROUND_CUR_DIRECTION); 3828 } 3829 3830 #define _mm512_cvtt_roundpd_epi32(A, R) \ 3831 (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3832 (__v8si)_mm256_setzero_si256(), \ 3833 (__mmask8)-1, (int)(R)) 3834 3835 #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \ 3836 (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3837 (__v8si)(__m256i)(W), \ 3838 (__mmask8)(U), (int)(R)) 3839 3840 #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \ 3841 (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3842 (__v8si)_mm256_setzero_si256(), \ 3843 (__mmask8)(U), (int)(R)) 3844 3845 static __inline __m256i __DEFAULT_FN_ATTRS512 3846 _mm512_cvttpd_epi32(__m512d __a) 3847 { 3848 return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a, 3849 (__v8si)_mm256_setzero_si256(), 3850 (__mmask8) -1, 3851 _MM_FROUND_CUR_DIRECTION); 3852 } 3853 3854 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3855 _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) 3856 { 3857 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, 3858 (__v8si) __W, 3859 (__mmask8) __U, 3860 _MM_FROUND_CUR_DIRECTION); 3861 } 3862 3863 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3864 _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) 3865 { 3866 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, 3867 (__v8si) _mm256_setzero_si256 (), 3868 (__mmask8) __U, 3869 _MM_FROUND_CUR_DIRECTION); 3870 } 3871 3872 #define _mm512_cvtt_roundps_epi32(A, R) \ 3873 (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3874 (__v16si)_mm512_setzero_si512(), \ 3875 (__mmask16)-1, (int)(R)) 3876 3877 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \ 3878 (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3879 (__v16si)(__m512i)(W), \ 3880 (__mmask16)(U), (int)(R)) 3881 3882 #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \ 3883 (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3884 (__v16si)_mm512_setzero_si512(), \ 3885 (__mmask16)(U), (int)(R)) 3886 3887 static __inline __m512i __DEFAULT_FN_ATTRS512 3888 _mm512_cvttps_epi32(__m512 __a) 3889 { 3890 return (__m512i) 3891 __builtin_ia32_cvttps2dq512_mask((__v16sf) __a, 3892 (__v16si) _mm512_setzero_si512 (), 3893 (__mmask16) -1, _MM_FROUND_CUR_DIRECTION); 3894 } 3895 3896 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3897 _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) 3898 { 3899 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, 3900 (__v16si) __W, 3901 (__mmask16) __U, 3902 _MM_FROUND_CUR_DIRECTION); 3903 } 3904 3905 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3906 _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) 3907 { 3908 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, 3909 (__v16si) _mm512_setzero_si512 (), 3910 (__mmask16) __U, 3911 _MM_FROUND_CUR_DIRECTION); 3912 } 3913 3914 #define _mm512_cvt_roundps_epi32(A, R) \ 3915 (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3916 (__v16si)_mm512_setzero_si512(), \ 3917 (__mmask16)-1, (int)(R)) 3918 3919 #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \ 3920 (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3921 (__v16si)(__m512i)(W), \ 3922 (__mmask16)(U), (int)(R)) 3923 3924 #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \ 3925 (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3926 (__v16si)_mm512_setzero_si512(), \ 3927 (__mmask16)(U), (int)(R)) 3928 3929 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3930 _mm512_cvtps_epi32 (__m512 __A) 3931 { 3932 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3933 (__v16si) _mm512_undefined_epi32 (), 3934 (__mmask16) -1, 3935 _MM_FROUND_CUR_DIRECTION); 3936 } 3937 3938 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3939 _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) 3940 { 3941 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3942 (__v16si) __W, 3943 (__mmask16) __U, 3944 _MM_FROUND_CUR_DIRECTION); 3945 } 3946 3947 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3948 _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) 3949 { 3950 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3951 (__v16si) 3952 _mm512_setzero_si512 (), 3953 (__mmask16) __U, 3954 _MM_FROUND_CUR_DIRECTION); 3955 } 3956 3957 #define _mm512_cvt_roundpd_epi32(A, R) \ 3958 (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3959 (__v8si)_mm256_setzero_si256(), \ 3960 (__mmask8)-1, (int)(R)) 3961 3962 #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \ 3963 (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3964 (__v8si)(__m256i)(W), \ 3965 (__mmask8)(U), (int)(R)) 3966 3967 #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \ 3968 (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3969 (__v8si)_mm256_setzero_si256(), \ 3970 (__mmask8)(U), (int)(R)) 3971 3972 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3973 _mm512_cvtpd_epi32 (__m512d __A) 3974 { 3975 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 3976 (__v8si) 3977 _mm256_undefined_si256 (), 3978 (__mmask8) -1, 3979 _MM_FROUND_CUR_DIRECTION); 3980 } 3981 3982 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3983 _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) 3984 { 3985 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 3986 (__v8si) __W, 3987 (__mmask8) __U, 3988 _MM_FROUND_CUR_DIRECTION); 3989 } 3990 3991 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3992 _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) 3993 { 3994 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 3995 (__v8si) 3996 _mm256_setzero_si256 (), 3997 (__mmask8) __U, 3998 _MM_FROUND_CUR_DIRECTION); 3999 } 4000 4001 #define _mm512_cvt_roundps_epu32(A, R) \ 4002 (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4003 (__v16si)_mm512_setzero_si512(), \ 4004 (__mmask16)-1, (int)(R)) 4005 4006 #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \ 4007 (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4008 (__v16si)(__m512i)(W), \ 4009 (__mmask16)(U), (int)(R)) 4010 4011 #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \ 4012 (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4013 (__v16si)_mm512_setzero_si512(), \ 4014 (__mmask16)(U), (int)(R)) 4015 4016 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4017 _mm512_cvtps_epu32 ( __m512 __A) 4018 { 4019 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\ 4020 (__v16si)\ 4021 _mm512_undefined_epi32 (), 4022 (__mmask16) -1,\ 4023 _MM_FROUND_CUR_DIRECTION); 4024 } 4025 4026 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4027 _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) 4028 { 4029 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, 4030 (__v16si) __W, 4031 (__mmask16) __U, 4032 _MM_FROUND_CUR_DIRECTION); 4033 } 4034 4035 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4036 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A) 4037 { 4038 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, 4039 (__v16si) 4040 _mm512_setzero_si512 (), 4041 (__mmask16) __U , 4042 _MM_FROUND_CUR_DIRECTION); 4043 } 4044 4045 #define _mm512_cvt_roundpd_epu32(A, R) \ 4046 (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4047 (__v8si)_mm256_setzero_si256(), \ 4048 (__mmask8)-1, (int)(R)) 4049 4050 #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \ 4051 (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4052 (__v8si)(__m256i)(W), \ 4053 (__mmask8)(U), (int)(R)) 4054 4055 #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \ 4056 (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4057 (__v8si)_mm256_setzero_si256(), \ 4058 (__mmask8)(U), (int)(R)) 4059 4060 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4061 _mm512_cvtpd_epu32 (__m512d __A) 4062 { 4063 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4064 (__v8si) 4065 _mm256_undefined_si256 (), 4066 (__mmask8) -1, 4067 _MM_FROUND_CUR_DIRECTION); 4068 } 4069 4070 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4071 _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) 4072 { 4073 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4074 (__v8si) __W, 4075 (__mmask8) __U, 4076 _MM_FROUND_CUR_DIRECTION); 4077 } 4078 4079 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4080 _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) 4081 { 4082 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4083 (__v8si) 4084 _mm256_setzero_si256 (), 4085 (__mmask8) __U, 4086 _MM_FROUND_CUR_DIRECTION); 4087 } 4088 4089 static __inline__ double __DEFAULT_FN_ATTRS512 4090 _mm512_cvtsd_f64(__m512d __a) 4091 { 4092 return __a[0]; 4093 } 4094 4095 static __inline__ float __DEFAULT_FN_ATTRS512 4096 _mm512_cvtss_f32(__m512 __a) 4097 { 4098 return __a[0]; 4099 } 4100 4101 /* Unpack and Interleave */ 4102 4103 static __inline __m512d __DEFAULT_FN_ATTRS512 4104 _mm512_unpackhi_pd(__m512d __a, __m512d __b) 4105 { 4106 return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 4107 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); 4108 } 4109 4110 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4111 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 4112 { 4113 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4114 (__v8df)_mm512_unpackhi_pd(__A, __B), 4115 (__v8df)__W); 4116 } 4117 4118 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4119 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) 4120 { 4121 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4122 (__v8df)_mm512_unpackhi_pd(__A, __B), 4123 (__v8df)_mm512_setzero_pd()); 4124 } 4125 4126 static __inline __m512d __DEFAULT_FN_ATTRS512 4127 _mm512_unpacklo_pd(__m512d __a, __m512d __b) 4128 { 4129 return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 4130 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); 4131 } 4132 4133 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4134 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 4135 { 4136 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4137 (__v8df)_mm512_unpacklo_pd(__A, __B), 4138 (__v8df)__W); 4139 } 4140 4141 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4142 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) 4143 { 4144 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4145 (__v8df)_mm512_unpacklo_pd(__A, __B), 4146 (__v8df)_mm512_setzero_pd()); 4147 } 4148 4149 static __inline __m512 __DEFAULT_FN_ATTRS512 4150 _mm512_unpackhi_ps(__m512 __a, __m512 __b) 4151 { 4152 return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, 4153 2, 18, 3, 19, 4154 2+4, 18+4, 3+4, 19+4, 4155 2+8, 18+8, 3+8, 19+8, 4156 2+12, 18+12, 3+12, 19+12); 4157 } 4158 4159 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4160 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 4161 { 4162 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4163 (__v16sf)_mm512_unpackhi_ps(__A, __B), 4164 (__v16sf)__W); 4165 } 4166 4167 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4168 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) 4169 { 4170 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4171 (__v16sf)_mm512_unpackhi_ps(__A, __B), 4172 (__v16sf)_mm512_setzero_ps()); 4173 } 4174 4175 static __inline __m512 __DEFAULT_FN_ATTRS512 4176 _mm512_unpacklo_ps(__m512 __a, __m512 __b) 4177 { 4178 return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, 4179 0, 16, 1, 17, 4180 0+4, 16+4, 1+4, 17+4, 4181 0+8, 16+8, 1+8, 17+8, 4182 0+12, 16+12, 1+12, 17+12); 4183 } 4184 4185 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4186 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 4187 { 4188 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4189 (__v16sf)_mm512_unpacklo_ps(__A, __B), 4190 (__v16sf)__W); 4191 } 4192 4193 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4194 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) 4195 { 4196 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4197 (__v16sf)_mm512_unpacklo_ps(__A, __B), 4198 (__v16sf)_mm512_setzero_ps()); 4199 } 4200 4201 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4202 _mm512_unpackhi_epi32(__m512i __A, __m512i __B) 4203 { 4204 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, 4205 2, 18, 3, 19, 4206 2+4, 18+4, 3+4, 19+4, 4207 2+8, 18+8, 3+8, 19+8, 4208 2+12, 18+12, 3+12, 19+12); 4209 } 4210 4211 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4212 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4213 { 4214 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4215 (__v16si)_mm512_unpackhi_epi32(__A, __B), 4216 (__v16si)__W); 4217 } 4218 4219 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4220 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) 4221 { 4222 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4223 (__v16si)_mm512_unpackhi_epi32(__A, __B), 4224 (__v16si)_mm512_setzero_si512()); 4225 } 4226 4227 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4228 _mm512_unpacklo_epi32(__m512i __A, __m512i __B) 4229 { 4230 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, 4231 0, 16, 1, 17, 4232 0+4, 16+4, 1+4, 17+4, 4233 0+8, 16+8, 1+8, 17+8, 4234 0+12, 16+12, 1+12, 17+12); 4235 } 4236 4237 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4238 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4239 { 4240 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4241 (__v16si)_mm512_unpacklo_epi32(__A, __B), 4242 (__v16si)__W); 4243 } 4244 4245 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4246 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) 4247 { 4248 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4249 (__v16si)_mm512_unpacklo_epi32(__A, __B), 4250 (__v16si)_mm512_setzero_si512()); 4251 } 4252 4253 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4254 _mm512_unpackhi_epi64(__m512i __A, __m512i __B) 4255 { 4256 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 4257 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); 4258 } 4259 4260 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4261 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4262 { 4263 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4264 (__v8di)_mm512_unpackhi_epi64(__A, __B), 4265 (__v8di)__W); 4266 } 4267 4268 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4269 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) 4270 { 4271 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4272 (__v8di)_mm512_unpackhi_epi64(__A, __B), 4273 (__v8di)_mm512_setzero_si512()); 4274 } 4275 4276 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4277 _mm512_unpacklo_epi64 (__m512i __A, __m512i __B) 4278 { 4279 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 4280 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); 4281 } 4282 4283 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4284 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4285 { 4286 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4287 (__v8di)_mm512_unpacklo_epi64(__A, __B), 4288 (__v8di)__W); 4289 } 4290 4291 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4292 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 4293 { 4294 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4295 (__v8di)_mm512_unpacklo_epi64(__A, __B), 4296 (__v8di)_mm512_setzero_si512()); 4297 } 4298 4299 4300 /* SIMD load ops */ 4301 4302 static __inline __m512i __DEFAULT_FN_ATTRS512 4303 _mm512_loadu_si512 (void const *__P) 4304 { 4305 struct __loadu_si512 { 4306 __m512i_u __v; 4307 } __attribute__((__packed__, __may_alias__)); 4308 return ((const struct __loadu_si512*)__P)->__v; 4309 } 4310 4311 static __inline __m512i __DEFAULT_FN_ATTRS512 4312 _mm512_loadu_epi32 (void const *__P) 4313 { 4314 struct __loadu_epi32 { 4315 __m512i_u __v; 4316 } __attribute__((__packed__, __may_alias__)); 4317 return ((const struct __loadu_epi32*)__P)->__v; 4318 } 4319 4320 static __inline __m512i __DEFAULT_FN_ATTRS512 4321 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) 4322 { 4323 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, 4324 (__v16si) __W, 4325 (__mmask16) __U); 4326 } 4327 4328 4329 static __inline __m512i __DEFAULT_FN_ATTRS512 4330 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) 4331 { 4332 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P, 4333 (__v16si) 4334 _mm512_setzero_si512 (), 4335 (__mmask16) __U); 4336 } 4337 4338 static __inline __m512i __DEFAULT_FN_ATTRS512 4339 _mm512_loadu_epi64 (void const *__P) 4340 { 4341 struct __loadu_epi64 { 4342 __m512i_u __v; 4343 } __attribute__((__packed__, __may_alias__)); 4344 return ((const struct __loadu_epi64*)__P)->__v; 4345 } 4346 4347 static __inline __m512i __DEFAULT_FN_ATTRS512 4348 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) 4349 { 4350 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, 4351 (__v8di) __W, 4352 (__mmask8) __U); 4353 } 4354 4355 static __inline __m512i __DEFAULT_FN_ATTRS512 4356 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) 4357 { 4358 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P, 4359 (__v8di) 4360 _mm512_setzero_si512 (), 4361 (__mmask8) __U); 4362 } 4363 4364 static __inline __m512 __DEFAULT_FN_ATTRS512 4365 _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) 4366 { 4367 return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, 4368 (__v16sf) __W, 4369 (__mmask16) __U); 4370 } 4371 4372 static __inline __m512 __DEFAULT_FN_ATTRS512 4373 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) 4374 { 4375 return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P, 4376 (__v16sf) 4377 _mm512_setzero_ps (), 4378 (__mmask16) __U); 4379 } 4380 4381 static __inline __m512d __DEFAULT_FN_ATTRS512 4382 _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) 4383 { 4384 return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, 4385 (__v8df) __W, 4386 (__mmask8) __U); 4387 } 4388 4389 static __inline __m512d __DEFAULT_FN_ATTRS512 4390 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) 4391 { 4392 return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P, 4393 (__v8df) 4394 _mm512_setzero_pd (), 4395 (__mmask8) __U); 4396 } 4397 4398 static __inline __m512d __DEFAULT_FN_ATTRS512 4399 _mm512_loadu_pd(void const *__p) 4400 { 4401 struct __loadu_pd { 4402 __m512d_u __v; 4403 } __attribute__((__packed__, __may_alias__)); 4404 return ((const struct __loadu_pd*)__p)->__v; 4405 } 4406 4407 static __inline __m512 __DEFAULT_FN_ATTRS512 4408 _mm512_loadu_ps(void const *__p) 4409 { 4410 struct __loadu_ps { 4411 __m512_u __v; 4412 } __attribute__((__packed__, __may_alias__)); 4413 return ((const struct __loadu_ps*)__p)->__v; 4414 } 4415 4416 static __inline __m512 __DEFAULT_FN_ATTRS512 4417 _mm512_load_ps(void const *__p) 4418 { 4419 return *(const __m512*)__p; 4420 } 4421 4422 static __inline __m512 __DEFAULT_FN_ATTRS512 4423 _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) 4424 { 4425 return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, 4426 (__v16sf) __W, 4427 (__mmask16) __U); 4428 } 4429 4430 static __inline __m512 __DEFAULT_FN_ATTRS512 4431 _mm512_maskz_load_ps(__mmask16 __U, void const *__P) 4432 { 4433 return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P, 4434 (__v16sf) 4435 _mm512_setzero_ps (), 4436 (__mmask16) __U); 4437 } 4438 4439 static __inline __m512d __DEFAULT_FN_ATTRS512 4440 _mm512_load_pd(void const *__p) 4441 { 4442 return *(const __m512d*)__p; 4443 } 4444 4445 static __inline __m512d __DEFAULT_FN_ATTRS512 4446 _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) 4447 { 4448 return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, 4449 (__v8df) __W, 4450 (__mmask8) __U); 4451 } 4452 4453 static __inline __m512d __DEFAULT_FN_ATTRS512 4454 _mm512_maskz_load_pd(__mmask8 __U, void const *__P) 4455 { 4456 return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P, 4457 (__v8df) 4458 _mm512_setzero_pd (), 4459 (__mmask8) __U); 4460 } 4461 4462 static __inline __m512i __DEFAULT_FN_ATTRS512 4463 _mm512_load_si512 (void const *__P) 4464 { 4465 return *(const __m512i *) __P; 4466 } 4467 4468 static __inline __m512i __DEFAULT_FN_ATTRS512 4469 _mm512_load_epi32 (void const *__P) 4470 { 4471 return *(const __m512i *) __P; 4472 } 4473 4474 static __inline __m512i __DEFAULT_FN_ATTRS512 4475 _mm512_load_epi64 (void const *__P) 4476 { 4477 return *(const __m512i *) __P; 4478 } 4479 4480 /* SIMD store ops */ 4481 4482 static __inline void __DEFAULT_FN_ATTRS512 4483 _mm512_storeu_epi64 (void *__P, __m512i __A) 4484 { 4485 struct __storeu_epi64 { 4486 __m512i_u __v; 4487 } __attribute__((__packed__, __may_alias__)); 4488 ((struct __storeu_epi64*)__P)->__v = __A; 4489 } 4490 4491 static __inline void __DEFAULT_FN_ATTRS512 4492 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) 4493 { 4494 __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A, 4495 (__mmask8) __U); 4496 } 4497 4498 static __inline void __DEFAULT_FN_ATTRS512 4499 _mm512_storeu_si512 (void *__P, __m512i __A) 4500 { 4501 struct __storeu_si512 { 4502 __m512i_u __v; 4503 } __attribute__((__packed__, __may_alias__)); 4504 ((struct __storeu_si512*)__P)->__v = __A; 4505 } 4506 4507 static __inline void __DEFAULT_FN_ATTRS512 4508 _mm512_storeu_epi32 (void *__P, __m512i __A) 4509 { 4510 struct __storeu_epi32 { 4511 __m512i_u __v; 4512 } __attribute__((__packed__, __may_alias__)); 4513 ((struct __storeu_epi32*)__P)->__v = __A; 4514 } 4515 4516 static __inline void __DEFAULT_FN_ATTRS512 4517 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) 4518 { 4519 __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A, 4520 (__mmask16) __U); 4521 } 4522 4523 static __inline void __DEFAULT_FN_ATTRS512 4524 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) 4525 { 4526 __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U); 4527 } 4528 4529 static __inline void __DEFAULT_FN_ATTRS512 4530 _mm512_storeu_pd(void *__P, __m512d __A) 4531 { 4532 struct __storeu_pd { 4533 __m512d_u __v; 4534 } __attribute__((__packed__, __may_alias__)); 4535 ((struct __storeu_pd*)__P)->__v = __A; 4536 } 4537 4538 static __inline void __DEFAULT_FN_ATTRS512 4539 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) 4540 { 4541 __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A, 4542 (__mmask16) __U); 4543 } 4544 4545 static __inline void __DEFAULT_FN_ATTRS512 4546 _mm512_storeu_ps(void *__P, __m512 __A) 4547 { 4548 struct __storeu_ps { 4549 __m512_u __v; 4550 } __attribute__((__packed__, __may_alias__)); 4551 ((struct __storeu_ps*)__P)->__v = __A; 4552 } 4553 4554 static __inline void __DEFAULT_FN_ATTRS512 4555 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) 4556 { 4557 __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U); 4558 } 4559 4560 static __inline void __DEFAULT_FN_ATTRS512 4561 _mm512_store_pd(void *__P, __m512d __A) 4562 { 4563 *(__m512d*)__P = __A; 4564 } 4565 4566 static __inline void __DEFAULT_FN_ATTRS512 4567 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) 4568 { 4569 __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A, 4570 (__mmask16) __U); 4571 } 4572 4573 static __inline void __DEFAULT_FN_ATTRS512 4574 _mm512_store_ps(void *__P, __m512 __A) 4575 { 4576 *(__m512*)__P = __A; 4577 } 4578 4579 static __inline void __DEFAULT_FN_ATTRS512 4580 _mm512_store_si512 (void *__P, __m512i __A) 4581 { 4582 *(__m512i *) __P = __A; 4583 } 4584 4585 static __inline void __DEFAULT_FN_ATTRS512 4586 _mm512_store_epi32 (void *__P, __m512i __A) 4587 { 4588 *(__m512i *) __P = __A; 4589 } 4590 4591 static __inline void __DEFAULT_FN_ATTRS512 4592 _mm512_store_epi64 (void *__P, __m512i __A) 4593 { 4594 *(__m512i *) __P = __A; 4595 } 4596 4597 /* Mask ops */ 4598 4599 static __inline __mmask16 __DEFAULT_FN_ATTRS 4600 _mm512_knot(__mmask16 __M) 4601 { 4602 return __builtin_ia32_knothi(__M); 4603 } 4604 4605 /* Integer compare */ 4606 4607 #define _mm512_cmpeq_epi32_mask(A, B) \ 4608 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) 4609 #define _mm512_mask_cmpeq_epi32_mask(k, A, B) \ 4610 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) 4611 #define _mm512_cmpge_epi32_mask(A, B) \ 4612 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) 4613 #define _mm512_mask_cmpge_epi32_mask(k, A, B) \ 4614 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) 4615 #define _mm512_cmpgt_epi32_mask(A, B) \ 4616 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) 4617 #define _mm512_mask_cmpgt_epi32_mask(k, A, B) \ 4618 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) 4619 #define _mm512_cmple_epi32_mask(A, B) \ 4620 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) 4621 #define _mm512_mask_cmple_epi32_mask(k, A, B) \ 4622 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) 4623 #define _mm512_cmplt_epi32_mask(A, B) \ 4624 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) 4625 #define _mm512_mask_cmplt_epi32_mask(k, A, B) \ 4626 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) 4627 #define _mm512_cmpneq_epi32_mask(A, B) \ 4628 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) 4629 #define _mm512_mask_cmpneq_epi32_mask(k, A, B) \ 4630 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) 4631 4632 #define _mm512_cmpeq_epu32_mask(A, B) \ 4633 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) 4634 #define _mm512_mask_cmpeq_epu32_mask(k, A, B) \ 4635 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) 4636 #define _mm512_cmpge_epu32_mask(A, B) \ 4637 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) 4638 #define _mm512_mask_cmpge_epu32_mask(k, A, B) \ 4639 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) 4640 #define _mm512_cmpgt_epu32_mask(A, B) \ 4641 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) 4642 #define _mm512_mask_cmpgt_epu32_mask(k, A, B) \ 4643 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) 4644 #define _mm512_cmple_epu32_mask(A, B) \ 4645 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) 4646 #define _mm512_mask_cmple_epu32_mask(k, A, B) \ 4647 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) 4648 #define _mm512_cmplt_epu32_mask(A, B) \ 4649 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) 4650 #define _mm512_mask_cmplt_epu32_mask(k, A, B) \ 4651 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) 4652 #define _mm512_cmpneq_epu32_mask(A, B) \ 4653 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) 4654 #define _mm512_mask_cmpneq_epu32_mask(k, A, B) \ 4655 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) 4656 4657 #define _mm512_cmpeq_epi64_mask(A, B) \ 4658 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) 4659 #define _mm512_mask_cmpeq_epi64_mask(k, A, B) \ 4660 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) 4661 #define _mm512_cmpge_epi64_mask(A, B) \ 4662 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) 4663 #define _mm512_mask_cmpge_epi64_mask(k, A, B) \ 4664 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) 4665 #define _mm512_cmpgt_epi64_mask(A, B) \ 4666 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) 4667 #define _mm512_mask_cmpgt_epi64_mask(k, A, B) \ 4668 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) 4669 #define _mm512_cmple_epi64_mask(A, B) \ 4670 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) 4671 #define _mm512_mask_cmple_epi64_mask(k, A, B) \ 4672 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) 4673 #define _mm512_cmplt_epi64_mask(A, B) \ 4674 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) 4675 #define _mm512_mask_cmplt_epi64_mask(k, A, B) \ 4676 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) 4677 #define _mm512_cmpneq_epi64_mask(A, B) \ 4678 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) 4679 #define _mm512_mask_cmpneq_epi64_mask(k, A, B) \ 4680 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) 4681 4682 #define _mm512_cmpeq_epu64_mask(A, B) \ 4683 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) 4684 #define _mm512_mask_cmpeq_epu64_mask(k, A, B) \ 4685 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) 4686 #define _mm512_cmpge_epu64_mask(A, B) \ 4687 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) 4688 #define _mm512_mask_cmpge_epu64_mask(k, A, B) \ 4689 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) 4690 #define _mm512_cmpgt_epu64_mask(A, B) \ 4691 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) 4692 #define _mm512_mask_cmpgt_epu64_mask(k, A, B) \ 4693 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) 4694 #define _mm512_cmple_epu64_mask(A, B) \ 4695 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) 4696 #define _mm512_mask_cmple_epu64_mask(k, A, B) \ 4697 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) 4698 #define _mm512_cmplt_epu64_mask(A, B) \ 4699 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) 4700 #define _mm512_mask_cmplt_epu64_mask(k, A, B) \ 4701 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) 4702 #define _mm512_cmpneq_epu64_mask(A, B) \ 4703 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) 4704 #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \ 4705 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) 4706 4707 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4708 _mm512_cvtepi8_epi32(__m128i __A) 4709 { 4710 /* This function always performs a signed extension, but __v16qi is a char 4711 which may be signed or unsigned, so use __v16qs. */ 4712 return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si); 4713 } 4714 4715 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4716 _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) 4717 { 4718 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4719 (__v16si)_mm512_cvtepi8_epi32(__A), 4720 (__v16si)__W); 4721 } 4722 4723 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4724 _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) 4725 { 4726 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4727 (__v16si)_mm512_cvtepi8_epi32(__A), 4728 (__v16si)_mm512_setzero_si512()); 4729 } 4730 4731 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4732 _mm512_cvtepi8_epi64(__m128i __A) 4733 { 4734 /* This function always performs a signed extension, but __v16qi is a char 4735 which may be signed or unsigned, so use __v16qs. */ 4736 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); 4737 } 4738 4739 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4740 _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4741 { 4742 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4743 (__v8di)_mm512_cvtepi8_epi64(__A), 4744 (__v8di)__W); 4745 } 4746 4747 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4748 _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) 4749 { 4750 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4751 (__v8di)_mm512_cvtepi8_epi64(__A), 4752 (__v8di)_mm512_setzero_si512 ()); 4753 } 4754 4755 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4756 _mm512_cvtepi32_epi64(__m256i __X) 4757 { 4758 return (__m512i)__builtin_convertvector((__v8si)__X, __v8di); 4759 } 4760 4761 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4762 _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) 4763 { 4764 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4765 (__v8di)_mm512_cvtepi32_epi64(__X), 4766 (__v8di)__W); 4767 } 4768 4769 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4770 _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) 4771 { 4772 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4773 (__v8di)_mm512_cvtepi32_epi64(__X), 4774 (__v8di)_mm512_setzero_si512()); 4775 } 4776 4777 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4778 _mm512_cvtepi16_epi32(__m256i __A) 4779 { 4780 return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si); 4781 } 4782 4783 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4784 _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) 4785 { 4786 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4787 (__v16si)_mm512_cvtepi16_epi32(__A), 4788 (__v16si)__W); 4789 } 4790 4791 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4792 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) 4793 { 4794 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4795 (__v16si)_mm512_cvtepi16_epi32(__A), 4796 (__v16si)_mm512_setzero_si512 ()); 4797 } 4798 4799 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4800 _mm512_cvtepi16_epi64(__m128i __A) 4801 { 4802 return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di); 4803 } 4804 4805 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4806 _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4807 { 4808 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4809 (__v8di)_mm512_cvtepi16_epi64(__A), 4810 (__v8di)__W); 4811 } 4812 4813 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4814 _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) 4815 { 4816 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4817 (__v8di)_mm512_cvtepi16_epi64(__A), 4818 (__v8di)_mm512_setzero_si512()); 4819 } 4820 4821 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4822 _mm512_cvtepu8_epi32(__m128i __A) 4823 { 4824 return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si); 4825 } 4826 4827 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4828 _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) 4829 { 4830 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4831 (__v16si)_mm512_cvtepu8_epi32(__A), 4832 (__v16si)__W); 4833 } 4834 4835 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4836 _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) 4837 { 4838 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4839 (__v16si)_mm512_cvtepu8_epi32(__A), 4840 (__v16si)_mm512_setzero_si512()); 4841 } 4842 4843 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4844 _mm512_cvtepu8_epi64(__m128i __A) 4845 { 4846 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); 4847 } 4848 4849 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4850 _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4851 { 4852 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4853 (__v8di)_mm512_cvtepu8_epi64(__A), 4854 (__v8di)__W); 4855 } 4856 4857 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4858 _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) 4859 { 4860 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4861 (__v8di)_mm512_cvtepu8_epi64(__A), 4862 (__v8di)_mm512_setzero_si512()); 4863 } 4864 4865 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4866 _mm512_cvtepu32_epi64(__m256i __X) 4867 { 4868 return (__m512i)__builtin_convertvector((__v8su)__X, __v8di); 4869 } 4870 4871 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4872 _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) 4873 { 4874 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4875 (__v8di)_mm512_cvtepu32_epi64(__X), 4876 (__v8di)__W); 4877 } 4878 4879 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4880 _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) 4881 { 4882 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4883 (__v8di)_mm512_cvtepu32_epi64(__X), 4884 (__v8di)_mm512_setzero_si512()); 4885 } 4886 4887 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4888 _mm512_cvtepu16_epi32(__m256i __A) 4889 { 4890 return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si); 4891 } 4892 4893 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4894 _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) 4895 { 4896 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4897 (__v16si)_mm512_cvtepu16_epi32(__A), 4898 (__v16si)__W); 4899 } 4900 4901 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4902 _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) 4903 { 4904 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4905 (__v16si)_mm512_cvtepu16_epi32(__A), 4906 (__v16si)_mm512_setzero_si512()); 4907 } 4908 4909 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4910 _mm512_cvtepu16_epi64(__m128i __A) 4911 { 4912 return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di); 4913 } 4914 4915 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4916 _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4917 { 4918 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4919 (__v8di)_mm512_cvtepu16_epi64(__A), 4920 (__v8di)__W); 4921 } 4922 4923 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4924 _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) 4925 { 4926 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4927 (__v8di)_mm512_cvtepu16_epi64(__A), 4928 (__v8di)_mm512_setzero_si512()); 4929 } 4930 4931 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4932 _mm512_rorv_epi32 (__m512i __A, __m512i __B) 4933 { 4934 return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B); 4935 } 4936 4937 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4938 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4939 { 4940 return (__m512i)__builtin_ia32_selectd_512(__U, 4941 (__v16si)_mm512_rorv_epi32(__A, __B), 4942 (__v16si)__W); 4943 } 4944 4945 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4946 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 4947 { 4948 return (__m512i)__builtin_ia32_selectd_512(__U, 4949 (__v16si)_mm512_rorv_epi32(__A, __B), 4950 (__v16si)_mm512_setzero_si512()); 4951 } 4952 4953 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4954 _mm512_rorv_epi64 (__m512i __A, __m512i __B) 4955 { 4956 return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B); 4957 } 4958 4959 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4960 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4961 { 4962 return (__m512i)__builtin_ia32_selectq_512(__U, 4963 (__v8di)_mm512_rorv_epi64(__A, __B), 4964 (__v8di)__W); 4965 } 4966 4967 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4968 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 4969 { 4970 return (__m512i)__builtin_ia32_selectq_512(__U, 4971 (__v8di)_mm512_rorv_epi64(__A, __B), 4972 (__v8di)_mm512_setzero_si512()); 4973 } 4974 4975 4976 4977 #define _mm512_cmp_epi32_mask(a, b, p) \ 4978 (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ 4979 (__v16si)(__m512i)(b), (int)(p), \ 4980 (__mmask16)-1) 4981 4982 #define _mm512_cmp_epu32_mask(a, b, p) \ 4983 (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ 4984 (__v16si)(__m512i)(b), (int)(p), \ 4985 (__mmask16)-1) 4986 4987 #define _mm512_cmp_epi64_mask(a, b, p) \ 4988 (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ 4989 (__v8di)(__m512i)(b), (int)(p), \ 4990 (__mmask8)-1) 4991 4992 #define _mm512_cmp_epu64_mask(a, b, p) \ 4993 (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ 4994 (__v8di)(__m512i)(b), (int)(p), \ 4995 (__mmask8)-1) 4996 4997 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \ 4998 (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ 4999 (__v16si)(__m512i)(b), (int)(p), \ 5000 (__mmask16)(m)) 5001 5002 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \ 5003 (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ 5004 (__v16si)(__m512i)(b), (int)(p), \ 5005 (__mmask16)(m)) 5006 5007 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \ 5008 (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ 5009 (__v8di)(__m512i)(b), (int)(p), \ 5010 (__mmask8)(m)) 5011 5012 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \ 5013 (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ 5014 (__v8di)(__m512i)(b), (int)(p), \ 5015 (__mmask8)(m)) 5016 5017 #define _mm512_rol_epi32(a, b) \ 5018 (__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)) 5019 5020 #define _mm512_mask_rol_epi32(W, U, a, b) \ 5021 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5022 (__v16si)_mm512_rol_epi32((a), (b)), \ 5023 (__v16si)(__m512i)(W)) 5024 5025 #define _mm512_maskz_rol_epi32(U, a, b) \ 5026 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5027 (__v16si)_mm512_rol_epi32((a), (b)), \ 5028 (__v16si)_mm512_setzero_si512()) 5029 5030 #define _mm512_rol_epi64(a, b) \ 5031 (__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)) 5032 5033 #define _mm512_mask_rol_epi64(W, U, a, b) \ 5034 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5035 (__v8di)_mm512_rol_epi64((a), (b)), \ 5036 (__v8di)(__m512i)(W)) 5037 5038 #define _mm512_maskz_rol_epi64(U, a, b) \ 5039 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5040 (__v8di)_mm512_rol_epi64((a), (b)), \ 5041 (__v8di)_mm512_setzero_si512()) 5042 5043 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5044 _mm512_rolv_epi32 (__m512i __A, __m512i __B) 5045 { 5046 return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B); 5047 } 5048 5049 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5050 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 5051 { 5052 return (__m512i)__builtin_ia32_selectd_512(__U, 5053 (__v16si)_mm512_rolv_epi32(__A, __B), 5054 (__v16si)__W); 5055 } 5056 5057 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5058 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 5059 { 5060 return (__m512i)__builtin_ia32_selectd_512(__U, 5061 (__v16si)_mm512_rolv_epi32(__A, __B), 5062 (__v16si)_mm512_setzero_si512()); 5063 } 5064 5065 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5066 _mm512_rolv_epi64 (__m512i __A, __m512i __B) 5067 { 5068 return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B); 5069 } 5070 5071 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5072 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 5073 { 5074 return (__m512i)__builtin_ia32_selectq_512(__U, 5075 (__v8di)_mm512_rolv_epi64(__A, __B), 5076 (__v8di)__W); 5077 } 5078 5079 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5080 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 5081 { 5082 return (__m512i)__builtin_ia32_selectq_512(__U, 5083 (__v8di)_mm512_rolv_epi64(__A, __B), 5084 (__v8di)_mm512_setzero_si512()); 5085 } 5086 5087 #define _mm512_ror_epi32(A, B) \ 5088 (__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)) 5089 5090 #define _mm512_mask_ror_epi32(W, U, A, B) \ 5091 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5092 (__v16si)_mm512_ror_epi32((A), (B)), \ 5093 (__v16si)(__m512i)(W)) 5094 5095 #define _mm512_maskz_ror_epi32(U, A, B) \ 5096 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5097 (__v16si)_mm512_ror_epi32((A), (B)), \ 5098 (__v16si)_mm512_setzero_si512()) 5099 5100 #define _mm512_ror_epi64(A, B) \ 5101 (__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)) 5102 5103 #define _mm512_mask_ror_epi64(W, U, A, B) \ 5104 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5105 (__v8di)_mm512_ror_epi64((A), (B)), \ 5106 (__v8di)(__m512i)(W)) 5107 5108 #define _mm512_maskz_ror_epi64(U, A, B) \ 5109 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5110 (__v8di)_mm512_ror_epi64((A), (B)), \ 5111 (__v8di)_mm512_setzero_si512()) 5112 5113 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5114 _mm512_slli_epi32(__m512i __A, unsigned int __B) 5115 { 5116 return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B); 5117 } 5118 5119 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5120 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, 5121 unsigned int __B) 5122 { 5123 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5124 (__v16si)_mm512_slli_epi32(__A, __B), 5125 (__v16si)__W); 5126 } 5127 5128 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5129 _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { 5130 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5131 (__v16si)_mm512_slli_epi32(__A, __B), 5132 (__v16si)_mm512_setzero_si512()); 5133 } 5134 5135 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5136 _mm512_slli_epi64(__m512i __A, unsigned int __B) 5137 { 5138 return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B); 5139 } 5140 5141 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5142 _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) 5143 { 5144 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5145 (__v8di)_mm512_slli_epi64(__A, __B), 5146 (__v8di)__W); 5147 } 5148 5149 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5150 _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) 5151 { 5152 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5153 (__v8di)_mm512_slli_epi64(__A, __B), 5154 (__v8di)_mm512_setzero_si512()); 5155 } 5156 5157 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5158 _mm512_srli_epi32(__m512i __A, unsigned int __B) 5159 { 5160 return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B); 5161 } 5162 5163 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5164 _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, 5165 unsigned int __B) 5166 { 5167 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5168 (__v16si)_mm512_srli_epi32(__A, __B), 5169 (__v16si)__W); 5170 } 5171 5172 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5173 _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { 5174 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5175 (__v16si)_mm512_srli_epi32(__A, __B), 5176 (__v16si)_mm512_setzero_si512()); 5177 } 5178 5179 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5180 _mm512_srli_epi64(__m512i __A, unsigned int __B) 5181 { 5182 return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B); 5183 } 5184 5185 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5186 _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, 5187 unsigned int __B) 5188 { 5189 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5190 (__v8di)_mm512_srli_epi64(__A, __B), 5191 (__v8di)__W); 5192 } 5193 5194 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5195 _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, 5196 unsigned int __B) 5197 { 5198 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5199 (__v8di)_mm512_srli_epi64(__A, __B), 5200 (__v8di)_mm512_setzero_si512()); 5201 } 5202 5203 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5204 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) 5205 { 5206 return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, 5207 (__v16si) __W, 5208 (__mmask16) __U); 5209 } 5210 5211 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5212 _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) 5213 { 5214 return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, 5215 (__v16si) 5216 _mm512_setzero_si512 (), 5217 (__mmask16) __U); 5218 } 5219 5220 static __inline__ void __DEFAULT_FN_ATTRS512 5221 _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) 5222 { 5223 __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A, 5224 (__mmask16) __U); 5225 } 5226 5227 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5228 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 5229 { 5230 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 5231 (__v16si) __A, 5232 (__v16si) __W); 5233 } 5234 5235 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5236 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) 5237 { 5238 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 5239 (__v16si) __A, 5240 (__v16si) _mm512_setzero_si512 ()); 5241 } 5242 5243 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5244 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 5245 { 5246 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 5247 (__v8di) __A, 5248 (__v8di) __W); 5249 } 5250 5251 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5252 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) 5253 { 5254 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 5255 (__v8di) __A, 5256 (__v8di) _mm512_setzero_si512 ()); 5257 } 5258 5259 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5260 _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) 5261 { 5262 return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, 5263 (__v8di) __W, 5264 (__mmask8) __U); 5265 } 5266 5267 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5268 _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) 5269 { 5270 return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, 5271 (__v8di) 5272 _mm512_setzero_si512 (), 5273 (__mmask8) __U); 5274 } 5275 5276 static __inline__ void __DEFAULT_FN_ATTRS512 5277 _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) 5278 { 5279 __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A, 5280 (__mmask8) __U); 5281 } 5282 5283 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5284 _mm512_movedup_pd (__m512d __A) 5285 { 5286 return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A, 5287 0, 0, 2, 2, 4, 4, 6, 6); 5288 } 5289 5290 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5291 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) 5292 { 5293 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 5294 (__v8df)_mm512_movedup_pd(__A), 5295 (__v8df)__W); 5296 } 5297 5298 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5299 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) 5300 { 5301 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 5302 (__v8df)_mm512_movedup_pd(__A), 5303 (__v8df)_mm512_setzero_pd()); 5304 } 5305 5306 #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \ 5307 (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5308 (__v8df)(__m512d)(B), \ 5309 (__v8di)(__m512i)(C), (int)(imm), \ 5310 (__mmask8)-1, (int)(R)) 5311 5312 #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \ 5313 (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5314 (__v8df)(__m512d)(B), \ 5315 (__v8di)(__m512i)(C), (int)(imm), \ 5316 (__mmask8)(U), (int)(R)) 5317 5318 #define _mm512_fixupimm_pd(A, B, C, imm) \ 5319 (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5320 (__v8df)(__m512d)(B), \ 5321 (__v8di)(__m512i)(C), (int)(imm), \ 5322 (__mmask8)-1, \ 5323 _MM_FROUND_CUR_DIRECTION) 5324 5325 #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \ 5326 (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5327 (__v8df)(__m512d)(B), \ 5328 (__v8di)(__m512i)(C), (int)(imm), \ 5329 (__mmask8)(U), \ 5330 _MM_FROUND_CUR_DIRECTION) 5331 5332 #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \ 5333 (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ 5334 (__v8df)(__m512d)(B), \ 5335 (__v8di)(__m512i)(C), \ 5336 (int)(imm), (__mmask8)(U), \ 5337 (int)(R)) 5338 5339 #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \ 5340 (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ 5341 (__v8df)(__m512d)(B), \ 5342 (__v8di)(__m512i)(C), \ 5343 (int)(imm), (__mmask8)(U), \ 5344 _MM_FROUND_CUR_DIRECTION) 5345 5346 #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \ 5347 (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5348 (__v16sf)(__m512)(B), \ 5349 (__v16si)(__m512i)(C), (int)(imm), \ 5350 (__mmask16)-1, (int)(R)) 5351 5352 #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \ 5353 (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5354 (__v16sf)(__m512)(B), \ 5355 (__v16si)(__m512i)(C), (int)(imm), \ 5356 (__mmask16)(U), (int)(R)) 5357 5358 #define _mm512_fixupimm_ps(A, B, C, imm) \ 5359 (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5360 (__v16sf)(__m512)(B), \ 5361 (__v16si)(__m512i)(C), (int)(imm), \ 5362 (__mmask16)-1, \ 5363 _MM_FROUND_CUR_DIRECTION) 5364 5365 #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \ 5366 (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5367 (__v16sf)(__m512)(B), \ 5368 (__v16si)(__m512i)(C), (int)(imm), \ 5369 (__mmask16)(U), \ 5370 _MM_FROUND_CUR_DIRECTION) 5371 5372 #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \ 5373 (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ 5374 (__v16sf)(__m512)(B), \ 5375 (__v16si)(__m512i)(C), \ 5376 (int)(imm), (__mmask16)(U), \ 5377 (int)(R)) 5378 5379 #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \ 5380 (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ 5381 (__v16sf)(__m512)(B), \ 5382 (__v16si)(__m512i)(C), \ 5383 (int)(imm), (__mmask16)(U), \ 5384 _MM_FROUND_CUR_DIRECTION) 5385 5386 #define _mm_fixupimm_round_sd(A, B, C, imm, R) \ 5387 (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5388 (__v2df)(__m128d)(B), \ 5389 (__v2di)(__m128i)(C), (int)(imm), \ 5390 (__mmask8)-1, (int)(R)) 5391 5392 #define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \ 5393 (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5394 (__v2df)(__m128d)(B), \ 5395 (__v2di)(__m128i)(C), (int)(imm), \ 5396 (__mmask8)(U), (int)(R)) 5397 5398 #define _mm_fixupimm_sd(A, B, C, imm) \ 5399 (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5400 (__v2df)(__m128d)(B), \ 5401 (__v2di)(__m128i)(C), (int)(imm), \ 5402 (__mmask8)-1, \ 5403 _MM_FROUND_CUR_DIRECTION) 5404 5405 #define _mm_mask_fixupimm_sd(A, U, B, C, imm) \ 5406 (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5407 (__v2df)(__m128d)(B), \ 5408 (__v2di)(__m128i)(C), (int)(imm), \ 5409 (__mmask8)(U), \ 5410 _MM_FROUND_CUR_DIRECTION) 5411 5412 #define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \ 5413 (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ 5414 (__v2df)(__m128d)(B), \ 5415 (__v2di)(__m128i)(C), (int)(imm), \ 5416 (__mmask8)(U), (int)(R)) 5417 5418 #define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \ 5419 (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ 5420 (__v2df)(__m128d)(B), \ 5421 (__v2di)(__m128i)(C), (int)(imm), \ 5422 (__mmask8)(U), \ 5423 _MM_FROUND_CUR_DIRECTION) 5424 5425 #define _mm_fixupimm_round_ss(A, B, C, imm, R) \ 5426 (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5427 (__v4sf)(__m128)(B), \ 5428 (__v4si)(__m128i)(C), (int)(imm), \ 5429 (__mmask8)-1, (int)(R)) 5430 5431 #define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \ 5432 (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5433 (__v4sf)(__m128)(B), \ 5434 (__v4si)(__m128i)(C), (int)(imm), \ 5435 (__mmask8)(U), (int)(R)) 5436 5437 #define _mm_fixupimm_ss(A, B, C, imm) \ 5438 (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5439 (__v4sf)(__m128)(B), \ 5440 (__v4si)(__m128i)(C), (int)(imm), \ 5441 (__mmask8)-1, \ 5442 _MM_FROUND_CUR_DIRECTION) 5443 5444 #define _mm_mask_fixupimm_ss(A, U, B, C, imm) \ 5445 (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5446 (__v4sf)(__m128)(B), \ 5447 (__v4si)(__m128i)(C), (int)(imm), \ 5448 (__mmask8)(U), \ 5449 _MM_FROUND_CUR_DIRECTION) 5450 5451 #define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \ 5452 (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ 5453 (__v4sf)(__m128)(B), \ 5454 (__v4si)(__m128i)(C), (int)(imm), \ 5455 (__mmask8)(U), (int)(R)) 5456 5457 #define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \ 5458 (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ 5459 (__v4sf)(__m128)(B), \ 5460 (__v4si)(__m128i)(C), (int)(imm), \ 5461 (__mmask8)(U), \ 5462 _MM_FROUND_CUR_DIRECTION) 5463 5464 #define _mm_getexp_round_sd(A, B, R) \ 5465 (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5466 (__v2df)(__m128d)(B), \ 5467 (__v2df)_mm_setzero_pd(), \ 5468 (__mmask8)-1, (int)(R)) 5469 5470 5471 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5472 _mm_getexp_sd (__m128d __A, __m128d __B) 5473 { 5474 return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A, 5475 (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); 5476 } 5477 5478 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5479 _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 5480 { 5481 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, 5482 (__v2df) __B, 5483 (__v2df) __W, 5484 (__mmask8) __U, 5485 _MM_FROUND_CUR_DIRECTION); 5486 } 5487 5488 #define _mm_mask_getexp_round_sd(W, U, A, B, R) \ 5489 (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5490 (__v2df)(__m128d)(B), \ 5491 (__v2df)(__m128d)(W), \ 5492 (__mmask8)(U), (int)(R)) 5493 5494 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5495 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) 5496 { 5497 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, 5498 (__v2df) __B, 5499 (__v2df) _mm_setzero_pd (), 5500 (__mmask8) __U, 5501 _MM_FROUND_CUR_DIRECTION); 5502 } 5503 5504 #define _mm_maskz_getexp_round_sd(U, A, B, R) \ 5505 (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5506 (__v2df)(__m128d)(B), \ 5507 (__v2df)_mm_setzero_pd(), \ 5508 (__mmask8)(U), (int)(R)) 5509 5510 #define _mm_getexp_round_ss(A, B, R) \ 5511 (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5512 (__v4sf)(__m128)(B), \ 5513 (__v4sf)_mm_setzero_ps(), \ 5514 (__mmask8)-1, (int)(R)) 5515 5516 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5517 _mm_getexp_ss (__m128 __A, __m128 __B) 5518 { 5519 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5520 (__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); 5521 } 5522 5523 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5524 _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 5525 { 5526 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5527 (__v4sf) __B, 5528 (__v4sf) __W, 5529 (__mmask8) __U, 5530 _MM_FROUND_CUR_DIRECTION); 5531 } 5532 5533 #define _mm_mask_getexp_round_ss(W, U, A, B, R) \ 5534 (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5535 (__v4sf)(__m128)(B), \ 5536 (__v4sf)(__m128)(W), \ 5537 (__mmask8)(U), (int)(R)) 5538 5539 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5540 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) 5541 { 5542 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5543 (__v4sf) __B, 5544 (__v4sf) _mm_setzero_ps (), 5545 (__mmask8) __U, 5546 _MM_FROUND_CUR_DIRECTION); 5547 } 5548 5549 #define _mm_maskz_getexp_round_ss(U, A, B, R) \ 5550 (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5551 (__v4sf)(__m128)(B), \ 5552 (__v4sf)_mm_setzero_ps(), \ 5553 (__mmask8)(U), (int)(R)) 5554 5555 #define _mm_getmant_round_sd(A, B, C, D, R) \ 5556 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5557 (__v2df)(__m128d)(B), \ 5558 (int)(((D)<<2) | (C)), \ 5559 (__v2df)_mm_setzero_pd(), \ 5560 (__mmask8)-1, (int)(R)) 5561 5562 #define _mm_getmant_sd(A, B, C, D) \ 5563 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5564 (__v2df)(__m128d)(B), \ 5565 (int)(((D)<<2) | (C)), \ 5566 (__v2df)_mm_setzero_pd(), \ 5567 (__mmask8)-1, \ 5568 _MM_FROUND_CUR_DIRECTION) 5569 5570 #define _mm_mask_getmant_sd(W, U, A, B, C, D) \ 5571 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5572 (__v2df)(__m128d)(B), \ 5573 (int)(((D)<<2) | (C)), \ 5574 (__v2df)(__m128d)(W), \ 5575 (__mmask8)(U), \ 5576 _MM_FROUND_CUR_DIRECTION) 5577 5578 #define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \ 5579 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5580 (__v2df)(__m128d)(B), \ 5581 (int)(((D)<<2) | (C)), \ 5582 (__v2df)(__m128d)(W), \ 5583 (__mmask8)(U), (int)(R)) 5584 5585 #define _mm_maskz_getmant_sd(U, A, B, C, D) \ 5586 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5587 (__v2df)(__m128d)(B), \ 5588 (int)(((D)<<2) | (C)), \ 5589 (__v2df)_mm_setzero_pd(), \ 5590 (__mmask8)(U), \ 5591 _MM_FROUND_CUR_DIRECTION) 5592 5593 #define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \ 5594 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5595 (__v2df)(__m128d)(B), \ 5596 (int)(((D)<<2) | (C)), \ 5597 (__v2df)_mm_setzero_pd(), \ 5598 (__mmask8)(U), (int)(R)) 5599 5600 #define _mm_getmant_round_ss(A, B, C, D, R) \ 5601 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5602 (__v4sf)(__m128)(B), \ 5603 (int)(((D)<<2) | (C)), \ 5604 (__v4sf)_mm_setzero_ps(), \ 5605 (__mmask8)-1, (int)(R)) 5606 5607 #define _mm_getmant_ss(A, B, C, D) \ 5608 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5609 (__v4sf)(__m128)(B), \ 5610 (int)(((D)<<2) | (C)), \ 5611 (__v4sf)_mm_setzero_ps(), \ 5612 (__mmask8)-1, \ 5613 _MM_FROUND_CUR_DIRECTION) 5614 5615 #define _mm_mask_getmant_ss(W, U, A, B, C, D) \ 5616 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5617 (__v4sf)(__m128)(B), \ 5618 (int)(((D)<<2) | (C)), \ 5619 (__v4sf)(__m128)(W), \ 5620 (__mmask8)(U), \ 5621 _MM_FROUND_CUR_DIRECTION) 5622 5623 #define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \ 5624 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5625 (__v4sf)(__m128)(B), \ 5626 (int)(((D)<<2) | (C)), \ 5627 (__v4sf)(__m128)(W), \ 5628 (__mmask8)(U), (int)(R)) 5629 5630 #define _mm_maskz_getmant_ss(U, A, B, C, D) \ 5631 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5632 (__v4sf)(__m128)(B), \ 5633 (int)(((D)<<2) | (C)), \ 5634 (__v4sf)_mm_setzero_ps(), \ 5635 (__mmask8)(U), \ 5636 _MM_FROUND_CUR_DIRECTION) 5637 5638 #define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \ 5639 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5640 (__v4sf)(__m128)(B), \ 5641 (int)(((D)<<2) | (C)), \ 5642 (__v4sf)_mm_setzero_ps(), \ 5643 (__mmask8)(U), (int)(R)) 5644 5645 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 5646 _mm512_kmov (__mmask16 __A) 5647 { 5648 return __A; 5649 } 5650 5651 #define _mm_comi_round_sd(A, B, P, R) \ 5652 (int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ 5653 (int)(P), (int)(R)) 5654 5655 #define _mm_comi_round_ss(A, B, P, R) \ 5656 (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ 5657 (int)(P), (int)(R)) 5658 5659 #ifdef __x86_64__ 5660 #define _mm_cvt_roundsd_si64(A, R) \ 5661 (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)) 5662 #endif 5663 5664 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5665 _mm512_sll_epi32(__m512i __A, __m128i __B) 5666 { 5667 return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); 5668 } 5669 5670 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5671 _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5672 { 5673 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5674 (__v16si)_mm512_sll_epi32(__A, __B), 5675 (__v16si)__W); 5676 } 5677 5678 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5679 _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5680 { 5681 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5682 (__v16si)_mm512_sll_epi32(__A, __B), 5683 (__v16si)_mm512_setzero_si512()); 5684 } 5685 5686 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5687 _mm512_sll_epi64(__m512i __A, __m128i __B) 5688 { 5689 return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); 5690 } 5691 5692 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5693 _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5694 { 5695 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5696 (__v8di)_mm512_sll_epi64(__A, __B), 5697 (__v8di)__W); 5698 } 5699 5700 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5701 _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5702 { 5703 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5704 (__v8di)_mm512_sll_epi64(__A, __B), 5705 (__v8di)_mm512_setzero_si512()); 5706 } 5707 5708 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5709 _mm512_sllv_epi32(__m512i __X, __m512i __Y) 5710 { 5711 return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y); 5712 } 5713 5714 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5715 _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5716 { 5717 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5718 (__v16si)_mm512_sllv_epi32(__X, __Y), 5719 (__v16si)__W); 5720 } 5721 5722 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5723 _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5724 { 5725 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5726 (__v16si)_mm512_sllv_epi32(__X, __Y), 5727 (__v16si)_mm512_setzero_si512()); 5728 } 5729 5730 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5731 _mm512_sllv_epi64(__m512i __X, __m512i __Y) 5732 { 5733 return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y); 5734 } 5735 5736 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5737 _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5738 { 5739 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5740 (__v8di)_mm512_sllv_epi64(__X, __Y), 5741 (__v8di)__W); 5742 } 5743 5744 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5745 _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5746 { 5747 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5748 (__v8di)_mm512_sllv_epi64(__X, __Y), 5749 (__v8di)_mm512_setzero_si512()); 5750 } 5751 5752 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5753 _mm512_sra_epi32(__m512i __A, __m128i __B) 5754 { 5755 return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); 5756 } 5757 5758 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5759 _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5760 { 5761 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5762 (__v16si)_mm512_sra_epi32(__A, __B), 5763 (__v16si)__W); 5764 } 5765 5766 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5767 _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5768 { 5769 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5770 (__v16si)_mm512_sra_epi32(__A, __B), 5771 (__v16si)_mm512_setzero_si512()); 5772 } 5773 5774 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5775 _mm512_sra_epi64(__m512i __A, __m128i __B) 5776 { 5777 return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); 5778 } 5779 5780 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5781 _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5782 { 5783 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5784 (__v8di)_mm512_sra_epi64(__A, __B), 5785 (__v8di)__W); 5786 } 5787 5788 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5789 _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5790 { 5791 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5792 (__v8di)_mm512_sra_epi64(__A, __B), 5793 (__v8di)_mm512_setzero_si512()); 5794 } 5795 5796 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5797 _mm512_srav_epi32(__m512i __X, __m512i __Y) 5798 { 5799 return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y); 5800 } 5801 5802 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5803 _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5804 { 5805 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5806 (__v16si)_mm512_srav_epi32(__X, __Y), 5807 (__v16si)__W); 5808 } 5809 5810 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5811 _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5812 { 5813 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5814 (__v16si)_mm512_srav_epi32(__X, __Y), 5815 (__v16si)_mm512_setzero_si512()); 5816 } 5817 5818 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5819 _mm512_srav_epi64(__m512i __X, __m512i __Y) 5820 { 5821 return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y); 5822 } 5823 5824 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5825 _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5826 { 5827 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5828 (__v8di)_mm512_srav_epi64(__X, __Y), 5829 (__v8di)__W); 5830 } 5831 5832 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5833 _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5834 { 5835 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5836 (__v8di)_mm512_srav_epi64(__X, __Y), 5837 (__v8di)_mm512_setzero_si512()); 5838 } 5839 5840 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5841 _mm512_srl_epi32(__m512i __A, __m128i __B) 5842 { 5843 return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); 5844 } 5845 5846 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5847 _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5848 { 5849 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5850 (__v16si)_mm512_srl_epi32(__A, __B), 5851 (__v16si)__W); 5852 } 5853 5854 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5855 _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5856 { 5857 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5858 (__v16si)_mm512_srl_epi32(__A, __B), 5859 (__v16si)_mm512_setzero_si512()); 5860 } 5861 5862 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5863 _mm512_srl_epi64(__m512i __A, __m128i __B) 5864 { 5865 return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); 5866 } 5867 5868 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5869 _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5870 { 5871 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5872 (__v8di)_mm512_srl_epi64(__A, __B), 5873 (__v8di)__W); 5874 } 5875 5876 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5877 _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5878 { 5879 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5880 (__v8di)_mm512_srl_epi64(__A, __B), 5881 (__v8di)_mm512_setzero_si512()); 5882 } 5883 5884 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5885 _mm512_srlv_epi32(__m512i __X, __m512i __Y) 5886 { 5887 return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y); 5888 } 5889 5890 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5891 _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5892 { 5893 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5894 (__v16si)_mm512_srlv_epi32(__X, __Y), 5895 (__v16si)__W); 5896 } 5897 5898 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5899 _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5900 { 5901 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5902 (__v16si)_mm512_srlv_epi32(__X, __Y), 5903 (__v16si)_mm512_setzero_si512()); 5904 } 5905 5906 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5907 _mm512_srlv_epi64 (__m512i __X, __m512i __Y) 5908 { 5909 return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y); 5910 } 5911 5912 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5913 _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5914 { 5915 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5916 (__v8di)_mm512_srlv_epi64(__X, __Y), 5917 (__v8di)__W); 5918 } 5919 5920 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5921 _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5922 { 5923 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5924 (__v8di)_mm512_srlv_epi64(__X, __Y), 5925 (__v8di)_mm512_setzero_si512()); 5926 } 5927 5928 #define _mm512_ternarylogic_epi32(A, B, C, imm) \ 5929 (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ 5930 (__v16si)(__m512i)(B), \ 5931 (__v16si)(__m512i)(C), (int)(imm), \ 5932 (__mmask16)-1) 5933 5934 #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \ 5935 (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ 5936 (__v16si)(__m512i)(B), \ 5937 (__v16si)(__m512i)(C), (int)(imm), \ 5938 (__mmask16)(U)) 5939 5940 #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \ 5941 (__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \ 5942 (__v16si)(__m512i)(B), \ 5943 (__v16si)(__m512i)(C), \ 5944 (int)(imm), (__mmask16)(U)) 5945 5946 #define _mm512_ternarylogic_epi64(A, B, C, imm) \ 5947 (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ 5948 (__v8di)(__m512i)(B), \ 5949 (__v8di)(__m512i)(C), (int)(imm), \ 5950 (__mmask8)-1) 5951 5952 #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \ 5953 (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ 5954 (__v8di)(__m512i)(B), \ 5955 (__v8di)(__m512i)(C), (int)(imm), \ 5956 (__mmask8)(U)) 5957 5958 #define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \ 5959 (__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \ 5960 (__v8di)(__m512i)(B), \ 5961 (__v8di)(__m512i)(C), (int)(imm), \ 5962 (__mmask8)(U)) 5963 5964 #ifdef __x86_64__ 5965 #define _mm_cvt_roundsd_i64(A, R) \ 5966 (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)) 5967 #endif 5968 5969 #define _mm_cvt_roundsd_si32(A, R) \ 5970 (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)) 5971 5972 #define _mm_cvt_roundsd_i32(A, R) \ 5973 (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)) 5974 5975 #define _mm_cvt_roundsd_u32(A, R) \ 5976 (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)) 5977 5978 static __inline__ unsigned __DEFAULT_FN_ATTRS128 5979 _mm_cvtsd_u32 (__m128d __A) 5980 { 5981 return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, 5982 _MM_FROUND_CUR_DIRECTION); 5983 } 5984 5985 #ifdef __x86_64__ 5986 #define _mm_cvt_roundsd_u64(A, R) \ 5987 (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ 5988 (int)(R)) 5989 5990 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 5991 _mm_cvtsd_u64 (__m128d __A) 5992 { 5993 return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) 5994 __A, 5995 _MM_FROUND_CUR_DIRECTION); 5996 } 5997 #endif 5998 5999 #define _mm_cvt_roundss_si32(A, R) \ 6000 (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)) 6001 6002 #define _mm_cvt_roundss_i32(A, R) \ 6003 (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)) 6004 6005 #ifdef __x86_64__ 6006 #define _mm_cvt_roundss_si64(A, R) \ 6007 (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)) 6008 6009 #define _mm_cvt_roundss_i64(A, R) \ 6010 (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)) 6011 #endif 6012 6013 #define _mm_cvt_roundss_u32(A, R) \ 6014 (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)) 6015 6016 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6017 _mm_cvtss_u32 (__m128 __A) 6018 { 6019 return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, 6020 _MM_FROUND_CUR_DIRECTION); 6021 } 6022 6023 #ifdef __x86_64__ 6024 #define _mm_cvt_roundss_u64(A, R) \ 6025 (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ 6026 (int)(R)) 6027 6028 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6029 _mm_cvtss_u64 (__m128 __A) 6030 { 6031 return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) 6032 __A, 6033 _MM_FROUND_CUR_DIRECTION); 6034 } 6035 #endif 6036 6037 #define _mm_cvtt_roundsd_i32(A, R) \ 6038 (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)) 6039 6040 #define _mm_cvtt_roundsd_si32(A, R) \ 6041 (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)) 6042 6043 static __inline__ int __DEFAULT_FN_ATTRS128 6044 _mm_cvttsd_i32 (__m128d __A) 6045 { 6046 return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, 6047 _MM_FROUND_CUR_DIRECTION); 6048 } 6049 6050 #ifdef __x86_64__ 6051 #define _mm_cvtt_roundsd_si64(A, R) \ 6052 (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)) 6053 6054 #define _mm_cvtt_roundsd_i64(A, R) \ 6055 (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)) 6056 6057 static __inline__ long long __DEFAULT_FN_ATTRS128 6058 _mm_cvttsd_i64 (__m128d __A) 6059 { 6060 return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, 6061 _MM_FROUND_CUR_DIRECTION); 6062 } 6063 #endif 6064 6065 #define _mm_cvtt_roundsd_u32(A, R) \ 6066 (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)) 6067 6068 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6069 _mm_cvttsd_u32 (__m128d __A) 6070 { 6071 return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, 6072 _MM_FROUND_CUR_DIRECTION); 6073 } 6074 6075 #ifdef __x86_64__ 6076 #define _mm_cvtt_roundsd_u64(A, R) \ 6077 (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ 6078 (int)(R)) 6079 6080 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6081 _mm_cvttsd_u64 (__m128d __A) 6082 { 6083 return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) 6084 __A, 6085 _MM_FROUND_CUR_DIRECTION); 6086 } 6087 #endif 6088 6089 #define _mm_cvtt_roundss_i32(A, R) \ 6090 (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)) 6091 6092 #define _mm_cvtt_roundss_si32(A, R) \ 6093 (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)) 6094 6095 static __inline__ int __DEFAULT_FN_ATTRS128 6096 _mm_cvttss_i32 (__m128 __A) 6097 { 6098 return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, 6099 _MM_FROUND_CUR_DIRECTION); 6100 } 6101 6102 #ifdef __x86_64__ 6103 #define _mm_cvtt_roundss_i64(A, R) \ 6104 (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)) 6105 6106 #define _mm_cvtt_roundss_si64(A, R) \ 6107 (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)) 6108 6109 static __inline__ long long __DEFAULT_FN_ATTRS128 6110 _mm_cvttss_i64 (__m128 __A) 6111 { 6112 return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, 6113 _MM_FROUND_CUR_DIRECTION); 6114 } 6115 #endif 6116 6117 #define _mm_cvtt_roundss_u32(A, R) \ 6118 (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)) 6119 6120 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6121 _mm_cvttss_u32 (__m128 __A) 6122 { 6123 return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, 6124 _MM_FROUND_CUR_DIRECTION); 6125 } 6126 6127 #ifdef __x86_64__ 6128 #define _mm_cvtt_roundss_u64(A, R) \ 6129 (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ 6130 (int)(R)) 6131 6132 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6133 _mm_cvttss_u64 (__m128 __A) 6134 { 6135 return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) 6136 __A, 6137 _MM_FROUND_CUR_DIRECTION); 6138 } 6139 #endif 6140 6141 #define _mm512_permute_pd(X, C) \ 6142 (__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)) 6143 6144 #define _mm512_mask_permute_pd(W, U, X, C) \ 6145 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6146 (__v8df)_mm512_permute_pd((X), (C)), \ 6147 (__v8df)(__m512d)(W)) 6148 6149 #define _mm512_maskz_permute_pd(U, X, C) \ 6150 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6151 (__v8df)_mm512_permute_pd((X), (C)), \ 6152 (__v8df)_mm512_setzero_pd()) 6153 6154 #define _mm512_permute_ps(X, C) \ 6155 (__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)) 6156 6157 #define _mm512_mask_permute_ps(W, U, X, C) \ 6158 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6159 (__v16sf)_mm512_permute_ps((X), (C)), \ 6160 (__v16sf)(__m512)(W)) 6161 6162 #define _mm512_maskz_permute_ps(U, X, C) \ 6163 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6164 (__v16sf)_mm512_permute_ps((X), (C)), \ 6165 (__v16sf)_mm512_setzero_ps()) 6166 6167 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6168 _mm512_permutevar_pd(__m512d __A, __m512i __C) 6169 { 6170 return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C); 6171 } 6172 6173 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6174 _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) 6175 { 6176 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 6177 (__v8df)_mm512_permutevar_pd(__A, __C), 6178 (__v8df)__W); 6179 } 6180 6181 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6182 _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) 6183 { 6184 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 6185 (__v8df)_mm512_permutevar_pd(__A, __C), 6186 (__v8df)_mm512_setzero_pd()); 6187 } 6188 6189 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6190 _mm512_permutevar_ps(__m512 __A, __m512i __C) 6191 { 6192 return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C); 6193 } 6194 6195 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6196 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) 6197 { 6198 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 6199 (__v16sf)_mm512_permutevar_ps(__A, __C), 6200 (__v16sf)__W); 6201 } 6202 6203 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6204 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) 6205 { 6206 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 6207 (__v16sf)_mm512_permutevar_ps(__A, __C), 6208 (__v16sf)_mm512_setzero_ps()); 6209 } 6210 6211 static __inline __m512d __DEFAULT_FN_ATTRS512 6212 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) 6213 { 6214 return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, 6215 (__v8df)__B); 6216 } 6217 6218 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6219 _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) 6220 { 6221 return (__m512d)__builtin_ia32_selectpd_512(__U, 6222 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6223 (__v8df)__A); 6224 } 6225 6226 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6227 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, 6228 __m512d __B) 6229 { 6230 return (__m512d)__builtin_ia32_selectpd_512(__U, 6231 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6232 (__v8df)(__m512d)__I); 6233 } 6234 6235 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6236 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, 6237 __m512d __B) 6238 { 6239 return (__m512d)__builtin_ia32_selectpd_512(__U, 6240 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6241 (__v8df)_mm512_setzero_pd()); 6242 } 6243 6244 static __inline __m512 __DEFAULT_FN_ATTRS512 6245 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) 6246 { 6247 return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, 6248 (__v16sf) __B); 6249 } 6250 6251 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6252 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) 6253 { 6254 return (__m512)__builtin_ia32_selectps_512(__U, 6255 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6256 (__v16sf)__A); 6257 } 6258 6259 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6260 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) 6261 { 6262 return (__m512)__builtin_ia32_selectps_512(__U, 6263 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6264 (__v16sf)(__m512)__I); 6265 } 6266 6267 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6268 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) 6269 { 6270 return (__m512)__builtin_ia32_selectps_512(__U, 6271 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6272 (__v16sf)_mm512_setzero_ps()); 6273 } 6274 6275 6276 #define _mm512_cvtt_roundpd_epu32(A, R) \ 6277 (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6278 (__v8si)_mm256_undefined_si256(), \ 6279 (__mmask8)-1, (int)(R)) 6280 6281 #define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \ 6282 (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6283 (__v8si)(__m256i)(W), \ 6284 (__mmask8)(U), (int)(R)) 6285 6286 #define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \ 6287 (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6288 (__v8si)_mm256_setzero_si256(), \ 6289 (__mmask8)(U), (int)(R)) 6290 6291 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6292 _mm512_cvttpd_epu32 (__m512d __A) 6293 { 6294 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6295 (__v8si) 6296 _mm256_undefined_si256 (), 6297 (__mmask8) -1, 6298 _MM_FROUND_CUR_DIRECTION); 6299 } 6300 6301 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6302 _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) 6303 { 6304 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6305 (__v8si) __W, 6306 (__mmask8) __U, 6307 _MM_FROUND_CUR_DIRECTION); 6308 } 6309 6310 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6311 _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) 6312 { 6313 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6314 (__v8si) 6315 _mm256_setzero_si256 (), 6316 (__mmask8) __U, 6317 _MM_FROUND_CUR_DIRECTION); 6318 } 6319 6320 #define _mm_roundscale_round_sd(A, B, imm, R) \ 6321 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6322 (__v2df)(__m128d)(B), \ 6323 (__v2df)_mm_setzero_pd(), \ 6324 (__mmask8)-1, (int)(imm), \ 6325 (int)(R)) 6326 6327 #define _mm_roundscale_sd(A, B, imm) \ 6328 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6329 (__v2df)(__m128d)(B), \ 6330 (__v2df)_mm_setzero_pd(), \ 6331 (__mmask8)-1, (int)(imm), \ 6332 _MM_FROUND_CUR_DIRECTION) 6333 6334 #define _mm_mask_roundscale_sd(W, U, A, B, imm) \ 6335 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6336 (__v2df)(__m128d)(B), \ 6337 (__v2df)(__m128d)(W), \ 6338 (__mmask8)(U), (int)(imm), \ 6339 _MM_FROUND_CUR_DIRECTION) 6340 6341 #define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \ 6342 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6343 (__v2df)(__m128d)(B), \ 6344 (__v2df)(__m128d)(W), \ 6345 (__mmask8)(U), (int)(I), \ 6346 (int)(R)) 6347 6348 #define _mm_maskz_roundscale_sd(U, A, B, I) \ 6349 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6350 (__v2df)(__m128d)(B), \ 6351 (__v2df)_mm_setzero_pd(), \ 6352 (__mmask8)(U), (int)(I), \ 6353 _MM_FROUND_CUR_DIRECTION) 6354 6355 #define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ 6356 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6357 (__v2df)(__m128d)(B), \ 6358 (__v2df)_mm_setzero_pd(), \ 6359 (__mmask8)(U), (int)(I), \ 6360 (int)(R)) 6361 6362 #define _mm_roundscale_round_ss(A, B, imm, R) \ 6363 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6364 (__v4sf)(__m128)(B), \ 6365 (__v4sf)_mm_setzero_ps(), \ 6366 (__mmask8)-1, (int)(imm), \ 6367 (int)(R)) 6368 6369 #define _mm_roundscale_ss(A, B, imm) \ 6370 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6371 (__v4sf)(__m128)(B), \ 6372 (__v4sf)_mm_setzero_ps(), \ 6373 (__mmask8)-1, (int)(imm), \ 6374 _MM_FROUND_CUR_DIRECTION) 6375 6376 #define _mm_mask_roundscale_ss(W, U, A, B, I) \ 6377 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6378 (__v4sf)(__m128)(B), \ 6379 (__v4sf)(__m128)(W), \ 6380 (__mmask8)(U), (int)(I), \ 6381 _MM_FROUND_CUR_DIRECTION) 6382 6383 #define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \ 6384 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6385 (__v4sf)(__m128)(B), \ 6386 (__v4sf)(__m128)(W), \ 6387 (__mmask8)(U), (int)(I), \ 6388 (int)(R)) 6389 6390 #define _mm_maskz_roundscale_ss(U, A, B, I) \ 6391 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6392 (__v4sf)(__m128)(B), \ 6393 (__v4sf)_mm_setzero_ps(), \ 6394 (__mmask8)(U), (int)(I), \ 6395 _MM_FROUND_CUR_DIRECTION) 6396 6397 #define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ 6398 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6399 (__v4sf)(__m128)(B), \ 6400 (__v4sf)_mm_setzero_ps(), \ 6401 (__mmask8)(U), (int)(I), \ 6402 (int)(R)) 6403 6404 #define _mm512_scalef_round_pd(A, B, R) \ 6405 (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6406 (__v8df)(__m512d)(B), \ 6407 (__v8df)_mm512_undefined_pd(), \ 6408 (__mmask8)-1, (int)(R)) 6409 6410 #define _mm512_mask_scalef_round_pd(W, U, A, B, R) \ 6411 (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6412 (__v8df)(__m512d)(B), \ 6413 (__v8df)(__m512d)(W), \ 6414 (__mmask8)(U), (int)(R)) 6415 6416 #define _mm512_maskz_scalef_round_pd(U, A, B, R) \ 6417 (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6418 (__v8df)(__m512d)(B), \ 6419 (__v8df)_mm512_setzero_pd(), \ 6420 (__mmask8)(U), (int)(R)) 6421 6422 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6423 _mm512_scalef_pd (__m512d __A, __m512d __B) 6424 { 6425 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6426 (__v8df) __B, 6427 (__v8df) 6428 _mm512_undefined_pd (), 6429 (__mmask8) -1, 6430 _MM_FROUND_CUR_DIRECTION); 6431 } 6432 6433 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6434 _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 6435 { 6436 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6437 (__v8df) __B, 6438 (__v8df) __W, 6439 (__mmask8) __U, 6440 _MM_FROUND_CUR_DIRECTION); 6441 } 6442 6443 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6444 _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) 6445 { 6446 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6447 (__v8df) __B, 6448 (__v8df) 6449 _mm512_setzero_pd (), 6450 (__mmask8) __U, 6451 _MM_FROUND_CUR_DIRECTION); 6452 } 6453 6454 #define _mm512_scalef_round_ps(A, B, R) \ 6455 (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6456 (__v16sf)(__m512)(B), \ 6457 (__v16sf)_mm512_undefined_ps(), \ 6458 (__mmask16)-1, (int)(R)) 6459 6460 #define _mm512_mask_scalef_round_ps(W, U, A, B, R) \ 6461 (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6462 (__v16sf)(__m512)(B), \ 6463 (__v16sf)(__m512)(W), \ 6464 (__mmask16)(U), (int)(R)) 6465 6466 #define _mm512_maskz_scalef_round_ps(U, A, B, R) \ 6467 (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6468 (__v16sf)(__m512)(B), \ 6469 (__v16sf)_mm512_setzero_ps(), \ 6470 (__mmask16)(U), (int)(R)) 6471 6472 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6473 _mm512_scalef_ps (__m512 __A, __m512 __B) 6474 { 6475 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6476 (__v16sf) __B, 6477 (__v16sf) 6478 _mm512_undefined_ps (), 6479 (__mmask16) -1, 6480 _MM_FROUND_CUR_DIRECTION); 6481 } 6482 6483 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6484 _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 6485 { 6486 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6487 (__v16sf) __B, 6488 (__v16sf) __W, 6489 (__mmask16) __U, 6490 _MM_FROUND_CUR_DIRECTION); 6491 } 6492 6493 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6494 _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) 6495 { 6496 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6497 (__v16sf) __B, 6498 (__v16sf) 6499 _mm512_setzero_ps (), 6500 (__mmask16) __U, 6501 _MM_FROUND_CUR_DIRECTION); 6502 } 6503 6504 #define _mm_scalef_round_sd(A, B, R) \ 6505 (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6506 (__v2df)(__m128d)(B), \ 6507 (__v2df)_mm_setzero_pd(), \ 6508 (__mmask8)-1, (int)(R)) 6509 6510 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6511 _mm_scalef_sd (__m128d __A, __m128d __B) 6512 { 6513 return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A, 6514 (__v2df)( __B), (__v2df) _mm_setzero_pd(), 6515 (__mmask8) -1, 6516 _MM_FROUND_CUR_DIRECTION); 6517 } 6518 6519 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6520 _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 6521 { 6522 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, 6523 (__v2df) __B, 6524 (__v2df) __W, 6525 (__mmask8) __U, 6526 _MM_FROUND_CUR_DIRECTION); 6527 } 6528 6529 #define _mm_mask_scalef_round_sd(W, U, A, B, R) \ 6530 (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6531 (__v2df)(__m128d)(B), \ 6532 (__v2df)(__m128d)(W), \ 6533 (__mmask8)(U), (int)(R)) 6534 6535 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6536 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) 6537 { 6538 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, 6539 (__v2df) __B, 6540 (__v2df) _mm_setzero_pd (), 6541 (__mmask8) __U, 6542 _MM_FROUND_CUR_DIRECTION); 6543 } 6544 6545 #define _mm_maskz_scalef_round_sd(U, A, B, R) \ 6546 (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6547 (__v2df)(__m128d)(B), \ 6548 (__v2df)_mm_setzero_pd(), \ 6549 (__mmask8)(U), (int)(R)) 6550 6551 #define _mm_scalef_round_ss(A, B, R) \ 6552 (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6553 (__v4sf)(__m128)(B), \ 6554 (__v4sf)_mm_setzero_ps(), \ 6555 (__mmask8)-1, (int)(R)) 6556 6557 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6558 _mm_scalef_ss (__m128 __A, __m128 __B) 6559 { 6560 return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A, 6561 (__v4sf)( __B), (__v4sf) _mm_setzero_ps(), 6562 (__mmask8) -1, 6563 _MM_FROUND_CUR_DIRECTION); 6564 } 6565 6566 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6567 _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 6568 { 6569 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, 6570 (__v4sf) __B, 6571 (__v4sf) __W, 6572 (__mmask8) __U, 6573 _MM_FROUND_CUR_DIRECTION); 6574 } 6575 6576 #define _mm_mask_scalef_round_ss(W, U, A, B, R) \ 6577 (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6578 (__v4sf)(__m128)(B), \ 6579 (__v4sf)(__m128)(W), \ 6580 (__mmask8)(U), (int)(R)) 6581 6582 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6583 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) 6584 { 6585 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, 6586 (__v4sf) __B, 6587 (__v4sf) _mm_setzero_ps (), 6588 (__mmask8) __U, 6589 _MM_FROUND_CUR_DIRECTION); 6590 } 6591 6592 #define _mm_maskz_scalef_round_ss(U, A, B, R) \ 6593 (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6594 (__v4sf)(__m128)(B), \ 6595 (__v4sf)_mm_setzero_ps(), \ 6596 (__mmask8)(U), \ 6597 (int)(R)) 6598 6599 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6600 _mm512_srai_epi32(__m512i __A, unsigned int __B) 6601 { 6602 return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B); 6603 } 6604 6605 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6606 _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, 6607 unsigned int __B) 6608 { 6609 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 6610 (__v16si)_mm512_srai_epi32(__A, __B), 6611 (__v16si)__W); 6612 } 6613 6614 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6615 _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, 6616 unsigned int __B) { 6617 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 6618 (__v16si)_mm512_srai_epi32(__A, __B), 6619 (__v16si)_mm512_setzero_si512()); 6620 } 6621 6622 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6623 _mm512_srai_epi64(__m512i __A, unsigned int __B) 6624 { 6625 return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B); 6626 } 6627 6628 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6629 _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) 6630 { 6631 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 6632 (__v8di)_mm512_srai_epi64(__A, __B), 6633 (__v8di)__W); 6634 } 6635 6636 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6637 _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) 6638 { 6639 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 6640 (__v8di)_mm512_srai_epi64(__A, __B), 6641 (__v8di)_mm512_setzero_si512()); 6642 } 6643 6644 #define _mm512_shuffle_f32x4(A, B, imm) \ 6645 (__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \ 6646 (__v16sf)(__m512)(B), (int)(imm)) 6647 6648 #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \ 6649 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6650 (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ 6651 (__v16sf)(__m512)(W)) 6652 6653 #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \ 6654 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6655 (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ 6656 (__v16sf)_mm512_setzero_ps()) 6657 6658 #define _mm512_shuffle_f64x2(A, B, imm) \ 6659 (__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \ 6660 (__v8df)(__m512d)(B), (int)(imm)) 6661 6662 #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \ 6663 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6664 (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ 6665 (__v8df)(__m512d)(W)) 6666 6667 #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \ 6668 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6669 (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ 6670 (__v8df)_mm512_setzero_pd()) 6671 6672 #define _mm512_shuffle_i32x4(A, B, imm) \ 6673 (__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \ 6674 (__v16si)(__m512i)(B), (int)(imm)) 6675 6676 #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \ 6677 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 6678 (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ 6679 (__v16si)(__m512i)(W)) 6680 6681 #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \ 6682 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 6683 (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ 6684 (__v16si)_mm512_setzero_si512()) 6685 6686 #define _mm512_shuffle_i64x2(A, B, imm) \ 6687 (__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \ 6688 (__v8di)(__m512i)(B), (int)(imm)) 6689 6690 #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \ 6691 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 6692 (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ 6693 (__v8di)(__m512i)(W)) 6694 6695 #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \ 6696 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 6697 (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ 6698 (__v8di)_mm512_setzero_si512()) 6699 6700 #define _mm512_shuffle_pd(A, B, M) \ 6701 (__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \ 6702 (__v8df)(__m512d)(B), (int)(M)) 6703 6704 #define _mm512_mask_shuffle_pd(W, U, A, B, M) \ 6705 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6706 (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ 6707 (__v8df)(__m512d)(W)) 6708 6709 #define _mm512_maskz_shuffle_pd(U, A, B, M) \ 6710 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6711 (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ 6712 (__v8df)_mm512_setzero_pd()) 6713 6714 #define _mm512_shuffle_ps(A, B, M) \ 6715 (__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \ 6716 (__v16sf)(__m512)(B), (int)(M)) 6717 6718 #define _mm512_mask_shuffle_ps(W, U, A, B, M) \ 6719 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6720 (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ 6721 (__v16sf)(__m512)(W)) 6722 6723 #define _mm512_maskz_shuffle_ps(U, A, B, M) \ 6724 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6725 (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ 6726 (__v16sf)_mm512_setzero_ps()) 6727 6728 #define _mm_sqrt_round_sd(A, B, R) \ 6729 (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6730 (__v2df)(__m128d)(B), \ 6731 (__v2df)_mm_setzero_pd(), \ 6732 (__mmask8)-1, (int)(R)) 6733 6734 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6735 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 6736 { 6737 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, 6738 (__v2df) __B, 6739 (__v2df) __W, 6740 (__mmask8) __U, 6741 _MM_FROUND_CUR_DIRECTION); 6742 } 6743 6744 #define _mm_mask_sqrt_round_sd(W, U, A, B, R) \ 6745 (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6746 (__v2df)(__m128d)(B), \ 6747 (__v2df)(__m128d)(W), \ 6748 (__mmask8)(U), (int)(R)) 6749 6750 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6751 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) 6752 { 6753 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, 6754 (__v2df) __B, 6755 (__v2df) _mm_setzero_pd (), 6756 (__mmask8) __U, 6757 _MM_FROUND_CUR_DIRECTION); 6758 } 6759 6760 #define _mm_maskz_sqrt_round_sd(U, A, B, R) \ 6761 (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6762 (__v2df)(__m128d)(B), \ 6763 (__v2df)_mm_setzero_pd(), \ 6764 (__mmask8)(U), (int)(R)) 6765 6766 #define _mm_sqrt_round_ss(A, B, R) \ 6767 (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6768 (__v4sf)(__m128)(B), \ 6769 (__v4sf)_mm_setzero_ps(), \ 6770 (__mmask8)-1, (int)(R)) 6771 6772 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6773 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 6774 { 6775 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, 6776 (__v4sf) __B, 6777 (__v4sf) __W, 6778 (__mmask8) __U, 6779 _MM_FROUND_CUR_DIRECTION); 6780 } 6781 6782 #define _mm_mask_sqrt_round_ss(W, U, A, B, R) \ 6783 (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6784 (__v4sf)(__m128)(B), \ 6785 (__v4sf)(__m128)(W), (__mmask8)(U), \ 6786 (int)(R)) 6787 6788 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6789 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) 6790 { 6791 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, 6792 (__v4sf) __B, 6793 (__v4sf) _mm_setzero_ps (), 6794 (__mmask8) __U, 6795 _MM_FROUND_CUR_DIRECTION); 6796 } 6797 6798 #define _mm_maskz_sqrt_round_ss(U, A, B, R) \ 6799 (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6800 (__v4sf)(__m128)(B), \ 6801 (__v4sf)_mm_setzero_ps(), \ 6802 (__mmask8)(U), (int)(R)) 6803 6804 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6805 _mm512_broadcast_f32x4(__m128 __A) 6806 { 6807 return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 6808 0, 1, 2, 3, 0, 1, 2, 3, 6809 0, 1, 2, 3, 0, 1, 2, 3); 6810 } 6811 6812 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6813 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) 6814 { 6815 return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 6816 (__v16sf)_mm512_broadcast_f32x4(__A), 6817 (__v16sf)__O); 6818 } 6819 6820 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6821 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) 6822 { 6823 return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 6824 (__v16sf)_mm512_broadcast_f32x4(__A), 6825 (__v16sf)_mm512_setzero_ps()); 6826 } 6827 6828 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6829 _mm512_broadcast_f64x4(__m256d __A) 6830 { 6831 return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A, 6832 0, 1, 2, 3, 0, 1, 2, 3); 6833 } 6834 6835 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6836 _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) 6837 { 6838 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, 6839 (__v8df)_mm512_broadcast_f64x4(__A), 6840 (__v8df)__O); 6841 } 6842 6843 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6844 _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) 6845 { 6846 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, 6847 (__v8df)_mm512_broadcast_f64x4(__A), 6848 (__v8df)_mm512_setzero_pd()); 6849 } 6850 6851 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6852 _mm512_broadcast_i32x4(__m128i __A) 6853 { 6854 return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 6855 0, 1, 2, 3, 0, 1, 2, 3, 6856 0, 1, 2, 3, 0, 1, 2, 3); 6857 } 6858 6859 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6860 _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) 6861 { 6862 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 6863 (__v16si)_mm512_broadcast_i32x4(__A), 6864 (__v16si)__O); 6865 } 6866 6867 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6868 _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) 6869 { 6870 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 6871 (__v16si)_mm512_broadcast_i32x4(__A), 6872 (__v16si)_mm512_setzero_si512()); 6873 } 6874 6875 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6876 _mm512_broadcast_i64x4(__m256i __A) 6877 { 6878 return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A, 6879 0, 1, 2, 3, 0, 1, 2, 3); 6880 } 6881 6882 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6883 _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) 6884 { 6885 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 6886 (__v8di)_mm512_broadcast_i64x4(__A), 6887 (__v8di)__O); 6888 } 6889 6890 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6891 _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) 6892 { 6893 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 6894 (__v8di)_mm512_broadcast_i64x4(__A), 6895 (__v8di)_mm512_setzero_si512()); 6896 } 6897 6898 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6899 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) 6900 { 6901 return (__m512d)__builtin_ia32_selectpd_512(__M, 6902 (__v8df) _mm512_broadcastsd_pd(__A), 6903 (__v8df) __O); 6904 } 6905 6906 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6907 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) 6908 { 6909 return (__m512d)__builtin_ia32_selectpd_512(__M, 6910 (__v8df) _mm512_broadcastsd_pd(__A), 6911 (__v8df) _mm512_setzero_pd()); 6912 } 6913 6914 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6915 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) 6916 { 6917 return (__m512)__builtin_ia32_selectps_512(__M, 6918 (__v16sf) _mm512_broadcastss_ps(__A), 6919 (__v16sf) __O); 6920 } 6921 6922 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6923 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) 6924 { 6925 return (__m512)__builtin_ia32_selectps_512(__M, 6926 (__v16sf) _mm512_broadcastss_ps(__A), 6927 (__v16sf) _mm512_setzero_ps()); 6928 } 6929 6930 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6931 _mm512_cvtsepi32_epi8 (__m512i __A) 6932 { 6933 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6934 (__v16qi) _mm_undefined_si128 (), 6935 (__mmask16) -1); 6936 } 6937 6938 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6939 _mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 6940 { 6941 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6942 (__v16qi) __O, __M); 6943 } 6944 6945 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6946 _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) 6947 { 6948 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6949 (__v16qi) _mm_setzero_si128 (), 6950 __M); 6951 } 6952 6953 static __inline__ void __DEFAULT_FN_ATTRS512 6954 _mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 6955 { 6956 __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 6957 } 6958 6959 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6960 _mm512_cvtsepi32_epi16 (__m512i __A) 6961 { 6962 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6963 (__v16hi) _mm256_undefined_si256 (), 6964 (__mmask16) -1); 6965 } 6966 6967 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6968 _mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 6969 { 6970 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6971 (__v16hi) __O, __M); 6972 } 6973 6974 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6975 _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) 6976 { 6977 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6978 (__v16hi) _mm256_setzero_si256 (), 6979 __M); 6980 } 6981 6982 static __inline__ void __DEFAULT_FN_ATTRS512 6983 _mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) 6984 { 6985 __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); 6986 } 6987 6988 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6989 _mm512_cvtsepi64_epi8 (__m512i __A) 6990 { 6991 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 6992 (__v16qi) _mm_undefined_si128 (), 6993 (__mmask8) -1); 6994 } 6995 6996 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6997 _mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 6998 { 6999 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 7000 (__v16qi) __O, __M); 7001 } 7002 7003 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7004 _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) 7005 { 7006 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 7007 (__v16qi) _mm_setzero_si128 (), 7008 __M); 7009 } 7010 7011 static __inline__ void __DEFAULT_FN_ATTRS512 7012 _mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7013 { 7014 __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7015 } 7016 7017 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7018 _mm512_cvtsepi64_epi32 (__m512i __A) 7019 { 7020 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7021 (__v8si) _mm256_undefined_si256 (), 7022 (__mmask8) -1); 7023 } 7024 7025 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7026 _mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7027 { 7028 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7029 (__v8si) __O, __M); 7030 } 7031 7032 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7033 _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) 7034 { 7035 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7036 (__v8si) _mm256_setzero_si256 (), 7037 __M); 7038 } 7039 7040 static __inline__ void __DEFAULT_FN_ATTRS512 7041 _mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) 7042 { 7043 __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); 7044 } 7045 7046 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7047 _mm512_cvtsepi64_epi16 (__m512i __A) 7048 { 7049 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7050 (__v8hi) _mm_undefined_si128 (), 7051 (__mmask8) -1); 7052 } 7053 7054 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7055 _mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7056 { 7057 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7058 (__v8hi) __O, __M); 7059 } 7060 7061 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7062 _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) 7063 { 7064 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7065 (__v8hi) _mm_setzero_si128 (), 7066 __M); 7067 } 7068 7069 static __inline__ void __DEFAULT_FN_ATTRS512 7070 _mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) 7071 { 7072 __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); 7073 } 7074 7075 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7076 _mm512_cvtusepi32_epi8 (__m512i __A) 7077 { 7078 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7079 (__v16qi) _mm_undefined_si128 (), 7080 (__mmask16) -1); 7081 } 7082 7083 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7084 _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 7085 { 7086 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7087 (__v16qi) __O, 7088 __M); 7089 } 7090 7091 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7092 _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) 7093 { 7094 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7095 (__v16qi) _mm_setzero_si128 (), 7096 __M); 7097 } 7098 7099 static __inline__ void __DEFAULT_FN_ATTRS512 7100 _mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 7101 { 7102 __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 7103 } 7104 7105 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7106 _mm512_cvtusepi32_epi16 (__m512i __A) 7107 { 7108 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7109 (__v16hi) _mm256_undefined_si256 (), 7110 (__mmask16) -1); 7111 } 7112 7113 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7114 _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 7115 { 7116 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7117 (__v16hi) __O, 7118 __M); 7119 } 7120 7121 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7122 _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) 7123 { 7124 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7125 (__v16hi) _mm256_setzero_si256 (), 7126 __M); 7127 } 7128 7129 static __inline__ void __DEFAULT_FN_ATTRS512 7130 _mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) 7131 { 7132 __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); 7133 } 7134 7135 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7136 _mm512_cvtusepi64_epi8 (__m512i __A) 7137 { 7138 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7139 (__v16qi) _mm_undefined_si128 (), 7140 (__mmask8) -1); 7141 } 7142 7143 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7144 _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 7145 { 7146 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7147 (__v16qi) __O, 7148 __M); 7149 } 7150 7151 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7152 _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) 7153 { 7154 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7155 (__v16qi) _mm_setzero_si128 (), 7156 __M); 7157 } 7158 7159 static __inline__ void __DEFAULT_FN_ATTRS512 7160 _mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7161 { 7162 __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7163 } 7164 7165 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7166 _mm512_cvtusepi64_epi32 (__m512i __A) 7167 { 7168 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7169 (__v8si) _mm256_undefined_si256 (), 7170 (__mmask8) -1); 7171 } 7172 7173 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7174 _mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7175 { 7176 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7177 (__v8si) __O, __M); 7178 } 7179 7180 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7181 _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) 7182 { 7183 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7184 (__v8si) _mm256_setzero_si256 (), 7185 __M); 7186 } 7187 7188 static __inline__ void __DEFAULT_FN_ATTRS512 7189 _mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) 7190 { 7191 __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); 7192 } 7193 7194 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7195 _mm512_cvtusepi64_epi16 (__m512i __A) 7196 { 7197 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7198 (__v8hi) _mm_undefined_si128 (), 7199 (__mmask8) -1); 7200 } 7201 7202 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7203 _mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7204 { 7205 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7206 (__v8hi) __O, __M); 7207 } 7208 7209 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7210 _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) 7211 { 7212 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7213 (__v8hi) _mm_setzero_si128 (), 7214 __M); 7215 } 7216 7217 static __inline__ void __DEFAULT_FN_ATTRS512 7218 _mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) 7219 { 7220 __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); 7221 } 7222 7223 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7224 _mm512_cvtepi32_epi8 (__m512i __A) 7225 { 7226 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7227 (__v16qi) _mm_undefined_si128 (), 7228 (__mmask16) -1); 7229 } 7230 7231 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7232 _mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 7233 { 7234 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7235 (__v16qi) __O, __M); 7236 } 7237 7238 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7239 _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) 7240 { 7241 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7242 (__v16qi) _mm_setzero_si128 (), 7243 __M); 7244 } 7245 7246 static __inline__ void __DEFAULT_FN_ATTRS512 7247 _mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 7248 { 7249 __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 7250 } 7251 7252 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7253 _mm512_cvtepi32_epi16 (__m512i __A) 7254 { 7255 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7256 (__v16hi) _mm256_undefined_si256 (), 7257 (__mmask16) -1); 7258 } 7259 7260 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7261 _mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 7262 { 7263 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7264 (__v16hi) __O, __M); 7265 } 7266 7267 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7268 _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) 7269 { 7270 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7271 (__v16hi) _mm256_setzero_si256 (), 7272 __M); 7273 } 7274 7275 static __inline__ void __DEFAULT_FN_ATTRS512 7276 _mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) 7277 { 7278 __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); 7279 } 7280 7281 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7282 _mm512_cvtepi64_epi8 (__m512i __A) 7283 { 7284 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7285 (__v16qi) _mm_undefined_si128 (), 7286 (__mmask8) -1); 7287 } 7288 7289 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7290 _mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 7291 { 7292 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7293 (__v16qi) __O, __M); 7294 } 7295 7296 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7297 _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) 7298 { 7299 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7300 (__v16qi) _mm_setzero_si128 (), 7301 __M); 7302 } 7303 7304 static __inline__ void __DEFAULT_FN_ATTRS512 7305 _mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7306 { 7307 __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7308 } 7309 7310 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7311 _mm512_cvtepi64_epi32 (__m512i __A) 7312 { 7313 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7314 (__v8si) _mm256_undefined_si256 (), 7315 (__mmask8) -1); 7316 } 7317 7318 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7319 _mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7320 { 7321 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7322 (__v8si) __O, __M); 7323 } 7324 7325 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7326 _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) 7327 { 7328 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7329 (__v8si) _mm256_setzero_si256 (), 7330 __M); 7331 } 7332 7333 static __inline__ void __DEFAULT_FN_ATTRS512 7334 _mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) 7335 { 7336 __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); 7337 } 7338 7339 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7340 _mm512_cvtepi64_epi16 (__m512i __A) 7341 { 7342 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7343 (__v8hi) _mm_undefined_si128 (), 7344 (__mmask8) -1); 7345 } 7346 7347 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7348 _mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7349 { 7350 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7351 (__v8hi) __O, __M); 7352 } 7353 7354 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7355 _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) 7356 { 7357 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7358 (__v8hi) _mm_setzero_si128 (), 7359 __M); 7360 } 7361 7362 static __inline__ void __DEFAULT_FN_ATTRS512 7363 _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) 7364 { 7365 __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); 7366 } 7367 7368 #define _mm512_extracti32x4_epi32(A, imm) \ 7369 (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7370 (__v4si)_mm_undefined_si128(), \ 7371 (__mmask8)-1) 7372 7373 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \ 7374 (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7375 (__v4si)(__m128i)(W), \ 7376 (__mmask8)(U)) 7377 7378 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \ 7379 (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7380 (__v4si)_mm_setzero_si128(), \ 7381 (__mmask8)(U)) 7382 7383 #define _mm512_extracti64x4_epi64(A, imm) \ 7384 (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7385 (__v4di)_mm256_undefined_si256(), \ 7386 (__mmask8)-1) 7387 7388 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \ 7389 (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7390 (__v4di)(__m256i)(W), \ 7391 (__mmask8)(U)) 7392 7393 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \ 7394 (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7395 (__v4di)_mm256_setzero_si256(), \ 7396 (__mmask8)(U)) 7397 7398 #define _mm512_insertf64x4(A, B, imm) \ 7399 (__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ 7400 (__v4df)(__m256d)(B), (int)(imm)) 7401 7402 #define _mm512_mask_insertf64x4(W, U, A, B, imm) \ 7403 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 7404 (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ 7405 (__v8df)(__m512d)(W)) 7406 7407 #define _mm512_maskz_insertf64x4(U, A, B, imm) \ 7408 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 7409 (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ 7410 (__v8df)_mm512_setzero_pd()) 7411 7412 #define _mm512_inserti64x4(A, B, imm) \ 7413 (__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \ 7414 (__v4di)(__m256i)(B), (int)(imm)) 7415 7416 #define _mm512_mask_inserti64x4(W, U, A, B, imm) \ 7417 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 7418 (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ 7419 (__v8di)(__m512i)(W)) 7420 7421 #define _mm512_maskz_inserti64x4(U, A, B, imm) \ 7422 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 7423 (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ 7424 (__v8di)_mm512_setzero_si512()) 7425 7426 #define _mm512_insertf32x4(A, B, imm) \ 7427 (__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \ 7428 (__v4sf)(__m128)(B), (int)(imm)) 7429 7430 #define _mm512_mask_insertf32x4(W, U, A, B, imm) \ 7431 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 7432 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ 7433 (__v16sf)(__m512)(W)) 7434 7435 #define _mm512_maskz_insertf32x4(U, A, B, imm) \ 7436 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 7437 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ 7438 (__v16sf)_mm512_setzero_ps()) 7439 7440 #define _mm512_inserti32x4(A, B, imm) \ 7441 (__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \ 7442 (__v4si)(__m128i)(B), (int)(imm)) 7443 7444 #define _mm512_mask_inserti32x4(W, U, A, B, imm) \ 7445 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 7446 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ 7447 (__v16si)(__m512i)(W)) 7448 7449 #define _mm512_maskz_inserti32x4(U, A, B, imm) \ 7450 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 7451 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ 7452 (__v16si)_mm512_setzero_si512()) 7453 7454 #define _mm512_getmant_round_pd(A, B, C, R) \ 7455 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7456 (int)(((C)<<2) | (B)), \ 7457 (__v8df)_mm512_undefined_pd(), \ 7458 (__mmask8)-1, (int)(R)) 7459 7460 #define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \ 7461 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7462 (int)(((C)<<2) | (B)), \ 7463 (__v8df)(__m512d)(W), \ 7464 (__mmask8)(U), (int)(R)) 7465 7466 #define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \ 7467 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7468 (int)(((C)<<2) | (B)), \ 7469 (__v8df)_mm512_setzero_pd(), \ 7470 (__mmask8)(U), (int)(R)) 7471 7472 #define _mm512_getmant_pd(A, B, C) \ 7473 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7474 (int)(((C)<<2) | (B)), \ 7475 (__v8df)_mm512_setzero_pd(), \ 7476 (__mmask8)-1, \ 7477 _MM_FROUND_CUR_DIRECTION) 7478 7479 #define _mm512_mask_getmant_pd(W, U, A, B, C) \ 7480 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7481 (int)(((C)<<2) | (B)), \ 7482 (__v8df)(__m512d)(W), \ 7483 (__mmask8)(U), \ 7484 _MM_FROUND_CUR_DIRECTION) 7485 7486 #define _mm512_maskz_getmant_pd(U, A, B, C) \ 7487 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7488 (int)(((C)<<2) | (B)), \ 7489 (__v8df)_mm512_setzero_pd(), \ 7490 (__mmask8)(U), \ 7491 _MM_FROUND_CUR_DIRECTION) 7492 7493 #define _mm512_getmant_round_ps(A, B, C, R) \ 7494 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7495 (int)(((C)<<2) | (B)), \ 7496 (__v16sf)_mm512_undefined_ps(), \ 7497 (__mmask16)-1, (int)(R)) 7498 7499 #define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \ 7500 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7501 (int)(((C)<<2) | (B)), \ 7502 (__v16sf)(__m512)(W), \ 7503 (__mmask16)(U), (int)(R)) 7504 7505 #define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \ 7506 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7507 (int)(((C)<<2) | (B)), \ 7508 (__v16sf)_mm512_setzero_ps(), \ 7509 (__mmask16)(U), (int)(R)) 7510 7511 #define _mm512_getmant_ps(A, B, C) \ 7512 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7513 (int)(((C)<<2)|(B)), \ 7514 (__v16sf)_mm512_undefined_ps(), \ 7515 (__mmask16)-1, \ 7516 _MM_FROUND_CUR_DIRECTION) 7517 7518 #define _mm512_mask_getmant_ps(W, U, A, B, C) \ 7519 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7520 (int)(((C)<<2)|(B)), \ 7521 (__v16sf)(__m512)(W), \ 7522 (__mmask16)(U), \ 7523 _MM_FROUND_CUR_DIRECTION) 7524 7525 #define _mm512_maskz_getmant_ps(U, A, B, C) \ 7526 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7527 (int)(((C)<<2)|(B)), \ 7528 (__v16sf)_mm512_setzero_ps(), \ 7529 (__mmask16)(U), \ 7530 _MM_FROUND_CUR_DIRECTION) 7531 7532 #define _mm512_getexp_round_pd(A, R) \ 7533 (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7534 (__v8df)_mm512_undefined_pd(), \ 7535 (__mmask8)-1, (int)(R)) 7536 7537 #define _mm512_mask_getexp_round_pd(W, U, A, R) \ 7538 (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7539 (__v8df)(__m512d)(W), \ 7540 (__mmask8)(U), (int)(R)) 7541 7542 #define _mm512_maskz_getexp_round_pd(U, A, R) \ 7543 (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7544 (__v8df)_mm512_setzero_pd(), \ 7545 (__mmask8)(U), (int)(R)) 7546 7547 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7548 _mm512_getexp_pd (__m512d __A) 7549 { 7550 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7551 (__v8df) _mm512_undefined_pd (), 7552 (__mmask8) -1, 7553 _MM_FROUND_CUR_DIRECTION); 7554 } 7555 7556 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7557 _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) 7558 { 7559 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7560 (__v8df) __W, 7561 (__mmask8) __U, 7562 _MM_FROUND_CUR_DIRECTION); 7563 } 7564 7565 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7566 _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) 7567 { 7568 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7569 (__v8df) _mm512_setzero_pd (), 7570 (__mmask8) __U, 7571 _MM_FROUND_CUR_DIRECTION); 7572 } 7573 7574 #define _mm512_getexp_round_ps(A, R) \ 7575 (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7576 (__v16sf)_mm512_undefined_ps(), \ 7577 (__mmask16)-1, (int)(R)) 7578 7579 #define _mm512_mask_getexp_round_ps(W, U, A, R) \ 7580 (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7581 (__v16sf)(__m512)(W), \ 7582 (__mmask16)(U), (int)(R)) 7583 7584 #define _mm512_maskz_getexp_round_ps(U, A, R) \ 7585 (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7586 (__v16sf)_mm512_setzero_ps(), \ 7587 (__mmask16)(U), (int)(R)) 7588 7589 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7590 _mm512_getexp_ps (__m512 __A) 7591 { 7592 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7593 (__v16sf) _mm512_undefined_ps (), 7594 (__mmask16) -1, 7595 _MM_FROUND_CUR_DIRECTION); 7596 } 7597 7598 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7599 _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) 7600 { 7601 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7602 (__v16sf) __W, 7603 (__mmask16) __U, 7604 _MM_FROUND_CUR_DIRECTION); 7605 } 7606 7607 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7608 _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) 7609 { 7610 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7611 (__v16sf) _mm512_setzero_ps (), 7612 (__mmask16) __U, 7613 _MM_FROUND_CUR_DIRECTION); 7614 } 7615 7616 #define _mm512_i64gather_ps(index, addr, scale) \ 7617 (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \ 7618 (void const *)(addr), \ 7619 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7620 (int)(scale)) 7621 7622 #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \ 7623 (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ 7624 (void const *)(addr), \ 7625 (__v8di)(__m512i)(index), \ 7626 (__mmask8)(mask), (int)(scale)) 7627 7628 #define _mm512_i64gather_epi32(index, addr, scale) \ 7629 (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \ 7630 (void const *)(addr), \ 7631 (__v8di)(__m512i)(index), \ 7632 (__mmask8)-1, (int)(scale)) 7633 7634 #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \ 7635 (__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \ 7636 (void const *)(addr), \ 7637 (__v8di)(__m512i)(index), \ 7638 (__mmask8)(mask), (int)(scale)) 7639 7640 #define _mm512_i64gather_pd(index, addr, scale) \ 7641 (__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \ 7642 (void const *)(addr), \ 7643 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7644 (int)(scale)) 7645 7646 #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \ 7647 (__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \ 7648 (void const *)(addr), \ 7649 (__v8di)(__m512i)(index), \ 7650 (__mmask8)(mask), (int)(scale)) 7651 7652 #define _mm512_i64gather_epi64(index, addr, scale) \ 7653 (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \ 7654 (void const *)(addr), \ 7655 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7656 (int)(scale)) 7657 7658 #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \ 7659 (__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \ 7660 (void const *)(addr), \ 7661 (__v8di)(__m512i)(index), \ 7662 (__mmask8)(mask), (int)(scale)) 7663 7664 #define _mm512_i32gather_ps(index, addr, scale) \ 7665 (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \ 7666 (void const *)(addr), \ 7667 (__v16si)(__m512)(index), \ 7668 (__mmask16)-1, (int)(scale)) 7669 7670 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \ 7671 (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \ 7672 (void const *)(addr), \ 7673 (__v16si)(__m512)(index), \ 7674 (__mmask16)(mask), (int)(scale)) 7675 7676 #define _mm512_i32gather_epi32(index, addr, scale) \ 7677 (__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \ 7678 (void const *)(addr), \ 7679 (__v16si)(__m512i)(index), \ 7680 (__mmask16)-1, (int)(scale)) 7681 7682 #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \ 7683 (__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \ 7684 (void const *)(addr), \ 7685 (__v16si)(__m512i)(index), \ 7686 (__mmask16)(mask), (int)(scale)) 7687 7688 #define _mm512_i32gather_pd(index, addr, scale) \ 7689 (__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \ 7690 (void const *)(addr), \ 7691 (__v8si)(__m256i)(index), (__mmask8)-1, \ 7692 (int)(scale)) 7693 7694 #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \ 7695 (__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \ 7696 (void const *)(addr), \ 7697 (__v8si)(__m256i)(index), \ 7698 (__mmask8)(mask), (int)(scale)) 7699 7700 #define _mm512_i32gather_epi64(index, addr, scale) \ 7701 (__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \ 7702 (void const *)(addr), \ 7703 (__v8si)(__m256i)(index), (__mmask8)-1, \ 7704 (int)(scale)) 7705 7706 #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \ 7707 (__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \ 7708 (void const *)(addr), \ 7709 (__v8si)(__m256i)(index), \ 7710 (__mmask8)(mask), (int)(scale)) 7711 7712 #define _mm512_i64scatter_ps(addr, index, v1, scale) \ 7713 __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \ 7714 (__v8di)(__m512i)(index), \ 7715 (__v8sf)(__m256)(v1), (int)(scale)) 7716 7717 #define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \ 7718 __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \ 7719 (__v8di)(__m512i)(index), \ 7720 (__v8sf)(__m256)(v1), (int)(scale)) 7721 7722 #define _mm512_i64scatter_epi32(addr, index, v1, scale) \ 7723 __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \ 7724 (__v8di)(__m512i)(index), \ 7725 (__v8si)(__m256i)(v1), (int)(scale)) 7726 7727 #define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ 7728 __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \ 7729 (__v8di)(__m512i)(index), \ 7730 (__v8si)(__m256i)(v1), (int)(scale)) 7731 7732 #define _mm512_i64scatter_pd(addr, index, v1, scale) \ 7733 __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \ 7734 (__v8di)(__m512i)(index), \ 7735 (__v8df)(__m512d)(v1), (int)(scale)) 7736 7737 #define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \ 7738 __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \ 7739 (__v8di)(__m512i)(index), \ 7740 (__v8df)(__m512d)(v1), (int)(scale)) 7741 7742 #define _mm512_i64scatter_epi64(addr, index, v1, scale) \ 7743 __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \ 7744 (__v8di)(__m512i)(index), \ 7745 (__v8di)(__m512i)(v1), (int)(scale)) 7746 7747 #define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ 7748 __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \ 7749 (__v8di)(__m512i)(index), \ 7750 (__v8di)(__m512i)(v1), (int)(scale)) 7751 7752 #define _mm512_i32scatter_ps(addr, index, v1, scale) \ 7753 __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \ 7754 (__v16si)(__m512i)(index), \ 7755 (__v16sf)(__m512)(v1), (int)(scale)) 7756 7757 #define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \ 7758 __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \ 7759 (__v16si)(__m512i)(index), \ 7760 (__v16sf)(__m512)(v1), (int)(scale)) 7761 7762 #define _mm512_i32scatter_epi32(addr, index, v1, scale) \ 7763 __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \ 7764 (__v16si)(__m512i)(index), \ 7765 (__v16si)(__m512i)(v1), (int)(scale)) 7766 7767 #define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ 7768 __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \ 7769 (__v16si)(__m512i)(index), \ 7770 (__v16si)(__m512i)(v1), (int)(scale)) 7771 7772 #define _mm512_i32scatter_pd(addr, index, v1, scale) \ 7773 __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \ 7774 (__v8si)(__m256i)(index), \ 7775 (__v8df)(__m512d)(v1), (int)(scale)) 7776 7777 #define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \ 7778 __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \ 7779 (__v8si)(__m256i)(index), \ 7780 (__v8df)(__m512d)(v1), (int)(scale)) 7781 7782 #define _mm512_i32scatter_epi64(addr, index, v1, scale) \ 7783 __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \ 7784 (__v8si)(__m256i)(index), \ 7785 (__v8di)(__m512i)(v1), (int)(scale)) 7786 7787 #define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ 7788 __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \ 7789 (__v8si)(__m256i)(index), \ 7790 (__v8di)(__m512i)(v1), (int)(scale)) 7791 7792 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7793 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7794 { 7795 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7796 (__v4sf)__A, 7797 (__v4sf)__B, 7798 (__mmask8)__U, 7799 _MM_FROUND_CUR_DIRECTION); 7800 } 7801 7802 #define _mm_fmadd_round_ss(A, B, C, R) \ 7803 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7804 (__v4sf)(__m128)(B), \ 7805 (__v4sf)(__m128)(C), (__mmask8)-1, \ 7806 (int)(R)) 7807 7808 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) \ 7809 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7810 (__v4sf)(__m128)(A), \ 7811 (__v4sf)(__m128)(B), (__mmask8)(U), \ 7812 (int)(R)) 7813 7814 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7815 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7816 { 7817 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7818 (__v4sf)__B, 7819 (__v4sf)__C, 7820 (__mmask8)__U, 7821 _MM_FROUND_CUR_DIRECTION); 7822 } 7823 7824 #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ 7825 (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7826 (__v4sf)(__m128)(B), \ 7827 (__v4sf)(__m128)(C), (__mmask8)(U), \ 7828 (int)(R)) 7829 7830 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7831 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7832 { 7833 return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, 7834 (__v4sf)__X, 7835 (__v4sf)__Y, 7836 (__mmask8)__U, 7837 _MM_FROUND_CUR_DIRECTION); 7838 } 7839 7840 #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \ 7841 (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ 7842 (__v4sf)(__m128)(X), \ 7843 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7844 (int)(R)) 7845 7846 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7847 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7848 { 7849 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7850 (__v4sf)__A, 7851 -(__v4sf)__B, 7852 (__mmask8)__U, 7853 _MM_FROUND_CUR_DIRECTION); 7854 } 7855 7856 #define _mm_fmsub_round_ss(A, B, C, R) \ 7857 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7858 (__v4sf)(__m128)(B), \ 7859 -(__v4sf)(__m128)(C), (__mmask8)-1, \ 7860 (int)(R)) 7861 7862 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) \ 7863 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7864 (__v4sf)(__m128)(A), \ 7865 -(__v4sf)(__m128)(B), (__mmask8)(U), \ 7866 (int)(R)) 7867 7868 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7869 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7870 { 7871 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7872 (__v4sf)__B, 7873 -(__v4sf)__C, 7874 (__mmask8)__U, 7875 _MM_FROUND_CUR_DIRECTION); 7876 } 7877 7878 #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ 7879 (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7880 (__v4sf)(__m128)(B), \ 7881 -(__v4sf)(__m128)(C), (__mmask8)(U), \ 7882 (int)(R)) 7883 7884 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7885 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7886 { 7887 return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, 7888 (__v4sf)__X, 7889 (__v4sf)__Y, 7890 (__mmask8)__U, 7891 _MM_FROUND_CUR_DIRECTION); 7892 } 7893 7894 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \ 7895 (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ 7896 (__v4sf)(__m128)(X), \ 7897 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7898 (int)(R)) 7899 7900 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7901 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7902 { 7903 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7904 -(__v4sf)__A, 7905 (__v4sf)__B, 7906 (__mmask8)__U, 7907 _MM_FROUND_CUR_DIRECTION); 7908 } 7909 7910 #define _mm_fnmadd_round_ss(A, B, C, R) \ 7911 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7912 -(__v4sf)(__m128)(B), \ 7913 (__v4sf)(__m128)(C), (__mmask8)-1, \ 7914 (int)(R)) 7915 7916 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \ 7917 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7918 -(__v4sf)(__m128)(A), \ 7919 (__v4sf)(__m128)(B), (__mmask8)(U), \ 7920 (int)(R)) 7921 7922 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7923 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7924 { 7925 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7926 -(__v4sf)__B, 7927 (__v4sf)__C, 7928 (__mmask8)__U, 7929 _MM_FROUND_CUR_DIRECTION); 7930 } 7931 7932 #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ 7933 (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7934 -(__v4sf)(__m128)(B), \ 7935 (__v4sf)(__m128)(C), (__mmask8)(U), \ 7936 (int)(R)) 7937 7938 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7939 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7940 { 7941 return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, 7942 -(__v4sf)__X, 7943 (__v4sf)__Y, 7944 (__mmask8)__U, 7945 _MM_FROUND_CUR_DIRECTION); 7946 } 7947 7948 #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \ 7949 (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ 7950 -(__v4sf)(__m128)(X), \ 7951 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7952 (int)(R)) 7953 7954 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7955 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7956 { 7957 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7958 -(__v4sf)__A, 7959 -(__v4sf)__B, 7960 (__mmask8)__U, 7961 _MM_FROUND_CUR_DIRECTION); 7962 } 7963 7964 #define _mm_fnmsub_round_ss(A, B, C, R) \ 7965 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7966 -(__v4sf)(__m128)(B), \ 7967 -(__v4sf)(__m128)(C), (__mmask8)-1, \ 7968 (int)(R)) 7969 7970 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \ 7971 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7972 -(__v4sf)(__m128)(A), \ 7973 -(__v4sf)(__m128)(B), (__mmask8)(U), \ 7974 (int)(R)) 7975 7976 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7977 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7978 { 7979 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7980 -(__v4sf)__B, 7981 -(__v4sf)__C, 7982 (__mmask8)__U, 7983 _MM_FROUND_CUR_DIRECTION); 7984 } 7985 7986 #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ 7987 (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7988 -(__v4sf)(__m128)(B), \ 7989 -(__v4sf)(__m128)(C), (__mmask8)(U), \ 7990 (int)(R)) 7991 7992 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7993 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7994 { 7995 return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, 7996 -(__v4sf)__X, 7997 (__v4sf)__Y, 7998 (__mmask8)__U, 7999 _MM_FROUND_CUR_DIRECTION); 8000 } 8001 8002 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \ 8003 (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ 8004 -(__v4sf)(__m128)(X), \ 8005 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 8006 (int)(R)) 8007 8008 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8009 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8010 { 8011 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8012 (__v2df)__A, 8013 (__v2df)__B, 8014 (__mmask8)__U, 8015 _MM_FROUND_CUR_DIRECTION); 8016 } 8017 8018 #define _mm_fmadd_round_sd(A, B, C, R) \ 8019 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8020 (__v2df)(__m128d)(B), \ 8021 (__v2df)(__m128d)(C), (__mmask8)-1, \ 8022 (int)(R)) 8023 8024 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) \ 8025 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8026 (__v2df)(__m128d)(A), \ 8027 (__v2df)(__m128d)(B), (__mmask8)(U), \ 8028 (int)(R)) 8029 8030 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8031 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8032 { 8033 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8034 (__v2df)__B, 8035 (__v2df)__C, 8036 (__mmask8)__U, 8037 _MM_FROUND_CUR_DIRECTION); 8038 } 8039 8040 #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ 8041 (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8042 (__v2df)(__m128d)(B), \ 8043 (__v2df)(__m128d)(C), (__mmask8)(U), \ 8044 (int)(R)) 8045 8046 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8047 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8048 { 8049 return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, 8050 (__v2df)__X, 8051 (__v2df)__Y, 8052 (__mmask8)__U, 8053 _MM_FROUND_CUR_DIRECTION); 8054 } 8055 8056 #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \ 8057 (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ 8058 (__v2df)(__m128d)(X), \ 8059 (__v2df)(__m128d)(Y), (__mmask8)(U), \ 8060 (int)(R)) 8061 8062 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8063 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8064 { 8065 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8066 (__v2df)__A, 8067 -(__v2df)__B, 8068 (__mmask8)__U, 8069 _MM_FROUND_CUR_DIRECTION); 8070 } 8071 8072 #define _mm_fmsub_round_sd(A, B, C, R) \ 8073 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8074 (__v2df)(__m128d)(B), \ 8075 -(__v2df)(__m128d)(C), (__mmask8)-1, \ 8076 (int)(R)) 8077 8078 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) \ 8079 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8080 (__v2df)(__m128d)(A), \ 8081 -(__v2df)(__m128d)(B), (__mmask8)(U), \ 8082 (int)(R)) 8083 8084 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8085 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8086 { 8087 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8088 (__v2df)__B, 8089 -(__v2df)__C, 8090 (__mmask8)__U, 8091 _MM_FROUND_CUR_DIRECTION); 8092 } 8093 8094 #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ 8095 (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8096 (__v2df)(__m128d)(B), \ 8097 -(__v2df)(__m128d)(C), \ 8098 (__mmask8)(U), (int)(R)) 8099 8100 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8101 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8102 { 8103 return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, 8104 (__v2df)__X, 8105 (__v2df)__Y, 8106 (__mmask8)__U, 8107 _MM_FROUND_CUR_DIRECTION); 8108 } 8109 8110 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \ 8111 (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ 8112 (__v2df)(__m128d)(X), \ 8113 (__v2df)(__m128d)(Y), \ 8114 (__mmask8)(U), (int)(R)) 8115 8116 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8117 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8118 { 8119 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8120 -(__v2df)__A, 8121 (__v2df)__B, 8122 (__mmask8)__U, 8123 _MM_FROUND_CUR_DIRECTION); 8124 } 8125 8126 #define _mm_fnmadd_round_sd(A, B, C, R) \ 8127 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8128 -(__v2df)(__m128d)(B), \ 8129 (__v2df)(__m128d)(C), (__mmask8)-1, \ 8130 (int)(R)) 8131 8132 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \ 8133 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8134 -(__v2df)(__m128d)(A), \ 8135 (__v2df)(__m128d)(B), (__mmask8)(U), \ 8136 (int)(R)) 8137 8138 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8139 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8140 { 8141 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8142 -(__v2df)__B, 8143 (__v2df)__C, 8144 (__mmask8)__U, 8145 _MM_FROUND_CUR_DIRECTION); 8146 } 8147 8148 #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ 8149 (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8150 -(__v2df)(__m128d)(B), \ 8151 (__v2df)(__m128d)(C), (__mmask8)(U), \ 8152 (int)(R)) 8153 8154 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8155 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8156 { 8157 return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, 8158 -(__v2df)__X, 8159 (__v2df)__Y, 8160 (__mmask8)__U, 8161 _MM_FROUND_CUR_DIRECTION); 8162 } 8163 8164 #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \ 8165 (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ 8166 -(__v2df)(__m128d)(X), \ 8167 (__v2df)(__m128d)(Y), (__mmask8)(U), \ 8168 (int)(R)) 8169 8170 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8171 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8172 { 8173 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8174 -(__v2df)__A, 8175 -(__v2df)__B, 8176 (__mmask8)__U, 8177 _MM_FROUND_CUR_DIRECTION); 8178 } 8179 8180 #define _mm_fnmsub_round_sd(A, B, C, R) \ 8181 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8182 -(__v2df)(__m128d)(B), \ 8183 -(__v2df)(__m128d)(C), (__mmask8)-1, \ 8184 (int)(R)) 8185 8186 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \ 8187 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8188 -(__v2df)(__m128d)(A), \ 8189 -(__v2df)(__m128d)(B), (__mmask8)(U), \ 8190 (int)(R)) 8191 8192 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8193 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8194 { 8195 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8196 -(__v2df)__B, 8197 -(__v2df)__C, 8198 (__mmask8)__U, 8199 _MM_FROUND_CUR_DIRECTION); 8200 } 8201 8202 #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ 8203 (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8204 -(__v2df)(__m128d)(B), \ 8205 -(__v2df)(__m128d)(C), \ 8206 (__mmask8)(U), \ 8207 (int)(R)) 8208 8209 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8210 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8211 { 8212 return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, 8213 -(__v2df)__X, 8214 (__v2df)__Y, 8215 (__mmask8)__U, 8216 _MM_FROUND_CUR_DIRECTION); 8217 } 8218 8219 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \ 8220 (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ 8221 -(__v2df)(__m128d)(X), \ 8222 (__v2df)(__m128d)(Y), \ 8223 (__mmask8)(U), (int)(R)) 8224 8225 #define _mm512_permutex_pd(X, C) \ 8226 (__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)) 8227 8228 #define _mm512_mask_permutex_pd(W, U, X, C) \ 8229 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 8230 (__v8df)_mm512_permutex_pd((X), (C)), \ 8231 (__v8df)(__m512d)(W)) 8232 8233 #define _mm512_maskz_permutex_pd(U, X, C) \ 8234 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 8235 (__v8df)_mm512_permutex_pd((X), (C)), \ 8236 (__v8df)_mm512_setzero_pd()) 8237 8238 #define _mm512_permutex_epi64(X, C) \ 8239 (__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)) 8240 8241 #define _mm512_mask_permutex_epi64(W, U, X, C) \ 8242 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 8243 (__v8di)_mm512_permutex_epi64((X), (C)), \ 8244 (__v8di)(__m512i)(W)) 8245 8246 #define _mm512_maskz_permutex_epi64(U, X, C) \ 8247 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 8248 (__v8di)_mm512_permutex_epi64((X), (C)), \ 8249 (__v8di)_mm512_setzero_si512()) 8250 8251 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8252 _mm512_permutexvar_pd (__m512i __X, __m512d __Y) 8253 { 8254 return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X); 8255 } 8256 8257 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8258 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) 8259 { 8260 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8261 (__v8df)_mm512_permutexvar_pd(__X, __Y), 8262 (__v8df)__W); 8263 } 8264 8265 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8266 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) 8267 { 8268 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8269 (__v8df)_mm512_permutexvar_pd(__X, __Y), 8270 (__v8df)_mm512_setzero_pd()); 8271 } 8272 8273 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8274 _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) 8275 { 8276 return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X); 8277 } 8278 8279 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8280 _mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) 8281 { 8282 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 8283 (__v8di)_mm512_permutexvar_epi64(__X, __Y), 8284 (__v8di)_mm512_setzero_si512()); 8285 } 8286 8287 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8288 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, 8289 __m512i __Y) 8290 { 8291 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 8292 (__v8di)_mm512_permutexvar_epi64(__X, __Y), 8293 (__v8di)__W); 8294 } 8295 8296 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8297 _mm512_permutexvar_ps (__m512i __X, __m512 __Y) 8298 { 8299 return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X); 8300 } 8301 8302 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8303 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) 8304 { 8305 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8306 (__v16sf)_mm512_permutexvar_ps(__X, __Y), 8307 (__v16sf)__W); 8308 } 8309 8310 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8311 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) 8312 { 8313 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8314 (__v16sf)_mm512_permutexvar_ps(__X, __Y), 8315 (__v16sf)_mm512_setzero_ps()); 8316 } 8317 8318 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8319 _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) 8320 { 8321 return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X); 8322 } 8323 8324 #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32 8325 8326 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8327 _mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) 8328 { 8329 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 8330 (__v16si)_mm512_permutexvar_epi32(__X, __Y), 8331 (__v16si)_mm512_setzero_si512()); 8332 } 8333 8334 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8335 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, 8336 __m512i __Y) 8337 { 8338 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 8339 (__v16si)_mm512_permutexvar_epi32(__X, __Y), 8340 (__v16si)__W); 8341 } 8342 8343 #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 8344 8345 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8346 _mm512_kand (__mmask16 __A, __mmask16 __B) 8347 { 8348 return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); 8349 } 8350 8351 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8352 _mm512_kandn (__mmask16 __A, __mmask16 __B) 8353 { 8354 return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); 8355 } 8356 8357 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8358 _mm512_kor (__mmask16 __A, __mmask16 __B) 8359 { 8360 return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); 8361 } 8362 8363 static __inline__ int __DEFAULT_FN_ATTRS 8364 _mm512_kortestc (__mmask16 __A, __mmask16 __B) 8365 { 8366 return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B); 8367 } 8368 8369 static __inline__ int __DEFAULT_FN_ATTRS 8370 _mm512_kortestz (__mmask16 __A, __mmask16 __B) 8371 { 8372 return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B); 8373 } 8374 8375 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8376 _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) 8377 { 8378 return (unsigned char)__builtin_ia32_kortestchi(__A, __B); 8379 } 8380 8381 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8382 _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) 8383 { 8384 return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); 8385 } 8386 8387 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8388 _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { 8389 *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B); 8390 return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); 8391 } 8392 8393 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8394 _mm512_kunpackb (__mmask16 __A, __mmask16 __B) 8395 { 8396 return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); 8397 } 8398 8399 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8400 _mm512_kxnor (__mmask16 __A, __mmask16 __B) 8401 { 8402 return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); 8403 } 8404 8405 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8406 _mm512_kxor (__mmask16 __A, __mmask16 __B) 8407 { 8408 return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); 8409 } 8410 8411 #define _kand_mask16 _mm512_kand 8412 #define _kandn_mask16 _mm512_kandn 8413 #define _knot_mask16 _mm512_knot 8414 #define _kor_mask16 _mm512_kor 8415 #define _kxnor_mask16 _mm512_kxnor 8416 #define _kxor_mask16 _mm512_kxor 8417 8418 #define _kshiftli_mask16(A, I) \ 8419 (__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)) 8420 8421 #define _kshiftri_mask16(A, I) \ 8422 (__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)) 8423 8424 static __inline__ unsigned int __DEFAULT_FN_ATTRS 8425 _cvtmask16_u32(__mmask16 __A) { 8426 return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); 8427 } 8428 8429 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8430 _cvtu32_mask16(unsigned int __A) { 8431 return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); 8432 } 8433 8434 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8435 _load_mask16(__mmask16 *__A) { 8436 return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); 8437 } 8438 8439 static __inline__ void __DEFAULT_FN_ATTRS 8440 _store_mask16(__mmask16 *__A, __mmask16 __B) { 8441 *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B); 8442 } 8443 8444 static __inline__ void __DEFAULT_FN_ATTRS512 8445 _mm512_stream_si512 (void * __P, __m512i __A) 8446 { 8447 typedef __v8di __v8di_aligned __attribute__((aligned(64))); 8448 __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P); 8449 } 8450 8451 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8452 _mm512_stream_load_si512 (void const *__P) 8453 { 8454 typedef __v8di __v8di_aligned __attribute__((aligned(64))); 8455 return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P); 8456 } 8457 8458 static __inline__ void __DEFAULT_FN_ATTRS512 8459 _mm512_stream_pd (void *__P, __m512d __A) 8460 { 8461 typedef __v8df __v8df_aligned __attribute__((aligned(64))); 8462 __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P); 8463 } 8464 8465 static __inline__ void __DEFAULT_FN_ATTRS512 8466 _mm512_stream_ps (void *__P, __m512 __A) 8467 { 8468 typedef __v16sf __v16sf_aligned __attribute__((aligned(64))); 8469 __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P); 8470 } 8471 8472 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8473 _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) 8474 { 8475 return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, 8476 (__v8df) __W, 8477 (__mmask8) __U); 8478 } 8479 8480 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8481 _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) 8482 { 8483 return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, 8484 (__v8df) 8485 _mm512_setzero_pd (), 8486 (__mmask8) __U); 8487 } 8488 8489 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8490 _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 8491 { 8492 return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, 8493 (__v8di) __W, 8494 (__mmask8) __U); 8495 } 8496 8497 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8498 _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) 8499 { 8500 return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, 8501 (__v8di) 8502 _mm512_setzero_si512 (), 8503 (__mmask8) __U); 8504 } 8505 8506 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8507 _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) 8508 { 8509 return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, 8510 (__v16sf) __W, 8511 (__mmask16) __U); 8512 } 8513 8514 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8515 _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) 8516 { 8517 return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, 8518 (__v16sf) 8519 _mm512_setzero_ps (), 8520 (__mmask16) __U); 8521 } 8522 8523 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8524 _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 8525 { 8526 return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, 8527 (__v16si) __W, 8528 (__mmask16) __U); 8529 } 8530 8531 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8532 _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) 8533 { 8534 return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, 8535 (__v16si) 8536 _mm512_setzero_si512 (), 8537 (__mmask16) __U); 8538 } 8539 8540 #define _mm_cmp_round_ss_mask(X, Y, P, R) \ 8541 (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8542 (__v4sf)(__m128)(Y), (int)(P), \ 8543 (__mmask8)-1, (int)(R)) 8544 8545 #define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ 8546 (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8547 (__v4sf)(__m128)(Y), (int)(P), \ 8548 (__mmask8)(M), (int)(R)) 8549 8550 #define _mm_cmp_ss_mask(X, Y, P) \ 8551 (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8552 (__v4sf)(__m128)(Y), (int)(P), \ 8553 (__mmask8)-1, \ 8554 _MM_FROUND_CUR_DIRECTION) 8555 8556 #define _mm_mask_cmp_ss_mask(M, X, Y, P) \ 8557 (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8558 (__v4sf)(__m128)(Y), (int)(P), \ 8559 (__mmask8)(M), \ 8560 _MM_FROUND_CUR_DIRECTION) 8561 8562 #define _mm_cmp_round_sd_mask(X, Y, P, R) \ 8563 (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8564 (__v2df)(__m128d)(Y), (int)(P), \ 8565 (__mmask8)-1, (int)(R)) 8566 8567 #define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ 8568 (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8569 (__v2df)(__m128d)(Y), (int)(P), \ 8570 (__mmask8)(M), (int)(R)) 8571 8572 #define _mm_cmp_sd_mask(X, Y, P) \ 8573 (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8574 (__v2df)(__m128d)(Y), (int)(P), \ 8575 (__mmask8)-1, \ 8576 _MM_FROUND_CUR_DIRECTION) 8577 8578 #define _mm_mask_cmp_sd_mask(M, X, Y, P) \ 8579 (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8580 (__v2df)(__m128d)(Y), (int)(P), \ 8581 (__mmask8)(M), \ 8582 _MM_FROUND_CUR_DIRECTION) 8583 8584 /* Bit Test */ 8585 8586 static __inline __mmask16 __DEFAULT_FN_ATTRS512 8587 _mm512_test_epi32_mask (__m512i __A, __m512i __B) 8588 { 8589 return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B), 8590 _mm512_setzero_si512()); 8591 } 8592 8593 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8594 _mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) 8595 { 8596 return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), 8597 _mm512_setzero_si512()); 8598 } 8599 8600 static __inline __mmask8 __DEFAULT_FN_ATTRS512 8601 _mm512_test_epi64_mask (__m512i __A, __m512i __B) 8602 { 8603 return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B), 8604 _mm512_setzero_si512()); 8605 } 8606 8607 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8608 _mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) 8609 { 8610 return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), 8611 _mm512_setzero_si512()); 8612 } 8613 8614 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8615 _mm512_testn_epi32_mask (__m512i __A, __m512i __B) 8616 { 8617 return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B), 8618 _mm512_setzero_si512()); 8619 } 8620 8621 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8622 _mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) 8623 { 8624 return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), 8625 _mm512_setzero_si512()); 8626 } 8627 8628 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8629 _mm512_testn_epi64_mask (__m512i __A, __m512i __B) 8630 { 8631 return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B), 8632 _mm512_setzero_si512()); 8633 } 8634 8635 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8636 _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) 8637 { 8638 return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), 8639 _mm512_setzero_si512()); 8640 } 8641 8642 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8643 _mm512_movehdup_ps (__m512 __A) 8644 { 8645 return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, 8646 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); 8647 } 8648 8649 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8650 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) 8651 { 8652 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8653 (__v16sf)_mm512_movehdup_ps(__A), 8654 (__v16sf)__W); 8655 } 8656 8657 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8658 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) 8659 { 8660 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8661 (__v16sf)_mm512_movehdup_ps(__A), 8662 (__v16sf)_mm512_setzero_ps()); 8663 } 8664 8665 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8666 _mm512_moveldup_ps (__m512 __A) 8667 { 8668 return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, 8669 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); 8670 } 8671 8672 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8673 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) 8674 { 8675 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8676 (__v16sf)_mm512_moveldup_ps(__A), 8677 (__v16sf)__W); 8678 } 8679 8680 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8681 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) 8682 { 8683 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8684 (__v16sf)_mm512_moveldup_ps(__A), 8685 (__v16sf)_mm512_setzero_ps()); 8686 } 8687 8688 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8689 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 8690 { 8691 return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W); 8692 } 8693 8694 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8695 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) 8696 { 8697 return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), 8698 _mm_setzero_ps()); 8699 } 8700 8701 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8702 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8703 { 8704 return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W); 8705 } 8706 8707 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8708 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) 8709 { 8710 return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), 8711 _mm_setzero_pd()); 8712 } 8713 8714 static __inline__ void __DEFAULT_FN_ATTRS128 8715 _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) 8716 { 8717 __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1); 8718 } 8719 8720 static __inline__ void __DEFAULT_FN_ATTRS128 8721 _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) 8722 { 8723 __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1); 8724 } 8725 8726 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8727 _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) 8728 { 8729 __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W, 8730 (__v4sf)_mm_setzero_ps(), 8731 0, 4, 4, 4); 8732 8733 return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1); 8734 } 8735 8736 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8737 _mm_maskz_load_ss (__mmask8 __U, const float* __A) 8738 { 8739 return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A, 8740 (__v4sf) _mm_setzero_ps(), 8741 __U & 1); 8742 } 8743 8744 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8745 _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) 8746 { 8747 __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, 8748 (__v2df)_mm_setzero_pd(), 8749 0, 2); 8750 8751 return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1); 8752 } 8753 8754 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8755 _mm_maskz_load_sd (__mmask8 __U, const double* __A) 8756 { 8757 return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, 8758 (__v2df) _mm_setzero_pd(), 8759 __U & 1); 8760 } 8761 8762 #define _mm512_shuffle_epi32(A, I) \ 8763 (__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)) 8764 8765 #define _mm512_mask_shuffle_epi32(W, U, A, I) \ 8766 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 8767 (__v16si)_mm512_shuffle_epi32((A), (I)), \ 8768 (__v16si)(__m512i)(W)) 8769 8770 #define _mm512_maskz_shuffle_epi32(U, A, I) \ 8771 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 8772 (__v16si)_mm512_shuffle_epi32((A), (I)), \ 8773 (__v16si)_mm512_setzero_si512()) 8774 8775 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8776 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) 8777 { 8778 return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, 8779 (__v8df) __W, 8780 (__mmask8) __U); 8781 } 8782 8783 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8784 _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) 8785 { 8786 return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, 8787 (__v8df) _mm512_setzero_pd (), 8788 (__mmask8) __U); 8789 } 8790 8791 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8792 _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 8793 { 8794 return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, 8795 (__v8di) __W, 8796 (__mmask8) __U); 8797 } 8798 8799 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8800 _mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A) 8801 { 8802 return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, 8803 (__v8di) _mm512_setzero_si512 (), 8804 (__mmask8) __U); 8805 } 8806 8807 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8808 _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) 8809 { 8810 return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, 8811 (__v8df) __W, 8812 (__mmask8) __U); 8813 } 8814 8815 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8816 _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) 8817 { 8818 return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, 8819 (__v8df) _mm512_setzero_pd(), 8820 (__mmask8) __U); 8821 } 8822 8823 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8824 _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) 8825 { 8826 return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, 8827 (__v8di) __W, 8828 (__mmask8) __U); 8829 } 8830 8831 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8832 _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) 8833 { 8834 return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, 8835 (__v8di) _mm512_setzero_si512(), 8836 (__mmask8) __U); 8837 } 8838 8839 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8840 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) 8841 { 8842 return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, 8843 (__v16sf) __W, 8844 (__mmask16) __U); 8845 } 8846 8847 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8848 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) 8849 { 8850 return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, 8851 (__v16sf) _mm512_setzero_ps(), 8852 (__mmask16) __U); 8853 } 8854 8855 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8856 _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) 8857 { 8858 return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, 8859 (__v16si) __W, 8860 (__mmask16) __U); 8861 } 8862 8863 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8864 _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) 8865 { 8866 return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, 8867 (__v16si) _mm512_setzero_si512(), 8868 (__mmask16) __U); 8869 } 8870 8871 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8872 _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) 8873 { 8874 return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, 8875 (__v16sf) __W, 8876 (__mmask16) __U); 8877 } 8878 8879 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8880 _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) 8881 { 8882 return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, 8883 (__v16sf) _mm512_setzero_ps(), 8884 (__mmask16) __U); 8885 } 8886 8887 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8888 _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 8889 { 8890 return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, 8891 (__v16si) __W, 8892 (__mmask16) __U); 8893 } 8894 8895 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8896 _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) 8897 { 8898 return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, 8899 (__v16si) _mm512_setzero_si512(), 8900 (__mmask16) __U); 8901 } 8902 8903 #define _mm512_cvt_roundps_pd(A, R) \ 8904 (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8905 (__v8df)_mm512_undefined_pd(), \ 8906 (__mmask8)-1, (int)(R)) 8907 8908 #define _mm512_mask_cvt_roundps_pd(W, U, A, R) \ 8909 (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8910 (__v8df)(__m512d)(W), \ 8911 (__mmask8)(U), (int)(R)) 8912 8913 #define _mm512_maskz_cvt_roundps_pd(U, A, R) \ 8914 (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8915 (__v8df)_mm512_setzero_pd(), \ 8916 (__mmask8)(U), (int)(R)) 8917 8918 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8919 _mm512_cvtps_pd (__m256 __A) 8920 { 8921 return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df); 8922 } 8923 8924 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8925 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) 8926 { 8927 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8928 (__v8df)_mm512_cvtps_pd(__A), 8929 (__v8df)__W); 8930 } 8931 8932 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8933 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) 8934 { 8935 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8936 (__v8df)_mm512_cvtps_pd(__A), 8937 (__v8df)_mm512_setzero_pd()); 8938 } 8939 8940 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8941 _mm512_cvtpslo_pd (__m512 __A) 8942 { 8943 return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); 8944 } 8945 8946 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8947 _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) 8948 { 8949 return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); 8950 } 8951 8952 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8953 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) 8954 { 8955 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 8956 (__v8df) __A, 8957 (__v8df) __W); 8958 } 8959 8960 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8961 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) 8962 { 8963 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 8964 (__v8df) __A, 8965 (__v8df) _mm512_setzero_pd ()); 8966 } 8967 8968 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8969 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) 8970 { 8971 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 8972 (__v16sf) __A, 8973 (__v16sf) __W); 8974 } 8975 8976 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8977 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) 8978 { 8979 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 8980 (__v16sf) __A, 8981 (__v16sf) _mm512_setzero_ps ()); 8982 } 8983 8984 static __inline__ void __DEFAULT_FN_ATTRS512 8985 _mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) 8986 { 8987 __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, 8988 (__mmask8) __U); 8989 } 8990 8991 static __inline__ void __DEFAULT_FN_ATTRS512 8992 _mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) 8993 { 8994 __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, 8995 (__mmask8) __U); 8996 } 8997 8998 static __inline__ void __DEFAULT_FN_ATTRS512 8999 _mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) 9000 { 9001 __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, 9002 (__mmask16) __U); 9003 } 9004 9005 static __inline__ void __DEFAULT_FN_ATTRS512 9006 _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) 9007 { 9008 __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, 9009 (__mmask16) __U); 9010 } 9011 9012 #define _mm_cvt_roundsd_ss(A, B, R) \ 9013 (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9014 (__v2df)(__m128d)(B), \ 9015 (__v4sf)_mm_undefined_ps(), \ 9016 (__mmask8)-1, (int)(R)) 9017 9018 #define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \ 9019 (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9020 (__v2df)(__m128d)(B), \ 9021 (__v4sf)(__m128)(W), \ 9022 (__mmask8)(U), (int)(R)) 9023 9024 #define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \ 9025 (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9026 (__v2df)(__m128d)(B), \ 9027 (__v4sf)_mm_setzero_ps(), \ 9028 (__mmask8)(U), (int)(R)) 9029 9030 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9031 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) 9032 { 9033 return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, 9034 (__v2df)__B, 9035 (__v4sf)__W, 9036 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9037 } 9038 9039 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9040 _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) 9041 { 9042 return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, 9043 (__v2df)__B, 9044 (__v4sf)_mm_setzero_ps(), 9045 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9046 } 9047 9048 #define _mm_cvtss_i32 _mm_cvtss_si32 9049 #define _mm_cvtsd_i32 _mm_cvtsd_si32 9050 #define _mm_cvti32_sd _mm_cvtsi32_sd 9051 #define _mm_cvti32_ss _mm_cvtsi32_ss 9052 #ifdef __x86_64__ 9053 #define _mm_cvtss_i64 _mm_cvtss_si64 9054 #define _mm_cvtsd_i64 _mm_cvtsd_si64 9055 #define _mm_cvti64_sd _mm_cvtsi64_sd 9056 #define _mm_cvti64_ss _mm_cvtsi64_ss 9057 #endif 9058 9059 #ifdef __x86_64__ 9060 #define _mm_cvt_roundi64_sd(A, B, R) \ 9061 (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ 9062 (int)(R)) 9063 9064 #define _mm_cvt_roundsi64_sd(A, B, R) \ 9065 (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ 9066 (int)(R)) 9067 #endif 9068 9069 #define _mm_cvt_roundsi32_ss(A, B, R) \ 9070 (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)) 9071 9072 #define _mm_cvt_roundi32_ss(A, B, R) \ 9073 (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)) 9074 9075 #ifdef __x86_64__ 9076 #define _mm_cvt_roundsi64_ss(A, B, R) \ 9077 (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ 9078 (int)(R)) 9079 9080 #define _mm_cvt_roundi64_ss(A, B, R) \ 9081 (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ 9082 (int)(R)) 9083 #endif 9084 9085 #define _mm_cvt_roundss_sd(A, B, R) \ 9086 (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9087 (__v4sf)(__m128)(B), \ 9088 (__v2df)_mm_undefined_pd(), \ 9089 (__mmask8)-1, (int)(R)) 9090 9091 #define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \ 9092 (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9093 (__v4sf)(__m128)(B), \ 9094 (__v2df)(__m128d)(W), \ 9095 (__mmask8)(U), (int)(R)) 9096 9097 #define _mm_maskz_cvt_roundss_sd(U, A, B, R) \ 9098 (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9099 (__v4sf)(__m128)(B), \ 9100 (__v2df)_mm_setzero_pd(), \ 9101 (__mmask8)(U), (int)(R)) 9102 9103 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9104 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) 9105 { 9106 return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, 9107 (__v4sf)__B, 9108 (__v2df)__W, 9109 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9110 } 9111 9112 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9113 _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B) 9114 { 9115 return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, 9116 (__v4sf)__B, 9117 (__v2df)_mm_setzero_pd(), 9118 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9119 } 9120 9121 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9122 _mm_cvtu32_sd (__m128d __A, unsigned __B) 9123 { 9124 __A[0] = __B; 9125 return __A; 9126 } 9127 9128 #ifdef __x86_64__ 9129 #define _mm_cvt_roundu64_sd(A, B, R) \ 9130 (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ 9131 (unsigned long long)(B), (int)(R)) 9132 9133 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9134 _mm_cvtu64_sd (__m128d __A, unsigned long long __B) 9135 { 9136 __A[0] = __B; 9137 return __A; 9138 } 9139 #endif 9140 9141 #define _mm_cvt_roundu32_ss(A, B, R) \ 9142 (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ 9143 (int)(R)) 9144 9145 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9146 _mm_cvtu32_ss (__m128 __A, unsigned __B) 9147 { 9148 __A[0] = __B; 9149 return __A; 9150 } 9151 9152 #ifdef __x86_64__ 9153 #define _mm_cvt_roundu64_ss(A, B, R) \ 9154 (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ 9155 (unsigned long long)(B), (int)(R)) 9156 9157 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9158 _mm_cvtu64_ss (__m128 __A, unsigned long long __B) 9159 { 9160 __A[0] = __B; 9161 return __A; 9162 } 9163 #endif 9164 9165 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9166 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) 9167 { 9168 return (__m512i) __builtin_ia32_selectd_512(__M, 9169 (__v16si) _mm512_set1_epi32(__A), 9170 (__v16si) __O); 9171 } 9172 9173 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9174 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) 9175 { 9176 return (__m512i) __builtin_ia32_selectq_512(__M, 9177 (__v8di) _mm512_set1_epi64(__A), 9178 (__v8di) __O); 9179 } 9180 9181 static __inline __m512i __DEFAULT_FN_ATTRS512 9182 _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, 9183 char __e58, char __e57, char __e56, char __e55, char __e54, char __e53, 9184 char __e52, char __e51, char __e50, char __e49, char __e48, char __e47, 9185 char __e46, char __e45, char __e44, char __e43, char __e42, char __e41, 9186 char __e40, char __e39, char __e38, char __e37, char __e36, char __e35, 9187 char __e34, char __e33, char __e32, char __e31, char __e30, char __e29, 9188 char __e28, char __e27, char __e26, char __e25, char __e24, char __e23, 9189 char __e22, char __e21, char __e20, char __e19, char __e18, char __e17, 9190 char __e16, char __e15, char __e14, char __e13, char __e12, char __e11, 9191 char __e10, char __e9, char __e8, char __e7, char __e6, char __e5, 9192 char __e4, char __e3, char __e2, char __e1, char __e0) { 9193 9194 return __extension__ (__m512i)(__v64qi) 9195 {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, 9196 __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, 9197 __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, 9198 __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31, 9199 __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39, 9200 __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47, 9201 __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55, 9202 __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63}; 9203 } 9204 9205 static __inline __m512i __DEFAULT_FN_ATTRS512 9206 _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, 9207 short __e27, short __e26, short __e25, short __e24, short __e23, 9208 short __e22, short __e21, short __e20, short __e19, short __e18, 9209 short __e17, short __e16, short __e15, short __e14, short __e13, 9210 short __e12, short __e11, short __e10, short __e9, short __e8, 9211 short __e7, short __e6, short __e5, short __e4, short __e3, 9212 short __e2, short __e1, short __e0) { 9213 return __extension__ (__m512i)(__v32hi) 9214 {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, 9215 __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, 9216 __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, 9217 __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 }; 9218 } 9219 9220 static __inline __m512i __DEFAULT_FN_ATTRS512 9221 _mm512_set_epi32 (int __A, int __B, int __C, int __D, 9222 int __E, int __F, int __G, int __H, 9223 int __I, int __J, int __K, int __L, 9224 int __M, int __N, int __O, int __P) 9225 { 9226 return __extension__ (__m512i)(__v16si) 9227 { __P, __O, __N, __M, __L, __K, __J, __I, 9228 __H, __G, __F, __E, __D, __C, __B, __A }; 9229 } 9230 9231 #define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \ 9232 e8,e9,e10,e11,e12,e13,e14,e15) \ 9233 _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \ 9234 (e5),(e4),(e3),(e2),(e1),(e0)) 9235 9236 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9237 _mm512_set_epi64 (long long __A, long long __B, long long __C, 9238 long long __D, long long __E, long long __F, 9239 long long __G, long long __H) 9240 { 9241 return __extension__ (__m512i) (__v8di) 9242 { __H, __G, __F, __E, __D, __C, __B, __A }; 9243 } 9244 9245 #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \ 9246 _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) 9247 9248 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9249 _mm512_set_pd (double __A, double __B, double __C, double __D, 9250 double __E, double __F, double __G, double __H) 9251 { 9252 return __extension__ (__m512d) 9253 { __H, __G, __F, __E, __D, __C, __B, __A }; 9254 } 9255 9256 #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \ 9257 _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) 9258 9259 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9260 _mm512_set_ps (float __A, float __B, float __C, float __D, 9261 float __E, float __F, float __G, float __H, 9262 float __I, float __J, float __K, float __L, 9263 float __M, float __N, float __O, float __P) 9264 { 9265 return __extension__ (__m512) 9266 { __P, __O, __N, __M, __L, __K, __J, __I, 9267 __H, __G, __F, __E, __D, __C, __B, __A }; 9268 } 9269 9270 #define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \ 9271 _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \ 9272 (e4),(e3),(e2),(e1),(e0)) 9273 9274 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9275 _mm512_abs_ps(__m512 __A) 9276 { 9277 return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; 9278 } 9279 9280 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9281 _mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) 9282 { 9283 return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; 9284 } 9285 9286 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9287 _mm512_abs_pd(__m512d __A) 9288 { 9289 return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ; 9290 } 9291 9292 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9293 _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) 9294 { 9295 return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); 9296 } 9297 9298 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as 9299 * outputs. This class of vector operation forms the basis of many scientific 9300 * computations. In vector-reduction arithmetic, the evaluation off is 9301 * independent of the order of the input elements of V. 9302 9303 * Used bisection method. At each step, we partition the vector with previous 9304 * step in half, and the operation is performed on its two halves. 9305 * This takes log2(n) steps where n is the number of elements in the vector. 9306 */ 9307 9308 #define _mm512_mask_reduce_operator(op) \ 9309 __v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \ 9310 __v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \ 9311 __m256i __t3 = (__m256i)(__t1 op __t2); \ 9312 __v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \ 9313 __v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \ 9314 __v2du __t6 = __t4 op __t5; \ 9315 __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ 9316 __v2du __t8 = __t6 op __t7; \ 9317 return __t8[0] 9318 9319 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) { 9320 _mm512_mask_reduce_operator(+); 9321 } 9322 9323 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) { 9324 _mm512_mask_reduce_operator(*); 9325 } 9326 9327 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) { 9328 _mm512_mask_reduce_operator(&); 9329 } 9330 9331 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) { 9332 _mm512_mask_reduce_operator(|); 9333 } 9334 9335 static __inline__ long long __DEFAULT_FN_ATTRS512 9336 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { 9337 __W = _mm512_maskz_mov_epi64(__M, __W); 9338 _mm512_mask_reduce_operator(+); 9339 } 9340 9341 static __inline__ long long __DEFAULT_FN_ATTRS512 9342 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { 9343 __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W); 9344 _mm512_mask_reduce_operator(*); 9345 } 9346 9347 static __inline__ long long __DEFAULT_FN_ATTRS512 9348 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { 9349 __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W); 9350 _mm512_mask_reduce_operator(&); 9351 } 9352 9353 static __inline__ long long __DEFAULT_FN_ATTRS512 9354 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { 9355 __W = _mm512_maskz_mov_epi64(__M, __W); 9356 _mm512_mask_reduce_operator(|); 9357 } 9358 #undef _mm512_mask_reduce_operator 9359 9360 #define _mm512_mask_reduce_operator(op) \ 9361 __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \ 9362 __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \ 9363 __m256d __t3 = __t1 op __t2; \ 9364 __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \ 9365 __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \ 9366 __m128d __t6 = __t4 op __t5; \ 9367 __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ 9368 __m128d __t8 = __t6 op __t7; \ 9369 return __t8[0] 9370 9371 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { 9372 _mm512_mask_reduce_operator(+); 9373 } 9374 9375 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) { 9376 _mm512_mask_reduce_operator(*); 9377 } 9378 9379 static __inline__ double __DEFAULT_FN_ATTRS512 9380 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { 9381 __W = _mm512_maskz_mov_pd(__M, __W); 9382 _mm512_mask_reduce_operator(+); 9383 } 9384 9385 static __inline__ double __DEFAULT_FN_ATTRS512 9386 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { 9387 __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W); 9388 _mm512_mask_reduce_operator(*); 9389 } 9390 #undef _mm512_mask_reduce_operator 9391 9392 #define _mm512_mask_reduce_operator(op) \ 9393 __v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \ 9394 __v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \ 9395 __m256i __t3 = (__m256i)(__t1 op __t2); \ 9396 __v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \ 9397 __v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \ 9398 __v4su __t6 = __t4 op __t5; \ 9399 __v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ 9400 __v4su __t8 = __t6 op __t7; \ 9401 __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ 9402 __v4su __t10 = __t8 op __t9; \ 9403 return __t10[0] 9404 9405 static __inline__ int __DEFAULT_FN_ATTRS512 9406 _mm512_reduce_add_epi32(__m512i __W) { 9407 _mm512_mask_reduce_operator(+); 9408 } 9409 9410 static __inline__ int __DEFAULT_FN_ATTRS512 9411 _mm512_reduce_mul_epi32(__m512i __W) { 9412 _mm512_mask_reduce_operator(*); 9413 } 9414 9415 static __inline__ int __DEFAULT_FN_ATTRS512 9416 _mm512_reduce_and_epi32(__m512i __W) { 9417 _mm512_mask_reduce_operator(&); 9418 } 9419 9420 static __inline__ int __DEFAULT_FN_ATTRS512 9421 _mm512_reduce_or_epi32(__m512i __W) { 9422 _mm512_mask_reduce_operator(|); 9423 } 9424 9425 static __inline__ int __DEFAULT_FN_ATTRS512 9426 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { 9427 __W = _mm512_maskz_mov_epi32(__M, __W); 9428 _mm512_mask_reduce_operator(+); 9429 } 9430 9431 static __inline__ int __DEFAULT_FN_ATTRS512 9432 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { 9433 __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W); 9434 _mm512_mask_reduce_operator(*); 9435 } 9436 9437 static __inline__ int __DEFAULT_FN_ATTRS512 9438 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { 9439 __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W); 9440 _mm512_mask_reduce_operator(&); 9441 } 9442 9443 static __inline__ int __DEFAULT_FN_ATTRS512 9444 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { 9445 __W = _mm512_maskz_mov_epi32(__M, __W); 9446 _mm512_mask_reduce_operator(|); 9447 } 9448 #undef _mm512_mask_reduce_operator 9449 9450 #define _mm512_mask_reduce_operator(op) \ 9451 __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \ 9452 __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \ 9453 __m256 __t3 = __t1 op __t2; \ 9454 __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \ 9455 __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \ 9456 __m128 __t6 = __t4 op __t5; \ 9457 __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ 9458 __m128 __t8 = __t6 op __t7; \ 9459 __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ 9460 __m128 __t10 = __t8 op __t9; \ 9461 return __t10[0] 9462 9463 static __inline__ float __DEFAULT_FN_ATTRS512 9464 _mm512_reduce_add_ps(__m512 __W) { 9465 _mm512_mask_reduce_operator(+); 9466 } 9467 9468 static __inline__ float __DEFAULT_FN_ATTRS512 9469 _mm512_reduce_mul_ps(__m512 __W) { 9470 _mm512_mask_reduce_operator(*); 9471 } 9472 9473 static __inline__ float __DEFAULT_FN_ATTRS512 9474 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { 9475 __W = _mm512_maskz_mov_ps(__M, __W); 9476 _mm512_mask_reduce_operator(+); 9477 } 9478 9479 static __inline__ float __DEFAULT_FN_ATTRS512 9480 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { 9481 __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W); 9482 _mm512_mask_reduce_operator(*); 9483 } 9484 #undef _mm512_mask_reduce_operator 9485 9486 #define _mm512_mask_reduce_operator(op) \ 9487 __m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \ 9488 __m512i __t2 = _mm512_##op(__V, __t1); \ 9489 __m512i __t3 = (__m512i)__builtin_shufflevector((__v8di)__t2, (__v8di)__t2, 2, 3, 0, 1, 6, 7, 4, 5); \ 9490 __m512i __t4 = _mm512_##op(__t2, __t3); \ 9491 __m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \ 9492 __v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \ 9493 return __t6[0] 9494 9495 static __inline__ long long __DEFAULT_FN_ATTRS512 9496 _mm512_reduce_max_epi64(__m512i __V) { 9497 _mm512_mask_reduce_operator(max_epi64); 9498 } 9499 9500 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9501 _mm512_reduce_max_epu64(__m512i __V) { 9502 _mm512_mask_reduce_operator(max_epu64); 9503 } 9504 9505 static __inline__ long long __DEFAULT_FN_ATTRS512 9506 _mm512_reduce_min_epi64(__m512i __V) { 9507 _mm512_mask_reduce_operator(min_epi64); 9508 } 9509 9510 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9511 _mm512_reduce_min_epu64(__m512i __V) { 9512 _mm512_mask_reduce_operator(min_epu64); 9513 } 9514 9515 static __inline__ long long __DEFAULT_FN_ATTRS512 9516 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { 9517 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V); 9518 _mm512_mask_reduce_operator(max_epi64); 9519 } 9520 9521 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9522 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { 9523 __V = _mm512_maskz_mov_epi64(__M, __V); 9524 _mm512_mask_reduce_operator(max_epu64); 9525 } 9526 9527 static __inline__ long long __DEFAULT_FN_ATTRS512 9528 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { 9529 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V); 9530 _mm512_mask_reduce_operator(min_epi64); 9531 } 9532 9533 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9534 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { 9535 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V); 9536 _mm512_mask_reduce_operator(min_epu64); 9537 } 9538 #undef _mm512_mask_reduce_operator 9539 9540 #define _mm512_mask_reduce_operator(op) \ 9541 __m256i __t1 = _mm512_extracti64x4_epi64(__V, 0); \ 9542 __m256i __t2 = _mm512_extracti64x4_epi64(__V, 1); \ 9543 __m256i __t3 = _mm256_##op(__t1, __t2); \ 9544 __m128i __t4 = _mm256_extracti128_si256(__t3, 0); \ 9545 __m128i __t5 = _mm256_extracti128_si256(__t3, 1); \ 9546 __m128i __t6 = _mm_##op(__t4, __t5); \ 9547 __m128i __t7 = (__m128i)__builtin_shufflevector((__v4si)__t6, (__v4si)__t6, 2, 3, 0, 1); \ 9548 __m128i __t8 = _mm_##op(__t6, __t7); \ 9549 __m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \ 9550 __v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \ 9551 return __t10[0] 9552 9553 static __inline__ int __DEFAULT_FN_ATTRS512 9554 _mm512_reduce_max_epi32(__m512i __V) { 9555 _mm512_mask_reduce_operator(max_epi32); 9556 } 9557 9558 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9559 _mm512_reduce_max_epu32(__m512i __V) { 9560 _mm512_mask_reduce_operator(max_epu32); 9561 } 9562 9563 static __inline__ int __DEFAULT_FN_ATTRS512 9564 _mm512_reduce_min_epi32(__m512i __V) { 9565 _mm512_mask_reduce_operator(min_epi32); 9566 } 9567 9568 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9569 _mm512_reduce_min_epu32(__m512i __V) { 9570 _mm512_mask_reduce_operator(min_epu32); 9571 } 9572 9573 static __inline__ int __DEFAULT_FN_ATTRS512 9574 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { 9575 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V); 9576 _mm512_mask_reduce_operator(max_epi32); 9577 } 9578 9579 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9580 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { 9581 __V = _mm512_maskz_mov_epi32(__M, __V); 9582 _mm512_mask_reduce_operator(max_epu32); 9583 } 9584 9585 static __inline__ int __DEFAULT_FN_ATTRS512 9586 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { 9587 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V); 9588 _mm512_mask_reduce_operator(min_epi32); 9589 } 9590 9591 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9592 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { 9593 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V); 9594 _mm512_mask_reduce_operator(min_epu32); 9595 } 9596 #undef _mm512_mask_reduce_operator 9597 9598 #define _mm512_mask_reduce_operator(op) \ 9599 __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \ 9600 __m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \ 9601 __m256d __t3 = _mm256_##op(__t1, __t2); \ 9602 __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \ 9603 __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \ 9604 __m128d __t6 = _mm_##op(__t4, __t5); \ 9605 __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ 9606 __m128d __t8 = _mm_##op(__t6, __t7); \ 9607 return __t8[0] 9608 9609 static __inline__ double __DEFAULT_FN_ATTRS512 9610 _mm512_reduce_max_pd(__m512d __V) { 9611 _mm512_mask_reduce_operator(max_pd); 9612 } 9613 9614 static __inline__ double __DEFAULT_FN_ATTRS512 9615 _mm512_reduce_min_pd(__m512d __V) { 9616 _mm512_mask_reduce_operator(min_pd); 9617 } 9618 9619 static __inline__ double __DEFAULT_FN_ATTRS512 9620 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { 9621 __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V); 9622 _mm512_mask_reduce_operator(max_pd); 9623 } 9624 9625 static __inline__ double __DEFAULT_FN_ATTRS512 9626 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { 9627 __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V); 9628 _mm512_mask_reduce_operator(min_pd); 9629 } 9630 #undef _mm512_mask_reduce_operator 9631 9632 #define _mm512_mask_reduce_operator(op) \ 9633 __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \ 9634 __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \ 9635 __m256 __t3 = _mm256_##op(__t1, __t2); \ 9636 __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \ 9637 __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \ 9638 __m128 __t6 = _mm_##op(__t4, __t5); \ 9639 __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ 9640 __m128 __t8 = _mm_##op(__t6, __t7); \ 9641 __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ 9642 __m128 __t10 = _mm_##op(__t8, __t9); \ 9643 return __t10[0] 9644 9645 static __inline__ float __DEFAULT_FN_ATTRS512 9646 _mm512_reduce_max_ps(__m512 __V) { 9647 _mm512_mask_reduce_operator(max_ps); 9648 } 9649 9650 static __inline__ float __DEFAULT_FN_ATTRS512 9651 _mm512_reduce_min_ps(__m512 __V) { 9652 _mm512_mask_reduce_operator(min_ps); 9653 } 9654 9655 static __inline__ float __DEFAULT_FN_ATTRS512 9656 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { 9657 __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V); 9658 _mm512_mask_reduce_operator(max_ps); 9659 } 9660 9661 static __inline__ float __DEFAULT_FN_ATTRS512 9662 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { 9663 __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V); 9664 _mm512_mask_reduce_operator(min_ps); 9665 } 9666 #undef _mm512_mask_reduce_operator 9667 9668 /// Moves the least significant 32 bits of a vector of [16 x i32] to a 9669 /// 32-bit signed integer value. 9670 /// 9671 /// \headerfile <x86intrin.h> 9672 /// 9673 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 9674 /// 9675 /// \param __A 9676 /// A vector of [16 x i32]. The least significant 32 bits are moved to the 9677 /// destination. 9678 /// \returns A 32-bit signed integer containing the moved value. 9679 static __inline__ int __DEFAULT_FN_ATTRS512 9680 _mm512_cvtsi512_si32(__m512i __A) { 9681 __v16si __b = (__v16si)__A; 9682 return __b[0]; 9683 } 9684 9685 #undef __DEFAULT_FN_ATTRS512 9686 #undef __DEFAULT_FN_ATTRS128 9687 #undef __DEFAULT_FN_ATTRS 9688 9689 #endif /* __AVX512FINTRIN_H */ 9690