1 /*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead." 11 #endif 12 13 #ifndef __AVX512FINTRIN_H 14 #define __AVX512FINTRIN_H 15 16 typedef char __v64qi __attribute__((__vector_size__(64))); 17 typedef short __v32hi __attribute__((__vector_size__(64))); 18 typedef double __v8df __attribute__((__vector_size__(64))); 19 typedef float __v16sf __attribute__((__vector_size__(64))); 20 typedef long long __v8di __attribute__((__vector_size__(64))); 21 typedef int __v16si __attribute__((__vector_size__(64))); 22 23 /* Unsigned types */ 24 typedef unsigned char __v64qu __attribute__((__vector_size__(64))); 25 typedef unsigned short __v32hu __attribute__((__vector_size__(64))); 26 typedef unsigned long long __v8du __attribute__((__vector_size__(64))); 27 typedef unsigned int __v16su __attribute__((__vector_size__(64))); 28 29 typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64))); 30 typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64))); 31 typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64))); 32 33 typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1))); 34 typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1))); 35 typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1))); 36 37 typedef unsigned char __mmask8; 38 typedef unsigned short __mmask16; 39 40 /* Rounding mode macros. */ 41 #define _MM_FROUND_TO_NEAREST_INT 0x00 42 #define _MM_FROUND_TO_NEG_INF 0x01 43 #define _MM_FROUND_TO_POS_INF 0x02 44 #define _MM_FROUND_TO_ZERO 0x03 45 #define _MM_FROUND_CUR_DIRECTION 0x04 46 47 /* Constants for integer comparison predicates */ 48 typedef enum { 49 _MM_CMPINT_EQ, /* Equal */ 50 _MM_CMPINT_LT, /* Less than */ 51 _MM_CMPINT_LE, /* Less than or Equal */ 52 _MM_CMPINT_UNUSED, 53 _MM_CMPINT_NE, /* Not Equal */ 54 _MM_CMPINT_NLT, /* Not Less than */ 55 #define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */ 56 _MM_CMPINT_NLE /* Not Less than or Equal */ 57 #define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */ 58 } _MM_CMPINT_ENUM; 59 60 typedef enum 61 { 62 _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, 63 _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, 64 _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, 65 _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, 66 _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, 67 _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, 68 _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, 69 _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, 70 _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, 71 _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, 72 _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, 73 _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, 74 _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, 75 _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, 76 _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, 77 _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, 78 _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, 79 _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, 80 _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, 81 _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, 82 _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, 83 _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, 84 _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, 85 _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, 86 _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, 87 _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, 88 _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, 89 _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, 90 _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, 91 _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, 92 _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, 93 _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, 94 _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, 95 _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, 96 _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, 97 _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, 98 _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, 99 _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, 100 _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, 101 _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, 102 _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, 103 _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, 104 _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, 105 _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, 106 _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, 107 _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, 108 _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, 109 _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, 110 _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, 111 _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, 112 _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, 113 _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, 114 _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, 115 _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, 116 _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, 117 _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, 118 _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, 119 _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, 120 _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, 121 _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, 122 _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, 123 _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, 124 _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, 125 _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, 126 _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, 127 _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, 128 _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, 129 _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, 130 _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, 131 _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, 132 _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, 133 _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, 134 _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, 135 _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, 136 _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, 137 _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, 138 _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, 139 _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, 140 _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, 141 _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, 142 _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, 143 _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, 144 _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, 145 _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, 146 _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, 147 _MM_PERM_DDDD = 0xFF 148 } _MM_PERM_ENUM; 149 150 typedef enum 151 { 152 _MM_MANT_NORM_1_2, /* interval [1, 2) */ 153 _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */ 154 _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */ 155 _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */ 156 } _MM_MANTISSA_NORM_ENUM; 157 158 typedef enum 159 { 160 _MM_MANT_SIGN_src, /* sign = sign(SRC) */ 161 _MM_MANT_SIGN_zero, /* sign = 0 */ 162 _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */ 163 } _MM_MANTISSA_SIGN_ENUM; 164 165 /* Define the default attributes for the functions in this file. */ 166 #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512))) 167 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128))) 168 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f"))) 169 170 /* Create vectors with repeated elements */ 171 172 static __inline __m512i __DEFAULT_FN_ATTRS512 173 _mm512_setzero_si512(void) 174 { 175 return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; 176 } 177 178 #define _mm512_setzero_epi32 _mm512_setzero_si512 179 180 static __inline__ __m512d __DEFAULT_FN_ATTRS512 181 _mm512_undefined_pd(void) 182 { 183 return (__m512d)__builtin_ia32_undef512(); 184 } 185 186 static __inline__ __m512 __DEFAULT_FN_ATTRS512 187 _mm512_undefined(void) 188 { 189 return (__m512)__builtin_ia32_undef512(); 190 } 191 192 static __inline__ __m512 __DEFAULT_FN_ATTRS512 193 _mm512_undefined_ps(void) 194 { 195 return (__m512)__builtin_ia32_undef512(); 196 } 197 198 static __inline__ __m512i __DEFAULT_FN_ATTRS512 199 _mm512_undefined_epi32(void) 200 { 201 return (__m512i)__builtin_ia32_undef512(); 202 } 203 204 static __inline__ __m512i __DEFAULT_FN_ATTRS512 205 _mm512_broadcastd_epi32 (__m128i __A) 206 { 207 return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A, 208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 209 } 210 211 static __inline__ __m512i __DEFAULT_FN_ATTRS512 212 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) 213 { 214 return (__m512i)__builtin_ia32_selectd_512(__M, 215 (__v16si) _mm512_broadcastd_epi32(__A), 216 (__v16si) __O); 217 } 218 219 static __inline__ __m512i __DEFAULT_FN_ATTRS512 220 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) 221 { 222 return (__m512i)__builtin_ia32_selectd_512(__M, 223 (__v16si) _mm512_broadcastd_epi32(__A), 224 (__v16si) _mm512_setzero_si512()); 225 } 226 227 static __inline__ __m512i __DEFAULT_FN_ATTRS512 228 _mm512_broadcastq_epi64 (__m128i __A) 229 { 230 return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A, 231 0, 0, 0, 0, 0, 0, 0, 0); 232 } 233 234 static __inline__ __m512i __DEFAULT_FN_ATTRS512 235 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) 236 { 237 return (__m512i)__builtin_ia32_selectq_512(__M, 238 (__v8di) _mm512_broadcastq_epi64(__A), 239 (__v8di) __O); 240 241 } 242 243 static __inline__ __m512i __DEFAULT_FN_ATTRS512 244 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) 245 { 246 return (__m512i)__builtin_ia32_selectq_512(__M, 247 (__v8di) _mm512_broadcastq_epi64(__A), 248 (__v8di) _mm512_setzero_si512()); 249 } 250 251 252 static __inline __m512 __DEFAULT_FN_ATTRS512 253 _mm512_setzero_ps(void) 254 { 255 return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 256 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; 257 } 258 259 #define _mm512_setzero _mm512_setzero_ps 260 261 static __inline __m512d __DEFAULT_FN_ATTRS512 262 _mm512_setzero_pd(void) 263 { 264 return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; 265 } 266 267 static __inline __m512 __DEFAULT_FN_ATTRS512 268 _mm512_set1_ps(float __w) 269 { 270 return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w, 271 __w, __w, __w, __w, __w, __w, __w, __w }; 272 } 273 274 static __inline __m512d __DEFAULT_FN_ATTRS512 275 _mm512_set1_pd(double __w) 276 { 277 return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w }; 278 } 279 280 static __inline __m512i __DEFAULT_FN_ATTRS512 281 _mm512_set1_epi8(char __w) 282 { 283 return __extension__ (__m512i)(__v64qi){ 284 __w, __w, __w, __w, __w, __w, __w, __w, 285 __w, __w, __w, __w, __w, __w, __w, __w, 286 __w, __w, __w, __w, __w, __w, __w, __w, 287 __w, __w, __w, __w, __w, __w, __w, __w, 288 __w, __w, __w, __w, __w, __w, __w, __w, 289 __w, __w, __w, __w, __w, __w, __w, __w, 290 __w, __w, __w, __w, __w, __w, __w, __w, 291 __w, __w, __w, __w, __w, __w, __w, __w }; 292 } 293 294 static __inline __m512i __DEFAULT_FN_ATTRS512 295 _mm512_set1_epi16(short __w) 296 { 297 return __extension__ (__m512i)(__v32hi){ 298 __w, __w, __w, __w, __w, __w, __w, __w, 299 __w, __w, __w, __w, __w, __w, __w, __w, 300 __w, __w, __w, __w, __w, __w, __w, __w, 301 __w, __w, __w, __w, __w, __w, __w, __w }; 302 } 303 304 static __inline __m512i __DEFAULT_FN_ATTRS512 305 _mm512_set1_epi32(int __s) 306 { 307 return __extension__ (__m512i)(__v16si){ 308 __s, __s, __s, __s, __s, __s, __s, __s, 309 __s, __s, __s, __s, __s, __s, __s, __s }; 310 } 311 312 static __inline __m512i __DEFAULT_FN_ATTRS512 313 _mm512_maskz_set1_epi32(__mmask16 __M, int __A) 314 { 315 return (__m512i)__builtin_ia32_selectd_512(__M, 316 (__v16si)_mm512_set1_epi32(__A), 317 (__v16si)_mm512_setzero_si512()); 318 } 319 320 static __inline __m512i __DEFAULT_FN_ATTRS512 321 _mm512_set1_epi64(long long __d) 322 { 323 return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; 324 } 325 326 static __inline __m512i __DEFAULT_FN_ATTRS512 327 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) 328 { 329 return (__m512i)__builtin_ia32_selectq_512(__M, 330 (__v8di)_mm512_set1_epi64(__A), 331 (__v8di)_mm512_setzero_si512()); 332 } 333 334 static __inline__ __m512 __DEFAULT_FN_ATTRS512 335 _mm512_broadcastss_ps(__m128 __A) 336 { 337 return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A, 338 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 339 } 340 341 static __inline __m512i __DEFAULT_FN_ATTRS512 342 _mm512_set4_epi32 (int __A, int __B, int __C, int __D) 343 { 344 return __extension__ (__m512i)(__v16si) 345 { __D, __C, __B, __A, __D, __C, __B, __A, 346 __D, __C, __B, __A, __D, __C, __B, __A }; 347 } 348 349 static __inline __m512i __DEFAULT_FN_ATTRS512 350 _mm512_set4_epi64 (long long __A, long long __B, long long __C, 351 long long __D) 352 { 353 return __extension__ (__m512i) (__v8di) 354 { __D, __C, __B, __A, __D, __C, __B, __A }; 355 } 356 357 static __inline __m512d __DEFAULT_FN_ATTRS512 358 _mm512_set4_pd (double __A, double __B, double __C, double __D) 359 { 360 return __extension__ (__m512d) 361 { __D, __C, __B, __A, __D, __C, __B, __A }; 362 } 363 364 static __inline __m512 __DEFAULT_FN_ATTRS512 365 _mm512_set4_ps (float __A, float __B, float __C, float __D) 366 { 367 return __extension__ (__m512) 368 { __D, __C, __B, __A, __D, __C, __B, __A, 369 __D, __C, __B, __A, __D, __C, __B, __A }; 370 } 371 372 #define _mm512_setr4_epi32(e0,e1,e2,e3) \ 373 _mm512_set4_epi32((e3),(e2),(e1),(e0)) 374 375 #define _mm512_setr4_epi64(e0,e1,e2,e3) \ 376 _mm512_set4_epi64((e3),(e2),(e1),(e0)) 377 378 #define _mm512_setr4_pd(e0,e1,e2,e3) \ 379 _mm512_set4_pd((e3),(e2),(e1),(e0)) 380 381 #define _mm512_setr4_ps(e0,e1,e2,e3) \ 382 _mm512_set4_ps((e3),(e2),(e1),(e0)) 383 384 static __inline__ __m512d __DEFAULT_FN_ATTRS512 385 _mm512_broadcastsd_pd(__m128d __A) 386 { 387 return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A, 388 0, 0, 0, 0, 0, 0, 0, 0); 389 } 390 391 /* Cast between vector types */ 392 393 static __inline __m512d __DEFAULT_FN_ATTRS512 394 _mm512_castpd256_pd512(__m256d __a) 395 { 396 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1); 397 } 398 399 static __inline __m512 __DEFAULT_FN_ATTRS512 400 _mm512_castps256_ps512(__m256 __a) 401 { 402 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 403 -1, -1, -1, -1, -1, -1, -1, -1); 404 } 405 406 static __inline __m128d __DEFAULT_FN_ATTRS512 407 _mm512_castpd512_pd128(__m512d __a) 408 { 409 return __builtin_shufflevector(__a, __a, 0, 1); 410 } 411 412 static __inline __m256d __DEFAULT_FN_ATTRS512 413 _mm512_castpd512_pd256 (__m512d __A) 414 { 415 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3); 416 } 417 418 static __inline __m128 __DEFAULT_FN_ATTRS512 419 _mm512_castps512_ps128(__m512 __a) 420 { 421 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); 422 } 423 424 static __inline __m256 __DEFAULT_FN_ATTRS512 425 _mm512_castps512_ps256 (__m512 __A) 426 { 427 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7); 428 } 429 430 static __inline __m512 __DEFAULT_FN_ATTRS512 431 _mm512_castpd_ps (__m512d __A) 432 { 433 return (__m512) (__A); 434 } 435 436 static __inline __m512i __DEFAULT_FN_ATTRS512 437 _mm512_castpd_si512 (__m512d __A) 438 { 439 return (__m512i) (__A); 440 } 441 442 static __inline__ __m512d __DEFAULT_FN_ATTRS512 443 _mm512_castpd128_pd512 (__m128d __A) 444 { 445 return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); 446 } 447 448 static __inline __m512d __DEFAULT_FN_ATTRS512 449 _mm512_castps_pd (__m512 __A) 450 { 451 return (__m512d) (__A); 452 } 453 454 static __inline __m512i __DEFAULT_FN_ATTRS512 455 _mm512_castps_si512 (__m512 __A) 456 { 457 return (__m512i) (__A); 458 } 459 460 static __inline__ __m512 __DEFAULT_FN_ATTRS512 461 _mm512_castps128_ps512 (__m128 __A) 462 { 463 return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 464 } 465 466 static __inline__ __m512i __DEFAULT_FN_ATTRS512 467 _mm512_castsi128_si512 (__m128i __A) 468 { 469 return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); 470 } 471 472 static __inline__ __m512i __DEFAULT_FN_ATTRS512 473 _mm512_castsi256_si512 (__m256i __A) 474 { 475 return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1); 476 } 477 478 static __inline __m512 __DEFAULT_FN_ATTRS512 479 _mm512_castsi512_ps (__m512i __A) 480 { 481 return (__m512) (__A); 482 } 483 484 static __inline __m512d __DEFAULT_FN_ATTRS512 485 _mm512_castsi512_pd (__m512i __A) 486 { 487 return (__m512d) (__A); 488 } 489 490 static __inline __m128i __DEFAULT_FN_ATTRS512 491 _mm512_castsi512_si128 (__m512i __A) 492 { 493 return (__m128i)__builtin_shufflevector(__A, __A , 0, 1); 494 } 495 496 static __inline __m256i __DEFAULT_FN_ATTRS512 497 _mm512_castsi512_si256 (__m512i __A) 498 { 499 return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); 500 } 501 502 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 503 _mm512_int2mask(int __a) 504 { 505 return (__mmask16)__a; 506 } 507 508 static __inline__ int __DEFAULT_FN_ATTRS 509 _mm512_mask2int(__mmask16 __a) 510 { 511 return (int)__a; 512 } 513 514 /// Constructs a 512-bit floating-point vector of [8 x double] from a 515 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits 516 /// contain the value of the source vector. The upper 384 bits are set 517 /// to zero. 518 /// 519 /// \headerfile <x86intrin.h> 520 /// 521 /// This intrinsic has no corresponding instruction. 522 /// 523 /// \param __a 524 /// A 128-bit vector of [2 x double]. 525 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits 526 /// contain the value of the parameter. The upper 384 bits are set to zero. 527 static __inline __m512d __DEFAULT_FN_ATTRS512 528 _mm512_zextpd128_pd512(__m128d __a) 529 { 530 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3); 531 } 532 533 /// Constructs a 512-bit floating-point vector of [8 x double] from a 534 /// 256-bit floating-point vector of [4 x double]. The lower 256 bits 535 /// contain the value of the source vector. The upper 256 bits are set 536 /// to zero. 537 /// 538 /// \headerfile <x86intrin.h> 539 /// 540 /// This intrinsic has no corresponding instruction. 541 /// 542 /// \param __a 543 /// A 256-bit vector of [4 x double]. 544 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits 545 /// contain the value of the parameter. The upper 256 bits are set to zero. 546 static __inline __m512d __DEFAULT_FN_ATTRS512 547 _mm512_zextpd256_pd512(__m256d __a) 548 { 549 return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7); 550 } 551 552 /// Constructs a 512-bit floating-point vector of [16 x float] from a 553 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 554 /// the value of the source vector. The upper 384 bits are set to zero. 555 /// 556 /// \headerfile <x86intrin.h> 557 /// 558 /// This intrinsic has no corresponding instruction. 559 /// 560 /// \param __a 561 /// A 128-bit vector of [4 x float]. 562 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits 563 /// contain the value of the parameter. The upper 384 bits are set to zero. 564 static __inline __m512 __DEFAULT_FN_ATTRS512 565 _mm512_zextps128_ps512(__m128 __a) 566 { 567 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7); 568 } 569 570 /// Constructs a 512-bit floating-point vector of [16 x float] from a 571 /// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain 572 /// the value of the source vector. The upper 256 bits are set to zero. 573 /// 574 /// \headerfile <x86intrin.h> 575 /// 576 /// This intrinsic has no corresponding instruction. 577 /// 578 /// \param __a 579 /// A 256-bit vector of [8 x float]. 580 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits 581 /// contain the value of the parameter. The upper 256 bits are set to zero. 582 static __inline __m512 __DEFAULT_FN_ATTRS512 583 _mm512_zextps256_ps512(__m256 __a) 584 { 585 return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 586 } 587 588 /// Constructs a 512-bit integer vector from a 128-bit integer vector. 589 /// The lower 128 bits contain the value of the source vector. The upper 590 /// 384 bits are set to zero. 591 /// 592 /// \headerfile <x86intrin.h> 593 /// 594 /// This intrinsic has no corresponding instruction. 595 /// 596 /// \param __a 597 /// A 128-bit integer vector. 598 /// \returns A 512-bit integer vector. The lower 128 bits contain the value of 599 /// the parameter. The upper 384 bits are set to zero. 600 static __inline __m512i __DEFAULT_FN_ATTRS512 601 _mm512_zextsi128_si512(__m128i __a) 602 { 603 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3); 604 } 605 606 /// Constructs a 512-bit integer vector from a 256-bit integer vector. 607 /// The lower 256 bits contain the value of the source vector. The upper 608 /// 256 bits are set to zero. 609 /// 610 /// \headerfile <x86intrin.h> 611 /// 612 /// This intrinsic has no corresponding instruction. 613 /// 614 /// \param __a 615 /// A 256-bit integer vector. 616 /// \returns A 512-bit integer vector. The lower 256 bits contain the value of 617 /// the parameter. The upper 256 bits are set to zero. 618 static __inline __m512i __DEFAULT_FN_ATTRS512 619 _mm512_zextsi256_si512(__m256i __a) 620 { 621 return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7); 622 } 623 624 /* Bitwise operators */ 625 static __inline__ __m512i __DEFAULT_FN_ATTRS512 626 _mm512_and_epi32(__m512i __a, __m512i __b) 627 { 628 return (__m512i)((__v16su)__a & (__v16su)__b); 629 } 630 631 static __inline__ __m512i __DEFAULT_FN_ATTRS512 632 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 633 { 634 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 635 (__v16si) _mm512_and_epi32(__a, __b), 636 (__v16si) __src); 637 } 638 639 static __inline__ __m512i __DEFAULT_FN_ATTRS512 640 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) 641 { 642 return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (), 643 __k, __a, __b); 644 } 645 646 static __inline__ __m512i __DEFAULT_FN_ATTRS512 647 _mm512_and_epi64(__m512i __a, __m512i __b) 648 { 649 return (__m512i)((__v8du)__a & (__v8du)__b); 650 } 651 652 static __inline__ __m512i __DEFAULT_FN_ATTRS512 653 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 654 { 655 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k, 656 (__v8di) _mm512_and_epi64(__a, __b), 657 (__v8di) __src); 658 } 659 660 static __inline__ __m512i __DEFAULT_FN_ATTRS512 661 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) 662 { 663 return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (), 664 __k, __a, __b); 665 } 666 667 static __inline__ __m512i __DEFAULT_FN_ATTRS512 668 _mm512_andnot_si512 (__m512i __A, __m512i __B) 669 { 670 return (__m512i)(~(__v8du)__A & (__v8du)__B); 671 } 672 673 static __inline__ __m512i __DEFAULT_FN_ATTRS512 674 _mm512_andnot_epi32 (__m512i __A, __m512i __B) 675 { 676 return (__m512i)(~(__v16su)__A & (__v16su)__B); 677 } 678 679 static __inline__ __m512i __DEFAULT_FN_ATTRS512 680 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 681 { 682 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 683 (__v16si)_mm512_andnot_epi32(__A, __B), 684 (__v16si)__W); 685 } 686 687 static __inline__ __m512i __DEFAULT_FN_ATTRS512 688 _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B) 689 { 690 return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(), 691 __U, __A, __B); 692 } 693 694 static __inline__ __m512i __DEFAULT_FN_ATTRS512 695 _mm512_andnot_epi64(__m512i __A, __m512i __B) 696 { 697 return (__m512i)(~(__v8du)__A & (__v8du)__B); 698 } 699 700 static __inline__ __m512i __DEFAULT_FN_ATTRS512 701 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 702 { 703 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 704 (__v8di)_mm512_andnot_epi64(__A, __B), 705 (__v8di)__W); 706 } 707 708 static __inline__ __m512i __DEFAULT_FN_ATTRS512 709 _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B) 710 { 711 return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(), 712 __U, __A, __B); 713 } 714 715 static __inline__ __m512i __DEFAULT_FN_ATTRS512 716 _mm512_or_epi32(__m512i __a, __m512i __b) 717 { 718 return (__m512i)((__v16su)__a | (__v16su)__b); 719 } 720 721 static __inline__ __m512i __DEFAULT_FN_ATTRS512 722 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 723 { 724 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 725 (__v16si)_mm512_or_epi32(__a, __b), 726 (__v16si)__src); 727 } 728 729 static __inline__ __m512i __DEFAULT_FN_ATTRS512 730 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) 731 { 732 return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b); 733 } 734 735 static __inline__ __m512i __DEFAULT_FN_ATTRS512 736 _mm512_or_epi64(__m512i __a, __m512i __b) 737 { 738 return (__m512i)((__v8du)__a | (__v8du)__b); 739 } 740 741 static __inline__ __m512i __DEFAULT_FN_ATTRS512 742 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 743 { 744 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, 745 (__v8di)_mm512_or_epi64(__a, __b), 746 (__v8di)__src); 747 } 748 749 static __inline__ __m512i __DEFAULT_FN_ATTRS512 750 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) 751 { 752 return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b); 753 } 754 755 static __inline__ __m512i __DEFAULT_FN_ATTRS512 756 _mm512_xor_epi32(__m512i __a, __m512i __b) 757 { 758 return (__m512i)((__v16su)__a ^ (__v16su)__b); 759 } 760 761 static __inline__ __m512i __DEFAULT_FN_ATTRS512 762 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 763 { 764 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 765 (__v16si)_mm512_xor_epi32(__a, __b), 766 (__v16si)__src); 767 } 768 769 static __inline__ __m512i __DEFAULT_FN_ATTRS512 770 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) 771 { 772 return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b); 773 } 774 775 static __inline__ __m512i __DEFAULT_FN_ATTRS512 776 _mm512_xor_epi64(__m512i __a, __m512i __b) 777 { 778 return (__m512i)((__v8du)__a ^ (__v8du)__b); 779 } 780 781 static __inline__ __m512i __DEFAULT_FN_ATTRS512 782 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 783 { 784 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, 785 (__v8di)_mm512_xor_epi64(__a, __b), 786 (__v8di)__src); 787 } 788 789 static __inline__ __m512i __DEFAULT_FN_ATTRS512 790 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) 791 { 792 return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b); 793 } 794 795 static __inline__ __m512i __DEFAULT_FN_ATTRS512 796 _mm512_and_si512(__m512i __a, __m512i __b) 797 { 798 return (__m512i)((__v8du)__a & (__v8du)__b); 799 } 800 801 static __inline__ __m512i __DEFAULT_FN_ATTRS512 802 _mm512_or_si512(__m512i __a, __m512i __b) 803 { 804 return (__m512i)((__v8du)__a | (__v8du)__b); 805 } 806 807 static __inline__ __m512i __DEFAULT_FN_ATTRS512 808 _mm512_xor_si512(__m512i __a, __m512i __b) 809 { 810 return (__m512i)((__v8du)__a ^ (__v8du)__b); 811 } 812 813 /* Arithmetic */ 814 815 static __inline __m512d __DEFAULT_FN_ATTRS512 816 _mm512_add_pd(__m512d __a, __m512d __b) 817 { 818 return (__m512d)((__v8df)__a + (__v8df)__b); 819 } 820 821 static __inline __m512 __DEFAULT_FN_ATTRS512 822 _mm512_add_ps(__m512 __a, __m512 __b) 823 { 824 return (__m512)((__v16sf)__a + (__v16sf)__b); 825 } 826 827 static __inline __m512d __DEFAULT_FN_ATTRS512 828 _mm512_mul_pd(__m512d __a, __m512d __b) 829 { 830 return (__m512d)((__v8df)__a * (__v8df)__b); 831 } 832 833 static __inline __m512 __DEFAULT_FN_ATTRS512 834 _mm512_mul_ps(__m512 __a, __m512 __b) 835 { 836 return (__m512)((__v16sf)__a * (__v16sf)__b); 837 } 838 839 static __inline __m512d __DEFAULT_FN_ATTRS512 840 _mm512_sub_pd(__m512d __a, __m512d __b) 841 { 842 return (__m512d)((__v8df)__a - (__v8df)__b); 843 } 844 845 static __inline __m512 __DEFAULT_FN_ATTRS512 846 _mm512_sub_ps(__m512 __a, __m512 __b) 847 { 848 return (__m512)((__v16sf)__a - (__v16sf)__b); 849 } 850 851 static __inline__ __m512i __DEFAULT_FN_ATTRS512 852 _mm512_add_epi64 (__m512i __A, __m512i __B) 853 { 854 return (__m512i) ((__v8du) __A + (__v8du) __B); 855 } 856 857 static __inline__ __m512i __DEFAULT_FN_ATTRS512 858 _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 859 { 860 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 861 (__v8di)_mm512_add_epi64(__A, __B), 862 (__v8di)__W); 863 } 864 865 static __inline__ __m512i __DEFAULT_FN_ATTRS512 866 _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) 867 { 868 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 869 (__v8di)_mm512_add_epi64(__A, __B), 870 (__v8di)_mm512_setzero_si512()); 871 } 872 873 static __inline__ __m512i __DEFAULT_FN_ATTRS512 874 _mm512_sub_epi64 (__m512i __A, __m512i __B) 875 { 876 return (__m512i) ((__v8du) __A - (__v8du) __B); 877 } 878 879 static __inline__ __m512i __DEFAULT_FN_ATTRS512 880 _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 881 { 882 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 883 (__v8di)_mm512_sub_epi64(__A, __B), 884 (__v8di)__W); 885 } 886 887 static __inline__ __m512i __DEFAULT_FN_ATTRS512 888 _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) 889 { 890 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 891 (__v8di)_mm512_sub_epi64(__A, __B), 892 (__v8di)_mm512_setzero_si512()); 893 } 894 895 static __inline__ __m512i __DEFAULT_FN_ATTRS512 896 _mm512_add_epi32 (__m512i __A, __m512i __B) 897 { 898 return (__m512i) ((__v16su) __A + (__v16su) __B); 899 } 900 901 static __inline__ __m512i __DEFAULT_FN_ATTRS512 902 _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 903 { 904 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 905 (__v16si)_mm512_add_epi32(__A, __B), 906 (__v16si)__W); 907 } 908 909 static __inline__ __m512i __DEFAULT_FN_ATTRS512 910 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 911 { 912 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 913 (__v16si)_mm512_add_epi32(__A, __B), 914 (__v16si)_mm512_setzero_si512()); 915 } 916 917 static __inline__ __m512i __DEFAULT_FN_ATTRS512 918 _mm512_sub_epi32 (__m512i __A, __m512i __B) 919 { 920 return (__m512i) ((__v16su) __A - (__v16su) __B); 921 } 922 923 static __inline__ __m512i __DEFAULT_FN_ATTRS512 924 _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 925 { 926 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 927 (__v16si)_mm512_sub_epi32(__A, __B), 928 (__v16si)__W); 929 } 930 931 static __inline__ __m512i __DEFAULT_FN_ATTRS512 932 _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) 933 { 934 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 935 (__v16si)_mm512_sub_epi32(__A, __B), 936 (__v16si)_mm512_setzero_si512()); 937 } 938 939 #define _mm512_max_round_pd(A, B, R) \ 940 (__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \ 941 (__v8df)(__m512d)(B), (int)(R)) 942 943 #define _mm512_mask_max_round_pd(W, U, A, B, R) \ 944 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 945 (__v8df)_mm512_max_round_pd((A), (B), (R)), \ 946 (__v8df)(W)) 947 948 #define _mm512_maskz_max_round_pd(U, A, B, R) \ 949 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 950 (__v8df)_mm512_max_round_pd((A), (B), (R)), \ 951 (__v8df)_mm512_setzero_pd()) 952 953 static __inline__ __m512d __DEFAULT_FN_ATTRS512 954 _mm512_max_pd(__m512d __A, __m512d __B) 955 { 956 return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B, 957 _MM_FROUND_CUR_DIRECTION); 958 } 959 960 static __inline__ __m512d __DEFAULT_FN_ATTRS512 961 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 962 { 963 return (__m512d)__builtin_ia32_selectpd_512(__U, 964 (__v8df)_mm512_max_pd(__A, __B), 965 (__v8df)__W); 966 } 967 968 static __inline__ __m512d __DEFAULT_FN_ATTRS512 969 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) 970 { 971 return (__m512d)__builtin_ia32_selectpd_512(__U, 972 (__v8df)_mm512_max_pd(__A, __B), 973 (__v8df)_mm512_setzero_pd()); 974 } 975 976 #define _mm512_max_round_ps(A, B, R) \ 977 (__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \ 978 (__v16sf)(__m512)(B), (int)(R)) 979 980 #define _mm512_mask_max_round_ps(W, U, A, B, R) \ 981 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 982 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ 983 (__v16sf)(W)) 984 985 #define _mm512_maskz_max_round_ps(U, A, B, R) \ 986 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 987 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ 988 (__v16sf)_mm512_setzero_ps()) 989 990 static __inline__ __m512 __DEFAULT_FN_ATTRS512 991 _mm512_max_ps(__m512 __A, __m512 __B) 992 { 993 return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B, 994 _MM_FROUND_CUR_DIRECTION); 995 } 996 997 static __inline__ __m512 __DEFAULT_FN_ATTRS512 998 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 999 { 1000 return (__m512)__builtin_ia32_selectps_512(__U, 1001 (__v16sf)_mm512_max_ps(__A, __B), 1002 (__v16sf)__W); 1003 } 1004 1005 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1006 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) 1007 { 1008 return (__m512)__builtin_ia32_selectps_512(__U, 1009 (__v16sf)_mm512_max_ps(__A, __B), 1010 (__v16sf)_mm512_setzero_ps()); 1011 } 1012 1013 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1014 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1015 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, 1016 (__v4sf) __B, 1017 (__v4sf) __W, 1018 (__mmask8) __U, 1019 _MM_FROUND_CUR_DIRECTION); 1020 } 1021 1022 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1023 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1024 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, 1025 (__v4sf) __B, 1026 (__v4sf) _mm_setzero_ps (), 1027 (__mmask8) __U, 1028 _MM_FROUND_CUR_DIRECTION); 1029 } 1030 1031 #define _mm_max_round_ss(A, B, R) \ 1032 (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1033 (__v4sf)(__m128)(B), \ 1034 (__v4sf)_mm_setzero_ps(), \ 1035 (__mmask8)-1, (int)(R)) 1036 1037 #define _mm_mask_max_round_ss(W, U, A, B, R) \ 1038 (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1039 (__v4sf)(__m128)(B), \ 1040 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1041 (int)(R)) 1042 1043 #define _mm_maskz_max_round_ss(U, A, B, R) \ 1044 (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1045 (__v4sf)(__m128)(B), \ 1046 (__v4sf)_mm_setzero_ps(), \ 1047 (__mmask8)(U), (int)(R)) 1048 1049 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1050 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1051 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, 1052 (__v2df) __B, 1053 (__v2df) __W, 1054 (__mmask8) __U, 1055 _MM_FROUND_CUR_DIRECTION); 1056 } 1057 1058 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1059 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1060 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, 1061 (__v2df) __B, 1062 (__v2df) _mm_setzero_pd (), 1063 (__mmask8) __U, 1064 _MM_FROUND_CUR_DIRECTION); 1065 } 1066 1067 #define _mm_max_round_sd(A, B, R) \ 1068 (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1069 (__v2df)(__m128d)(B), \ 1070 (__v2df)_mm_setzero_pd(), \ 1071 (__mmask8)-1, (int)(R)) 1072 1073 #define _mm_mask_max_round_sd(W, U, A, B, R) \ 1074 (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1075 (__v2df)(__m128d)(B), \ 1076 (__v2df)(__m128d)(W), \ 1077 (__mmask8)(U), (int)(R)) 1078 1079 #define _mm_maskz_max_round_sd(U, A, B, R) \ 1080 (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1081 (__v2df)(__m128d)(B), \ 1082 (__v2df)_mm_setzero_pd(), \ 1083 (__mmask8)(U), (int)(R)) 1084 1085 static __inline __m512i 1086 __DEFAULT_FN_ATTRS512 1087 _mm512_max_epi32(__m512i __A, __m512i __B) 1088 { 1089 return (__m512i)__builtin_ia32_pmaxsd512((__v16si)__A, (__v16si)__B); 1090 } 1091 1092 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1093 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1094 { 1095 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1096 (__v16si)_mm512_max_epi32(__A, __B), 1097 (__v16si)__W); 1098 } 1099 1100 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1101 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) 1102 { 1103 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1104 (__v16si)_mm512_max_epi32(__A, __B), 1105 (__v16si)_mm512_setzero_si512()); 1106 } 1107 1108 static __inline __m512i __DEFAULT_FN_ATTRS512 1109 _mm512_max_epu32(__m512i __A, __m512i __B) 1110 { 1111 return (__m512i)__builtin_ia32_pmaxud512((__v16si)__A, (__v16si)__B); 1112 } 1113 1114 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1115 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1116 { 1117 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1118 (__v16si)_mm512_max_epu32(__A, __B), 1119 (__v16si)__W); 1120 } 1121 1122 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1123 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) 1124 { 1125 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1126 (__v16si)_mm512_max_epu32(__A, __B), 1127 (__v16si)_mm512_setzero_si512()); 1128 } 1129 1130 static __inline __m512i __DEFAULT_FN_ATTRS512 1131 _mm512_max_epi64(__m512i __A, __m512i __B) 1132 { 1133 return (__m512i)__builtin_ia32_pmaxsq512((__v8di)__A, (__v8di)__B); 1134 } 1135 1136 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1137 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1138 { 1139 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1140 (__v8di)_mm512_max_epi64(__A, __B), 1141 (__v8di)__W); 1142 } 1143 1144 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1145 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) 1146 { 1147 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1148 (__v8di)_mm512_max_epi64(__A, __B), 1149 (__v8di)_mm512_setzero_si512()); 1150 } 1151 1152 static __inline __m512i __DEFAULT_FN_ATTRS512 1153 _mm512_max_epu64(__m512i __A, __m512i __B) 1154 { 1155 return (__m512i)__builtin_ia32_pmaxuq512((__v8di)__A, (__v8di)__B); 1156 } 1157 1158 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1159 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1160 { 1161 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1162 (__v8di)_mm512_max_epu64(__A, __B), 1163 (__v8di)__W); 1164 } 1165 1166 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1167 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) 1168 { 1169 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1170 (__v8di)_mm512_max_epu64(__A, __B), 1171 (__v8di)_mm512_setzero_si512()); 1172 } 1173 1174 #define _mm512_min_round_pd(A, B, R) \ 1175 (__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \ 1176 (__v8df)(__m512d)(B), (int)(R)) 1177 1178 #define _mm512_mask_min_round_pd(W, U, A, B, R) \ 1179 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1180 (__v8df)_mm512_min_round_pd((A), (B), (R)), \ 1181 (__v8df)(W)) 1182 1183 #define _mm512_maskz_min_round_pd(U, A, B, R) \ 1184 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1185 (__v8df)_mm512_min_round_pd((A), (B), (R)), \ 1186 (__v8df)_mm512_setzero_pd()) 1187 1188 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1189 _mm512_min_pd(__m512d __A, __m512d __B) 1190 { 1191 return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B, 1192 _MM_FROUND_CUR_DIRECTION); 1193 } 1194 1195 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1196 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 1197 { 1198 return (__m512d)__builtin_ia32_selectpd_512(__U, 1199 (__v8df)_mm512_min_pd(__A, __B), 1200 (__v8df)__W); 1201 } 1202 1203 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1204 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) 1205 { 1206 return (__m512d)__builtin_ia32_selectpd_512(__U, 1207 (__v8df)_mm512_min_pd(__A, __B), 1208 (__v8df)_mm512_setzero_pd()); 1209 } 1210 1211 #define _mm512_min_round_ps(A, B, R) \ 1212 (__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \ 1213 (__v16sf)(__m512)(B), (int)(R)) 1214 1215 #define _mm512_mask_min_round_ps(W, U, A, B, R) \ 1216 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1217 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ 1218 (__v16sf)(W)) 1219 1220 #define _mm512_maskz_min_round_ps(U, A, B, R) \ 1221 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1222 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ 1223 (__v16sf)_mm512_setzero_ps()) 1224 1225 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1226 _mm512_min_ps(__m512 __A, __m512 __B) 1227 { 1228 return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B, 1229 _MM_FROUND_CUR_DIRECTION); 1230 } 1231 1232 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1233 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 1234 { 1235 return (__m512)__builtin_ia32_selectps_512(__U, 1236 (__v16sf)_mm512_min_ps(__A, __B), 1237 (__v16sf)__W); 1238 } 1239 1240 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1241 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) 1242 { 1243 return (__m512)__builtin_ia32_selectps_512(__U, 1244 (__v16sf)_mm512_min_ps(__A, __B), 1245 (__v16sf)_mm512_setzero_ps()); 1246 } 1247 1248 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1249 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1250 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, 1251 (__v4sf) __B, 1252 (__v4sf) __W, 1253 (__mmask8) __U, 1254 _MM_FROUND_CUR_DIRECTION); 1255 } 1256 1257 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1258 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1259 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, 1260 (__v4sf) __B, 1261 (__v4sf) _mm_setzero_ps (), 1262 (__mmask8) __U, 1263 _MM_FROUND_CUR_DIRECTION); 1264 } 1265 1266 #define _mm_min_round_ss(A, B, R) \ 1267 (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1268 (__v4sf)(__m128)(B), \ 1269 (__v4sf)_mm_setzero_ps(), \ 1270 (__mmask8)-1, (int)(R)) 1271 1272 #define _mm_mask_min_round_ss(W, U, A, B, R) \ 1273 (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1274 (__v4sf)(__m128)(B), \ 1275 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1276 (int)(R)) 1277 1278 #define _mm_maskz_min_round_ss(U, A, B, R) \ 1279 (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1280 (__v4sf)(__m128)(B), \ 1281 (__v4sf)_mm_setzero_ps(), \ 1282 (__mmask8)(U), (int)(R)) 1283 1284 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1285 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1286 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, 1287 (__v2df) __B, 1288 (__v2df) __W, 1289 (__mmask8) __U, 1290 _MM_FROUND_CUR_DIRECTION); 1291 } 1292 1293 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1294 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1295 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, 1296 (__v2df) __B, 1297 (__v2df) _mm_setzero_pd (), 1298 (__mmask8) __U, 1299 _MM_FROUND_CUR_DIRECTION); 1300 } 1301 1302 #define _mm_min_round_sd(A, B, R) \ 1303 (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1304 (__v2df)(__m128d)(B), \ 1305 (__v2df)_mm_setzero_pd(), \ 1306 (__mmask8)-1, (int)(R)) 1307 1308 #define _mm_mask_min_round_sd(W, U, A, B, R) \ 1309 (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1310 (__v2df)(__m128d)(B), \ 1311 (__v2df)(__m128d)(W), \ 1312 (__mmask8)(U), (int)(R)) 1313 1314 #define _mm_maskz_min_round_sd(U, A, B, R) \ 1315 (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1316 (__v2df)(__m128d)(B), \ 1317 (__v2df)_mm_setzero_pd(), \ 1318 (__mmask8)(U), (int)(R)) 1319 1320 static __inline __m512i 1321 __DEFAULT_FN_ATTRS512 1322 _mm512_min_epi32(__m512i __A, __m512i __B) 1323 { 1324 return (__m512i)__builtin_ia32_pminsd512((__v16si)__A, (__v16si)__B); 1325 } 1326 1327 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1328 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1329 { 1330 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1331 (__v16si)_mm512_min_epi32(__A, __B), 1332 (__v16si)__W); 1333 } 1334 1335 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1336 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) 1337 { 1338 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1339 (__v16si)_mm512_min_epi32(__A, __B), 1340 (__v16si)_mm512_setzero_si512()); 1341 } 1342 1343 static __inline __m512i __DEFAULT_FN_ATTRS512 1344 _mm512_min_epu32(__m512i __A, __m512i __B) 1345 { 1346 return (__m512i)__builtin_ia32_pminud512((__v16si)__A, (__v16si)__B); 1347 } 1348 1349 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1350 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1351 { 1352 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1353 (__v16si)_mm512_min_epu32(__A, __B), 1354 (__v16si)__W); 1355 } 1356 1357 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1358 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) 1359 { 1360 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1361 (__v16si)_mm512_min_epu32(__A, __B), 1362 (__v16si)_mm512_setzero_si512()); 1363 } 1364 1365 static __inline __m512i __DEFAULT_FN_ATTRS512 1366 _mm512_min_epi64(__m512i __A, __m512i __B) 1367 { 1368 return (__m512i)__builtin_ia32_pminsq512((__v8di)__A, (__v8di)__B); 1369 } 1370 1371 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1372 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1373 { 1374 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1375 (__v8di)_mm512_min_epi64(__A, __B), 1376 (__v8di)__W); 1377 } 1378 1379 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1380 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) 1381 { 1382 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1383 (__v8di)_mm512_min_epi64(__A, __B), 1384 (__v8di)_mm512_setzero_si512()); 1385 } 1386 1387 static __inline __m512i __DEFAULT_FN_ATTRS512 1388 _mm512_min_epu64(__m512i __A, __m512i __B) 1389 { 1390 return (__m512i)__builtin_ia32_pminuq512((__v8di)__A, (__v8di)__B); 1391 } 1392 1393 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1394 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1395 { 1396 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1397 (__v8di)_mm512_min_epu64(__A, __B), 1398 (__v8di)__W); 1399 } 1400 1401 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1402 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) 1403 { 1404 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1405 (__v8di)_mm512_min_epu64(__A, __B), 1406 (__v8di)_mm512_setzero_si512()); 1407 } 1408 1409 static __inline __m512i __DEFAULT_FN_ATTRS512 1410 _mm512_mul_epi32(__m512i __X, __m512i __Y) 1411 { 1412 return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); 1413 } 1414 1415 static __inline __m512i __DEFAULT_FN_ATTRS512 1416 _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) 1417 { 1418 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1419 (__v8di)_mm512_mul_epi32(__X, __Y), 1420 (__v8di)__W); 1421 } 1422 1423 static __inline __m512i __DEFAULT_FN_ATTRS512 1424 _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) 1425 { 1426 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1427 (__v8di)_mm512_mul_epi32(__X, __Y), 1428 (__v8di)_mm512_setzero_si512 ()); 1429 } 1430 1431 static __inline __m512i __DEFAULT_FN_ATTRS512 1432 _mm512_mul_epu32(__m512i __X, __m512i __Y) 1433 { 1434 return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); 1435 } 1436 1437 static __inline __m512i __DEFAULT_FN_ATTRS512 1438 _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) 1439 { 1440 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1441 (__v8di)_mm512_mul_epu32(__X, __Y), 1442 (__v8di)__W); 1443 } 1444 1445 static __inline __m512i __DEFAULT_FN_ATTRS512 1446 _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) 1447 { 1448 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1449 (__v8di)_mm512_mul_epu32(__X, __Y), 1450 (__v8di)_mm512_setzero_si512 ()); 1451 } 1452 1453 static __inline __m512i __DEFAULT_FN_ATTRS512 1454 _mm512_mullo_epi32 (__m512i __A, __m512i __B) 1455 { 1456 return (__m512i) ((__v16su) __A * (__v16su) __B); 1457 } 1458 1459 static __inline __m512i __DEFAULT_FN_ATTRS512 1460 _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) 1461 { 1462 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1463 (__v16si)_mm512_mullo_epi32(__A, __B), 1464 (__v16si)_mm512_setzero_si512()); 1465 } 1466 1467 static __inline __m512i __DEFAULT_FN_ATTRS512 1468 _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1469 { 1470 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1471 (__v16si)_mm512_mullo_epi32(__A, __B), 1472 (__v16si)__W); 1473 } 1474 1475 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1476 _mm512_mullox_epi64 (__m512i __A, __m512i __B) { 1477 return (__m512i) ((__v8du) __A * (__v8du) __B); 1478 } 1479 1480 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1481 _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { 1482 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1483 (__v8di)_mm512_mullox_epi64(__A, __B), 1484 (__v8di)__W); 1485 } 1486 1487 #define _mm512_sqrt_round_pd(A, R) \ 1488 (__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)) 1489 1490 #define _mm512_mask_sqrt_round_pd(W, U, A, R) \ 1491 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1492 (__v8df)_mm512_sqrt_round_pd((A), (R)), \ 1493 (__v8df)(__m512d)(W)) 1494 1495 #define _mm512_maskz_sqrt_round_pd(U, A, R) \ 1496 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1497 (__v8df)_mm512_sqrt_round_pd((A), (R)), \ 1498 (__v8df)_mm512_setzero_pd()) 1499 1500 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1501 _mm512_sqrt_pd(__m512d __A) 1502 { 1503 return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A, 1504 _MM_FROUND_CUR_DIRECTION); 1505 } 1506 1507 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1508 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) 1509 { 1510 return (__m512d)__builtin_ia32_selectpd_512(__U, 1511 (__v8df)_mm512_sqrt_pd(__A), 1512 (__v8df)__W); 1513 } 1514 1515 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1516 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) 1517 { 1518 return (__m512d)__builtin_ia32_selectpd_512(__U, 1519 (__v8df)_mm512_sqrt_pd(__A), 1520 (__v8df)_mm512_setzero_pd()); 1521 } 1522 1523 #define _mm512_sqrt_round_ps(A, R) \ 1524 (__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)) 1525 1526 #define _mm512_mask_sqrt_round_ps(W, U, A, R) \ 1527 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1528 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ 1529 (__v16sf)(__m512)(W)) 1530 1531 #define _mm512_maskz_sqrt_round_ps(U, A, R) \ 1532 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1533 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ 1534 (__v16sf)_mm512_setzero_ps()) 1535 1536 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1537 _mm512_sqrt_ps(__m512 __A) 1538 { 1539 return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A, 1540 _MM_FROUND_CUR_DIRECTION); 1541 } 1542 1543 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1544 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) 1545 { 1546 return (__m512)__builtin_ia32_selectps_512(__U, 1547 (__v16sf)_mm512_sqrt_ps(__A), 1548 (__v16sf)__W); 1549 } 1550 1551 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1552 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) 1553 { 1554 return (__m512)__builtin_ia32_selectps_512(__U, 1555 (__v16sf)_mm512_sqrt_ps(__A), 1556 (__v16sf)_mm512_setzero_ps()); 1557 } 1558 1559 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1560 _mm512_rsqrt14_pd(__m512d __A) 1561 { 1562 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1563 (__v8df) 1564 _mm512_setzero_pd (), 1565 (__mmask8) -1);} 1566 1567 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1568 _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) 1569 { 1570 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1571 (__v8df) __W, 1572 (__mmask8) __U); 1573 } 1574 1575 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1576 _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) 1577 { 1578 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1579 (__v8df) 1580 _mm512_setzero_pd (), 1581 (__mmask8) __U); 1582 } 1583 1584 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1585 _mm512_rsqrt14_ps(__m512 __A) 1586 { 1587 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1588 (__v16sf) 1589 _mm512_setzero_ps (), 1590 (__mmask16) -1); 1591 } 1592 1593 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1594 _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) 1595 { 1596 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1597 (__v16sf) __W, 1598 (__mmask16) __U); 1599 } 1600 1601 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1602 _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) 1603 { 1604 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1605 (__v16sf) 1606 _mm512_setzero_ps (), 1607 (__mmask16) __U); 1608 } 1609 1610 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1611 _mm_rsqrt14_ss(__m128 __A, __m128 __B) 1612 { 1613 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1614 (__v4sf) __B, 1615 (__v4sf) 1616 _mm_setzero_ps (), 1617 (__mmask8) -1); 1618 } 1619 1620 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1621 _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 1622 { 1623 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1624 (__v4sf) __B, 1625 (__v4sf) __W, 1626 (__mmask8) __U); 1627 } 1628 1629 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1630 _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) 1631 { 1632 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1633 (__v4sf) __B, 1634 (__v4sf) _mm_setzero_ps (), 1635 (__mmask8) __U); 1636 } 1637 1638 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1639 _mm_rsqrt14_sd(__m128d __A, __m128d __B) 1640 { 1641 return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A, 1642 (__v2df) __B, 1643 (__v2df) 1644 _mm_setzero_pd (), 1645 (__mmask8) -1); 1646 } 1647 1648 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1649 _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 1650 { 1651 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, 1652 (__v2df) __B, 1653 (__v2df) __W, 1654 (__mmask8) __U); 1655 } 1656 1657 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1658 _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) 1659 { 1660 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, 1661 (__v2df) __B, 1662 (__v2df) _mm_setzero_pd (), 1663 (__mmask8) __U); 1664 } 1665 1666 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1667 _mm512_rcp14_pd(__m512d __A) 1668 { 1669 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1670 (__v8df) 1671 _mm512_setzero_pd (), 1672 (__mmask8) -1); 1673 } 1674 1675 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1676 _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) 1677 { 1678 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1679 (__v8df) __W, 1680 (__mmask8) __U); 1681 } 1682 1683 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1684 _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) 1685 { 1686 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1687 (__v8df) 1688 _mm512_setzero_pd (), 1689 (__mmask8) __U); 1690 } 1691 1692 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1693 _mm512_rcp14_ps(__m512 __A) 1694 { 1695 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1696 (__v16sf) 1697 _mm512_setzero_ps (), 1698 (__mmask16) -1); 1699 } 1700 1701 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1702 _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) 1703 { 1704 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1705 (__v16sf) __W, 1706 (__mmask16) __U); 1707 } 1708 1709 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1710 _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) 1711 { 1712 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1713 (__v16sf) 1714 _mm512_setzero_ps (), 1715 (__mmask16) __U); 1716 } 1717 1718 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1719 _mm_rcp14_ss(__m128 __A, __m128 __B) 1720 { 1721 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1722 (__v4sf) __B, 1723 (__v4sf) 1724 _mm_setzero_ps (), 1725 (__mmask8) -1); 1726 } 1727 1728 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1729 _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 1730 { 1731 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1732 (__v4sf) __B, 1733 (__v4sf) __W, 1734 (__mmask8) __U); 1735 } 1736 1737 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1738 _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) 1739 { 1740 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1741 (__v4sf) __B, 1742 (__v4sf) _mm_setzero_ps (), 1743 (__mmask8) __U); 1744 } 1745 1746 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1747 _mm_rcp14_sd(__m128d __A, __m128d __B) 1748 { 1749 return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A, 1750 (__v2df) __B, 1751 (__v2df) 1752 _mm_setzero_pd (), 1753 (__mmask8) -1); 1754 } 1755 1756 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1757 _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 1758 { 1759 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, 1760 (__v2df) __B, 1761 (__v2df) __W, 1762 (__mmask8) __U); 1763 } 1764 1765 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1766 _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) 1767 { 1768 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, 1769 (__v2df) __B, 1770 (__v2df) _mm_setzero_pd (), 1771 (__mmask8) __U); 1772 } 1773 1774 static __inline __m512 __DEFAULT_FN_ATTRS512 1775 _mm512_floor_ps(__m512 __A) 1776 { 1777 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1778 _MM_FROUND_FLOOR, 1779 (__v16sf) __A, -1, 1780 _MM_FROUND_CUR_DIRECTION); 1781 } 1782 1783 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1784 _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) 1785 { 1786 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1787 _MM_FROUND_FLOOR, 1788 (__v16sf) __W, __U, 1789 _MM_FROUND_CUR_DIRECTION); 1790 } 1791 1792 static __inline __m512d __DEFAULT_FN_ATTRS512 1793 _mm512_floor_pd(__m512d __A) 1794 { 1795 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1796 _MM_FROUND_FLOOR, 1797 (__v8df) __A, -1, 1798 _MM_FROUND_CUR_DIRECTION); 1799 } 1800 1801 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1802 _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) 1803 { 1804 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1805 _MM_FROUND_FLOOR, 1806 (__v8df) __W, __U, 1807 _MM_FROUND_CUR_DIRECTION); 1808 } 1809 1810 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1811 _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) 1812 { 1813 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1814 _MM_FROUND_CEIL, 1815 (__v16sf) __W, __U, 1816 _MM_FROUND_CUR_DIRECTION); 1817 } 1818 1819 static __inline __m512 __DEFAULT_FN_ATTRS512 1820 _mm512_ceil_ps(__m512 __A) 1821 { 1822 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1823 _MM_FROUND_CEIL, 1824 (__v16sf) __A, -1, 1825 _MM_FROUND_CUR_DIRECTION); 1826 } 1827 1828 static __inline __m512d __DEFAULT_FN_ATTRS512 1829 _mm512_ceil_pd(__m512d __A) 1830 { 1831 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1832 _MM_FROUND_CEIL, 1833 (__v8df) __A, -1, 1834 _MM_FROUND_CUR_DIRECTION); 1835 } 1836 1837 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1838 _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) 1839 { 1840 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1841 _MM_FROUND_CEIL, 1842 (__v8df) __W, __U, 1843 _MM_FROUND_CUR_DIRECTION); 1844 } 1845 1846 static __inline __m512i __DEFAULT_FN_ATTRS512 1847 _mm512_abs_epi64(__m512i __A) 1848 { 1849 return (__m512i)__builtin_ia32_pabsq512((__v8di)__A); 1850 } 1851 1852 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1853 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 1854 { 1855 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1856 (__v8di)_mm512_abs_epi64(__A), 1857 (__v8di)__W); 1858 } 1859 1860 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1861 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) 1862 { 1863 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1864 (__v8di)_mm512_abs_epi64(__A), 1865 (__v8di)_mm512_setzero_si512()); 1866 } 1867 1868 static __inline __m512i __DEFAULT_FN_ATTRS512 1869 _mm512_abs_epi32(__m512i __A) 1870 { 1871 return (__m512i)__builtin_ia32_pabsd512((__v16si) __A); 1872 } 1873 1874 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1875 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 1876 { 1877 return (__m512i)__builtin_ia32_selectd_512(__U, 1878 (__v16si)_mm512_abs_epi32(__A), 1879 (__v16si)__W); 1880 } 1881 1882 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1883 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) 1884 { 1885 return (__m512i)__builtin_ia32_selectd_512(__U, 1886 (__v16si)_mm512_abs_epi32(__A), 1887 (__v16si)_mm512_setzero_si512()); 1888 } 1889 1890 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1891 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1892 __A = _mm_add_ss(__A, __B); 1893 return __builtin_ia32_selectss_128(__U, __A, __W); 1894 } 1895 1896 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1897 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1898 __A = _mm_add_ss(__A, __B); 1899 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 1900 } 1901 1902 #define _mm_add_round_ss(A, B, R) \ 1903 (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1904 (__v4sf)(__m128)(B), \ 1905 (__v4sf)_mm_setzero_ps(), \ 1906 (__mmask8)-1, (int)(R)) 1907 1908 #define _mm_mask_add_round_ss(W, U, A, B, R) \ 1909 (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1910 (__v4sf)(__m128)(B), \ 1911 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1912 (int)(R)) 1913 1914 #define _mm_maskz_add_round_ss(U, A, B, R) \ 1915 (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1916 (__v4sf)(__m128)(B), \ 1917 (__v4sf)_mm_setzero_ps(), \ 1918 (__mmask8)(U), (int)(R)) 1919 1920 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1921 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1922 __A = _mm_add_sd(__A, __B); 1923 return __builtin_ia32_selectsd_128(__U, __A, __W); 1924 } 1925 1926 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1927 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1928 __A = _mm_add_sd(__A, __B); 1929 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 1930 } 1931 #define _mm_add_round_sd(A, B, R) \ 1932 (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1933 (__v2df)(__m128d)(B), \ 1934 (__v2df)_mm_setzero_pd(), \ 1935 (__mmask8)-1, (int)(R)) 1936 1937 #define _mm_mask_add_round_sd(W, U, A, B, R) \ 1938 (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1939 (__v2df)(__m128d)(B), \ 1940 (__v2df)(__m128d)(W), \ 1941 (__mmask8)(U), (int)(R)) 1942 1943 #define _mm_maskz_add_round_sd(U, A, B, R) \ 1944 (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1945 (__v2df)(__m128d)(B), \ 1946 (__v2df)_mm_setzero_pd(), \ 1947 (__mmask8)(U), (int)(R)) 1948 1949 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1950 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 1951 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 1952 (__v8df)_mm512_add_pd(__A, __B), 1953 (__v8df)__W); 1954 } 1955 1956 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1957 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { 1958 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 1959 (__v8df)_mm512_add_pd(__A, __B), 1960 (__v8df)_mm512_setzero_pd()); 1961 } 1962 1963 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1964 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 1965 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 1966 (__v16sf)_mm512_add_ps(__A, __B), 1967 (__v16sf)__W); 1968 } 1969 1970 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1971 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { 1972 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 1973 (__v16sf)_mm512_add_ps(__A, __B), 1974 (__v16sf)_mm512_setzero_ps()); 1975 } 1976 1977 #define _mm512_add_round_pd(A, B, R) \ 1978 (__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \ 1979 (__v8df)(__m512d)(B), (int)(R)) 1980 1981 #define _mm512_mask_add_round_pd(W, U, A, B, R) \ 1982 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1983 (__v8df)_mm512_add_round_pd((A), (B), (R)), \ 1984 (__v8df)(__m512d)(W)) 1985 1986 #define _mm512_maskz_add_round_pd(U, A, B, R) \ 1987 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1988 (__v8df)_mm512_add_round_pd((A), (B), (R)), \ 1989 (__v8df)_mm512_setzero_pd()) 1990 1991 #define _mm512_add_round_ps(A, B, R) \ 1992 (__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ 1993 (__v16sf)(__m512)(B), (int)(R)) 1994 1995 #define _mm512_mask_add_round_ps(W, U, A, B, R) \ 1996 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1997 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ 1998 (__v16sf)(__m512)(W)) 1999 2000 #define _mm512_maskz_add_round_ps(U, A, B, R) \ 2001 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2002 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ 2003 (__v16sf)_mm512_setzero_ps()) 2004 2005 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2006 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2007 __A = _mm_sub_ss(__A, __B); 2008 return __builtin_ia32_selectss_128(__U, __A, __W); 2009 } 2010 2011 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2012 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2013 __A = _mm_sub_ss(__A, __B); 2014 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2015 } 2016 #define _mm_sub_round_ss(A, B, R) \ 2017 (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2018 (__v4sf)(__m128)(B), \ 2019 (__v4sf)_mm_setzero_ps(), \ 2020 (__mmask8)-1, (int)(R)) 2021 2022 #define _mm_mask_sub_round_ss(W, U, A, B, R) \ 2023 (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2024 (__v4sf)(__m128)(B), \ 2025 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2026 (int)(R)) 2027 2028 #define _mm_maskz_sub_round_ss(U, A, B, R) \ 2029 (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2030 (__v4sf)(__m128)(B), \ 2031 (__v4sf)_mm_setzero_ps(), \ 2032 (__mmask8)(U), (int)(R)) 2033 2034 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2035 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2036 __A = _mm_sub_sd(__A, __B); 2037 return __builtin_ia32_selectsd_128(__U, __A, __W); 2038 } 2039 2040 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2041 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2042 __A = _mm_sub_sd(__A, __B); 2043 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2044 } 2045 2046 #define _mm_sub_round_sd(A, B, R) \ 2047 (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2048 (__v2df)(__m128d)(B), \ 2049 (__v2df)_mm_setzero_pd(), \ 2050 (__mmask8)-1, (int)(R)) 2051 2052 #define _mm_mask_sub_round_sd(W, U, A, B, R) \ 2053 (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2054 (__v2df)(__m128d)(B), \ 2055 (__v2df)(__m128d)(W), \ 2056 (__mmask8)(U), (int)(R)) 2057 2058 #define _mm_maskz_sub_round_sd(U, A, B, R) \ 2059 (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2060 (__v2df)(__m128d)(B), \ 2061 (__v2df)_mm_setzero_pd(), \ 2062 (__mmask8)(U), (int)(R)) 2063 2064 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2065 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2066 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2067 (__v8df)_mm512_sub_pd(__A, __B), 2068 (__v8df)__W); 2069 } 2070 2071 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2072 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2073 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2074 (__v8df)_mm512_sub_pd(__A, __B), 2075 (__v8df)_mm512_setzero_pd()); 2076 } 2077 2078 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2079 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2080 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2081 (__v16sf)_mm512_sub_ps(__A, __B), 2082 (__v16sf)__W); 2083 } 2084 2085 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2086 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2087 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2088 (__v16sf)_mm512_sub_ps(__A, __B), 2089 (__v16sf)_mm512_setzero_ps()); 2090 } 2091 2092 #define _mm512_sub_round_pd(A, B, R) \ 2093 (__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \ 2094 (__v8df)(__m512d)(B), (int)(R)) 2095 2096 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \ 2097 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2098 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ 2099 (__v8df)(__m512d)(W)) 2100 2101 #define _mm512_maskz_sub_round_pd(U, A, B, R) \ 2102 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2103 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ 2104 (__v8df)_mm512_setzero_pd()) 2105 2106 #define _mm512_sub_round_ps(A, B, R) \ 2107 (__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ 2108 (__v16sf)(__m512)(B), (int)(R)) 2109 2110 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \ 2111 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2112 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ 2113 (__v16sf)(__m512)(W)) 2114 2115 #define _mm512_maskz_sub_round_ps(U, A, B, R) \ 2116 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2117 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ 2118 (__v16sf)_mm512_setzero_ps()) 2119 2120 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2121 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2122 __A = _mm_mul_ss(__A, __B); 2123 return __builtin_ia32_selectss_128(__U, __A, __W); 2124 } 2125 2126 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2127 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2128 __A = _mm_mul_ss(__A, __B); 2129 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2130 } 2131 #define _mm_mul_round_ss(A, B, R) \ 2132 (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2133 (__v4sf)(__m128)(B), \ 2134 (__v4sf)_mm_setzero_ps(), \ 2135 (__mmask8)-1, (int)(R)) 2136 2137 #define _mm_mask_mul_round_ss(W, U, A, B, R) \ 2138 (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2139 (__v4sf)(__m128)(B), \ 2140 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2141 (int)(R)) 2142 2143 #define _mm_maskz_mul_round_ss(U, A, B, R) \ 2144 (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2145 (__v4sf)(__m128)(B), \ 2146 (__v4sf)_mm_setzero_ps(), \ 2147 (__mmask8)(U), (int)(R)) 2148 2149 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2150 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2151 __A = _mm_mul_sd(__A, __B); 2152 return __builtin_ia32_selectsd_128(__U, __A, __W); 2153 } 2154 2155 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2156 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2157 __A = _mm_mul_sd(__A, __B); 2158 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2159 } 2160 2161 #define _mm_mul_round_sd(A, B, R) \ 2162 (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2163 (__v2df)(__m128d)(B), \ 2164 (__v2df)_mm_setzero_pd(), \ 2165 (__mmask8)-1, (int)(R)) 2166 2167 #define _mm_mask_mul_round_sd(W, U, A, B, R) \ 2168 (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2169 (__v2df)(__m128d)(B), \ 2170 (__v2df)(__m128d)(W), \ 2171 (__mmask8)(U), (int)(R)) 2172 2173 #define _mm_maskz_mul_round_sd(U, A, B, R) \ 2174 (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2175 (__v2df)(__m128d)(B), \ 2176 (__v2df)_mm_setzero_pd(), \ 2177 (__mmask8)(U), (int)(R)) 2178 2179 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2180 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2181 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2182 (__v8df)_mm512_mul_pd(__A, __B), 2183 (__v8df)__W); 2184 } 2185 2186 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2187 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2188 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2189 (__v8df)_mm512_mul_pd(__A, __B), 2190 (__v8df)_mm512_setzero_pd()); 2191 } 2192 2193 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2194 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2195 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2196 (__v16sf)_mm512_mul_ps(__A, __B), 2197 (__v16sf)__W); 2198 } 2199 2200 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2201 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2202 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2203 (__v16sf)_mm512_mul_ps(__A, __B), 2204 (__v16sf)_mm512_setzero_ps()); 2205 } 2206 2207 #define _mm512_mul_round_pd(A, B, R) \ 2208 (__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \ 2209 (__v8df)(__m512d)(B), (int)(R)) 2210 2211 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \ 2212 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2213 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ 2214 (__v8df)(__m512d)(W)) 2215 2216 #define _mm512_maskz_mul_round_pd(U, A, B, R) \ 2217 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2218 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ 2219 (__v8df)_mm512_setzero_pd()) 2220 2221 #define _mm512_mul_round_ps(A, B, R) \ 2222 (__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ 2223 (__v16sf)(__m512)(B), (int)(R)) 2224 2225 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \ 2226 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2227 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ 2228 (__v16sf)(__m512)(W)) 2229 2230 #define _mm512_maskz_mul_round_ps(U, A, B, R) \ 2231 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2232 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ 2233 (__v16sf)_mm512_setzero_ps()) 2234 2235 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2236 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2237 __A = _mm_div_ss(__A, __B); 2238 return __builtin_ia32_selectss_128(__U, __A, __W); 2239 } 2240 2241 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2242 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2243 __A = _mm_div_ss(__A, __B); 2244 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2245 } 2246 2247 #define _mm_div_round_ss(A, B, R) \ 2248 (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2249 (__v4sf)(__m128)(B), \ 2250 (__v4sf)_mm_setzero_ps(), \ 2251 (__mmask8)-1, (int)(R)) 2252 2253 #define _mm_mask_div_round_ss(W, U, A, B, R) \ 2254 (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2255 (__v4sf)(__m128)(B), \ 2256 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2257 (int)(R)) 2258 2259 #define _mm_maskz_div_round_ss(U, A, B, R) \ 2260 (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2261 (__v4sf)(__m128)(B), \ 2262 (__v4sf)_mm_setzero_ps(), \ 2263 (__mmask8)(U), (int)(R)) 2264 2265 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2266 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2267 __A = _mm_div_sd(__A, __B); 2268 return __builtin_ia32_selectsd_128(__U, __A, __W); 2269 } 2270 2271 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2272 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2273 __A = _mm_div_sd(__A, __B); 2274 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2275 } 2276 2277 #define _mm_div_round_sd(A, B, R) \ 2278 (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2279 (__v2df)(__m128d)(B), \ 2280 (__v2df)_mm_setzero_pd(), \ 2281 (__mmask8)-1, (int)(R)) 2282 2283 #define _mm_mask_div_round_sd(W, U, A, B, R) \ 2284 (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2285 (__v2df)(__m128d)(B), \ 2286 (__v2df)(__m128d)(W), \ 2287 (__mmask8)(U), (int)(R)) 2288 2289 #define _mm_maskz_div_round_sd(U, A, B, R) \ 2290 (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2291 (__v2df)(__m128d)(B), \ 2292 (__v2df)_mm_setzero_pd(), \ 2293 (__mmask8)(U), (int)(R)) 2294 2295 static __inline __m512d __DEFAULT_FN_ATTRS512 2296 _mm512_div_pd(__m512d __a, __m512d __b) 2297 { 2298 return (__m512d)((__v8df)__a/(__v8df)__b); 2299 } 2300 2301 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2302 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2303 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2304 (__v8df)_mm512_div_pd(__A, __B), 2305 (__v8df)__W); 2306 } 2307 2308 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2309 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2310 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2311 (__v8df)_mm512_div_pd(__A, __B), 2312 (__v8df)_mm512_setzero_pd()); 2313 } 2314 2315 static __inline __m512 __DEFAULT_FN_ATTRS512 2316 _mm512_div_ps(__m512 __a, __m512 __b) 2317 { 2318 return (__m512)((__v16sf)__a/(__v16sf)__b); 2319 } 2320 2321 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2322 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2323 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2324 (__v16sf)_mm512_div_ps(__A, __B), 2325 (__v16sf)__W); 2326 } 2327 2328 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2329 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2330 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2331 (__v16sf)_mm512_div_ps(__A, __B), 2332 (__v16sf)_mm512_setzero_ps()); 2333 } 2334 2335 #define _mm512_div_round_pd(A, B, R) \ 2336 (__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \ 2337 (__v8df)(__m512d)(B), (int)(R)) 2338 2339 #define _mm512_mask_div_round_pd(W, U, A, B, R) \ 2340 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2341 (__v8df)_mm512_div_round_pd((A), (B), (R)), \ 2342 (__v8df)(__m512d)(W)) 2343 2344 #define _mm512_maskz_div_round_pd(U, A, B, R) \ 2345 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2346 (__v8df)_mm512_div_round_pd((A), (B), (R)), \ 2347 (__v8df)_mm512_setzero_pd()) 2348 2349 #define _mm512_div_round_ps(A, B, R) \ 2350 (__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ 2351 (__v16sf)(__m512)(B), (int)(R)) 2352 2353 #define _mm512_mask_div_round_ps(W, U, A, B, R) \ 2354 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2355 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ 2356 (__v16sf)(__m512)(W)) 2357 2358 #define _mm512_maskz_div_round_ps(U, A, B, R) \ 2359 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2360 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ 2361 (__v16sf)_mm512_setzero_ps()) 2362 2363 #define _mm512_roundscale_ps(A, B) \ 2364 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ 2365 (__v16sf)_mm512_undefined_ps(), \ 2366 (__mmask16)-1, \ 2367 _MM_FROUND_CUR_DIRECTION) 2368 2369 #define _mm512_mask_roundscale_ps(A, B, C, imm) \ 2370 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ 2371 (__v16sf)(__m512)(A), (__mmask16)(B), \ 2372 _MM_FROUND_CUR_DIRECTION) 2373 2374 #define _mm512_maskz_roundscale_ps(A, B, imm) \ 2375 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ 2376 (__v16sf)_mm512_setzero_ps(), \ 2377 (__mmask16)(A), \ 2378 _MM_FROUND_CUR_DIRECTION) 2379 2380 #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \ 2381 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ 2382 (__v16sf)(__m512)(A), (__mmask16)(B), \ 2383 (int)(R)) 2384 2385 #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \ 2386 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ 2387 (__v16sf)_mm512_setzero_ps(), \ 2388 (__mmask16)(A), (int)(R)) 2389 2390 #define _mm512_roundscale_round_ps(A, imm, R) \ 2391 (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \ 2392 (__v16sf)_mm512_undefined_ps(), \ 2393 (__mmask16)-1, (int)(R)) 2394 2395 #define _mm512_roundscale_pd(A, B) \ 2396 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ 2397 (__v8df)_mm512_undefined_pd(), \ 2398 (__mmask8)-1, \ 2399 _MM_FROUND_CUR_DIRECTION) 2400 2401 #define _mm512_mask_roundscale_pd(A, B, C, imm) \ 2402 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ 2403 (__v8df)(__m512d)(A), (__mmask8)(B), \ 2404 _MM_FROUND_CUR_DIRECTION) 2405 2406 #define _mm512_maskz_roundscale_pd(A, B, imm) \ 2407 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ 2408 (__v8df)_mm512_setzero_pd(), \ 2409 (__mmask8)(A), \ 2410 _MM_FROUND_CUR_DIRECTION) 2411 2412 #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \ 2413 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ 2414 (__v8df)(__m512d)(A), (__mmask8)(B), \ 2415 (int)(R)) 2416 2417 #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \ 2418 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ 2419 (__v8df)_mm512_setzero_pd(), \ 2420 (__mmask8)(A), (int)(R)) 2421 2422 #define _mm512_roundscale_round_pd(A, imm, R) \ 2423 (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \ 2424 (__v8df)_mm512_undefined_pd(), \ 2425 (__mmask8)-1, (int)(R)) 2426 2427 #define _mm512_fmadd_round_pd(A, B, C, R) \ 2428 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2429 (__v8df)(__m512d)(B), \ 2430 (__v8df)(__m512d)(C), \ 2431 (__mmask8)-1, (int)(R)) 2432 2433 2434 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ 2435 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2436 (__v8df)(__m512d)(B), \ 2437 (__v8df)(__m512d)(C), \ 2438 (__mmask8)(U), (int)(R)) 2439 2440 2441 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ 2442 (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ 2443 (__v8df)(__m512d)(B), \ 2444 (__v8df)(__m512d)(C), \ 2445 (__mmask8)(U), (int)(R)) 2446 2447 2448 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ 2449 (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ 2450 (__v8df)(__m512d)(B), \ 2451 (__v8df)(__m512d)(C), \ 2452 (__mmask8)(U), (int)(R)) 2453 2454 2455 #define _mm512_fmsub_round_pd(A, B, C, R) \ 2456 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2457 (__v8df)(__m512d)(B), \ 2458 -(__v8df)(__m512d)(C), \ 2459 (__mmask8)-1, (int)(R)) 2460 2461 2462 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ 2463 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2464 (__v8df)(__m512d)(B), \ 2465 -(__v8df)(__m512d)(C), \ 2466 (__mmask8)(U), (int)(R)) 2467 2468 2469 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ 2470 (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ 2471 (__v8df)(__m512d)(B), \ 2472 -(__v8df)(__m512d)(C), \ 2473 (__mmask8)(U), (int)(R)) 2474 2475 2476 #define _mm512_fnmadd_round_pd(A, B, C, R) \ 2477 (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ 2478 (__v8df)(__m512d)(B), \ 2479 (__v8df)(__m512d)(C), \ 2480 (__mmask8)-1, (int)(R)) 2481 2482 2483 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ 2484 (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ 2485 (__v8df)(__m512d)(B), \ 2486 (__v8df)(__m512d)(C), \ 2487 (__mmask8)(U), (int)(R)) 2488 2489 2490 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ 2491 (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ 2492 (__v8df)(__m512d)(B), \ 2493 (__v8df)(__m512d)(C), \ 2494 (__mmask8)(U), (int)(R)) 2495 2496 2497 #define _mm512_fnmsub_round_pd(A, B, C, R) \ 2498 (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ 2499 (__v8df)(__m512d)(B), \ 2500 -(__v8df)(__m512d)(C), \ 2501 (__mmask8)-1, (int)(R)) 2502 2503 2504 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ 2505 (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ 2506 (__v8df)(__m512d)(B), \ 2507 -(__v8df)(__m512d)(C), \ 2508 (__mmask8)(U), (int)(R)) 2509 2510 2511 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2512 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) 2513 { 2514 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2515 (__v8df) __B, 2516 (__v8df) __C, 2517 (__mmask8) -1, 2518 _MM_FROUND_CUR_DIRECTION); 2519 } 2520 2521 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2522 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2523 { 2524 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2525 (__v8df) __B, 2526 (__v8df) __C, 2527 (__mmask8) __U, 2528 _MM_FROUND_CUR_DIRECTION); 2529 } 2530 2531 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2532 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2533 { 2534 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, 2535 (__v8df) __B, 2536 (__v8df) __C, 2537 (__mmask8) __U, 2538 _MM_FROUND_CUR_DIRECTION); 2539 } 2540 2541 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2542 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2543 { 2544 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, 2545 (__v8df) __B, 2546 (__v8df) __C, 2547 (__mmask8) __U, 2548 _MM_FROUND_CUR_DIRECTION); 2549 } 2550 2551 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2552 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) 2553 { 2554 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2555 (__v8df) __B, 2556 -(__v8df) __C, 2557 (__mmask8) -1, 2558 _MM_FROUND_CUR_DIRECTION); 2559 } 2560 2561 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2562 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2563 { 2564 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2565 (__v8df) __B, 2566 -(__v8df) __C, 2567 (__mmask8) __U, 2568 _MM_FROUND_CUR_DIRECTION); 2569 } 2570 2571 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2572 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2573 { 2574 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, 2575 (__v8df) __B, 2576 -(__v8df) __C, 2577 (__mmask8) __U, 2578 _MM_FROUND_CUR_DIRECTION); 2579 } 2580 2581 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2582 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) 2583 { 2584 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2585 -(__v8df) __B, 2586 (__v8df) __C, 2587 (__mmask8) -1, 2588 _MM_FROUND_CUR_DIRECTION); 2589 } 2590 2591 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2592 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2593 { 2594 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, 2595 (__v8df) __B, 2596 (__v8df) __C, 2597 (__mmask8) __U, 2598 _MM_FROUND_CUR_DIRECTION); 2599 } 2600 2601 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2602 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2603 { 2604 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, 2605 (__v8df) __B, 2606 (__v8df) __C, 2607 (__mmask8) __U, 2608 _MM_FROUND_CUR_DIRECTION); 2609 } 2610 2611 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2612 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) 2613 { 2614 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2615 -(__v8df) __B, 2616 -(__v8df) __C, 2617 (__mmask8) -1, 2618 _MM_FROUND_CUR_DIRECTION); 2619 } 2620 2621 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2622 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2623 { 2624 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, 2625 (__v8df) __B, 2626 -(__v8df) __C, 2627 (__mmask8) __U, 2628 _MM_FROUND_CUR_DIRECTION); 2629 } 2630 2631 #define _mm512_fmadd_round_ps(A, B, C, R) \ 2632 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2633 (__v16sf)(__m512)(B), \ 2634 (__v16sf)(__m512)(C), \ 2635 (__mmask16)-1, (int)(R)) 2636 2637 2638 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ 2639 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2640 (__v16sf)(__m512)(B), \ 2641 (__v16sf)(__m512)(C), \ 2642 (__mmask16)(U), (int)(R)) 2643 2644 2645 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ 2646 (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ 2647 (__v16sf)(__m512)(B), \ 2648 (__v16sf)(__m512)(C), \ 2649 (__mmask16)(U), (int)(R)) 2650 2651 2652 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ 2653 (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ 2654 (__v16sf)(__m512)(B), \ 2655 (__v16sf)(__m512)(C), \ 2656 (__mmask16)(U), (int)(R)) 2657 2658 2659 #define _mm512_fmsub_round_ps(A, B, C, R) \ 2660 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2661 (__v16sf)(__m512)(B), \ 2662 -(__v16sf)(__m512)(C), \ 2663 (__mmask16)-1, (int)(R)) 2664 2665 2666 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ 2667 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2668 (__v16sf)(__m512)(B), \ 2669 -(__v16sf)(__m512)(C), \ 2670 (__mmask16)(U), (int)(R)) 2671 2672 2673 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ 2674 (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ 2675 (__v16sf)(__m512)(B), \ 2676 -(__v16sf)(__m512)(C), \ 2677 (__mmask16)(U), (int)(R)) 2678 2679 2680 #define _mm512_fnmadd_round_ps(A, B, C, R) \ 2681 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2682 -(__v16sf)(__m512)(B), \ 2683 (__v16sf)(__m512)(C), \ 2684 (__mmask16)-1, (int)(R)) 2685 2686 2687 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ 2688 (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ 2689 (__v16sf)(__m512)(B), \ 2690 (__v16sf)(__m512)(C), \ 2691 (__mmask16)(U), (int)(R)) 2692 2693 2694 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ 2695 (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ 2696 (__v16sf)(__m512)(B), \ 2697 (__v16sf)(__m512)(C), \ 2698 (__mmask16)(U), (int)(R)) 2699 2700 2701 #define _mm512_fnmsub_round_ps(A, B, C, R) \ 2702 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2703 -(__v16sf)(__m512)(B), \ 2704 -(__v16sf)(__m512)(C), \ 2705 (__mmask16)-1, (int)(R)) 2706 2707 2708 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ 2709 (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ 2710 (__v16sf)(__m512)(B), \ 2711 -(__v16sf)(__m512)(C), \ 2712 (__mmask16)(U), (int)(R)) 2713 2714 2715 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2716 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) 2717 { 2718 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2719 (__v16sf) __B, 2720 (__v16sf) __C, 2721 (__mmask16) -1, 2722 _MM_FROUND_CUR_DIRECTION); 2723 } 2724 2725 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2726 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 2727 { 2728 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2729 (__v16sf) __B, 2730 (__v16sf) __C, 2731 (__mmask16) __U, 2732 _MM_FROUND_CUR_DIRECTION); 2733 } 2734 2735 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2736 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 2737 { 2738 return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, 2739 (__v16sf) __B, 2740 (__v16sf) __C, 2741 (__mmask16) __U, 2742 _MM_FROUND_CUR_DIRECTION); 2743 } 2744 2745 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2746 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2747 { 2748 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, 2749 (__v16sf) __B, 2750 (__v16sf) __C, 2751 (__mmask16) __U, 2752 _MM_FROUND_CUR_DIRECTION); 2753 } 2754 2755 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2756 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) 2757 { 2758 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2759 (__v16sf) __B, 2760 -(__v16sf) __C, 2761 (__mmask16) -1, 2762 _MM_FROUND_CUR_DIRECTION); 2763 } 2764 2765 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2766 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 2767 { 2768 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2769 (__v16sf) __B, 2770 -(__v16sf) __C, 2771 (__mmask16) __U, 2772 _MM_FROUND_CUR_DIRECTION); 2773 } 2774 2775 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2776 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2777 { 2778 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, 2779 (__v16sf) __B, 2780 -(__v16sf) __C, 2781 (__mmask16) __U, 2782 _MM_FROUND_CUR_DIRECTION); 2783 } 2784 2785 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2786 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) 2787 { 2788 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2789 -(__v16sf) __B, 2790 (__v16sf) __C, 2791 (__mmask16) -1, 2792 _MM_FROUND_CUR_DIRECTION); 2793 } 2794 2795 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2796 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 2797 { 2798 return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, 2799 (__v16sf) __B, 2800 (__v16sf) __C, 2801 (__mmask16) __U, 2802 _MM_FROUND_CUR_DIRECTION); 2803 } 2804 2805 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2806 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2807 { 2808 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, 2809 (__v16sf) __B, 2810 (__v16sf) __C, 2811 (__mmask16) __U, 2812 _MM_FROUND_CUR_DIRECTION); 2813 } 2814 2815 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2816 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) 2817 { 2818 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2819 -(__v16sf) __B, 2820 -(__v16sf) __C, 2821 (__mmask16) -1, 2822 _MM_FROUND_CUR_DIRECTION); 2823 } 2824 2825 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2826 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2827 { 2828 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, 2829 (__v16sf) __B, 2830 -(__v16sf) __C, 2831 (__mmask16) __U, 2832 _MM_FROUND_CUR_DIRECTION); 2833 } 2834 2835 #define _mm512_fmaddsub_round_pd(A, B, C, R) \ 2836 (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2837 (__v8df)(__m512d)(B), \ 2838 (__v8df)(__m512d)(C), \ 2839 (__mmask8)-1, (int)(R)) 2840 2841 2842 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ 2843 (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2844 (__v8df)(__m512d)(B), \ 2845 (__v8df)(__m512d)(C), \ 2846 (__mmask8)(U), (int)(R)) 2847 2848 2849 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ 2850 (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ 2851 (__v8df)(__m512d)(B), \ 2852 (__v8df)(__m512d)(C), \ 2853 (__mmask8)(U), (int)(R)) 2854 2855 2856 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ 2857 (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ 2858 (__v8df)(__m512d)(B), \ 2859 (__v8df)(__m512d)(C), \ 2860 (__mmask8)(U), (int)(R)) 2861 2862 2863 #define _mm512_fmsubadd_round_pd(A, B, C, R) \ 2864 (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2865 (__v8df)(__m512d)(B), \ 2866 -(__v8df)(__m512d)(C), \ 2867 (__mmask8)-1, (int)(R)) 2868 2869 2870 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ 2871 (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2872 (__v8df)(__m512d)(B), \ 2873 -(__v8df)(__m512d)(C), \ 2874 (__mmask8)(U), (int)(R)) 2875 2876 2877 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ 2878 (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ 2879 (__v8df)(__m512d)(B), \ 2880 -(__v8df)(__m512d)(C), \ 2881 (__mmask8)(U), (int)(R)) 2882 2883 2884 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2885 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) 2886 { 2887 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2888 (__v8df) __B, 2889 (__v8df) __C, 2890 (__mmask8) -1, 2891 _MM_FROUND_CUR_DIRECTION); 2892 } 2893 2894 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2895 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2896 { 2897 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2898 (__v8df) __B, 2899 (__v8df) __C, 2900 (__mmask8) __U, 2901 _MM_FROUND_CUR_DIRECTION); 2902 } 2903 2904 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2905 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2906 { 2907 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, 2908 (__v8df) __B, 2909 (__v8df) __C, 2910 (__mmask8) __U, 2911 _MM_FROUND_CUR_DIRECTION); 2912 } 2913 2914 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2915 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2916 { 2917 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, 2918 (__v8df) __B, 2919 (__v8df) __C, 2920 (__mmask8) __U, 2921 _MM_FROUND_CUR_DIRECTION); 2922 } 2923 2924 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2925 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) 2926 { 2927 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2928 (__v8df) __B, 2929 -(__v8df) __C, 2930 (__mmask8) -1, 2931 _MM_FROUND_CUR_DIRECTION); 2932 } 2933 2934 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2935 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2936 { 2937 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2938 (__v8df) __B, 2939 -(__v8df) __C, 2940 (__mmask8) __U, 2941 _MM_FROUND_CUR_DIRECTION); 2942 } 2943 2944 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2945 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2946 { 2947 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, 2948 (__v8df) __B, 2949 -(__v8df) __C, 2950 (__mmask8) __U, 2951 _MM_FROUND_CUR_DIRECTION); 2952 } 2953 2954 #define _mm512_fmaddsub_round_ps(A, B, C, R) \ 2955 (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2956 (__v16sf)(__m512)(B), \ 2957 (__v16sf)(__m512)(C), \ 2958 (__mmask16)-1, (int)(R)) 2959 2960 2961 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ 2962 (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2963 (__v16sf)(__m512)(B), \ 2964 (__v16sf)(__m512)(C), \ 2965 (__mmask16)(U), (int)(R)) 2966 2967 2968 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ 2969 (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ 2970 (__v16sf)(__m512)(B), \ 2971 (__v16sf)(__m512)(C), \ 2972 (__mmask16)(U), (int)(R)) 2973 2974 2975 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ 2976 (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ 2977 (__v16sf)(__m512)(B), \ 2978 (__v16sf)(__m512)(C), \ 2979 (__mmask16)(U), (int)(R)) 2980 2981 2982 #define _mm512_fmsubadd_round_ps(A, B, C, R) \ 2983 (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2984 (__v16sf)(__m512)(B), \ 2985 -(__v16sf)(__m512)(C), \ 2986 (__mmask16)-1, (int)(R)) 2987 2988 2989 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ 2990 (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2991 (__v16sf)(__m512)(B), \ 2992 -(__v16sf)(__m512)(C), \ 2993 (__mmask16)(U), (int)(R)) 2994 2995 2996 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ 2997 (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ 2998 (__v16sf)(__m512)(B), \ 2999 -(__v16sf)(__m512)(C), \ 3000 (__mmask16)(U), (int)(R)) 3001 3002 3003 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3004 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) 3005 { 3006 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3007 (__v16sf) __B, 3008 (__v16sf) __C, 3009 (__mmask16) -1, 3010 _MM_FROUND_CUR_DIRECTION); 3011 } 3012 3013 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3014 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3015 { 3016 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3017 (__v16sf) __B, 3018 (__v16sf) __C, 3019 (__mmask16) __U, 3020 _MM_FROUND_CUR_DIRECTION); 3021 } 3022 3023 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3024 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3025 { 3026 return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, 3027 (__v16sf) __B, 3028 (__v16sf) __C, 3029 (__mmask16) __U, 3030 _MM_FROUND_CUR_DIRECTION); 3031 } 3032 3033 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3034 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 3035 { 3036 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, 3037 (__v16sf) __B, 3038 (__v16sf) __C, 3039 (__mmask16) __U, 3040 _MM_FROUND_CUR_DIRECTION); 3041 } 3042 3043 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3044 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) 3045 { 3046 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3047 (__v16sf) __B, 3048 -(__v16sf) __C, 3049 (__mmask16) -1, 3050 _MM_FROUND_CUR_DIRECTION); 3051 } 3052 3053 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3054 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3055 { 3056 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3057 (__v16sf) __B, 3058 -(__v16sf) __C, 3059 (__mmask16) __U, 3060 _MM_FROUND_CUR_DIRECTION); 3061 } 3062 3063 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3064 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 3065 { 3066 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, 3067 (__v16sf) __B, 3068 -(__v16sf) __C, 3069 (__mmask16) __U, 3070 _MM_FROUND_CUR_DIRECTION); 3071 } 3072 3073 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ 3074 (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ 3075 (__v8df)(__m512d)(B), \ 3076 (__v8df)(__m512d)(C), \ 3077 (__mmask8)(U), (int)(R)) 3078 3079 3080 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3081 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3082 { 3083 return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, 3084 (__v8df) __B, 3085 (__v8df) __C, 3086 (__mmask8) __U, 3087 _MM_FROUND_CUR_DIRECTION); 3088 } 3089 3090 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ 3091 (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ 3092 (__v16sf)(__m512)(B), \ 3093 (__v16sf)(__m512)(C), \ 3094 (__mmask16)(U), (int)(R)) 3095 3096 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3097 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3098 { 3099 return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, 3100 (__v16sf) __B, 3101 (__v16sf) __C, 3102 (__mmask16) __U, 3103 _MM_FROUND_CUR_DIRECTION); 3104 } 3105 3106 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ 3107 (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ 3108 (__v8df)(__m512d)(B), \ 3109 (__v8df)(__m512d)(C), \ 3110 (__mmask8)(U), (int)(R)) 3111 3112 3113 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3114 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3115 { 3116 return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, 3117 (__v8df) __B, 3118 (__v8df) __C, 3119 (__mmask8) __U, 3120 _MM_FROUND_CUR_DIRECTION); 3121 } 3122 3123 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ 3124 (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ 3125 (__v16sf)(__m512)(B), \ 3126 (__v16sf)(__m512)(C), \ 3127 (__mmask16)(U), (int)(R)) 3128 3129 3130 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3131 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3132 { 3133 return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, 3134 (__v16sf) __B, 3135 (__v16sf) __C, 3136 (__mmask16) __U, 3137 _MM_FROUND_CUR_DIRECTION); 3138 } 3139 3140 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ 3141 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 3142 -(__v8df)(__m512d)(B), \ 3143 (__v8df)(__m512d)(C), \ 3144 (__mmask8)(U), (int)(R)) 3145 3146 3147 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3148 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 3149 { 3150 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 3151 -(__v8df) __B, 3152 (__v8df) __C, 3153 (__mmask8) __U, 3154 _MM_FROUND_CUR_DIRECTION); 3155 } 3156 3157 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ 3158 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 3159 -(__v16sf)(__m512)(B), \ 3160 (__v16sf)(__m512)(C), \ 3161 (__mmask16)(U), (int)(R)) 3162 3163 3164 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3165 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3166 { 3167 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 3168 -(__v16sf) __B, 3169 (__v16sf) __C, 3170 (__mmask16) __U, 3171 _MM_FROUND_CUR_DIRECTION); 3172 } 3173 3174 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ 3175 (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 3176 -(__v8df)(__m512d)(B), \ 3177 -(__v8df)(__m512d)(C), \ 3178 (__mmask8)(U), (int)(R)) 3179 3180 3181 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ 3182 (__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \ 3183 (__v8df)(__m512d)(B), \ 3184 (__v8df)(__m512d)(C), \ 3185 (__mmask8)(U), (int)(R)) 3186 3187 3188 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3189 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 3190 { 3191 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 3192 -(__v8df) __B, 3193 -(__v8df) __C, 3194 (__mmask8) __U, 3195 _MM_FROUND_CUR_DIRECTION); 3196 } 3197 3198 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3199 _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3200 { 3201 return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A, 3202 (__v8df) __B, 3203 (__v8df) __C, 3204 (__mmask8) __U, 3205 _MM_FROUND_CUR_DIRECTION); 3206 } 3207 3208 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ 3209 (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 3210 -(__v16sf)(__m512)(B), \ 3211 -(__v16sf)(__m512)(C), \ 3212 (__mmask16)(U), (int)(R)) 3213 3214 3215 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ 3216 (__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \ 3217 (__v16sf)(__m512)(B), \ 3218 (__v16sf)(__m512)(C), \ 3219 (__mmask16)(U), (int)(R)) 3220 3221 3222 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3223 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3224 { 3225 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 3226 -(__v16sf) __B, 3227 -(__v16sf) __C, 3228 (__mmask16) __U, 3229 _MM_FROUND_CUR_DIRECTION); 3230 } 3231 3232 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3233 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3234 { 3235 return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A, 3236 (__v16sf) __B, 3237 (__v16sf) __C, 3238 (__mmask16) __U, 3239 _MM_FROUND_CUR_DIRECTION); 3240 } 3241 3242 3243 3244 /* Vector permutations */ 3245 3246 static __inline __m512i __DEFAULT_FN_ATTRS512 3247 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) 3248 { 3249 return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, 3250 (__v16si) __B); 3251 } 3252 3253 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3254 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, 3255 __m512i __B) 3256 { 3257 return (__m512i)__builtin_ia32_selectd_512(__U, 3258 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3259 (__v16si)__A); 3260 } 3261 3262 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3263 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, 3264 __m512i __B) 3265 { 3266 return (__m512i)__builtin_ia32_selectd_512(__U, 3267 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3268 (__v16si)__I); 3269 } 3270 3271 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3272 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, 3273 __m512i __B) 3274 { 3275 return (__m512i)__builtin_ia32_selectd_512(__U, 3276 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3277 (__v16si)_mm512_setzero_si512()); 3278 } 3279 3280 static __inline __m512i __DEFAULT_FN_ATTRS512 3281 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) 3282 { 3283 return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, 3284 (__v8di) __B); 3285 } 3286 3287 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3288 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, 3289 __m512i __B) 3290 { 3291 return (__m512i)__builtin_ia32_selectq_512(__U, 3292 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3293 (__v8di)__A); 3294 } 3295 3296 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3297 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, 3298 __m512i __B) 3299 { 3300 return (__m512i)__builtin_ia32_selectq_512(__U, 3301 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3302 (__v8di)__I); 3303 } 3304 3305 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3306 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, 3307 __m512i __B) 3308 { 3309 return (__m512i)__builtin_ia32_selectq_512(__U, 3310 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3311 (__v8di)_mm512_setzero_si512()); 3312 } 3313 3314 #define _mm512_alignr_epi64(A, B, I) \ 3315 (__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \ 3316 (__v8di)(__m512i)(B), (int)(I)) 3317 3318 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \ 3319 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 3320 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ 3321 (__v8di)(__m512i)(W)) 3322 3323 #define _mm512_maskz_alignr_epi64(U, A, B, imm) \ 3324 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 3325 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ 3326 (__v8di)_mm512_setzero_si512()) 3327 3328 #define _mm512_alignr_epi32(A, B, I) \ 3329 (__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \ 3330 (__v16si)(__m512i)(B), (int)(I)) 3331 3332 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \ 3333 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 3334 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ 3335 (__v16si)(__m512i)(W)) 3336 3337 #define _mm512_maskz_alignr_epi32(U, A, B, imm) \ 3338 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 3339 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ 3340 (__v16si)_mm512_setzero_si512()) 3341 /* Vector Extract */ 3342 3343 #define _mm512_extractf64x4_pd(A, I) \ 3344 (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ 3345 (__v4df)_mm256_undefined_pd(), \ 3346 (__mmask8)-1) 3347 3348 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \ 3349 (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ 3350 (__v4df)(__m256d)(W), \ 3351 (__mmask8)(U)) 3352 3353 #define _mm512_maskz_extractf64x4_pd(U, A, imm) \ 3354 (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ 3355 (__v4df)_mm256_setzero_pd(), \ 3356 (__mmask8)(U)) 3357 3358 #define _mm512_extractf32x4_ps(A, I) \ 3359 (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ 3360 (__v4sf)_mm_undefined_ps(), \ 3361 (__mmask8)-1) 3362 3363 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \ 3364 (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ 3365 (__v4sf)(__m128)(W), \ 3366 (__mmask8)(U)) 3367 3368 #define _mm512_maskz_extractf32x4_ps(U, A, imm) \ 3369 (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ 3370 (__v4sf)_mm_setzero_ps(), \ 3371 (__mmask8)(U)) 3372 3373 /* Vector Blend */ 3374 3375 static __inline __m512d __DEFAULT_FN_ATTRS512 3376 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) 3377 { 3378 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 3379 (__v8df) __W, 3380 (__v8df) __A); 3381 } 3382 3383 static __inline __m512 __DEFAULT_FN_ATTRS512 3384 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) 3385 { 3386 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 3387 (__v16sf) __W, 3388 (__v16sf) __A); 3389 } 3390 3391 static __inline __m512i __DEFAULT_FN_ATTRS512 3392 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) 3393 { 3394 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 3395 (__v8di) __W, 3396 (__v8di) __A); 3397 } 3398 3399 static __inline __m512i __DEFAULT_FN_ATTRS512 3400 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) 3401 { 3402 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 3403 (__v16si) __W, 3404 (__v16si) __A); 3405 } 3406 3407 /* Compare */ 3408 3409 #define _mm512_cmp_round_ps_mask(A, B, P, R) \ 3410 (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ 3411 (__v16sf)(__m512)(B), (int)(P), \ 3412 (__mmask16)-1, (int)(R)) 3413 3414 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \ 3415 (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ 3416 (__v16sf)(__m512)(B), (int)(P), \ 3417 (__mmask16)(U), (int)(R)) 3418 3419 #define _mm512_cmp_ps_mask(A, B, P) \ 3420 _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3421 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \ 3422 _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3423 3424 #define _mm512_cmpeq_ps_mask(A, B) \ 3425 _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ) 3426 #define _mm512_mask_cmpeq_ps_mask(k, A, B) \ 3427 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ) 3428 3429 #define _mm512_cmplt_ps_mask(A, B) \ 3430 _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS) 3431 #define _mm512_mask_cmplt_ps_mask(k, A, B) \ 3432 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS) 3433 3434 #define _mm512_cmple_ps_mask(A, B) \ 3435 _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS) 3436 #define _mm512_mask_cmple_ps_mask(k, A, B) \ 3437 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS) 3438 3439 #define _mm512_cmpunord_ps_mask(A, B) \ 3440 _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q) 3441 #define _mm512_mask_cmpunord_ps_mask(k, A, B) \ 3442 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q) 3443 3444 #define _mm512_cmpneq_ps_mask(A, B) \ 3445 _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ) 3446 #define _mm512_mask_cmpneq_ps_mask(k, A, B) \ 3447 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ) 3448 3449 #define _mm512_cmpnlt_ps_mask(A, B) \ 3450 _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US) 3451 #define _mm512_mask_cmpnlt_ps_mask(k, A, B) \ 3452 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US) 3453 3454 #define _mm512_cmpnle_ps_mask(A, B) \ 3455 _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US) 3456 #define _mm512_mask_cmpnle_ps_mask(k, A, B) \ 3457 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US) 3458 3459 #define _mm512_cmpord_ps_mask(A, B) \ 3460 _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q) 3461 #define _mm512_mask_cmpord_ps_mask(k, A, B) \ 3462 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q) 3463 3464 #define _mm512_cmp_round_pd_mask(A, B, P, R) \ 3465 (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ 3466 (__v8df)(__m512d)(B), (int)(P), \ 3467 (__mmask8)-1, (int)(R)) 3468 3469 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \ 3470 (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ 3471 (__v8df)(__m512d)(B), (int)(P), \ 3472 (__mmask8)(U), (int)(R)) 3473 3474 #define _mm512_cmp_pd_mask(A, B, P) \ 3475 _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3476 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \ 3477 _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3478 3479 #define _mm512_cmpeq_pd_mask(A, B) \ 3480 _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ) 3481 #define _mm512_mask_cmpeq_pd_mask(k, A, B) \ 3482 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ) 3483 3484 #define _mm512_cmplt_pd_mask(A, B) \ 3485 _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS) 3486 #define _mm512_mask_cmplt_pd_mask(k, A, B) \ 3487 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS) 3488 3489 #define _mm512_cmple_pd_mask(A, B) \ 3490 _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS) 3491 #define _mm512_mask_cmple_pd_mask(k, A, B) \ 3492 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS) 3493 3494 #define _mm512_cmpunord_pd_mask(A, B) \ 3495 _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q) 3496 #define _mm512_mask_cmpunord_pd_mask(k, A, B) \ 3497 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q) 3498 3499 #define _mm512_cmpneq_pd_mask(A, B) \ 3500 _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ) 3501 #define _mm512_mask_cmpneq_pd_mask(k, A, B) \ 3502 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ) 3503 3504 #define _mm512_cmpnlt_pd_mask(A, B) \ 3505 _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US) 3506 #define _mm512_mask_cmpnlt_pd_mask(k, A, B) \ 3507 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US) 3508 3509 #define _mm512_cmpnle_pd_mask(A, B) \ 3510 _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US) 3511 #define _mm512_mask_cmpnle_pd_mask(k, A, B) \ 3512 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US) 3513 3514 #define _mm512_cmpord_pd_mask(A, B) \ 3515 _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q) 3516 #define _mm512_mask_cmpord_pd_mask(k, A, B) \ 3517 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q) 3518 3519 /* Conversion */ 3520 3521 #define _mm512_cvtt_roundps_epu32(A, R) \ 3522 (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3523 (__v16si)_mm512_undefined_epi32(), \ 3524 (__mmask16)-1, (int)(R)) 3525 3526 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \ 3527 (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3528 (__v16si)(__m512i)(W), \ 3529 (__mmask16)(U), (int)(R)) 3530 3531 #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \ 3532 (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3533 (__v16si)_mm512_setzero_si512(), \ 3534 (__mmask16)(U), (int)(R)) 3535 3536 3537 static __inline __m512i __DEFAULT_FN_ATTRS512 3538 _mm512_cvttps_epu32(__m512 __A) 3539 { 3540 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3541 (__v16si) 3542 _mm512_setzero_si512 (), 3543 (__mmask16) -1, 3544 _MM_FROUND_CUR_DIRECTION); 3545 } 3546 3547 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3548 _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) 3549 { 3550 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3551 (__v16si) __W, 3552 (__mmask16) __U, 3553 _MM_FROUND_CUR_DIRECTION); 3554 } 3555 3556 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3557 _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) 3558 { 3559 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3560 (__v16si) _mm512_setzero_si512 (), 3561 (__mmask16) __U, 3562 _MM_FROUND_CUR_DIRECTION); 3563 } 3564 3565 #define _mm512_cvt_roundepi32_ps(A, R) \ 3566 (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3567 (__v16sf)_mm512_setzero_ps(), \ 3568 (__mmask16)-1, (int)(R)) 3569 3570 #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \ 3571 (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3572 (__v16sf)(__m512)(W), \ 3573 (__mmask16)(U), (int)(R)) 3574 3575 #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \ 3576 (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3577 (__v16sf)_mm512_setzero_ps(), \ 3578 (__mmask16)(U), (int)(R)) 3579 3580 #define _mm512_cvt_roundepu32_ps(A, R) \ 3581 (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3582 (__v16sf)_mm512_setzero_ps(), \ 3583 (__mmask16)-1, (int)(R)) 3584 3585 #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \ 3586 (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3587 (__v16sf)(__m512)(W), \ 3588 (__mmask16)(U), (int)(R)) 3589 3590 #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \ 3591 (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3592 (__v16sf)_mm512_setzero_ps(), \ 3593 (__mmask16)(U), (int)(R)) 3594 3595 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3596 _mm512_cvtepu32_ps (__m512i __A) 3597 { 3598 return (__m512)__builtin_convertvector((__v16su)__A, __v16sf); 3599 } 3600 3601 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3602 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) 3603 { 3604 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3605 (__v16sf)_mm512_cvtepu32_ps(__A), 3606 (__v16sf)__W); 3607 } 3608 3609 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3610 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) 3611 { 3612 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3613 (__v16sf)_mm512_cvtepu32_ps(__A), 3614 (__v16sf)_mm512_setzero_ps()); 3615 } 3616 3617 static __inline __m512d __DEFAULT_FN_ATTRS512 3618 _mm512_cvtepi32_pd(__m256i __A) 3619 { 3620 return (__m512d)__builtin_convertvector((__v8si)__A, __v8df); 3621 } 3622 3623 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3624 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) 3625 { 3626 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3627 (__v8df)_mm512_cvtepi32_pd(__A), 3628 (__v8df)__W); 3629 } 3630 3631 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3632 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) 3633 { 3634 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3635 (__v8df)_mm512_cvtepi32_pd(__A), 3636 (__v8df)_mm512_setzero_pd()); 3637 } 3638 3639 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3640 _mm512_cvtepi32lo_pd(__m512i __A) 3641 { 3642 return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A)); 3643 } 3644 3645 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3646 _mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) 3647 { 3648 return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A)); 3649 } 3650 3651 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3652 _mm512_cvtepi32_ps (__m512i __A) 3653 { 3654 return (__m512)__builtin_convertvector((__v16si)__A, __v16sf); 3655 } 3656 3657 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3658 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) 3659 { 3660 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3661 (__v16sf)_mm512_cvtepi32_ps(__A), 3662 (__v16sf)__W); 3663 } 3664 3665 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3666 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) 3667 { 3668 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3669 (__v16sf)_mm512_cvtepi32_ps(__A), 3670 (__v16sf)_mm512_setzero_ps()); 3671 } 3672 3673 static __inline __m512d __DEFAULT_FN_ATTRS512 3674 _mm512_cvtepu32_pd(__m256i __A) 3675 { 3676 return (__m512d)__builtin_convertvector((__v8su)__A, __v8df); 3677 } 3678 3679 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3680 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) 3681 { 3682 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3683 (__v8df)_mm512_cvtepu32_pd(__A), 3684 (__v8df)__W); 3685 } 3686 3687 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3688 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) 3689 { 3690 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3691 (__v8df)_mm512_cvtepu32_pd(__A), 3692 (__v8df)_mm512_setzero_pd()); 3693 } 3694 3695 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3696 _mm512_cvtepu32lo_pd(__m512i __A) 3697 { 3698 return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A)); 3699 } 3700 3701 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3702 _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) 3703 { 3704 return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A)); 3705 } 3706 3707 #define _mm512_cvt_roundpd_ps(A, R) \ 3708 (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3709 (__v8sf)_mm256_setzero_ps(), \ 3710 (__mmask8)-1, (int)(R)) 3711 3712 #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \ 3713 (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3714 (__v8sf)(__m256)(W), (__mmask8)(U), \ 3715 (int)(R)) 3716 3717 #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \ 3718 (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3719 (__v8sf)_mm256_setzero_ps(), \ 3720 (__mmask8)(U), (int)(R)) 3721 3722 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3723 _mm512_cvtpd_ps (__m512d __A) 3724 { 3725 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3726 (__v8sf) _mm256_undefined_ps (), 3727 (__mmask8) -1, 3728 _MM_FROUND_CUR_DIRECTION); 3729 } 3730 3731 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3732 _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) 3733 { 3734 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3735 (__v8sf) __W, 3736 (__mmask8) __U, 3737 _MM_FROUND_CUR_DIRECTION); 3738 } 3739 3740 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3741 _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) 3742 { 3743 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3744 (__v8sf) _mm256_setzero_ps (), 3745 (__mmask8) __U, 3746 _MM_FROUND_CUR_DIRECTION); 3747 } 3748 3749 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3750 _mm512_cvtpd_pslo (__m512d __A) 3751 { 3752 return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A), 3753 (__v8sf) _mm256_setzero_ps (), 3754 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3755 } 3756 3757 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3758 _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) 3759 { 3760 return (__m512) __builtin_shufflevector ( 3761 (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W), 3762 __U, __A), 3763 (__v8sf) _mm256_setzero_ps (), 3764 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3765 } 3766 3767 #define _mm512_cvt_roundps_ph(A, I) \ 3768 (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3769 (__v16hi)_mm256_undefined_si256(), \ 3770 (__mmask16)-1) 3771 3772 #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ 3773 (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3774 (__v16hi)(__m256i)(U), \ 3775 (__mmask16)(W)) 3776 3777 #define _mm512_maskz_cvt_roundps_ph(W, A, I) \ 3778 (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3779 (__v16hi)_mm256_setzero_si256(), \ 3780 (__mmask16)(W)) 3781 3782 #define _mm512_cvtps_ph _mm512_cvt_roundps_ph 3783 #define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph 3784 #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph 3785 3786 #define _mm512_cvt_roundph_ps(A, R) \ 3787 (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3788 (__v16sf)_mm512_undefined_ps(), \ 3789 (__mmask16)-1, (int)(R)) 3790 3791 #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \ 3792 (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3793 (__v16sf)(__m512)(W), \ 3794 (__mmask16)(U), (int)(R)) 3795 3796 #define _mm512_maskz_cvt_roundph_ps(U, A, R) \ 3797 (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3798 (__v16sf)_mm512_setzero_ps(), \ 3799 (__mmask16)(U), (int)(R)) 3800 3801 3802 static __inline __m512 __DEFAULT_FN_ATTRS512 3803 _mm512_cvtph_ps(__m256i __A) 3804 { 3805 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3806 (__v16sf) 3807 _mm512_setzero_ps (), 3808 (__mmask16) -1, 3809 _MM_FROUND_CUR_DIRECTION); 3810 } 3811 3812 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3813 _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) 3814 { 3815 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3816 (__v16sf) __W, 3817 (__mmask16) __U, 3818 _MM_FROUND_CUR_DIRECTION); 3819 } 3820 3821 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3822 _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) 3823 { 3824 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3825 (__v16sf) _mm512_setzero_ps (), 3826 (__mmask16) __U, 3827 _MM_FROUND_CUR_DIRECTION); 3828 } 3829 3830 #define _mm512_cvtt_roundpd_epi32(A, R) \ 3831 (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3832 (__v8si)_mm256_setzero_si256(), \ 3833 (__mmask8)-1, (int)(R)) 3834 3835 #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \ 3836 (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3837 (__v8si)(__m256i)(W), \ 3838 (__mmask8)(U), (int)(R)) 3839 3840 #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \ 3841 (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3842 (__v8si)_mm256_setzero_si256(), \ 3843 (__mmask8)(U), (int)(R)) 3844 3845 static __inline __m256i __DEFAULT_FN_ATTRS512 3846 _mm512_cvttpd_epi32(__m512d __a) 3847 { 3848 return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a, 3849 (__v8si)_mm256_setzero_si256(), 3850 (__mmask8) -1, 3851 _MM_FROUND_CUR_DIRECTION); 3852 } 3853 3854 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3855 _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) 3856 { 3857 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, 3858 (__v8si) __W, 3859 (__mmask8) __U, 3860 _MM_FROUND_CUR_DIRECTION); 3861 } 3862 3863 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3864 _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) 3865 { 3866 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, 3867 (__v8si) _mm256_setzero_si256 (), 3868 (__mmask8) __U, 3869 _MM_FROUND_CUR_DIRECTION); 3870 } 3871 3872 #define _mm512_cvtt_roundps_epi32(A, R) \ 3873 (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3874 (__v16si)_mm512_setzero_si512(), \ 3875 (__mmask16)-1, (int)(R)) 3876 3877 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \ 3878 (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3879 (__v16si)(__m512i)(W), \ 3880 (__mmask16)(U), (int)(R)) 3881 3882 #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \ 3883 (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3884 (__v16si)_mm512_setzero_si512(), \ 3885 (__mmask16)(U), (int)(R)) 3886 3887 static __inline __m512i __DEFAULT_FN_ATTRS512 3888 _mm512_cvttps_epi32(__m512 __a) 3889 { 3890 return (__m512i) 3891 __builtin_ia32_cvttps2dq512_mask((__v16sf) __a, 3892 (__v16si) _mm512_setzero_si512 (), 3893 (__mmask16) -1, _MM_FROUND_CUR_DIRECTION); 3894 } 3895 3896 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3897 _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) 3898 { 3899 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, 3900 (__v16si) __W, 3901 (__mmask16) __U, 3902 _MM_FROUND_CUR_DIRECTION); 3903 } 3904 3905 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3906 _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) 3907 { 3908 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, 3909 (__v16si) _mm512_setzero_si512 (), 3910 (__mmask16) __U, 3911 _MM_FROUND_CUR_DIRECTION); 3912 } 3913 3914 #define _mm512_cvt_roundps_epi32(A, R) \ 3915 (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3916 (__v16si)_mm512_setzero_si512(), \ 3917 (__mmask16)-1, (int)(R)) 3918 3919 #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \ 3920 (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3921 (__v16si)(__m512i)(W), \ 3922 (__mmask16)(U), (int)(R)) 3923 3924 #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \ 3925 (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3926 (__v16si)_mm512_setzero_si512(), \ 3927 (__mmask16)(U), (int)(R)) 3928 3929 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3930 _mm512_cvtps_epi32 (__m512 __A) 3931 { 3932 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3933 (__v16si) _mm512_undefined_epi32 (), 3934 (__mmask16) -1, 3935 _MM_FROUND_CUR_DIRECTION); 3936 } 3937 3938 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3939 _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) 3940 { 3941 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3942 (__v16si) __W, 3943 (__mmask16) __U, 3944 _MM_FROUND_CUR_DIRECTION); 3945 } 3946 3947 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3948 _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) 3949 { 3950 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3951 (__v16si) 3952 _mm512_setzero_si512 (), 3953 (__mmask16) __U, 3954 _MM_FROUND_CUR_DIRECTION); 3955 } 3956 3957 #define _mm512_cvt_roundpd_epi32(A, R) \ 3958 (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3959 (__v8si)_mm256_setzero_si256(), \ 3960 (__mmask8)-1, (int)(R)) 3961 3962 #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \ 3963 (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3964 (__v8si)(__m256i)(W), \ 3965 (__mmask8)(U), (int)(R)) 3966 3967 #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \ 3968 (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3969 (__v8si)_mm256_setzero_si256(), \ 3970 (__mmask8)(U), (int)(R)) 3971 3972 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3973 _mm512_cvtpd_epi32 (__m512d __A) 3974 { 3975 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 3976 (__v8si) 3977 _mm256_undefined_si256 (), 3978 (__mmask8) -1, 3979 _MM_FROUND_CUR_DIRECTION); 3980 } 3981 3982 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3983 _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) 3984 { 3985 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 3986 (__v8si) __W, 3987 (__mmask8) __U, 3988 _MM_FROUND_CUR_DIRECTION); 3989 } 3990 3991 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3992 _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) 3993 { 3994 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 3995 (__v8si) 3996 _mm256_setzero_si256 (), 3997 (__mmask8) __U, 3998 _MM_FROUND_CUR_DIRECTION); 3999 } 4000 4001 #define _mm512_cvt_roundps_epu32(A, R) \ 4002 (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4003 (__v16si)_mm512_setzero_si512(), \ 4004 (__mmask16)-1, (int)(R)) 4005 4006 #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \ 4007 (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4008 (__v16si)(__m512i)(W), \ 4009 (__mmask16)(U), (int)(R)) 4010 4011 #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \ 4012 (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4013 (__v16si)_mm512_setzero_si512(), \ 4014 (__mmask16)(U), (int)(R)) 4015 4016 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4017 _mm512_cvtps_epu32 ( __m512 __A) 4018 { 4019 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\ 4020 (__v16si)\ 4021 _mm512_undefined_epi32 (), 4022 (__mmask16) -1,\ 4023 _MM_FROUND_CUR_DIRECTION); 4024 } 4025 4026 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4027 _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) 4028 { 4029 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, 4030 (__v16si) __W, 4031 (__mmask16) __U, 4032 _MM_FROUND_CUR_DIRECTION); 4033 } 4034 4035 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4036 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A) 4037 { 4038 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, 4039 (__v16si) 4040 _mm512_setzero_si512 (), 4041 (__mmask16) __U , 4042 _MM_FROUND_CUR_DIRECTION); 4043 } 4044 4045 #define _mm512_cvt_roundpd_epu32(A, R) \ 4046 (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4047 (__v8si)_mm256_setzero_si256(), \ 4048 (__mmask8)-1, (int)(R)) 4049 4050 #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \ 4051 (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4052 (__v8si)(__m256i)(W), \ 4053 (__mmask8)(U), (int)(R)) 4054 4055 #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \ 4056 (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4057 (__v8si)_mm256_setzero_si256(), \ 4058 (__mmask8)(U), (int)(R)) 4059 4060 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4061 _mm512_cvtpd_epu32 (__m512d __A) 4062 { 4063 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4064 (__v8si) 4065 _mm256_undefined_si256 (), 4066 (__mmask8) -1, 4067 _MM_FROUND_CUR_DIRECTION); 4068 } 4069 4070 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4071 _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) 4072 { 4073 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4074 (__v8si) __W, 4075 (__mmask8) __U, 4076 _MM_FROUND_CUR_DIRECTION); 4077 } 4078 4079 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4080 _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) 4081 { 4082 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4083 (__v8si) 4084 _mm256_setzero_si256 (), 4085 (__mmask8) __U, 4086 _MM_FROUND_CUR_DIRECTION); 4087 } 4088 4089 static __inline__ double __DEFAULT_FN_ATTRS512 4090 _mm512_cvtsd_f64(__m512d __a) 4091 { 4092 return __a[0]; 4093 } 4094 4095 static __inline__ float __DEFAULT_FN_ATTRS512 4096 _mm512_cvtss_f32(__m512 __a) 4097 { 4098 return __a[0]; 4099 } 4100 4101 /* Unpack and Interleave */ 4102 4103 static __inline __m512d __DEFAULT_FN_ATTRS512 4104 _mm512_unpackhi_pd(__m512d __a, __m512d __b) 4105 { 4106 return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 4107 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); 4108 } 4109 4110 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4111 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 4112 { 4113 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4114 (__v8df)_mm512_unpackhi_pd(__A, __B), 4115 (__v8df)__W); 4116 } 4117 4118 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4119 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) 4120 { 4121 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4122 (__v8df)_mm512_unpackhi_pd(__A, __B), 4123 (__v8df)_mm512_setzero_pd()); 4124 } 4125 4126 static __inline __m512d __DEFAULT_FN_ATTRS512 4127 _mm512_unpacklo_pd(__m512d __a, __m512d __b) 4128 { 4129 return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 4130 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); 4131 } 4132 4133 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4134 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 4135 { 4136 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4137 (__v8df)_mm512_unpacklo_pd(__A, __B), 4138 (__v8df)__W); 4139 } 4140 4141 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4142 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) 4143 { 4144 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4145 (__v8df)_mm512_unpacklo_pd(__A, __B), 4146 (__v8df)_mm512_setzero_pd()); 4147 } 4148 4149 static __inline __m512 __DEFAULT_FN_ATTRS512 4150 _mm512_unpackhi_ps(__m512 __a, __m512 __b) 4151 { 4152 return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, 4153 2, 18, 3, 19, 4154 2+4, 18+4, 3+4, 19+4, 4155 2+8, 18+8, 3+8, 19+8, 4156 2+12, 18+12, 3+12, 19+12); 4157 } 4158 4159 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4160 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 4161 { 4162 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4163 (__v16sf)_mm512_unpackhi_ps(__A, __B), 4164 (__v16sf)__W); 4165 } 4166 4167 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4168 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) 4169 { 4170 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4171 (__v16sf)_mm512_unpackhi_ps(__A, __B), 4172 (__v16sf)_mm512_setzero_ps()); 4173 } 4174 4175 static __inline __m512 __DEFAULT_FN_ATTRS512 4176 _mm512_unpacklo_ps(__m512 __a, __m512 __b) 4177 { 4178 return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, 4179 0, 16, 1, 17, 4180 0+4, 16+4, 1+4, 17+4, 4181 0+8, 16+8, 1+8, 17+8, 4182 0+12, 16+12, 1+12, 17+12); 4183 } 4184 4185 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4186 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 4187 { 4188 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4189 (__v16sf)_mm512_unpacklo_ps(__A, __B), 4190 (__v16sf)__W); 4191 } 4192 4193 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4194 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) 4195 { 4196 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4197 (__v16sf)_mm512_unpacklo_ps(__A, __B), 4198 (__v16sf)_mm512_setzero_ps()); 4199 } 4200 4201 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4202 _mm512_unpackhi_epi32(__m512i __A, __m512i __B) 4203 { 4204 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, 4205 2, 18, 3, 19, 4206 2+4, 18+4, 3+4, 19+4, 4207 2+8, 18+8, 3+8, 19+8, 4208 2+12, 18+12, 3+12, 19+12); 4209 } 4210 4211 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4212 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4213 { 4214 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4215 (__v16si)_mm512_unpackhi_epi32(__A, __B), 4216 (__v16si)__W); 4217 } 4218 4219 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4220 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) 4221 { 4222 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4223 (__v16si)_mm512_unpackhi_epi32(__A, __B), 4224 (__v16si)_mm512_setzero_si512()); 4225 } 4226 4227 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4228 _mm512_unpacklo_epi32(__m512i __A, __m512i __B) 4229 { 4230 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, 4231 0, 16, 1, 17, 4232 0+4, 16+4, 1+4, 17+4, 4233 0+8, 16+8, 1+8, 17+8, 4234 0+12, 16+12, 1+12, 17+12); 4235 } 4236 4237 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4238 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4239 { 4240 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4241 (__v16si)_mm512_unpacklo_epi32(__A, __B), 4242 (__v16si)__W); 4243 } 4244 4245 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4246 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) 4247 { 4248 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4249 (__v16si)_mm512_unpacklo_epi32(__A, __B), 4250 (__v16si)_mm512_setzero_si512()); 4251 } 4252 4253 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4254 _mm512_unpackhi_epi64(__m512i __A, __m512i __B) 4255 { 4256 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 4257 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); 4258 } 4259 4260 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4261 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4262 { 4263 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4264 (__v8di)_mm512_unpackhi_epi64(__A, __B), 4265 (__v8di)__W); 4266 } 4267 4268 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4269 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) 4270 { 4271 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4272 (__v8di)_mm512_unpackhi_epi64(__A, __B), 4273 (__v8di)_mm512_setzero_si512()); 4274 } 4275 4276 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4277 _mm512_unpacklo_epi64 (__m512i __A, __m512i __B) 4278 { 4279 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 4280 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); 4281 } 4282 4283 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4284 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4285 { 4286 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4287 (__v8di)_mm512_unpacklo_epi64(__A, __B), 4288 (__v8di)__W); 4289 } 4290 4291 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4292 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 4293 { 4294 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4295 (__v8di)_mm512_unpacklo_epi64(__A, __B), 4296 (__v8di)_mm512_setzero_si512()); 4297 } 4298 4299 4300 /* SIMD load ops */ 4301 4302 static __inline __m512i __DEFAULT_FN_ATTRS512 4303 _mm512_loadu_si512 (void const *__P) 4304 { 4305 struct __loadu_si512 { 4306 __m512i_u __v; 4307 } __attribute__((__packed__, __may_alias__)); 4308 return ((const struct __loadu_si512*)__P)->__v; 4309 } 4310 4311 static __inline __m512i __DEFAULT_FN_ATTRS512 4312 _mm512_loadu_epi32 (void const *__P) 4313 { 4314 struct __loadu_epi32 { 4315 __m512i_u __v; 4316 } __attribute__((__packed__, __may_alias__)); 4317 return ((const struct __loadu_epi32*)__P)->__v; 4318 } 4319 4320 static __inline __m512i __DEFAULT_FN_ATTRS512 4321 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) 4322 { 4323 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, 4324 (__v16si) __W, 4325 (__mmask16) __U); 4326 } 4327 4328 4329 static __inline __m512i __DEFAULT_FN_ATTRS512 4330 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) 4331 { 4332 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P, 4333 (__v16si) 4334 _mm512_setzero_si512 (), 4335 (__mmask16) __U); 4336 } 4337 4338 static __inline __m512i __DEFAULT_FN_ATTRS512 4339 _mm512_loadu_epi64 (void const *__P) 4340 { 4341 struct __loadu_epi64 { 4342 __m512i_u __v; 4343 } __attribute__((__packed__, __may_alias__)); 4344 return ((const struct __loadu_epi64*)__P)->__v; 4345 } 4346 4347 static __inline __m512i __DEFAULT_FN_ATTRS512 4348 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) 4349 { 4350 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, 4351 (__v8di) __W, 4352 (__mmask8) __U); 4353 } 4354 4355 static __inline __m512i __DEFAULT_FN_ATTRS512 4356 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) 4357 { 4358 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P, 4359 (__v8di) 4360 _mm512_setzero_si512 (), 4361 (__mmask8) __U); 4362 } 4363 4364 static __inline __m512 __DEFAULT_FN_ATTRS512 4365 _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) 4366 { 4367 return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, 4368 (__v16sf) __W, 4369 (__mmask16) __U); 4370 } 4371 4372 static __inline __m512 __DEFAULT_FN_ATTRS512 4373 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) 4374 { 4375 return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P, 4376 (__v16sf) 4377 _mm512_setzero_ps (), 4378 (__mmask16) __U); 4379 } 4380 4381 static __inline __m512d __DEFAULT_FN_ATTRS512 4382 _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) 4383 { 4384 return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, 4385 (__v8df) __W, 4386 (__mmask8) __U); 4387 } 4388 4389 static __inline __m512d __DEFAULT_FN_ATTRS512 4390 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) 4391 { 4392 return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P, 4393 (__v8df) 4394 _mm512_setzero_pd (), 4395 (__mmask8) __U); 4396 } 4397 4398 static __inline __m512d __DEFAULT_FN_ATTRS512 4399 _mm512_loadu_pd(void const *__p) 4400 { 4401 struct __loadu_pd { 4402 __m512d_u __v; 4403 } __attribute__((__packed__, __may_alias__)); 4404 return ((const struct __loadu_pd*)__p)->__v; 4405 } 4406 4407 static __inline __m512 __DEFAULT_FN_ATTRS512 4408 _mm512_loadu_ps(void const *__p) 4409 { 4410 struct __loadu_ps { 4411 __m512_u __v; 4412 } __attribute__((__packed__, __may_alias__)); 4413 return ((const struct __loadu_ps*)__p)->__v; 4414 } 4415 4416 static __inline __m512 __DEFAULT_FN_ATTRS512 4417 _mm512_load_ps(void const *__p) 4418 { 4419 return *(const __m512*)__p; 4420 } 4421 4422 static __inline __m512 __DEFAULT_FN_ATTRS512 4423 _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) 4424 { 4425 return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, 4426 (__v16sf) __W, 4427 (__mmask16) __U); 4428 } 4429 4430 static __inline __m512 __DEFAULT_FN_ATTRS512 4431 _mm512_maskz_load_ps(__mmask16 __U, void const *__P) 4432 { 4433 return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P, 4434 (__v16sf) 4435 _mm512_setzero_ps (), 4436 (__mmask16) __U); 4437 } 4438 4439 static __inline __m512d __DEFAULT_FN_ATTRS512 4440 _mm512_load_pd(void const *__p) 4441 { 4442 return *(const __m512d*)__p; 4443 } 4444 4445 static __inline __m512d __DEFAULT_FN_ATTRS512 4446 _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) 4447 { 4448 return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, 4449 (__v8df) __W, 4450 (__mmask8) __U); 4451 } 4452 4453 static __inline __m512d __DEFAULT_FN_ATTRS512 4454 _mm512_maskz_load_pd(__mmask8 __U, void const *__P) 4455 { 4456 return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P, 4457 (__v8df) 4458 _mm512_setzero_pd (), 4459 (__mmask8) __U); 4460 } 4461 4462 static __inline __m512i __DEFAULT_FN_ATTRS512 4463 _mm512_load_si512 (void const *__P) 4464 { 4465 return *(const __m512i *) __P; 4466 } 4467 4468 static __inline __m512i __DEFAULT_FN_ATTRS512 4469 _mm512_load_epi32 (void const *__P) 4470 { 4471 return *(const __m512i *) __P; 4472 } 4473 4474 static __inline __m512i __DEFAULT_FN_ATTRS512 4475 _mm512_load_epi64 (void const *__P) 4476 { 4477 return *(const __m512i *) __P; 4478 } 4479 4480 /* SIMD store ops */ 4481 4482 static __inline void __DEFAULT_FN_ATTRS512 4483 _mm512_storeu_epi64 (void *__P, __m512i __A) 4484 { 4485 struct __storeu_epi64 { 4486 __m512i_u __v; 4487 } __attribute__((__packed__, __may_alias__)); 4488 ((struct __storeu_epi64*)__P)->__v = __A; 4489 } 4490 4491 static __inline void __DEFAULT_FN_ATTRS512 4492 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) 4493 { 4494 __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A, 4495 (__mmask8) __U); 4496 } 4497 4498 static __inline void __DEFAULT_FN_ATTRS512 4499 _mm512_storeu_si512 (void *__P, __m512i __A) 4500 { 4501 struct __storeu_si512 { 4502 __m512i_u __v; 4503 } __attribute__((__packed__, __may_alias__)); 4504 ((struct __storeu_si512*)__P)->__v = __A; 4505 } 4506 4507 static __inline void __DEFAULT_FN_ATTRS512 4508 _mm512_storeu_epi32 (void *__P, __m512i __A) 4509 { 4510 struct __storeu_epi32 { 4511 __m512i_u __v; 4512 } __attribute__((__packed__, __may_alias__)); 4513 ((struct __storeu_epi32*)__P)->__v = __A; 4514 } 4515 4516 static __inline void __DEFAULT_FN_ATTRS512 4517 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) 4518 { 4519 __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A, 4520 (__mmask16) __U); 4521 } 4522 4523 static __inline void __DEFAULT_FN_ATTRS512 4524 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) 4525 { 4526 __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U); 4527 } 4528 4529 static __inline void __DEFAULT_FN_ATTRS512 4530 _mm512_storeu_pd(void *__P, __m512d __A) 4531 { 4532 struct __storeu_pd { 4533 __m512d_u __v; 4534 } __attribute__((__packed__, __may_alias__)); 4535 ((struct __storeu_pd*)__P)->__v = __A; 4536 } 4537 4538 static __inline void __DEFAULT_FN_ATTRS512 4539 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) 4540 { 4541 __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A, 4542 (__mmask16) __U); 4543 } 4544 4545 static __inline void __DEFAULT_FN_ATTRS512 4546 _mm512_storeu_ps(void *__P, __m512 __A) 4547 { 4548 struct __storeu_ps { 4549 __m512_u __v; 4550 } __attribute__((__packed__, __may_alias__)); 4551 ((struct __storeu_ps*)__P)->__v = __A; 4552 } 4553 4554 static __inline void __DEFAULT_FN_ATTRS512 4555 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) 4556 { 4557 __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U); 4558 } 4559 4560 static __inline void __DEFAULT_FN_ATTRS512 4561 _mm512_store_pd(void *__P, __m512d __A) 4562 { 4563 *(__m512d*)__P = __A; 4564 } 4565 4566 static __inline void __DEFAULT_FN_ATTRS512 4567 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) 4568 { 4569 __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A, 4570 (__mmask16) __U); 4571 } 4572 4573 static __inline void __DEFAULT_FN_ATTRS512 4574 _mm512_store_ps(void *__P, __m512 __A) 4575 { 4576 *(__m512*)__P = __A; 4577 } 4578 4579 static __inline void __DEFAULT_FN_ATTRS512 4580 _mm512_store_si512 (void *__P, __m512i __A) 4581 { 4582 *(__m512i *) __P = __A; 4583 } 4584 4585 static __inline void __DEFAULT_FN_ATTRS512 4586 _mm512_store_epi32 (void *__P, __m512i __A) 4587 { 4588 *(__m512i *) __P = __A; 4589 } 4590 4591 static __inline void __DEFAULT_FN_ATTRS512 4592 _mm512_store_epi64 (void *__P, __m512i __A) 4593 { 4594 *(__m512i *) __P = __A; 4595 } 4596 4597 /* Mask ops */ 4598 4599 static __inline __mmask16 __DEFAULT_FN_ATTRS 4600 _mm512_knot(__mmask16 __M) 4601 { 4602 return __builtin_ia32_knothi(__M); 4603 } 4604 4605 /* Integer compare */ 4606 4607 #define _mm512_cmpeq_epi32_mask(A, B) \ 4608 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) 4609 #define _mm512_mask_cmpeq_epi32_mask(k, A, B) \ 4610 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) 4611 #define _mm512_cmpge_epi32_mask(A, B) \ 4612 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) 4613 #define _mm512_mask_cmpge_epi32_mask(k, A, B) \ 4614 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) 4615 #define _mm512_cmpgt_epi32_mask(A, B) \ 4616 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) 4617 #define _mm512_mask_cmpgt_epi32_mask(k, A, B) \ 4618 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) 4619 #define _mm512_cmple_epi32_mask(A, B) \ 4620 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) 4621 #define _mm512_mask_cmple_epi32_mask(k, A, B) \ 4622 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) 4623 #define _mm512_cmplt_epi32_mask(A, B) \ 4624 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) 4625 #define _mm512_mask_cmplt_epi32_mask(k, A, B) \ 4626 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) 4627 #define _mm512_cmpneq_epi32_mask(A, B) \ 4628 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) 4629 #define _mm512_mask_cmpneq_epi32_mask(k, A, B) \ 4630 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) 4631 4632 #define _mm512_cmpeq_epu32_mask(A, B) \ 4633 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) 4634 #define _mm512_mask_cmpeq_epu32_mask(k, A, B) \ 4635 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) 4636 #define _mm512_cmpge_epu32_mask(A, B) \ 4637 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) 4638 #define _mm512_mask_cmpge_epu32_mask(k, A, B) \ 4639 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) 4640 #define _mm512_cmpgt_epu32_mask(A, B) \ 4641 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) 4642 #define _mm512_mask_cmpgt_epu32_mask(k, A, B) \ 4643 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) 4644 #define _mm512_cmple_epu32_mask(A, B) \ 4645 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) 4646 #define _mm512_mask_cmple_epu32_mask(k, A, B) \ 4647 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) 4648 #define _mm512_cmplt_epu32_mask(A, B) \ 4649 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) 4650 #define _mm512_mask_cmplt_epu32_mask(k, A, B) \ 4651 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) 4652 #define _mm512_cmpneq_epu32_mask(A, B) \ 4653 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) 4654 #define _mm512_mask_cmpneq_epu32_mask(k, A, B) \ 4655 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) 4656 4657 #define _mm512_cmpeq_epi64_mask(A, B) \ 4658 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) 4659 #define _mm512_mask_cmpeq_epi64_mask(k, A, B) \ 4660 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) 4661 #define _mm512_cmpge_epi64_mask(A, B) \ 4662 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) 4663 #define _mm512_mask_cmpge_epi64_mask(k, A, B) \ 4664 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) 4665 #define _mm512_cmpgt_epi64_mask(A, B) \ 4666 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) 4667 #define _mm512_mask_cmpgt_epi64_mask(k, A, B) \ 4668 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) 4669 #define _mm512_cmple_epi64_mask(A, B) \ 4670 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) 4671 #define _mm512_mask_cmple_epi64_mask(k, A, B) \ 4672 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) 4673 #define _mm512_cmplt_epi64_mask(A, B) \ 4674 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) 4675 #define _mm512_mask_cmplt_epi64_mask(k, A, B) \ 4676 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) 4677 #define _mm512_cmpneq_epi64_mask(A, B) \ 4678 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) 4679 #define _mm512_mask_cmpneq_epi64_mask(k, A, B) \ 4680 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) 4681 4682 #define _mm512_cmpeq_epu64_mask(A, B) \ 4683 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) 4684 #define _mm512_mask_cmpeq_epu64_mask(k, A, B) \ 4685 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) 4686 #define _mm512_cmpge_epu64_mask(A, B) \ 4687 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) 4688 #define _mm512_mask_cmpge_epu64_mask(k, A, B) \ 4689 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) 4690 #define _mm512_cmpgt_epu64_mask(A, B) \ 4691 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) 4692 #define _mm512_mask_cmpgt_epu64_mask(k, A, B) \ 4693 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) 4694 #define _mm512_cmple_epu64_mask(A, B) \ 4695 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) 4696 #define _mm512_mask_cmple_epu64_mask(k, A, B) \ 4697 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) 4698 #define _mm512_cmplt_epu64_mask(A, B) \ 4699 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) 4700 #define _mm512_mask_cmplt_epu64_mask(k, A, B) \ 4701 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) 4702 #define _mm512_cmpneq_epu64_mask(A, B) \ 4703 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) 4704 #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \ 4705 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) 4706 4707 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4708 _mm512_cvtepi8_epi32(__m128i __A) 4709 { 4710 /* This function always performs a signed extension, but __v16qi is a char 4711 which may be signed or unsigned, so use __v16qs. */ 4712 return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si); 4713 } 4714 4715 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4716 _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) 4717 { 4718 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4719 (__v16si)_mm512_cvtepi8_epi32(__A), 4720 (__v16si)__W); 4721 } 4722 4723 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4724 _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) 4725 { 4726 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4727 (__v16si)_mm512_cvtepi8_epi32(__A), 4728 (__v16si)_mm512_setzero_si512()); 4729 } 4730 4731 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4732 _mm512_cvtepi8_epi64(__m128i __A) 4733 { 4734 /* This function always performs a signed extension, but __v16qi is a char 4735 which may be signed or unsigned, so use __v16qs. */ 4736 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); 4737 } 4738 4739 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4740 _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4741 { 4742 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4743 (__v8di)_mm512_cvtepi8_epi64(__A), 4744 (__v8di)__W); 4745 } 4746 4747 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4748 _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) 4749 { 4750 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4751 (__v8di)_mm512_cvtepi8_epi64(__A), 4752 (__v8di)_mm512_setzero_si512 ()); 4753 } 4754 4755 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4756 _mm512_cvtepi32_epi64(__m256i __X) 4757 { 4758 return (__m512i)__builtin_convertvector((__v8si)__X, __v8di); 4759 } 4760 4761 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4762 _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) 4763 { 4764 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4765 (__v8di)_mm512_cvtepi32_epi64(__X), 4766 (__v8di)__W); 4767 } 4768 4769 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4770 _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) 4771 { 4772 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4773 (__v8di)_mm512_cvtepi32_epi64(__X), 4774 (__v8di)_mm512_setzero_si512()); 4775 } 4776 4777 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4778 _mm512_cvtepi16_epi32(__m256i __A) 4779 { 4780 return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si); 4781 } 4782 4783 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4784 _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) 4785 { 4786 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4787 (__v16si)_mm512_cvtepi16_epi32(__A), 4788 (__v16si)__W); 4789 } 4790 4791 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4792 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) 4793 { 4794 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4795 (__v16si)_mm512_cvtepi16_epi32(__A), 4796 (__v16si)_mm512_setzero_si512 ()); 4797 } 4798 4799 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4800 _mm512_cvtepi16_epi64(__m128i __A) 4801 { 4802 return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di); 4803 } 4804 4805 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4806 _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4807 { 4808 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4809 (__v8di)_mm512_cvtepi16_epi64(__A), 4810 (__v8di)__W); 4811 } 4812 4813 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4814 _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) 4815 { 4816 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4817 (__v8di)_mm512_cvtepi16_epi64(__A), 4818 (__v8di)_mm512_setzero_si512()); 4819 } 4820 4821 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4822 _mm512_cvtepu8_epi32(__m128i __A) 4823 { 4824 return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si); 4825 } 4826 4827 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4828 _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) 4829 { 4830 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4831 (__v16si)_mm512_cvtepu8_epi32(__A), 4832 (__v16si)__W); 4833 } 4834 4835 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4836 _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) 4837 { 4838 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4839 (__v16si)_mm512_cvtepu8_epi32(__A), 4840 (__v16si)_mm512_setzero_si512()); 4841 } 4842 4843 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4844 _mm512_cvtepu8_epi64(__m128i __A) 4845 { 4846 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); 4847 } 4848 4849 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4850 _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4851 { 4852 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4853 (__v8di)_mm512_cvtepu8_epi64(__A), 4854 (__v8di)__W); 4855 } 4856 4857 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4858 _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) 4859 { 4860 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4861 (__v8di)_mm512_cvtepu8_epi64(__A), 4862 (__v8di)_mm512_setzero_si512()); 4863 } 4864 4865 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4866 _mm512_cvtepu32_epi64(__m256i __X) 4867 { 4868 return (__m512i)__builtin_convertvector((__v8su)__X, __v8di); 4869 } 4870 4871 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4872 _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) 4873 { 4874 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4875 (__v8di)_mm512_cvtepu32_epi64(__X), 4876 (__v8di)__W); 4877 } 4878 4879 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4880 _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) 4881 { 4882 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4883 (__v8di)_mm512_cvtepu32_epi64(__X), 4884 (__v8di)_mm512_setzero_si512()); 4885 } 4886 4887 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4888 _mm512_cvtepu16_epi32(__m256i __A) 4889 { 4890 return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si); 4891 } 4892 4893 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4894 _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) 4895 { 4896 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4897 (__v16si)_mm512_cvtepu16_epi32(__A), 4898 (__v16si)__W); 4899 } 4900 4901 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4902 _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) 4903 { 4904 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4905 (__v16si)_mm512_cvtepu16_epi32(__A), 4906 (__v16si)_mm512_setzero_si512()); 4907 } 4908 4909 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4910 _mm512_cvtepu16_epi64(__m128i __A) 4911 { 4912 return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di); 4913 } 4914 4915 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4916 _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4917 { 4918 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4919 (__v8di)_mm512_cvtepu16_epi64(__A), 4920 (__v8di)__W); 4921 } 4922 4923 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4924 _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) 4925 { 4926 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4927 (__v8di)_mm512_cvtepu16_epi64(__A), 4928 (__v8di)_mm512_setzero_si512()); 4929 } 4930 4931 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4932 _mm512_rorv_epi32 (__m512i __A, __m512i __B) 4933 { 4934 return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B); 4935 } 4936 4937 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4938 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4939 { 4940 return (__m512i)__builtin_ia32_selectd_512(__U, 4941 (__v16si)_mm512_rorv_epi32(__A, __B), 4942 (__v16si)__W); 4943 } 4944 4945 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4946 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 4947 { 4948 return (__m512i)__builtin_ia32_selectd_512(__U, 4949 (__v16si)_mm512_rorv_epi32(__A, __B), 4950 (__v16si)_mm512_setzero_si512()); 4951 } 4952 4953 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4954 _mm512_rorv_epi64 (__m512i __A, __m512i __B) 4955 { 4956 return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B); 4957 } 4958 4959 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4960 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4961 { 4962 return (__m512i)__builtin_ia32_selectq_512(__U, 4963 (__v8di)_mm512_rorv_epi64(__A, __B), 4964 (__v8di)__W); 4965 } 4966 4967 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4968 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 4969 { 4970 return (__m512i)__builtin_ia32_selectq_512(__U, 4971 (__v8di)_mm512_rorv_epi64(__A, __B), 4972 (__v8di)_mm512_setzero_si512()); 4973 } 4974 4975 4976 4977 #define _mm512_cmp_epi32_mask(a, b, p) \ 4978 (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ 4979 (__v16si)(__m512i)(b), (int)(p), \ 4980 (__mmask16)-1) 4981 4982 #define _mm512_cmp_epu32_mask(a, b, p) \ 4983 (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ 4984 (__v16si)(__m512i)(b), (int)(p), \ 4985 (__mmask16)-1) 4986 4987 #define _mm512_cmp_epi64_mask(a, b, p) \ 4988 (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ 4989 (__v8di)(__m512i)(b), (int)(p), \ 4990 (__mmask8)-1) 4991 4992 #define _mm512_cmp_epu64_mask(a, b, p) \ 4993 (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ 4994 (__v8di)(__m512i)(b), (int)(p), \ 4995 (__mmask8)-1) 4996 4997 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \ 4998 (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ 4999 (__v16si)(__m512i)(b), (int)(p), \ 5000 (__mmask16)(m)) 5001 5002 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \ 5003 (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ 5004 (__v16si)(__m512i)(b), (int)(p), \ 5005 (__mmask16)(m)) 5006 5007 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \ 5008 (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ 5009 (__v8di)(__m512i)(b), (int)(p), \ 5010 (__mmask8)(m)) 5011 5012 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \ 5013 (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ 5014 (__v8di)(__m512i)(b), (int)(p), \ 5015 (__mmask8)(m)) 5016 5017 #define _mm512_rol_epi32(a, b) \ 5018 (__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)) 5019 5020 #define _mm512_mask_rol_epi32(W, U, a, b) \ 5021 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5022 (__v16si)_mm512_rol_epi32((a), (b)), \ 5023 (__v16si)(__m512i)(W)) 5024 5025 #define _mm512_maskz_rol_epi32(U, a, b) \ 5026 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5027 (__v16si)_mm512_rol_epi32((a), (b)), \ 5028 (__v16si)_mm512_setzero_si512()) 5029 5030 #define _mm512_rol_epi64(a, b) \ 5031 (__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)) 5032 5033 #define _mm512_mask_rol_epi64(W, U, a, b) \ 5034 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5035 (__v8di)_mm512_rol_epi64((a), (b)), \ 5036 (__v8di)(__m512i)(W)) 5037 5038 #define _mm512_maskz_rol_epi64(U, a, b) \ 5039 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5040 (__v8di)_mm512_rol_epi64((a), (b)), \ 5041 (__v8di)_mm512_setzero_si512()) 5042 5043 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5044 _mm512_rolv_epi32 (__m512i __A, __m512i __B) 5045 { 5046 return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B); 5047 } 5048 5049 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5050 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 5051 { 5052 return (__m512i)__builtin_ia32_selectd_512(__U, 5053 (__v16si)_mm512_rolv_epi32(__A, __B), 5054 (__v16si)__W); 5055 } 5056 5057 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5058 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 5059 { 5060 return (__m512i)__builtin_ia32_selectd_512(__U, 5061 (__v16si)_mm512_rolv_epi32(__A, __B), 5062 (__v16si)_mm512_setzero_si512()); 5063 } 5064 5065 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5066 _mm512_rolv_epi64 (__m512i __A, __m512i __B) 5067 { 5068 return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B); 5069 } 5070 5071 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5072 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 5073 { 5074 return (__m512i)__builtin_ia32_selectq_512(__U, 5075 (__v8di)_mm512_rolv_epi64(__A, __B), 5076 (__v8di)__W); 5077 } 5078 5079 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5080 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 5081 { 5082 return (__m512i)__builtin_ia32_selectq_512(__U, 5083 (__v8di)_mm512_rolv_epi64(__A, __B), 5084 (__v8di)_mm512_setzero_si512()); 5085 } 5086 5087 #define _mm512_ror_epi32(A, B) \ 5088 (__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)) 5089 5090 #define _mm512_mask_ror_epi32(W, U, A, B) \ 5091 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5092 (__v16si)_mm512_ror_epi32((A), (B)), \ 5093 (__v16si)(__m512i)(W)) 5094 5095 #define _mm512_maskz_ror_epi32(U, A, B) \ 5096 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5097 (__v16si)_mm512_ror_epi32((A), (B)), \ 5098 (__v16si)_mm512_setzero_si512()) 5099 5100 #define _mm512_ror_epi64(A, B) \ 5101 (__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)) 5102 5103 #define _mm512_mask_ror_epi64(W, U, A, B) \ 5104 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5105 (__v8di)_mm512_ror_epi64((A), (B)), \ 5106 (__v8di)(__m512i)(W)) 5107 5108 #define _mm512_maskz_ror_epi64(U, A, B) \ 5109 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5110 (__v8di)_mm512_ror_epi64((A), (B)), \ 5111 (__v8di)_mm512_setzero_si512()) 5112 5113 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5114 _mm512_slli_epi32(__m512i __A, int __B) 5115 { 5116 return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B); 5117 } 5118 5119 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5120 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) 5121 { 5122 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5123 (__v16si)_mm512_slli_epi32(__A, __B), 5124 (__v16si)__W); 5125 } 5126 5127 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5128 _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) { 5129 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5130 (__v16si)_mm512_slli_epi32(__A, __B), 5131 (__v16si)_mm512_setzero_si512()); 5132 } 5133 5134 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5135 _mm512_slli_epi64(__m512i __A, int __B) 5136 { 5137 return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B); 5138 } 5139 5140 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5141 _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) 5142 { 5143 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5144 (__v8di)_mm512_slli_epi64(__A, __B), 5145 (__v8di)__W); 5146 } 5147 5148 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5149 _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B) 5150 { 5151 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5152 (__v8di)_mm512_slli_epi64(__A, __B), 5153 (__v8di)_mm512_setzero_si512()); 5154 } 5155 5156 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5157 _mm512_srli_epi32(__m512i __A, int __B) 5158 { 5159 return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B); 5160 } 5161 5162 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5163 _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) 5164 { 5165 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5166 (__v16si)_mm512_srli_epi32(__A, __B), 5167 (__v16si)__W); 5168 } 5169 5170 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5171 _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) { 5172 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5173 (__v16si)_mm512_srli_epi32(__A, __B), 5174 (__v16si)_mm512_setzero_si512()); 5175 } 5176 5177 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5178 _mm512_srli_epi64(__m512i __A, int __B) 5179 { 5180 return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B); 5181 } 5182 5183 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5184 _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) 5185 { 5186 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5187 (__v8di)_mm512_srli_epi64(__A, __B), 5188 (__v8di)__W); 5189 } 5190 5191 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5192 _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B) 5193 { 5194 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5195 (__v8di)_mm512_srli_epi64(__A, __B), 5196 (__v8di)_mm512_setzero_si512()); 5197 } 5198 5199 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5200 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) 5201 { 5202 return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, 5203 (__v16si) __W, 5204 (__mmask16) __U); 5205 } 5206 5207 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5208 _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) 5209 { 5210 return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, 5211 (__v16si) 5212 _mm512_setzero_si512 (), 5213 (__mmask16) __U); 5214 } 5215 5216 static __inline__ void __DEFAULT_FN_ATTRS512 5217 _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) 5218 { 5219 __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A, 5220 (__mmask16) __U); 5221 } 5222 5223 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5224 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 5225 { 5226 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 5227 (__v16si) __A, 5228 (__v16si) __W); 5229 } 5230 5231 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5232 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) 5233 { 5234 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 5235 (__v16si) __A, 5236 (__v16si) _mm512_setzero_si512 ()); 5237 } 5238 5239 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5240 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 5241 { 5242 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 5243 (__v8di) __A, 5244 (__v8di) __W); 5245 } 5246 5247 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5248 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) 5249 { 5250 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 5251 (__v8di) __A, 5252 (__v8di) _mm512_setzero_si512 ()); 5253 } 5254 5255 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5256 _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) 5257 { 5258 return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, 5259 (__v8di) __W, 5260 (__mmask8) __U); 5261 } 5262 5263 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5264 _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) 5265 { 5266 return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, 5267 (__v8di) 5268 _mm512_setzero_si512 (), 5269 (__mmask8) __U); 5270 } 5271 5272 static __inline__ void __DEFAULT_FN_ATTRS512 5273 _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) 5274 { 5275 __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A, 5276 (__mmask8) __U); 5277 } 5278 5279 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5280 _mm512_movedup_pd (__m512d __A) 5281 { 5282 return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A, 5283 0, 0, 2, 2, 4, 4, 6, 6); 5284 } 5285 5286 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5287 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) 5288 { 5289 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 5290 (__v8df)_mm512_movedup_pd(__A), 5291 (__v8df)__W); 5292 } 5293 5294 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5295 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) 5296 { 5297 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 5298 (__v8df)_mm512_movedup_pd(__A), 5299 (__v8df)_mm512_setzero_pd()); 5300 } 5301 5302 #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \ 5303 (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5304 (__v8df)(__m512d)(B), \ 5305 (__v8di)(__m512i)(C), (int)(imm), \ 5306 (__mmask8)-1, (int)(R)) 5307 5308 #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \ 5309 (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5310 (__v8df)(__m512d)(B), \ 5311 (__v8di)(__m512i)(C), (int)(imm), \ 5312 (__mmask8)(U), (int)(R)) 5313 5314 #define _mm512_fixupimm_pd(A, B, C, imm) \ 5315 (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5316 (__v8df)(__m512d)(B), \ 5317 (__v8di)(__m512i)(C), (int)(imm), \ 5318 (__mmask8)-1, \ 5319 _MM_FROUND_CUR_DIRECTION) 5320 5321 #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \ 5322 (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5323 (__v8df)(__m512d)(B), \ 5324 (__v8di)(__m512i)(C), (int)(imm), \ 5325 (__mmask8)(U), \ 5326 _MM_FROUND_CUR_DIRECTION) 5327 5328 #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \ 5329 (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ 5330 (__v8df)(__m512d)(B), \ 5331 (__v8di)(__m512i)(C), \ 5332 (int)(imm), (__mmask8)(U), \ 5333 (int)(R)) 5334 5335 #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \ 5336 (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ 5337 (__v8df)(__m512d)(B), \ 5338 (__v8di)(__m512i)(C), \ 5339 (int)(imm), (__mmask8)(U), \ 5340 _MM_FROUND_CUR_DIRECTION) 5341 5342 #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \ 5343 (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5344 (__v16sf)(__m512)(B), \ 5345 (__v16si)(__m512i)(C), (int)(imm), \ 5346 (__mmask16)-1, (int)(R)) 5347 5348 #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \ 5349 (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5350 (__v16sf)(__m512)(B), \ 5351 (__v16si)(__m512i)(C), (int)(imm), \ 5352 (__mmask16)(U), (int)(R)) 5353 5354 #define _mm512_fixupimm_ps(A, B, C, imm) \ 5355 (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5356 (__v16sf)(__m512)(B), \ 5357 (__v16si)(__m512i)(C), (int)(imm), \ 5358 (__mmask16)-1, \ 5359 _MM_FROUND_CUR_DIRECTION) 5360 5361 #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \ 5362 (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5363 (__v16sf)(__m512)(B), \ 5364 (__v16si)(__m512i)(C), (int)(imm), \ 5365 (__mmask16)(U), \ 5366 _MM_FROUND_CUR_DIRECTION) 5367 5368 #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \ 5369 (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ 5370 (__v16sf)(__m512)(B), \ 5371 (__v16si)(__m512i)(C), \ 5372 (int)(imm), (__mmask16)(U), \ 5373 (int)(R)) 5374 5375 #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \ 5376 (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ 5377 (__v16sf)(__m512)(B), \ 5378 (__v16si)(__m512i)(C), \ 5379 (int)(imm), (__mmask16)(U), \ 5380 _MM_FROUND_CUR_DIRECTION) 5381 5382 #define _mm_fixupimm_round_sd(A, B, C, imm, R) \ 5383 (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5384 (__v2df)(__m128d)(B), \ 5385 (__v2di)(__m128i)(C), (int)(imm), \ 5386 (__mmask8)-1, (int)(R)) 5387 5388 #define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \ 5389 (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5390 (__v2df)(__m128d)(B), \ 5391 (__v2di)(__m128i)(C), (int)(imm), \ 5392 (__mmask8)(U), (int)(R)) 5393 5394 #define _mm_fixupimm_sd(A, B, C, imm) \ 5395 (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5396 (__v2df)(__m128d)(B), \ 5397 (__v2di)(__m128i)(C), (int)(imm), \ 5398 (__mmask8)-1, \ 5399 _MM_FROUND_CUR_DIRECTION) 5400 5401 #define _mm_mask_fixupimm_sd(A, U, B, C, imm) \ 5402 (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5403 (__v2df)(__m128d)(B), \ 5404 (__v2di)(__m128i)(C), (int)(imm), \ 5405 (__mmask8)(U), \ 5406 _MM_FROUND_CUR_DIRECTION) 5407 5408 #define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \ 5409 (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ 5410 (__v2df)(__m128d)(B), \ 5411 (__v2di)(__m128i)(C), (int)(imm), \ 5412 (__mmask8)(U), (int)(R)) 5413 5414 #define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \ 5415 (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ 5416 (__v2df)(__m128d)(B), \ 5417 (__v2di)(__m128i)(C), (int)(imm), \ 5418 (__mmask8)(U), \ 5419 _MM_FROUND_CUR_DIRECTION) 5420 5421 #define _mm_fixupimm_round_ss(A, B, C, imm, R) \ 5422 (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5423 (__v4sf)(__m128)(B), \ 5424 (__v4si)(__m128i)(C), (int)(imm), \ 5425 (__mmask8)-1, (int)(R)) 5426 5427 #define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \ 5428 (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5429 (__v4sf)(__m128)(B), \ 5430 (__v4si)(__m128i)(C), (int)(imm), \ 5431 (__mmask8)(U), (int)(R)) 5432 5433 #define _mm_fixupimm_ss(A, B, C, imm) \ 5434 (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5435 (__v4sf)(__m128)(B), \ 5436 (__v4si)(__m128i)(C), (int)(imm), \ 5437 (__mmask8)-1, \ 5438 _MM_FROUND_CUR_DIRECTION) 5439 5440 #define _mm_mask_fixupimm_ss(A, U, B, C, imm) \ 5441 (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5442 (__v4sf)(__m128)(B), \ 5443 (__v4si)(__m128i)(C), (int)(imm), \ 5444 (__mmask8)(U), \ 5445 _MM_FROUND_CUR_DIRECTION) 5446 5447 #define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \ 5448 (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ 5449 (__v4sf)(__m128)(B), \ 5450 (__v4si)(__m128i)(C), (int)(imm), \ 5451 (__mmask8)(U), (int)(R)) 5452 5453 #define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \ 5454 (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ 5455 (__v4sf)(__m128)(B), \ 5456 (__v4si)(__m128i)(C), (int)(imm), \ 5457 (__mmask8)(U), \ 5458 _MM_FROUND_CUR_DIRECTION) 5459 5460 #define _mm_getexp_round_sd(A, B, R) \ 5461 (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5462 (__v2df)(__m128d)(B), \ 5463 (__v2df)_mm_setzero_pd(), \ 5464 (__mmask8)-1, (int)(R)) 5465 5466 5467 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5468 _mm_getexp_sd (__m128d __A, __m128d __B) 5469 { 5470 return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A, 5471 (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); 5472 } 5473 5474 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5475 _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 5476 { 5477 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, 5478 (__v2df) __B, 5479 (__v2df) __W, 5480 (__mmask8) __U, 5481 _MM_FROUND_CUR_DIRECTION); 5482 } 5483 5484 #define _mm_mask_getexp_round_sd(W, U, A, B, R) \ 5485 (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5486 (__v2df)(__m128d)(B), \ 5487 (__v2df)(__m128d)(W), \ 5488 (__mmask8)(U), (int)(R)) 5489 5490 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5491 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) 5492 { 5493 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, 5494 (__v2df) __B, 5495 (__v2df) _mm_setzero_pd (), 5496 (__mmask8) __U, 5497 _MM_FROUND_CUR_DIRECTION); 5498 } 5499 5500 #define _mm_maskz_getexp_round_sd(U, A, B, R) \ 5501 (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5502 (__v2df)(__m128d)(B), \ 5503 (__v2df)_mm_setzero_pd(), \ 5504 (__mmask8)(U), (int)(R)) 5505 5506 #define _mm_getexp_round_ss(A, B, R) \ 5507 (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5508 (__v4sf)(__m128)(B), \ 5509 (__v4sf)_mm_setzero_ps(), \ 5510 (__mmask8)-1, (int)(R)) 5511 5512 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5513 _mm_getexp_ss (__m128 __A, __m128 __B) 5514 { 5515 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5516 (__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); 5517 } 5518 5519 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5520 _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 5521 { 5522 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5523 (__v4sf) __B, 5524 (__v4sf) __W, 5525 (__mmask8) __U, 5526 _MM_FROUND_CUR_DIRECTION); 5527 } 5528 5529 #define _mm_mask_getexp_round_ss(W, U, A, B, R) \ 5530 (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5531 (__v4sf)(__m128)(B), \ 5532 (__v4sf)(__m128)(W), \ 5533 (__mmask8)(U), (int)(R)) 5534 5535 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5536 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) 5537 { 5538 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5539 (__v4sf) __B, 5540 (__v4sf) _mm_setzero_ps (), 5541 (__mmask8) __U, 5542 _MM_FROUND_CUR_DIRECTION); 5543 } 5544 5545 #define _mm_maskz_getexp_round_ss(U, A, B, R) \ 5546 (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5547 (__v4sf)(__m128)(B), \ 5548 (__v4sf)_mm_setzero_ps(), \ 5549 (__mmask8)(U), (int)(R)) 5550 5551 #define _mm_getmant_round_sd(A, B, C, D, R) \ 5552 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5553 (__v2df)(__m128d)(B), \ 5554 (int)(((D)<<2) | (C)), \ 5555 (__v2df)_mm_setzero_pd(), \ 5556 (__mmask8)-1, (int)(R)) 5557 5558 #define _mm_getmant_sd(A, B, C, D) \ 5559 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5560 (__v2df)(__m128d)(B), \ 5561 (int)(((D)<<2) | (C)), \ 5562 (__v2df)_mm_setzero_pd(), \ 5563 (__mmask8)-1, \ 5564 _MM_FROUND_CUR_DIRECTION) 5565 5566 #define _mm_mask_getmant_sd(W, U, A, B, C, D) \ 5567 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5568 (__v2df)(__m128d)(B), \ 5569 (int)(((D)<<2) | (C)), \ 5570 (__v2df)(__m128d)(W), \ 5571 (__mmask8)(U), \ 5572 _MM_FROUND_CUR_DIRECTION) 5573 5574 #define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \ 5575 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5576 (__v2df)(__m128d)(B), \ 5577 (int)(((D)<<2) | (C)), \ 5578 (__v2df)(__m128d)(W), \ 5579 (__mmask8)(U), (int)(R)) 5580 5581 #define _mm_maskz_getmant_sd(U, A, B, C, D) \ 5582 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5583 (__v2df)(__m128d)(B), \ 5584 (int)(((D)<<2) | (C)), \ 5585 (__v2df)_mm_setzero_pd(), \ 5586 (__mmask8)(U), \ 5587 _MM_FROUND_CUR_DIRECTION) 5588 5589 #define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \ 5590 (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5591 (__v2df)(__m128d)(B), \ 5592 (int)(((D)<<2) | (C)), \ 5593 (__v2df)_mm_setzero_pd(), \ 5594 (__mmask8)(U), (int)(R)) 5595 5596 #define _mm_getmant_round_ss(A, B, C, D, R) \ 5597 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5598 (__v4sf)(__m128)(B), \ 5599 (int)(((D)<<2) | (C)), \ 5600 (__v4sf)_mm_setzero_ps(), \ 5601 (__mmask8)-1, (int)(R)) 5602 5603 #define _mm_getmant_ss(A, B, C, D) \ 5604 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5605 (__v4sf)(__m128)(B), \ 5606 (int)(((D)<<2) | (C)), \ 5607 (__v4sf)_mm_setzero_ps(), \ 5608 (__mmask8)-1, \ 5609 _MM_FROUND_CUR_DIRECTION) 5610 5611 #define _mm_mask_getmant_ss(W, U, A, B, C, D) \ 5612 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5613 (__v4sf)(__m128)(B), \ 5614 (int)(((D)<<2) | (C)), \ 5615 (__v4sf)(__m128)(W), \ 5616 (__mmask8)(U), \ 5617 _MM_FROUND_CUR_DIRECTION) 5618 5619 #define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \ 5620 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5621 (__v4sf)(__m128)(B), \ 5622 (int)(((D)<<2) | (C)), \ 5623 (__v4sf)(__m128)(W), \ 5624 (__mmask8)(U), (int)(R)) 5625 5626 #define _mm_maskz_getmant_ss(U, A, B, C, D) \ 5627 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5628 (__v4sf)(__m128)(B), \ 5629 (int)(((D)<<2) | (C)), \ 5630 (__v4sf)_mm_setzero_ps(), \ 5631 (__mmask8)(U), \ 5632 _MM_FROUND_CUR_DIRECTION) 5633 5634 #define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \ 5635 (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5636 (__v4sf)(__m128)(B), \ 5637 (int)(((D)<<2) | (C)), \ 5638 (__v4sf)_mm_setzero_ps(), \ 5639 (__mmask8)(U), (int)(R)) 5640 5641 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 5642 _mm512_kmov (__mmask16 __A) 5643 { 5644 return __A; 5645 } 5646 5647 #define _mm_comi_round_sd(A, B, P, R) \ 5648 (int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ 5649 (int)(P), (int)(R)) 5650 5651 #define _mm_comi_round_ss(A, B, P, R) \ 5652 (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ 5653 (int)(P), (int)(R)) 5654 5655 #ifdef __x86_64__ 5656 #define _mm_cvt_roundsd_si64(A, R) \ 5657 (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)) 5658 #endif 5659 5660 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5661 _mm512_sll_epi32(__m512i __A, __m128i __B) 5662 { 5663 return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); 5664 } 5665 5666 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5667 _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5668 { 5669 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5670 (__v16si)_mm512_sll_epi32(__A, __B), 5671 (__v16si)__W); 5672 } 5673 5674 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5675 _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5676 { 5677 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5678 (__v16si)_mm512_sll_epi32(__A, __B), 5679 (__v16si)_mm512_setzero_si512()); 5680 } 5681 5682 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5683 _mm512_sll_epi64(__m512i __A, __m128i __B) 5684 { 5685 return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); 5686 } 5687 5688 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5689 _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5690 { 5691 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5692 (__v8di)_mm512_sll_epi64(__A, __B), 5693 (__v8di)__W); 5694 } 5695 5696 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5697 _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5698 { 5699 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5700 (__v8di)_mm512_sll_epi64(__A, __B), 5701 (__v8di)_mm512_setzero_si512()); 5702 } 5703 5704 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5705 _mm512_sllv_epi32(__m512i __X, __m512i __Y) 5706 { 5707 return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y); 5708 } 5709 5710 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5711 _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5712 { 5713 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5714 (__v16si)_mm512_sllv_epi32(__X, __Y), 5715 (__v16si)__W); 5716 } 5717 5718 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5719 _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5720 { 5721 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5722 (__v16si)_mm512_sllv_epi32(__X, __Y), 5723 (__v16si)_mm512_setzero_si512()); 5724 } 5725 5726 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5727 _mm512_sllv_epi64(__m512i __X, __m512i __Y) 5728 { 5729 return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y); 5730 } 5731 5732 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5733 _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5734 { 5735 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5736 (__v8di)_mm512_sllv_epi64(__X, __Y), 5737 (__v8di)__W); 5738 } 5739 5740 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5741 _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5742 { 5743 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5744 (__v8di)_mm512_sllv_epi64(__X, __Y), 5745 (__v8di)_mm512_setzero_si512()); 5746 } 5747 5748 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5749 _mm512_sra_epi32(__m512i __A, __m128i __B) 5750 { 5751 return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); 5752 } 5753 5754 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5755 _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5756 { 5757 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5758 (__v16si)_mm512_sra_epi32(__A, __B), 5759 (__v16si)__W); 5760 } 5761 5762 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5763 _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5764 { 5765 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5766 (__v16si)_mm512_sra_epi32(__A, __B), 5767 (__v16si)_mm512_setzero_si512()); 5768 } 5769 5770 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5771 _mm512_sra_epi64(__m512i __A, __m128i __B) 5772 { 5773 return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); 5774 } 5775 5776 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5777 _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5778 { 5779 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5780 (__v8di)_mm512_sra_epi64(__A, __B), 5781 (__v8di)__W); 5782 } 5783 5784 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5785 _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5786 { 5787 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5788 (__v8di)_mm512_sra_epi64(__A, __B), 5789 (__v8di)_mm512_setzero_si512()); 5790 } 5791 5792 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5793 _mm512_srav_epi32(__m512i __X, __m512i __Y) 5794 { 5795 return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y); 5796 } 5797 5798 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5799 _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5800 { 5801 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5802 (__v16si)_mm512_srav_epi32(__X, __Y), 5803 (__v16si)__W); 5804 } 5805 5806 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5807 _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5808 { 5809 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5810 (__v16si)_mm512_srav_epi32(__X, __Y), 5811 (__v16si)_mm512_setzero_si512()); 5812 } 5813 5814 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5815 _mm512_srav_epi64(__m512i __X, __m512i __Y) 5816 { 5817 return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y); 5818 } 5819 5820 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5821 _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5822 { 5823 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5824 (__v8di)_mm512_srav_epi64(__X, __Y), 5825 (__v8di)__W); 5826 } 5827 5828 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5829 _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5830 { 5831 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5832 (__v8di)_mm512_srav_epi64(__X, __Y), 5833 (__v8di)_mm512_setzero_si512()); 5834 } 5835 5836 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5837 _mm512_srl_epi32(__m512i __A, __m128i __B) 5838 { 5839 return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); 5840 } 5841 5842 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5843 _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5844 { 5845 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5846 (__v16si)_mm512_srl_epi32(__A, __B), 5847 (__v16si)__W); 5848 } 5849 5850 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5851 _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5852 { 5853 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5854 (__v16si)_mm512_srl_epi32(__A, __B), 5855 (__v16si)_mm512_setzero_si512()); 5856 } 5857 5858 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5859 _mm512_srl_epi64(__m512i __A, __m128i __B) 5860 { 5861 return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); 5862 } 5863 5864 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5865 _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5866 { 5867 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5868 (__v8di)_mm512_srl_epi64(__A, __B), 5869 (__v8di)__W); 5870 } 5871 5872 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5873 _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5874 { 5875 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5876 (__v8di)_mm512_srl_epi64(__A, __B), 5877 (__v8di)_mm512_setzero_si512()); 5878 } 5879 5880 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5881 _mm512_srlv_epi32(__m512i __X, __m512i __Y) 5882 { 5883 return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y); 5884 } 5885 5886 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5887 _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5888 { 5889 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5890 (__v16si)_mm512_srlv_epi32(__X, __Y), 5891 (__v16si)__W); 5892 } 5893 5894 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5895 _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5896 { 5897 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5898 (__v16si)_mm512_srlv_epi32(__X, __Y), 5899 (__v16si)_mm512_setzero_si512()); 5900 } 5901 5902 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5903 _mm512_srlv_epi64 (__m512i __X, __m512i __Y) 5904 { 5905 return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y); 5906 } 5907 5908 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5909 _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5910 { 5911 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5912 (__v8di)_mm512_srlv_epi64(__X, __Y), 5913 (__v8di)__W); 5914 } 5915 5916 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5917 _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5918 { 5919 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5920 (__v8di)_mm512_srlv_epi64(__X, __Y), 5921 (__v8di)_mm512_setzero_si512()); 5922 } 5923 5924 #define _mm512_ternarylogic_epi32(A, B, C, imm) \ 5925 (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ 5926 (__v16si)(__m512i)(B), \ 5927 (__v16si)(__m512i)(C), (int)(imm), \ 5928 (__mmask16)-1) 5929 5930 #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \ 5931 (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ 5932 (__v16si)(__m512i)(B), \ 5933 (__v16si)(__m512i)(C), (int)(imm), \ 5934 (__mmask16)(U)) 5935 5936 #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \ 5937 (__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \ 5938 (__v16si)(__m512i)(B), \ 5939 (__v16si)(__m512i)(C), \ 5940 (int)(imm), (__mmask16)(U)) 5941 5942 #define _mm512_ternarylogic_epi64(A, B, C, imm) \ 5943 (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ 5944 (__v8di)(__m512i)(B), \ 5945 (__v8di)(__m512i)(C), (int)(imm), \ 5946 (__mmask8)-1) 5947 5948 #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \ 5949 (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ 5950 (__v8di)(__m512i)(B), \ 5951 (__v8di)(__m512i)(C), (int)(imm), \ 5952 (__mmask8)(U)) 5953 5954 #define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \ 5955 (__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \ 5956 (__v8di)(__m512i)(B), \ 5957 (__v8di)(__m512i)(C), (int)(imm), \ 5958 (__mmask8)(U)) 5959 5960 #ifdef __x86_64__ 5961 #define _mm_cvt_roundsd_i64(A, R) \ 5962 (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)) 5963 #endif 5964 5965 #define _mm_cvt_roundsd_si32(A, R) \ 5966 (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)) 5967 5968 #define _mm_cvt_roundsd_i32(A, R) \ 5969 (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)) 5970 5971 #define _mm_cvt_roundsd_u32(A, R) \ 5972 (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)) 5973 5974 static __inline__ unsigned __DEFAULT_FN_ATTRS128 5975 _mm_cvtsd_u32 (__m128d __A) 5976 { 5977 return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, 5978 _MM_FROUND_CUR_DIRECTION); 5979 } 5980 5981 #ifdef __x86_64__ 5982 #define _mm_cvt_roundsd_u64(A, R) \ 5983 (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ 5984 (int)(R)) 5985 5986 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 5987 _mm_cvtsd_u64 (__m128d __A) 5988 { 5989 return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) 5990 __A, 5991 _MM_FROUND_CUR_DIRECTION); 5992 } 5993 #endif 5994 5995 #define _mm_cvt_roundss_si32(A, R) \ 5996 (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)) 5997 5998 #define _mm_cvt_roundss_i32(A, R) \ 5999 (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)) 6000 6001 #ifdef __x86_64__ 6002 #define _mm_cvt_roundss_si64(A, R) \ 6003 (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)) 6004 6005 #define _mm_cvt_roundss_i64(A, R) \ 6006 (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)) 6007 #endif 6008 6009 #define _mm_cvt_roundss_u32(A, R) \ 6010 (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)) 6011 6012 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6013 _mm_cvtss_u32 (__m128 __A) 6014 { 6015 return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, 6016 _MM_FROUND_CUR_DIRECTION); 6017 } 6018 6019 #ifdef __x86_64__ 6020 #define _mm_cvt_roundss_u64(A, R) \ 6021 (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ 6022 (int)(R)) 6023 6024 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6025 _mm_cvtss_u64 (__m128 __A) 6026 { 6027 return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) 6028 __A, 6029 _MM_FROUND_CUR_DIRECTION); 6030 } 6031 #endif 6032 6033 #define _mm_cvtt_roundsd_i32(A, R) \ 6034 (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)) 6035 6036 #define _mm_cvtt_roundsd_si32(A, R) \ 6037 (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)) 6038 6039 static __inline__ int __DEFAULT_FN_ATTRS128 6040 _mm_cvttsd_i32 (__m128d __A) 6041 { 6042 return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, 6043 _MM_FROUND_CUR_DIRECTION); 6044 } 6045 6046 #ifdef __x86_64__ 6047 #define _mm_cvtt_roundsd_si64(A, R) \ 6048 (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)) 6049 6050 #define _mm_cvtt_roundsd_i64(A, R) \ 6051 (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)) 6052 6053 static __inline__ long long __DEFAULT_FN_ATTRS128 6054 _mm_cvttsd_i64 (__m128d __A) 6055 { 6056 return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, 6057 _MM_FROUND_CUR_DIRECTION); 6058 } 6059 #endif 6060 6061 #define _mm_cvtt_roundsd_u32(A, R) \ 6062 (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)) 6063 6064 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6065 _mm_cvttsd_u32 (__m128d __A) 6066 { 6067 return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, 6068 _MM_FROUND_CUR_DIRECTION); 6069 } 6070 6071 #ifdef __x86_64__ 6072 #define _mm_cvtt_roundsd_u64(A, R) \ 6073 (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ 6074 (int)(R)) 6075 6076 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6077 _mm_cvttsd_u64 (__m128d __A) 6078 { 6079 return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) 6080 __A, 6081 _MM_FROUND_CUR_DIRECTION); 6082 } 6083 #endif 6084 6085 #define _mm_cvtt_roundss_i32(A, R) \ 6086 (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)) 6087 6088 #define _mm_cvtt_roundss_si32(A, R) \ 6089 (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)) 6090 6091 static __inline__ int __DEFAULT_FN_ATTRS128 6092 _mm_cvttss_i32 (__m128 __A) 6093 { 6094 return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, 6095 _MM_FROUND_CUR_DIRECTION); 6096 } 6097 6098 #ifdef __x86_64__ 6099 #define _mm_cvtt_roundss_i64(A, R) \ 6100 (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)) 6101 6102 #define _mm_cvtt_roundss_si64(A, R) \ 6103 (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)) 6104 6105 static __inline__ long long __DEFAULT_FN_ATTRS128 6106 _mm_cvttss_i64 (__m128 __A) 6107 { 6108 return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, 6109 _MM_FROUND_CUR_DIRECTION); 6110 } 6111 #endif 6112 6113 #define _mm_cvtt_roundss_u32(A, R) \ 6114 (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)) 6115 6116 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6117 _mm_cvttss_u32 (__m128 __A) 6118 { 6119 return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, 6120 _MM_FROUND_CUR_DIRECTION); 6121 } 6122 6123 #ifdef __x86_64__ 6124 #define _mm_cvtt_roundss_u64(A, R) \ 6125 (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ 6126 (int)(R)) 6127 6128 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6129 _mm_cvttss_u64 (__m128 __A) 6130 { 6131 return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) 6132 __A, 6133 _MM_FROUND_CUR_DIRECTION); 6134 } 6135 #endif 6136 6137 #define _mm512_permute_pd(X, C) \ 6138 (__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)) 6139 6140 #define _mm512_mask_permute_pd(W, U, X, C) \ 6141 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6142 (__v8df)_mm512_permute_pd((X), (C)), \ 6143 (__v8df)(__m512d)(W)) 6144 6145 #define _mm512_maskz_permute_pd(U, X, C) \ 6146 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6147 (__v8df)_mm512_permute_pd((X), (C)), \ 6148 (__v8df)_mm512_setzero_pd()) 6149 6150 #define _mm512_permute_ps(X, C) \ 6151 (__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)) 6152 6153 #define _mm512_mask_permute_ps(W, U, X, C) \ 6154 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6155 (__v16sf)_mm512_permute_ps((X), (C)), \ 6156 (__v16sf)(__m512)(W)) 6157 6158 #define _mm512_maskz_permute_ps(U, X, C) \ 6159 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6160 (__v16sf)_mm512_permute_ps((X), (C)), \ 6161 (__v16sf)_mm512_setzero_ps()) 6162 6163 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6164 _mm512_permutevar_pd(__m512d __A, __m512i __C) 6165 { 6166 return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C); 6167 } 6168 6169 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6170 _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) 6171 { 6172 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 6173 (__v8df)_mm512_permutevar_pd(__A, __C), 6174 (__v8df)__W); 6175 } 6176 6177 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6178 _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) 6179 { 6180 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 6181 (__v8df)_mm512_permutevar_pd(__A, __C), 6182 (__v8df)_mm512_setzero_pd()); 6183 } 6184 6185 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6186 _mm512_permutevar_ps(__m512 __A, __m512i __C) 6187 { 6188 return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C); 6189 } 6190 6191 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6192 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) 6193 { 6194 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 6195 (__v16sf)_mm512_permutevar_ps(__A, __C), 6196 (__v16sf)__W); 6197 } 6198 6199 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6200 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) 6201 { 6202 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 6203 (__v16sf)_mm512_permutevar_ps(__A, __C), 6204 (__v16sf)_mm512_setzero_ps()); 6205 } 6206 6207 static __inline __m512d __DEFAULT_FN_ATTRS512 6208 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) 6209 { 6210 return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, 6211 (__v8df)__B); 6212 } 6213 6214 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6215 _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) 6216 { 6217 return (__m512d)__builtin_ia32_selectpd_512(__U, 6218 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6219 (__v8df)__A); 6220 } 6221 6222 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6223 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, 6224 __m512d __B) 6225 { 6226 return (__m512d)__builtin_ia32_selectpd_512(__U, 6227 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6228 (__v8df)(__m512d)__I); 6229 } 6230 6231 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6232 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, 6233 __m512d __B) 6234 { 6235 return (__m512d)__builtin_ia32_selectpd_512(__U, 6236 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6237 (__v8df)_mm512_setzero_pd()); 6238 } 6239 6240 static __inline __m512 __DEFAULT_FN_ATTRS512 6241 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) 6242 { 6243 return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, 6244 (__v16sf) __B); 6245 } 6246 6247 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6248 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) 6249 { 6250 return (__m512)__builtin_ia32_selectps_512(__U, 6251 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6252 (__v16sf)__A); 6253 } 6254 6255 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6256 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) 6257 { 6258 return (__m512)__builtin_ia32_selectps_512(__U, 6259 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6260 (__v16sf)(__m512)__I); 6261 } 6262 6263 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6264 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) 6265 { 6266 return (__m512)__builtin_ia32_selectps_512(__U, 6267 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6268 (__v16sf)_mm512_setzero_ps()); 6269 } 6270 6271 6272 #define _mm512_cvtt_roundpd_epu32(A, R) \ 6273 (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6274 (__v8si)_mm256_undefined_si256(), \ 6275 (__mmask8)-1, (int)(R)) 6276 6277 #define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \ 6278 (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6279 (__v8si)(__m256i)(W), \ 6280 (__mmask8)(U), (int)(R)) 6281 6282 #define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \ 6283 (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6284 (__v8si)_mm256_setzero_si256(), \ 6285 (__mmask8)(U), (int)(R)) 6286 6287 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6288 _mm512_cvttpd_epu32 (__m512d __A) 6289 { 6290 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6291 (__v8si) 6292 _mm256_undefined_si256 (), 6293 (__mmask8) -1, 6294 _MM_FROUND_CUR_DIRECTION); 6295 } 6296 6297 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6298 _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) 6299 { 6300 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6301 (__v8si) __W, 6302 (__mmask8) __U, 6303 _MM_FROUND_CUR_DIRECTION); 6304 } 6305 6306 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6307 _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) 6308 { 6309 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6310 (__v8si) 6311 _mm256_setzero_si256 (), 6312 (__mmask8) __U, 6313 _MM_FROUND_CUR_DIRECTION); 6314 } 6315 6316 #define _mm_roundscale_round_sd(A, B, imm, R) \ 6317 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6318 (__v2df)(__m128d)(B), \ 6319 (__v2df)_mm_setzero_pd(), \ 6320 (__mmask8)-1, (int)(imm), \ 6321 (int)(R)) 6322 6323 #define _mm_roundscale_sd(A, B, imm) \ 6324 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6325 (__v2df)(__m128d)(B), \ 6326 (__v2df)_mm_setzero_pd(), \ 6327 (__mmask8)-1, (int)(imm), \ 6328 _MM_FROUND_CUR_DIRECTION) 6329 6330 #define _mm_mask_roundscale_sd(W, U, A, B, imm) \ 6331 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6332 (__v2df)(__m128d)(B), \ 6333 (__v2df)(__m128d)(W), \ 6334 (__mmask8)(U), (int)(imm), \ 6335 _MM_FROUND_CUR_DIRECTION) 6336 6337 #define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \ 6338 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6339 (__v2df)(__m128d)(B), \ 6340 (__v2df)(__m128d)(W), \ 6341 (__mmask8)(U), (int)(I), \ 6342 (int)(R)) 6343 6344 #define _mm_maskz_roundscale_sd(U, A, B, I) \ 6345 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6346 (__v2df)(__m128d)(B), \ 6347 (__v2df)_mm_setzero_pd(), \ 6348 (__mmask8)(U), (int)(I), \ 6349 _MM_FROUND_CUR_DIRECTION) 6350 6351 #define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ 6352 (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6353 (__v2df)(__m128d)(B), \ 6354 (__v2df)_mm_setzero_pd(), \ 6355 (__mmask8)(U), (int)(I), \ 6356 (int)(R)) 6357 6358 #define _mm_roundscale_round_ss(A, B, imm, R) \ 6359 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6360 (__v4sf)(__m128)(B), \ 6361 (__v4sf)_mm_setzero_ps(), \ 6362 (__mmask8)-1, (int)(imm), \ 6363 (int)(R)) 6364 6365 #define _mm_roundscale_ss(A, B, imm) \ 6366 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6367 (__v4sf)(__m128)(B), \ 6368 (__v4sf)_mm_setzero_ps(), \ 6369 (__mmask8)-1, (int)(imm), \ 6370 _MM_FROUND_CUR_DIRECTION) 6371 6372 #define _mm_mask_roundscale_ss(W, U, A, B, I) \ 6373 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6374 (__v4sf)(__m128)(B), \ 6375 (__v4sf)(__m128)(W), \ 6376 (__mmask8)(U), (int)(I), \ 6377 _MM_FROUND_CUR_DIRECTION) 6378 6379 #define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \ 6380 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6381 (__v4sf)(__m128)(B), \ 6382 (__v4sf)(__m128)(W), \ 6383 (__mmask8)(U), (int)(I), \ 6384 (int)(R)) 6385 6386 #define _mm_maskz_roundscale_ss(U, A, B, I) \ 6387 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6388 (__v4sf)(__m128)(B), \ 6389 (__v4sf)_mm_setzero_ps(), \ 6390 (__mmask8)(U), (int)(I), \ 6391 _MM_FROUND_CUR_DIRECTION) 6392 6393 #define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ 6394 (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6395 (__v4sf)(__m128)(B), \ 6396 (__v4sf)_mm_setzero_ps(), \ 6397 (__mmask8)(U), (int)(I), \ 6398 (int)(R)) 6399 6400 #define _mm512_scalef_round_pd(A, B, R) \ 6401 (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6402 (__v8df)(__m512d)(B), \ 6403 (__v8df)_mm512_undefined_pd(), \ 6404 (__mmask8)-1, (int)(R)) 6405 6406 #define _mm512_mask_scalef_round_pd(W, U, A, B, R) \ 6407 (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6408 (__v8df)(__m512d)(B), \ 6409 (__v8df)(__m512d)(W), \ 6410 (__mmask8)(U), (int)(R)) 6411 6412 #define _mm512_maskz_scalef_round_pd(U, A, B, R) \ 6413 (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6414 (__v8df)(__m512d)(B), \ 6415 (__v8df)_mm512_setzero_pd(), \ 6416 (__mmask8)(U), (int)(R)) 6417 6418 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6419 _mm512_scalef_pd (__m512d __A, __m512d __B) 6420 { 6421 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6422 (__v8df) __B, 6423 (__v8df) 6424 _mm512_undefined_pd (), 6425 (__mmask8) -1, 6426 _MM_FROUND_CUR_DIRECTION); 6427 } 6428 6429 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6430 _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 6431 { 6432 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6433 (__v8df) __B, 6434 (__v8df) __W, 6435 (__mmask8) __U, 6436 _MM_FROUND_CUR_DIRECTION); 6437 } 6438 6439 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6440 _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) 6441 { 6442 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6443 (__v8df) __B, 6444 (__v8df) 6445 _mm512_setzero_pd (), 6446 (__mmask8) __U, 6447 _MM_FROUND_CUR_DIRECTION); 6448 } 6449 6450 #define _mm512_scalef_round_ps(A, B, R) \ 6451 (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6452 (__v16sf)(__m512)(B), \ 6453 (__v16sf)_mm512_undefined_ps(), \ 6454 (__mmask16)-1, (int)(R)) 6455 6456 #define _mm512_mask_scalef_round_ps(W, U, A, B, R) \ 6457 (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6458 (__v16sf)(__m512)(B), \ 6459 (__v16sf)(__m512)(W), \ 6460 (__mmask16)(U), (int)(R)) 6461 6462 #define _mm512_maskz_scalef_round_ps(U, A, B, R) \ 6463 (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6464 (__v16sf)(__m512)(B), \ 6465 (__v16sf)_mm512_setzero_ps(), \ 6466 (__mmask16)(U), (int)(R)) 6467 6468 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6469 _mm512_scalef_ps (__m512 __A, __m512 __B) 6470 { 6471 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6472 (__v16sf) __B, 6473 (__v16sf) 6474 _mm512_undefined_ps (), 6475 (__mmask16) -1, 6476 _MM_FROUND_CUR_DIRECTION); 6477 } 6478 6479 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6480 _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 6481 { 6482 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6483 (__v16sf) __B, 6484 (__v16sf) __W, 6485 (__mmask16) __U, 6486 _MM_FROUND_CUR_DIRECTION); 6487 } 6488 6489 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6490 _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) 6491 { 6492 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6493 (__v16sf) __B, 6494 (__v16sf) 6495 _mm512_setzero_ps (), 6496 (__mmask16) __U, 6497 _MM_FROUND_CUR_DIRECTION); 6498 } 6499 6500 #define _mm_scalef_round_sd(A, B, R) \ 6501 (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6502 (__v2df)(__m128d)(B), \ 6503 (__v2df)_mm_setzero_pd(), \ 6504 (__mmask8)-1, (int)(R)) 6505 6506 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6507 _mm_scalef_sd (__m128d __A, __m128d __B) 6508 { 6509 return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A, 6510 (__v2df)( __B), (__v2df) _mm_setzero_pd(), 6511 (__mmask8) -1, 6512 _MM_FROUND_CUR_DIRECTION); 6513 } 6514 6515 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6516 _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 6517 { 6518 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, 6519 (__v2df) __B, 6520 (__v2df) __W, 6521 (__mmask8) __U, 6522 _MM_FROUND_CUR_DIRECTION); 6523 } 6524 6525 #define _mm_mask_scalef_round_sd(W, U, A, B, R) \ 6526 (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6527 (__v2df)(__m128d)(B), \ 6528 (__v2df)(__m128d)(W), \ 6529 (__mmask8)(U), (int)(R)) 6530 6531 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6532 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) 6533 { 6534 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, 6535 (__v2df) __B, 6536 (__v2df) _mm_setzero_pd (), 6537 (__mmask8) __U, 6538 _MM_FROUND_CUR_DIRECTION); 6539 } 6540 6541 #define _mm_maskz_scalef_round_sd(U, A, B, R) \ 6542 (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6543 (__v2df)(__m128d)(B), \ 6544 (__v2df)_mm_setzero_pd(), \ 6545 (__mmask8)(U), (int)(R)) 6546 6547 #define _mm_scalef_round_ss(A, B, R) \ 6548 (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6549 (__v4sf)(__m128)(B), \ 6550 (__v4sf)_mm_setzero_ps(), \ 6551 (__mmask8)-1, (int)(R)) 6552 6553 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6554 _mm_scalef_ss (__m128 __A, __m128 __B) 6555 { 6556 return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A, 6557 (__v4sf)( __B), (__v4sf) _mm_setzero_ps(), 6558 (__mmask8) -1, 6559 _MM_FROUND_CUR_DIRECTION); 6560 } 6561 6562 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6563 _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 6564 { 6565 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, 6566 (__v4sf) __B, 6567 (__v4sf) __W, 6568 (__mmask8) __U, 6569 _MM_FROUND_CUR_DIRECTION); 6570 } 6571 6572 #define _mm_mask_scalef_round_ss(W, U, A, B, R) \ 6573 (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6574 (__v4sf)(__m128)(B), \ 6575 (__v4sf)(__m128)(W), \ 6576 (__mmask8)(U), (int)(R)) 6577 6578 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6579 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) 6580 { 6581 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, 6582 (__v4sf) __B, 6583 (__v4sf) _mm_setzero_ps (), 6584 (__mmask8) __U, 6585 _MM_FROUND_CUR_DIRECTION); 6586 } 6587 6588 #define _mm_maskz_scalef_round_ss(U, A, B, R) \ 6589 (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6590 (__v4sf)(__m128)(B), \ 6591 (__v4sf)_mm_setzero_ps(), \ 6592 (__mmask8)(U), \ 6593 (int)(R)) 6594 6595 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6596 _mm512_srai_epi32(__m512i __A, int __B) 6597 { 6598 return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B); 6599 } 6600 6601 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6602 _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) 6603 { 6604 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 6605 (__v16si)_mm512_srai_epi32(__A, __B), 6606 (__v16si)__W); 6607 } 6608 6609 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6610 _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) { 6611 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 6612 (__v16si)_mm512_srai_epi32(__A, __B), 6613 (__v16si)_mm512_setzero_si512()); 6614 } 6615 6616 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6617 _mm512_srai_epi64(__m512i __A, int __B) 6618 { 6619 return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B); 6620 } 6621 6622 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6623 _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) 6624 { 6625 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 6626 (__v8di)_mm512_srai_epi64(__A, __B), 6627 (__v8di)__W); 6628 } 6629 6630 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6631 _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B) 6632 { 6633 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 6634 (__v8di)_mm512_srai_epi64(__A, __B), 6635 (__v8di)_mm512_setzero_si512()); 6636 } 6637 6638 #define _mm512_shuffle_f32x4(A, B, imm) \ 6639 (__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \ 6640 (__v16sf)(__m512)(B), (int)(imm)) 6641 6642 #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \ 6643 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6644 (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ 6645 (__v16sf)(__m512)(W)) 6646 6647 #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \ 6648 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6649 (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ 6650 (__v16sf)_mm512_setzero_ps()) 6651 6652 #define _mm512_shuffle_f64x2(A, B, imm) \ 6653 (__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \ 6654 (__v8df)(__m512d)(B), (int)(imm)) 6655 6656 #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \ 6657 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6658 (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ 6659 (__v8df)(__m512d)(W)) 6660 6661 #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \ 6662 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6663 (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ 6664 (__v8df)_mm512_setzero_pd()) 6665 6666 #define _mm512_shuffle_i32x4(A, B, imm) \ 6667 (__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \ 6668 (__v16si)(__m512i)(B), (int)(imm)) 6669 6670 #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \ 6671 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 6672 (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ 6673 (__v16si)(__m512i)(W)) 6674 6675 #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \ 6676 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 6677 (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ 6678 (__v16si)_mm512_setzero_si512()) 6679 6680 #define _mm512_shuffle_i64x2(A, B, imm) \ 6681 (__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \ 6682 (__v8di)(__m512i)(B), (int)(imm)) 6683 6684 #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \ 6685 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 6686 (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ 6687 (__v8di)(__m512i)(W)) 6688 6689 #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \ 6690 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 6691 (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ 6692 (__v8di)_mm512_setzero_si512()) 6693 6694 #define _mm512_shuffle_pd(A, B, M) \ 6695 (__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \ 6696 (__v8df)(__m512d)(B), (int)(M)) 6697 6698 #define _mm512_mask_shuffle_pd(W, U, A, B, M) \ 6699 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6700 (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ 6701 (__v8df)(__m512d)(W)) 6702 6703 #define _mm512_maskz_shuffle_pd(U, A, B, M) \ 6704 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6705 (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ 6706 (__v8df)_mm512_setzero_pd()) 6707 6708 #define _mm512_shuffle_ps(A, B, M) \ 6709 (__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \ 6710 (__v16sf)(__m512)(B), (int)(M)) 6711 6712 #define _mm512_mask_shuffle_ps(W, U, A, B, M) \ 6713 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6714 (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ 6715 (__v16sf)(__m512)(W)) 6716 6717 #define _mm512_maskz_shuffle_ps(U, A, B, M) \ 6718 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6719 (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ 6720 (__v16sf)_mm512_setzero_ps()) 6721 6722 #define _mm_sqrt_round_sd(A, B, R) \ 6723 (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6724 (__v2df)(__m128d)(B), \ 6725 (__v2df)_mm_setzero_pd(), \ 6726 (__mmask8)-1, (int)(R)) 6727 6728 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6729 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 6730 { 6731 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, 6732 (__v2df) __B, 6733 (__v2df) __W, 6734 (__mmask8) __U, 6735 _MM_FROUND_CUR_DIRECTION); 6736 } 6737 6738 #define _mm_mask_sqrt_round_sd(W, U, A, B, R) \ 6739 (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6740 (__v2df)(__m128d)(B), \ 6741 (__v2df)(__m128d)(W), \ 6742 (__mmask8)(U), (int)(R)) 6743 6744 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6745 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) 6746 { 6747 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, 6748 (__v2df) __B, 6749 (__v2df) _mm_setzero_pd (), 6750 (__mmask8) __U, 6751 _MM_FROUND_CUR_DIRECTION); 6752 } 6753 6754 #define _mm_maskz_sqrt_round_sd(U, A, B, R) \ 6755 (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6756 (__v2df)(__m128d)(B), \ 6757 (__v2df)_mm_setzero_pd(), \ 6758 (__mmask8)(U), (int)(R)) 6759 6760 #define _mm_sqrt_round_ss(A, B, R) \ 6761 (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6762 (__v4sf)(__m128)(B), \ 6763 (__v4sf)_mm_setzero_ps(), \ 6764 (__mmask8)-1, (int)(R)) 6765 6766 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6767 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 6768 { 6769 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, 6770 (__v4sf) __B, 6771 (__v4sf) __W, 6772 (__mmask8) __U, 6773 _MM_FROUND_CUR_DIRECTION); 6774 } 6775 6776 #define _mm_mask_sqrt_round_ss(W, U, A, B, R) \ 6777 (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6778 (__v4sf)(__m128)(B), \ 6779 (__v4sf)(__m128)(W), (__mmask8)(U), \ 6780 (int)(R)) 6781 6782 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6783 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) 6784 { 6785 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, 6786 (__v4sf) __B, 6787 (__v4sf) _mm_setzero_ps (), 6788 (__mmask8) __U, 6789 _MM_FROUND_CUR_DIRECTION); 6790 } 6791 6792 #define _mm_maskz_sqrt_round_ss(U, A, B, R) \ 6793 (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6794 (__v4sf)(__m128)(B), \ 6795 (__v4sf)_mm_setzero_ps(), \ 6796 (__mmask8)(U), (int)(R)) 6797 6798 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6799 _mm512_broadcast_f32x4(__m128 __A) 6800 { 6801 return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 6802 0, 1, 2, 3, 0, 1, 2, 3, 6803 0, 1, 2, 3, 0, 1, 2, 3); 6804 } 6805 6806 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6807 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) 6808 { 6809 return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 6810 (__v16sf)_mm512_broadcast_f32x4(__A), 6811 (__v16sf)__O); 6812 } 6813 6814 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6815 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) 6816 { 6817 return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 6818 (__v16sf)_mm512_broadcast_f32x4(__A), 6819 (__v16sf)_mm512_setzero_ps()); 6820 } 6821 6822 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6823 _mm512_broadcast_f64x4(__m256d __A) 6824 { 6825 return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A, 6826 0, 1, 2, 3, 0, 1, 2, 3); 6827 } 6828 6829 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6830 _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) 6831 { 6832 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, 6833 (__v8df)_mm512_broadcast_f64x4(__A), 6834 (__v8df)__O); 6835 } 6836 6837 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6838 _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) 6839 { 6840 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, 6841 (__v8df)_mm512_broadcast_f64x4(__A), 6842 (__v8df)_mm512_setzero_pd()); 6843 } 6844 6845 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6846 _mm512_broadcast_i32x4(__m128i __A) 6847 { 6848 return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 6849 0, 1, 2, 3, 0, 1, 2, 3, 6850 0, 1, 2, 3, 0, 1, 2, 3); 6851 } 6852 6853 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6854 _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) 6855 { 6856 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 6857 (__v16si)_mm512_broadcast_i32x4(__A), 6858 (__v16si)__O); 6859 } 6860 6861 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6862 _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) 6863 { 6864 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 6865 (__v16si)_mm512_broadcast_i32x4(__A), 6866 (__v16si)_mm512_setzero_si512()); 6867 } 6868 6869 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6870 _mm512_broadcast_i64x4(__m256i __A) 6871 { 6872 return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A, 6873 0, 1, 2, 3, 0, 1, 2, 3); 6874 } 6875 6876 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6877 _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) 6878 { 6879 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 6880 (__v8di)_mm512_broadcast_i64x4(__A), 6881 (__v8di)__O); 6882 } 6883 6884 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6885 _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) 6886 { 6887 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 6888 (__v8di)_mm512_broadcast_i64x4(__A), 6889 (__v8di)_mm512_setzero_si512()); 6890 } 6891 6892 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6893 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) 6894 { 6895 return (__m512d)__builtin_ia32_selectpd_512(__M, 6896 (__v8df) _mm512_broadcastsd_pd(__A), 6897 (__v8df) __O); 6898 } 6899 6900 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6901 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) 6902 { 6903 return (__m512d)__builtin_ia32_selectpd_512(__M, 6904 (__v8df) _mm512_broadcastsd_pd(__A), 6905 (__v8df) _mm512_setzero_pd()); 6906 } 6907 6908 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6909 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) 6910 { 6911 return (__m512)__builtin_ia32_selectps_512(__M, 6912 (__v16sf) _mm512_broadcastss_ps(__A), 6913 (__v16sf) __O); 6914 } 6915 6916 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6917 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) 6918 { 6919 return (__m512)__builtin_ia32_selectps_512(__M, 6920 (__v16sf) _mm512_broadcastss_ps(__A), 6921 (__v16sf) _mm512_setzero_ps()); 6922 } 6923 6924 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6925 _mm512_cvtsepi32_epi8 (__m512i __A) 6926 { 6927 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6928 (__v16qi) _mm_undefined_si128 (), 6929 (__mmask16) -1); 6930 } 6931 6932 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6933 _mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 6934 { 6935 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6936 (__v16qi) __O, __M); 6937 } 6938 6939 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6940 _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) 6941 { 6942 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6943 (__v16qi) _mm_setzero_si128 (), 6944 __M); 6945 } 6946 6947 static __inline__ void __DEFAULT_FN_ATTRS512 6948 _mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 6949 { 6950 __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 6951 } 6952 6953 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6954 _mm512_cvtsepi32_epi16 (__m512i __A) 6955 { 6956 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6957 (__v16hi) _mm256_undefined_si256 (), 6958 (__mmask16) -1); 6959 } 6960 6961 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6962 _mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 6963 { 6964 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6965 (__v16hi) __O, __M); 6966 } 6967 6968 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6969 _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) 6970 { 6971 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6972 (__v16hi) _mm256_setzero_si256 (), 6973 __M); 6974 } 6975 6976 static __inline__ void __DEFAULT_FN_ATTRS512 6977 _mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) 6978 { 6979 __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); 6980 } 6981 6982 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6983 _mm512_cvtsepi64_epi8 (__m512i __A) 6984 { 6985 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 6986 (__v16qi) _mm_undefined_si128 (), 6987 (__mmask8) -1); 6988 } 6989 6990 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6991 _mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 6992 { 6993 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 6994 (__v16qi) __O, __M); 6995 } 6996 6997 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6998 _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) 6999 { 7000 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 7001 (__v16qi) _mm_setzero_si128 (), 7002 __M); 7003 } 7004 7005 static __inline__ void __DEFAULT_FN_ATTRS512 7006 _mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7007 { 7008 __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7009 } 7010 7011 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7012 _mm512_cvtsepi64_epi32 (__m512i __A) 7013 { 7014 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7015 (__v8si) _mm256_undefined_si256 (), 7016 (__mmask8) -1); 7017 } 7018 7019 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7020 _mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7021 { 7022 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7023 (__v8si) __O, __M); 7024 } 7025 7026 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7027 _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) 7028 { 7029 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7030 (__v8si) _mm256_setzero_si256 (), 7031 __M); 7032 } 7033 7034 static __inline__ void __DEFAULT_FN_ATTRS512 7035 _mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) 7036 { 7037 __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); 7038 } 7039 7040 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7041 _mm512_cvtsepi64_epi16 (__m512i __A) 7042 { 7043 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7044 (__v8hi) _mm_undefined_si128 (), 7045 (__mmask8) -1); 7046 } 7047 7048 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7049 _mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7050 { 7051 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7052 (__v8hi) __O, __M); 7053 } 7054 7055 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7056 _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) 7057 { 7058 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7059 (__v8hi) _mm_setzero_si128 (), 7060 __M); 7061 } 7062 7063 static __inline__ void __DEFAULT_FN_ATTRS512 7064 _mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) 7065 { 7066 __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); 7067 } 7068 7069 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7070 _mm512_cvtusepi32_epi8 (__m512i __A) 7071 { 7072 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7073 (__v16qi) _mm_undefined_si128 (), 7074 (__mmask16) -1); 7075 } 7076 7077 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7078 _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 7079 { 7080 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7081 (__v16qi) __O, 7082 __M); 7083 } 7084 7085 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7086 _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) 7087 { 7088 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7089 (__v16qi) _mm_setzero_si128 (), 7090 __M); 7091 } 7092 7093 static __inline__ void __DEFAULT_FN_ATTRS512 7094 _mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 7095 { 7096 __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 7097 } 7098 7099 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7100 _mm512_cvtusepi32_epi16 (__m512i __A) 7101 { 7102 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7103 (__v16hi) _mm256_undefined_si256 (), 7104 (__mmask16) -1); 7105 } 7106 7107 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7108 _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 7109 { 7110 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7111 (__v16hi) __O, 7112 __M); 7113 } 7114 7115 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7116 _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) 7117 { 7118 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7119 (__v16hi) _mm256_setzero_si256 (), 7120 __M); 7121 } 7122 7123 static __inline__ void __DEFAULT_FN_ATTRS512 7124 _mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) 7125 { 7126 __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); 7127 } 7128 7129 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7130 _mm512_cvtusepi64_epi8 (__m512i __A) 7131 { 7132 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7133 (__v16qi) _mm_undefined_si128 (), 7134 (__mmask8) -1); 7135 } 7136 7137 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7138 _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 7139 { 7140 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7141 (__v16qi) __O, 7142 __M); 7143 } 7144 7145 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7146 _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) 7147 { 7148 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7149 (__v16qi) _mm_setzero_si128 (), 7150 __M); 7151 } 7152 7153 static __inline__ void __DEFAULT_FN_ATTRS512 7154 _mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7155 { 7156 __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7157 } 7158 7159 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7160 _mm512_cvtusepi64_epi32 (__m512i __A) 7161 { 7162 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7163 (__v8si) _mm256_undefined_si256 (), 7164 (__mmask8) -1); 7165 } 7166 7167 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7168 _mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7169 { 7170 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7171 (__v8si) __O, __M); 7172 } 7173 7174 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7175 _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) 7176 { 7177 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7178 (__v8si) _mm256_setzero_si256 (), 7179 __M); 7180 } 7181 7182 static __inline__ void __DEFAULT_FN_ATTRS512 7183 _mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) 7184 { 7185 __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); 7186 } 7187 7188 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7189 _mm512_cvtusepi64_epi16 (__m512i __A) 7190 { 7191 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7192 (__v8hi) _mm_undefined_si128 (), 7193 (__mmask8) -1); 7194 } 7195 7196 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7197 _mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7198 { 7199 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7200 (__v8hi) __O, __M); 7201 } 7202 7203 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7204 _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) 7205 { 7206 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7207 (__v8hi) _mm_setzero_si128 (), 7208 __M); 7209 } 7210 7211 static __inline__ void __DEFAULT_FN_ATTRS512 7212 _mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) 7213 { 7214 __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); 7215 } 7216 7217 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7218 _mm512_cvtepi32_epi8 (__m512i __A) 7219 { 7220 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7221 (__v16qi) _mm_undefined_si128 (), 7222 (__mmask16) -1); 7223 } 7224 7225 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7226 _mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 7227 { 7228 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7229 (__v16qi) __O, __M); 7230 } 7231 7232 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7233 _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) 7234 { 7235 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7236 (__v16qi) _mm_setzero_si128 (), 7237 __M); 7238 } 7239 7240 static __inline__ void __DEFAULT_FN_ATTRS512 7241 _mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 7242 { 7243 __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 7244 } 7245 7246 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7247 _mm512_cvtepi32_epi16 (__m512i __A) 7248 { 7249 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7250 (__v16hi) _mm256_undefined_si256 (), 7251 (__mmask16) -1); 7252 } 7253 7254 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7255 _mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 7256 { 7257 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7258 (__v16hi) __O, __M); 7259 } 7260 7261 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7262 _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) 7263 { 7264 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7265 (__v16hi) _mm256_setzero_si256 (), 7266 __M); 7267 } 7268 7269 static __inline__ void __DEFAULT_FN_ATTRS512 7270 _mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) 7271 { 7272 __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); 7273 } 7274 7275 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7276 _mm512_cvtepi64_epi8 (__m512i __A) 7277 { 7278 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7279 (__v16qi) _mm_undefined_si128 (), 7280 (__mmask8) -1); 7281 } 7282 7283 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7284 _mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 7285 { 7286 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7287 (__v16qi) __O, __M); 7288 } 7289 7290 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7291 _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) 7292 { 7293 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7294 (__v16qi) _mm_setzero_si128 (), 7295 __M); 7296 } 7297 7298 static __inline__ void __DEFAULT_FN_ATTRS512 7299 _mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7300 { 7301 __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7302 } 7303 7304 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7305 _mm512_cvtepi64_epi32 (__m512i __A) 7306 { 7307 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7308 (__v8si) _mm256_undefined_si256 (), 7309 (__mmask8) -1); 7310 } 7311 7312 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7313 _mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7314 { 7315 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7316 (__v8si) __O, __M); 7317 } 7318 7319 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7320 _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) 7321 { 7322 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7323 (__v8si) _mm256_setzero_si256 (), 7324 __M); 7325 } 7326 7327 static __inline__ void __DEFAULT_FN_ATTRS512 7328 _mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) 7329 { 7330 __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); 7331 } 7332 7333 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7334 _mm512_cvtepi64_epi16 (__m512i __A) 7335 { 7336 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7337 (__v8hi) _mm_undefined_si128 (), 7338 (__mmask8) -1); 7339 } 7340 7341 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7342 _mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7343 { 7344 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7345 (__v8hi) __O, __M); 7346 } 7347 7348 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7349 _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) 7350 { 7351 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7352 (__v8hi) _mm_setzero_si128 (), 7353 __M); 7354 } 7355 7356 static __inline__ void __DEFAULT_FN_ATTRS512 7357 _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) 7358 { 7359 __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); 7360 } 7361 7362 #define _mm512_extracti32x4_epi32(A, imm) \ 7363 (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7364 (__v4si)_mm_undefined_si128(), \ 7365 (__mmask8)-1) 7366 7367 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \ 7368 (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7369 (__v4si)(__m128i)(W), \ 7370 (__mmask8)(U)) 7371 7372 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \ 7373 (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7374 (__v4si)_mm_setzero_si128(), \ 7375 (__mmask8)(U)) 7376 7377 #define _mm512_extracti64x4_epi64(A, imm) \ 7378 (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7379 (__v4di)_mm256_undefined_si256(), \ 7380 (__mmask8)-1) 7381 7382 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \ 7383 (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7384 (__v4di)(__m256i)(W), \ 7385 (__mmask8)(U)) 7386 7387 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \ 7388 (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7389 (__v4di)_mm256_setzero_si256(), \ 7390 (__mmask8)(U)) 7391 7392 #define _mm512_insertf64x4(A, B, imm) \ 7393 (__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ 7394 (__v4df)(__m256d)(B), (int)(imm)) 7395 7396 #define _mm512_mask_insertf64x4(W, U, A, B, imm) \ 7397 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 7398 (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ 7399 (__v8df)(__m512d)(W)) 7400 7401 #define _mm512_maskz_insertf64x4(U, A, B, imm) \ 7402 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 7403 (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ 7404 (__v8df)_mm512_setzero_pd()) 7405 7406 #define _mm512_inserti64x4(A, B, imm) \ 7407 (__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \ 7408 (__v4di)(__m256i)(B), (int)(imm)) 7409 7410 #define _mm512_mask_inserti64x4(W, U, A, B, imm) \ 7411 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 7412 (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ 7413 (__v8di)(__m512i)(W)) 7414 7415 #define _mm512_maskz_inserti64x4(U, A, B, imm) \ 7416 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 7417 (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ 7418 (__v8di)_mm512_setzero_si512()) 7419 7420 #define _mm512_insertf32x4(A, B, imm) \ 7421 (__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \ 7422 (__v4sf)(__m128)(B), (int)(imm)) 7423 7424 #define _mm512_mask_insertf32x4(W, U, A, B, imm) \ 7425 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 7426 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ 7427 (__v16sf)(__m512)(W)) 7428 7429 #define _mm512_maskz_insertf32x4(U, A, B, imm) \ 7430 (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 7431 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ 7432 (__v16sf)_mm512_setzero_ps()) 7433 7434 #define _mm512_inserti32x4(A, B, imm) \ 7435 (__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \ 7436 (__v4si)(__m128i)(B), (int)(imm)) 7437 7438 #define _mm512_mask_inserti32x4(W, U, A, B, imm) \ 7439 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 7440 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ 7441 (__v16si)(__m512i)(W)) 7442 7443 #define _mm512_maskz_inserti32x4(U, A, B, imm) \ 7444 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 7445 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ 7446 (__v16si)_mm512_setzero_si512()) 7447 7448 #define _mm512_getmant_round_pd(A, B, C, R) \ 7449 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7450 (int)(((C)<<2) | (B)), \ 7451 (__v8df)_mm512_undefined_pd(), \ 7452 (__mmask8)-1, (int)(R)) 7453 7454 #define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \ 7455 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7456 (int)(((C)<<2) | (B)), \ 7457 (__v8df)(__m512d)(W), \ 7458 (__mmask8)(U), (int)(R)) 7459 7460 #define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \ 7461 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7462 (int)(((C)<<2) | (B)), \ 7463 (__v8df)_mm512_setzero_pd(), \ 7464 (__mmask8)(U), (int)(R)) 7465 7466 #define _mm512_getmant_pd(A, B, C) \ 7467 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7468 (int)(((C)<<2) | (B)), \ 7469 (__v8df)_mm512_setzero_pd(), \ 7470 (__mmask8)-1, \ 7471 _MM_FROUND_CUR_DIRECTION) 7472 7473 #define _mm512_mask_getmant_pd(W, U, A, B, C) \ 7474 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7475 (int)(((C)<<2) | (B)), \ 7476 (__v8df)(__m512d)(W), \ 7477 (__mmask8)(U), \ 7478 _MM_FROUND_CUR_DIRECTION) 7479 7480 #define _mm512_maskz_getmant_pd(U, A, B, C) \ 7481 (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7482 (int)(((C)<<2) | (B)), \ 7483 (__v8df)_mm512_setzero_pd(), \ 7484 (__mmask8)(U), \ 7485 _MM_FROUND_CUR_DIRECTION) 7486 7487 #define _mm512_getmant_round_ps(A, B, C, R) \ 7488 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7489 (int)(((C)<<2) | (B)), \ 7490 (__v16sf)_mm512_undefined_ps(), \ 7491 (__mmask16)-1, (int)(R)) 7492 7493 #define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \ 7494 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7495 (int)(((C)<<2) | (B)), \ 7496 (__v16sf)(__m512)(W), \ 7497 (__mmask16)(U), (int)(R)) 7498 7499 #define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \ 7500 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7501 (int)(((C)<<2) | (B)), \ 7502 (__v16sf)_mm512_setzero_ps(), \ 7503 (__mmask16)(U), (int)(R)) 7504 7505 #define _mm512_getmant_ps(A, B, C) \ 7506 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7507 (int)(((C)<<2)|(B)), \ 7508 (__v16sf)_mm512_undefined_ps(), \ 7509 (__mmask16)-1, \ 7510 _MM_FROUND_CUR_DIRECTION) 7511 7512 #define _mm512_mask_getmant_ps(W, U, A, B, C) \ 7513 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7514 (int)(((C)<<2)|(B)), \ 7515 (__v16sf)(__m512)(W), \ 7516 (__mmask16)(U), \ 7517 _MM_FROUND_CUR_DIRECTION) 7518 7519 #define _mm512_maskz_getmant_ps(U, A, B, C) \ 7520 (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7521 (int)(((C)<<2)|(B)), \ 7522 (__v16sf)_mm512_setzero_ps(), \ 7523 (__mmask16)(U), \ 7524 _MM_FROUND_CUR_DIRECTION) 7525 7526 #define _mm512_getexp_round_pd(A, R) \ 7527 (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7528 (__v8df)_mm512_undefined_pd(), \ 7529 (__mmask8)-1, (int)(R)) 7530 7531 #define _mm512_mask_getexp_round_pd(W, U, A, R) \ 7532 (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7533 (__v8df)(__m512d)(W), \ 7534 (__mmask8)(U), (int)(R)) 7535 7536 #define _mm512_maskz_getexp_round_pd(U, A, R) \ 7537 (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7538 (__v8df)_mm512_setzero_pd(), \ 7539 (__mmask8)(U), (int)(R)) 7540 7541 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7542 _mm512_getexp_pd (__m512d __A) 7543 { 7544 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7545 (__v8df) _mm512_undefined_pd (), 7546 (__mmask8) -1, 7547 _MM_FROUND_CUR_DIRECTION); 7548 } 7549 7550 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7551 _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) 7552 { 7553 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7554 (__v8df) __W, 7555 (__mmask8) __U, 7556 _MM_FROUND_CUR_DIRECTION); 7557 } 7558 7559 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7560 _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) 7561 { 7562 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7563 (__v8df) _mm512_setzero_pd (), 7564 (__mmask8) __U, 7565 _MM_FROUND_CUR_DIRECTION); 7566 } 7567 7568 #define _mm512_getexp_round_ps(A, R) \ 7569 (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7570 (__v16sf)_mm512_undefined_ps(), \ 7571 (__mmask16)-1, (int)(R)) 7572 7573 #define _mm512_mask_getexp_round_ps(W, U, A, R) \ 7574 (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7575 (__v16sf)(__m512)(W), \ 7576 (__mmask16)(U), (int)(R)) 7577 7578 #define _mm512_maskz_getexp_round_ps(U, A, R) \ 7579 (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7580 (__v16sf)_mm512_setzero_ps(), \ 7581 (__mmask16)(U), (int)(R)) 7582 7583 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7584 _mm512_getexp_ps (__m512 __A) 7585 { 7586 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7587 (__v16sf) _mm512_undefined_ps (), 7588 (__mmask16) -1, 7589 _MM_FROUND_CUR_DIRECTION); 7590 } 7591 7592 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7593 _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) 7594 { 7595 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7596 (__v16sf) __W, 7597 (__mmask16) __U, 7598 _MM_FROUND_CUR_DIRECTION); 7599 } 7600 7601 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7602 _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) 7603 { 7604 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7605 (__v16sf) _mm512_setzero_ps (), 7606 (__mmask16) __U, 7607 _MM_FROUND_CUR_DIRECTION); 7608 } 7609 7610 #define _mm512_i64gather_ps(index, addr, scale) \ 7611 (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \ 7612 (void const *)(addr), \ 7613 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7614 (int)(scale)) 7615 7616 #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \ 7617 (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ 7618 (void const *)(addr), \ 7619 (__v8di)(__m512i)(index), \ 7620 (__mmask8)(mask), (int)(scale)) 7621 7622 #define _mm512_i64gather_epi32(index, addr, scale) \ 7623 (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \ 7624 (void const *)(addr), \ 7625 (__v8di)(__m512i)(index), \ 7626 (__mmask8)-1, (int)(scale)) 7627 7628 #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \ 7629 (__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \ 7630 (void const *)(addr), \ 7631 (__v8di)(__m512i)(index), \ 7632 (__mmask8)(mask), (int)(scale)) 7633 7634 #define _mm512_i64gather_pd(index, addr, scale) \ 7635 (__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \ 7636 (void const *)(addr), \ 7637 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7638 (int)(scale)) 7639 7640 #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \ 7641 (__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \ 7642 (void const *)(addr), \ 7643 (__v8di)(__m512i)(index), \ 7644 (__mmask8)(mask), (int)(scale)) 7645 7646 #define _mm512_i64gather_epi64(index, addr, scale) \ 7647 (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \ 7648 (void const *)(addr), \ 7649 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7650 (int)(scale)) 7651 7652 #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \ 7653 (__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \ 7654 (void const *)(addr), \ 7655 (__v8di)(__m512i)(index), \ 7656 (__mmask8)(mask), (int)(scale)) 7657 7658 #define _mm512_i32gather_ps(index, addr, scale) \ 7659 (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \ 7660 (void const *)(addr), \ 7661 (__v16si)(__m512)(index), \ 7662 (__mmask16)-1, (int)(scale)) 7663 7664 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \ 7665 (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \ 7666 (void const *)(addr), \ 7667 (__v16si)(__m512)(index), \ 7668 (__mmask16)(mask), (int)(scale)) 7669 7670 #define _mm512_i32gather_epi32(index, addr, scale) \ 7671 (__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \ 7672 (void const *)(addr), \ 7673 (__v16si)(__m512i)(index), \ 7674 (__mmask16)-1, (int)(scale)) 7675 7676 #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \ 7677 (__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \ 7678 (void const *)(addr), \ 7679 (__v16si)(__m512i)(index), \ 7680 (__mmask16)(mask), (int)(scale)) 7681 7682 #define _mm512_i32gather_pd(index, addr, scale) \ 7683 (__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \ 7684 (void const *)(addr), \ 7685 (__v8si)(__m256i)(index), (__mmask8)-1, \ 7686 (int)(scale)) 7687 7688 #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \ 7689 (__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \ 7690 (void const *)(addr), \ 7691 (__v8si)(__m256i)(index), \ 7692 (__mmask8)(mask), (int)(scale)) 7693 7694 #define _mm512_i32gather_epi64(index, addr, scale) \ 7695 (__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \ 7696 (void const *)(addr), \ 7697 (__v8si)(__m256i)(index), (__mmask8)-1, \ 7698 (int)(scale)) 7699 7700 #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \ 7701 (__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \ 7702 (void const *)(addr), \ 7703 (__v8si)(__m256i)(index), \ 7704 (__mmask8)(mask), (int)(scale)) 7705 7706 #define _mm512_i64scatter_ps(addr, index, v1, scale) \ 7707 __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \ 7708 (__v8di)(__m512i)(index), \ 7709 (__v8sf)(__m256)(v1), (int)(scale)) 7710 7711 #define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \ 7712 __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \ 7713 (__v8di)(__m512i)(index), \ 7714 (__v8sf)(__m256)(v1), (int)(scale)) 7715 7716 #define _mm512_i64scatter_epi32(addr, index, v1, scale) \ 7717 __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \ 7718 (__v8di)(__m512i)(index), \ 7719 (__v8si)(__m256i)(v1), (int)(scale)) 7720 7721 #define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ 7722 __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \ 7723 (__v8di)(__m512i)(index), \ 7724 (__v8si)(__m256i)(v1), (int)(scale)) 7725 7726 #define _mm512_i64scatter_pd(addr, index, v1, scale) \ 7727 __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \ 7728 (__v8di)(__m512i)(index), \ 7729 (__v8df)(__m512d)(v1), (int)(scale)) 7730 7731 #define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \ 7732 __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \ 7733 (__v8di)(__m512i)(index), \ 7734 (__v8df)(__m512d)(v1), (int)(scale)) 7735 7736 #define _mm512_i64scatter_epi64(addr, index, v1, scale) \ 7737 __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \ 7738 (__v8di)(__m512i)(index), \ 7739 (__v8di)(__m512i)(v1), (int)(scale)) 7740 7741 #define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ 7742 __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \ 7743 (__v8di)(__m512i)(index), \ 7744 (__v8di)(__m512i)(v1), (int)(scale)) 7745 7746 #define _mm512_i32scatter_ps(addr, index, v1, scale) \ 7747 __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \ 7748 (__v16si)(__m512i)(index), \ 7749 (__v16sf)(__m512)(v1), (int)(scale)) 7750 7751 #define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \ 7752 __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \ 7753 (__v16si)(__m512i)(index), \ 7754 (__v16sf)(__m512)(v1), (int)(scale)) 7755 7756 #define _mm512_i32scatter_epi32(addr, index, v1, scale) \ 7757 __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \ 7758 (__v16si)(__m512i)(index), \ 7759 (__v16si)(__m512i)(v1), (int)(scale)) 7760 7761 #define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ 7762 __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \ 7763 (__v16si)(__m512i)(index), \ 7764 (__v16si)(__m512i)(v1), (int)(scale)) 7765 7766 #define _mm512_i32scatter_pd(addr, index, v1, scale) \ 7767 __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \ 7768 (__v8si)(__m256i)(index), \ 7769 (__v8df)(__m512d)(v1), (int)(scale)) 7770 7771 #define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \ 7772 __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \ 7773 (__v8si)(__m256i)(index), \ 7774 (__v8df)(__m512d)(v1), (int)(scale)) 7775 7776 #define _mm512_i32scatter_epi64(addr, index, v1, scale) \ 7777 __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \ 7778 (__v8si)(__m256i)(index), \ 7779 (__v8di)(__m512i)(v1), (int)(scale)) 7780 7781 #define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ 7782 __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \ 7783 (__v8si)(__m256i)(index), \ 7784 (__v8di)(__m512i)(v1), (int)(scale)) 7785 7786 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7787 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7788 { 7789 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7790 (__v4sf)__A, 7791 (__v4sf)__B, 7792 (__mmask8)__U, 7793 _MM_FROUND_CUR_DIRECTION); 7794 } 7795 7796 #define _mm_fmadd_round_ss(A, B, C, R) \ 7797 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7798 (__v4sf)(__m128)(B), \ 7799 (__v4sf)(__m128)(C), (__mmask8)-1, \ 7800 (int)(R)) 7801 7802 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) \ 7803 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7804 (__v4sf)(__m128)(A), \ 7805 (__v4sf)(__m128)(B), (__mmask8)(U), \ 7806 (int)(R)) 7807 7808 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7809 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7810 { 7811 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7812 (__v4sf)__B, 7813 (__v4sf)__C, 7814 (__mmask8)__U, 7815 _MM_FROUND_CUR_DIRECTION); 7816 } 7817 7818 #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ 7819 (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7820 (__v4sf)(__m128)(B), \ 7821 (__v4sf)(__m128)(C), (__mmask8)(U), \ 7822 (int)(R)) 7823 7824 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7825 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7826 { 7827 return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, 7828 (__v4sf)__X, 7829 (__v4sf)__Y, 7830 (__mmask8)__U, 7831 _MM_FROUND_CUR_DIRECTION); 7832 } 7833 7834 #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \ 7835 (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ 7836 (__v4sf)(__m128)(X), \ 7837 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7838 (int)(R)) 7839 7840 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7841 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7842 { 7843 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7844 (__v4sf)__A, 7845 -(__v4sf)__B, 7846 (__mmask8)__U, 7847 _MM_FROUND_CUR_DIRECTION); 7848 } 7849 7850 #define _mm_fmsub_round_ss(A, B, C, R) \ 7851 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7852 (__v4sf)(__m128)(B), \ 7853 -(__v4sf)(__m128)(C), (__mmask8)-1, \ 7854 (int)(R)) 7855 7856 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) \ 7857 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7858 (__v4sf)(__m128)(A), \ 7859 -(__v4sf)(__m128)(B), (__mmask8)(U), \ 7860 (int)(R)) 7861 7862 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7863 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7864 { 7865 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7866 (__v4sf)__B, 7867 -(__v4sf)__C, 7868 (__mmask8)__U, 7869 _MM_FROUND_CUR_DIRECTION); 7870 } 7871 7872 #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ 7873 (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7874 (__v4sf)(__m128)(B), \ 7875 -(__v4sf)(__m128)(C), (__mmask8)(U), \ 7876 (int)(R)) 7877 7878 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7879 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7880 { 7881 return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, 7882 (__v4sf)__X, 7883 (__v4sf)__Y, 7884 (__mmask8)__U, 7885 _MM_FROUND_CUR_DIRECTION); 7886 } 7887 7888 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \ 7889 (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ 7890 (__v4sf)(__m128)(X), \ 7891 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7892 (int)(R)) 7893 7894 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7895 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7896 { 7897 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7898 -(__v4sf)__A, 7899 (__v4sf)__B, 7900 (__mmask8)__U, 7901 _MM_FROUND_CUR_DIRECTION); 7902 } 7903 7904 #define _mm_fnmadd_round_ss(A, B, C, R) \ 7905 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7906 -(__v4sf)(__m128)(B), \ 7907 (__v4sf)(__m128)(C), (__mmask8)-1, \ 7908 (int)(R)) 7909 7910 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \ 7911 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7912 -(__v4sf)(__m128)(A), \ 7913 (__v4sf)(__m128)(B), (__mmask8)(U), \ 7914 (int)(R)) 7915 7916 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7917 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7918 { 7919 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7920 -(__v4sf)__B, 7921 (__v4sf)__C, 7922 (__mmask8)__U, 7923 _MM_FROUND_CUR_DIRECTION); 7924 } 7925 7926 #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ 7927 (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7928 -(__v4sf)(__m128)(B), \ 7929 (__v4sf)(__m128)(C), (__mmask8)(U), \ 7930 (int)(R)) 7931 7932 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7933 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7934 { 7935 return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, 7936 -(__v4sf)__X, 7937 (__v4sf)__Y, 7938 (__mmask8)__U, 7939 _MM_FROUND_CUR_DIRECTION); 7940 } 7941 7942 #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \ 7943 (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ 7944 -(__v4sf)(__m128)(X), \ 7945 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7946 (int)(R)) 7947 7948 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7949 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7950 { 7951 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7952 -(__v4sf)__A, 7953 -(__v4sf)__B, 7954 (__mmask8)__U, 7955 _MM_FROUND_CUR_DIRECTION); 7956 } 7957 7958 #define _mm_fnmsub_round_ss(A, B, C, R) \ 7959 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7960 -(__v4sf)(__m128)(B), \ 7961 -(__v4sf)(__m128)(C), (__mmask8)-1, \ 7962 (int)(R)) 7963 7964 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \ 7965 (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7966 -(__v4sf)(__m128)(A), \ 7967 -(__v4sf)(__m128)(B), (__mmask8)(U), \ 7968 (int)(R)) 7969 7970 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7971 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7972 { 7973 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7974 -(__v4sf)__B, 7975 -(__v4sf)__C, 7976 (__mmask8)__U, 7977 _MM_FROUND_CUR_DIRECTION); 7978 } 7979 7980 #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ 7981 (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7982 -(__v4sf)(__m128)(B), \ 7983 -(__v4sf)(__m128)(C), (__mmask8)(U), \ 7984 (int)(R)) 7985 7986 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7987 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7988 { 7989 return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, 7990 -(__v4sf)__X, 7991 (__v4sf)__Y, 7992 (__mmask8)__U, 7993 _MM_FROUND_CUR_DIRECTION); 7994 } 7995 7996 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \ 7997 (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ 7998 -(__v4sf)(__m128)(X), \ 7999 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 8000 (int)(R)) 8001 8002 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8003 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8004 { 8005 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8006 (__v2df)__A, 8007 (__v2df)__B, 8008 (__mmask8)__U, 8009 _MM_FROUND_CUR_DIRECTION); 8010 } 8011 8012 #define _mm_fmadd_round_sd(A, B, C, R) \ 8013 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8014 (__v2df)(__m128d)(B), \ 8015 (__v2df)(__m128d)(C), (__mmask8)-1, \ 8016 (int)(R)) 8017 8018 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) \ 8019 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8020 (__v2df)(__m128d)(A), \ 8021 (__v2df)(__m128d)(B), (__mmask8)(U), \ 8022 (int)(R)) 8023 8024 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8025 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8026 { 8027 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8028 (__v2df)__B, 8029 (__v2df)__C, 8030 (__mmask8)__U, 8031 _MM_FROUND_CUR_DIRECTION); 8032 } 8033 8034 #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ 8035 (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8036 (__v2df)(__m128d)(B), \ 8037 (__v2df)(__m128d)(C), (__mmask8)(U), \ 8038 (int)(R)) 8039 8040 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8041 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8042 { 8043 return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, 8044 (__v2df)__X, 8045 (__v2df)__Y, 8046 (__mmask8)__U, 8047 _MM_FROUND_CUR_DIRECTION); 8048 } 8049 8050 #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \ 8051 (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ 8052 (__v2df)(__m128d)(X), \ 8053 (__v2df)(__m128d)(Y), (__mmask8)(U), \ 8054 (int)(R)) 8055 8056 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8057 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8058 { 8059 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8060 (__v2df)__A, 8061 -(__v2df)__B, 8062 (__mmask8)__U, 8063 _MM_FROUND_CUR_DIRECTION); 8064 } 8065 8066 #define _mm_fmsub_round_sd(A, B, C, R) \ 8067 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8068 (__v2df)(__m128d)(B), \ 8069 -(__v2df)(__m128d)(C), (__mmask8)-1, \ 8070 (int)(R)) 8071 8072 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) \ 8073 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8074 (__v2df)(__m128d)(A), \ 8075 -(__v2df)(__m128d)(B), (__mmask8)(U), \ 8076 (int)(R)) 8077 8078 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8079 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8080 { 8081 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8082 (__v2df)__B, 8083 -(__v2df)__C, 8084 (__mmask8)__U, 8085 _MM_FROUND_CUR_DIRECTION); 8086 } 8087 8088 #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ 8089 (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8090 (__v2df)(__m128d)(B), \ 8091 -(__v2df)(__m128d)(C), \ 8092 (__mmask8)(U), (int)(R)) 8093 8094 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8095 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8096 { 8097 return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, 8098 (__v2df)__X, 8099 (__v2df)__Y, 8100 (__mmask8)__U, 8101 _MM_FROUND_CUR_DIRECTION); 8102 } 8103 8104 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \ 8105 (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ 8106 (__v2df)(__m128d)(X), \ 8107 (__v2df)(__m128d)(Y), \ 8108 (__mmask8)(U), (int)(R)) 8109 8110 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8111 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8112 { 8113 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8114 -(__v2df)__A, 8115 (__v2df)__B, 8116 (__mmask8)__U, 8117 _MM_FROUND_CUR_DIRECTION); 8118 } 8119 8120 #define _mm_fnmadd_round_sd(A, B, C, R) \ 8121 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8122 -(__v2df)(__m128d)(B), \ 8123 (__v2df)(__m128d)(C), (__mmask8)-1, \ 8124 (int)(R)) 8125 8126 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \ 8127 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8128 -(__v2df)(__m128d)(A), \ 8129 (__v2df)(__m128d)(B), (__mmask8)(U), \ 8130 (int)(R)) 8131 8132 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8133 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8134 { 8135 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8136 -(__v2df)__B, 8137 (__v2df)__C, 8138 (__mmask8)__U, 8139 _MM_FROUND_CUR_DIRECTION); 8140 } 8141 8142 #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ 8143 (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8144 -(__v2df)(__m128d)(B), \ 8145 (__v2df)(__m128d)(C), (__mmask8)(U), \ 8146 (int)(R)) 8147 8148 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8149 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8150 { 8151 return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, 8152 -(__v2df)__X, 8153 (__v2df)__Y, 8154 (__mmask8)__U, 8155 _MM_FROUND_CUR_DIRECTION); 8156 } 8157 8158 #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \ 8159 (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ 8160 -(__v2df)(__m128d)(X), \ 8161 (__v2df)(__m128d)(Y), (__mmask8)(U), \ 8162 (int)(R)) 8163 8164 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8165 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8166 { 8167 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8168 -(__v2df)__A, 8169 -(__v2df)__B, 8170 (__mmask8)__U, 8171 _MM_FROUND_CUR_DIRECTION); 8172 } 8173 8174 #define _mm_fnmsub_round_sd(A, B, C, R) \ 8175 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8176 -(__v2df)(__m128d)(B), \ 8177 -(__v2df)(__m128d)(C), (__mmask8)-1, \ 8178 (int)(R)) 8179 8180 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \ 8181 (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8182 -(__v2df)(__m128d)(A), \ 8183 -(__v2df)(__m128d)(B), (__mmask8)(U), \ 8184 (int)(R)) 8185 8186 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8187 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8188 { 8189 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8190 -(__v2df)__B, 8191 -(__v2df)__C, 8192 (__mmask8)__U, 8193 _MM_FROUND_CUR_DIRECTION); 8194 } 8195 8196 #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ 8197 (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8198 -(__v2df)(__m128d)(B), \ 8199 -(__v2df)(__m128d)(C), \ 8200 (__mmask8)(U), \ 8201 (int)(R)) 8202 8203 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8204 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8205 { 8206 return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, 8207 -(__v2df)__X, 8208 (__v2df)__Y, 8209 (__mmask8)__U, 8210 _MM_FROUND_CUR_DIRECTION); 8211 } 8212 8213 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \ 8214 (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ 8215 -(__v2df)(__m128d)(X), \ 8216 (__v2df)(__m128d)(Y), \ 8217 (__mmask8)(U), (int)(R)) 8218 8219 #define _mm512_permutex_pd(X, C) \ 8220 (__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)) 8221 8222 #define _mm512_mask_permutex_pd(W, U, X, C) \ 8223 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 8224 (__v8df)_mm512_permutex_pd((X), (C)), \ 8225 (__v8df)(__m512d)(W)) 8226 8227 #define _mm512_maskz_permutex_pd(U, X, C) \ 8228 (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 8229 (__v8df)_mm512_permutex_pd((X), (C)), \ 8230 (__v8df)_mm512_setzero_pd()) 8231 8232 #define _mm512_permutex_epi64(X, C) \ 8233 (__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)) 8234 8235 #define _mm512_mask_permutex_epi64(W, U, X, C) \ 8236 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 8237 (__v8di)_mm512_permutex_epi64((X), (C)), \ 8238 (__v8di)(__m512i)(W)) 8239 8240 #define _mm512_maskz_permutex_epi64(U, X, C) \ 8241 (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 8242 (__v8di)_mm512_permutex_epi64((X), (C)), \ 8243 (__v8di)_mm512_setzero_si512()) 8244 8245 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8246 _mm512_permutexvar_pd (__m512i __X, __m512d __Y) 8247 { 8248 return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X); 8249 } 8250 8251 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8252 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) 8253 { 8254 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8255 (__v8df)_mm512_permutexvar_pd(__X, __Y), 8256 (__v8df)__W); 8257 } 8258 8259 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8260 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) 8261 { 8262 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8263 (__v8df)_mm512_permutexvar_pd(__X, __Y), 8264 (__v8df)_mm512_setzero_pd()); 8265 } 8266 8267 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8268 _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) 8269 { 8270 return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X); 8271 } 8272 8273 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8274 _mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) 8275 { 8276 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 8277 (__v8di)_mm512_permutexvar_epi64(__X, __Y), 8278 (__v8di)_mm512_setzero_si512()); 8279 } 8280 8281 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8282 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, 8283 __m512i __Y) 8284 { 8285 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 8286 (__v8di)_mm512_permutexvar_epi64(__X, __Y), 8287 (__v8di)__W); 8288 } 8289 8290 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8291 _mm512_permutexvar_ps (__m512i __X, __m512 __Y) 8292 { 8293 return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X); 8294 } 8295 8296 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8297 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) 8298 { 8299 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8300 (__v16sf)_mm512_permutexvar_ps(__X, __Y), 8301 (__v16sf)__W); 8302 } 8303 8304 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8305 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) 8306 { 8307 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8308 (__v16sf)_mm512_permutexvar_ps(__X, __Y), 8309 (__v16sf)_mm512_setzero_ps()); 8310 } 8311 8312 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8313 _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) 8314 { 8315 return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X); 8316 } 8317 8318 #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32 8319 8320 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8321 _mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) 8322 { 8323 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 8324 (__v16si)_mm512_permutexvar_epi32(__X, __Y), 8325 (__v16si)_mm512_setzero_si512()); 8326 } 8327 8328 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8329 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, 8330 __m512i __Y) 8331 { 8332 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 8333 (__v16si)_mm512_permutexvar_epi32(__X, __Y), 8334 (__v16si)__W); 8335 } 8336 8337 #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 8338 8339 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8340 _mm512_kand (__mmask16 __A, __mmask16 __B) 8341 { 8342 return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); 8343 } 8344 8345 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8346 _mm512_kandn (__mmask16 __A, __mmask16 __B) 8347 { 8348 return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); 8349 } 8350 8351 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8352 _mm512_kor (__mmask16 __A, __mmask16 __B) 8353 { 8354 return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); 8355 } 8356 8357 static __inline__ int __DEFAULT_FN_ATTRS 8358 _mm512_kortestc (__mmask16 __A, __mmask16 __B) 8359 { 8360 return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B); 8361 } 8362 8363 static __inline__ int __DEFAULT_FN_ATTRS 8364 _mm512_kortestz (__mmask16 __A, __mmask16 __B) 8365 { 8366 return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B); 8367 } 8368 8369 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8370 _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) 8371 { 8372 return (unsigned char)__builtin_ia32_kortestchi(__A, __B); 8373 } 8374 8375 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8376 _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) 8377 { 8378 return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); 8379 } 8380 8381 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8382 _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { 8383 *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B); 8384 return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); 8385 } 8386 8387 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8388 _mm512_kunpackb (__mmask16 __A, __mmask16 __B) 8389 { 8390 return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); 8391 } 8392 8393 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8394 _mm512_kxnor (__mmask16 __A, __mmask16 __B) 8395 { 8396 return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); 8397 } 8398 8399 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8400 _mm512_kxor (__mmask16 __A, __mmask16 __B) 8401 { 8402 return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); 8403 } 8404 8405 #define _kand_mask16 _mm512_kand 8406 #define _kandn_mask16 _mm512_kandn 8407 #define _knot_mask16 _mm512_knot 8408 #define _kor_mask16 _mm512_kor 8409 #define _kxnor_mask16 _mm512_kxnor 8410 #define _kxor_mask16 _mm512_kxor 8411 8412 #define _kshiftli_mask16(A, I) \ 8413 (__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)) 8414 8415 #define _kshiftri_mask16(A, I) \ 8416 (__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)) 8417 8418 static __inline__ unsigned int __DEFAULT_FN_ATTRS 8419 _cvtmask16_u32(__mmask16 __A) { 8420 return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); 8421 } 8422 8423 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8424 _cvtu32_mask16(unsigned int __A) { 8425 return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); 8426 } 8427 8428 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8429 _load_mask16(__mmask16 *__A) { 8430 return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); 8431 } 8432 8433 static __inline__ void __DEFAULT_FN_ATTRS 8434 _store_mask16(__mmask16 *__A, __mmask16 __B) { 8435 *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B); 8436 } 8437 8438 static __inline__ void __DEFAULT_FN_ATTRS512 8439 _mm512_stream_si512 (void * __P, __m512i __A) 8440 { 8441 typedef __v8di __v8di_aligned __attribute__((aligned(64))); 8442 __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P); 8443 } 8444 8445 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8446 _mm512_stream_load_si512 (void const *__P) 8447 { 8448 typedef __v8di __v8di_aligned __attribute__((aligned(64))); 8449 return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P); 8450 } 8451 8452 static __inline__ void __DEFAULT_FN_ATTRS512 8453 _mm512_stream_pd (void *__P, __m512d __A) 8454 { 8455 typedef __v8df __v8df_aligned __attribute__((aligned(64))); 8456 __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P); 8457 } 8458 8459 static __inline__ void __DEFAULT_FN_ATTRS512 8460 _mm512_stream_ps (void *__P, __m512 __A) 8461 { 8462 typedef __v16sf __v16sf_aligned __attribute__((aligned(64))); 8463 __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P); 8464 } 8465 8466 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8467 _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) 8468 { 8469 return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, 8470 (__v8df) __W, 8471 (__mmask8) __U); 8472 } 8473 8474 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8475 _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) 8476 { 8477 return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, 8478 (__v8df) 8479 _mm512_setzero_pd (), 8480 (__mmask8) __U); 8481 } 8482 8483 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8484 _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 8485 { 8486 return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, 8487 (__v8di) __W, 8488 (__mmask8) __U); 8489 } 8490 8491 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8492 _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) 8493 { 8494 return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, 8495 (__v8di) 8496 _mm512_setzero_si512 (), 8497 (__mmask8) __U); 8498 } 8499 8500 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8501 _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) 8502 { 8503 return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, 8504 (__v16sf) __W, 8505 (__mmask16) __U); 8506 } 8507 8508 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8509 _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) 8510 { 8511 return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, 8512 (__v16sf) 8513 _mm512_setzero_ps (), 8514 (__mmask16) __U); 8515 } 8516 8517 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8518 _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 8519 { 8520 return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, 8521 (__v16si) __W, 8522 (__mmask16) __U); 8523 } 8524 8525 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8526 _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) 8527 { 8528 return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, 8529 (__v16si) 8530 _mm512_setzero_si512 (), 8531 (__mmask16) __U); 8532 } 8533 8534 #define _mm_cmp_round_ss_mask(X, Y, P, R) \ 8535 (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8536 (__v4sf)(__m128)(Y), (int)(P), \ 8537 (__mmask8)-1, (int)(R)) 8538 8539 #define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ 8540 (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8541 (__v4sf)(__m128)(Y), (int)(P), \ 8542 (__mmask8)(M), (int)(R)) 8543 8544 #define _mm_cmp_ss_mask(X, Y, P) \ 8545 (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8546 (__v4sf)(__m128)(Y), (int)(P), \ 8547 (__mmask8)-1, \ 8548 _MM_FROUND_CUR_DIRECTION) 8549 8550 #define _mm_mask_cmp_ss_mask(M, X, Y, P) \ 8551 (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8552 (__v4sf)(__m128)(Y), (int)(P), \ 8553 (__mmask8)(M), \ 8554 _MM_FROUND_CUR_DIRECTION) 8555 8556 #define _mm_cmp_round_sd_mask(X, Y, P, R) \ 8557 (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8558 (__v2df)(__m128d)(Y), (int)(P), \ 8559 (__mmask8)-1, (int)(R)) 8560 8561 #define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ 8562 (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8563 (__v2df)(__m128d)(Y), (int)(P), \ 8564 (__mmask8)(M), (int)(R)) 8565 8566 #define _mm_cmp_sd_mask(X, Y, P) \ 8567 (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8568 (__v2df)(__m128d)(Y), (int)(P), \ 8569 (__mmask8)-1, \ 8570 _MM_FROUND_CUR_DIRECTION) 8571 8572 #define _mm_mask_cmp_sd_mask(M, X, Y, P) \ 8573 (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8574 (__v2df)(__m128d)(Y), (int)(P), \ 8575 (__mmask8)(M), \ 8576 _MM_FROUND_CUR_DIRECTION) 8577 8578 /* Bit Test */ 8579 8580 static __inline __mmask16 __DEFAULT_FN_ATTRS512 8581 _mm512_test_epi32_mask (__m512i __A, __m512i __B) 8582 { 8583 return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B), 8584 _mm512_setzero_si512()); 8585 } 8586 8587 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8588 _mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) 8589 { 8590 return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), 8591 _mm512_setzero_si512()); 8592 } 8593 8594 static __inline __mmask8 __DEFAULT_FN_ATTRS512 8595 _mm512_test_epi64_mask (__m512i __A, __m512i __B) 8596 { 8597 return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B), 8598 _mm512_setzero_si512()); 8599 } 8600 8601 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8602 _mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) 8603 { 8604 return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), 8605 _mm512_setzero_si512()); 8606 } 8607 8608 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8609 _mm512_testn_epi32_mask (__m512i __A, __m512i __B) 8610 { 8611 return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B), 8612 _mm512_setzero_si512()); 8613 } 8614 8615 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8616 _mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) 8617 { 8618 return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), 8619 _mm512_setzero_si512()); 8620 } 8621 8622 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8623 _mm512_testn_epi64_mask (__m512i __A, __m512i __B) 8624 { 8625 return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B), 8626 _mm512_setzero_si512()); 8627 } 8628 8629 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8630 _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) 8631 { 8632 return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), 8633 _mm512_setzero_si512()); 8634 } 8635 8636 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8637 _mm512_movehdup_ps (__m512 __A) 8638 { 8639 return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, 8640 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); 8641 } 8642 8643 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8644 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) 8645 { 8646 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8647 (__v16sf)_mm512_movehdup_ps(__A), 8648 (__v16sf)__W); 8649 } 8650 8651 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8652 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) 8653 { 8654 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8655 (__v16sf)_mm512_movehdup_ps(__A), 8656 (__v16sf)_mm512_setzero_ps()); 8657 } 8658 8659 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8660 _mm512_moveldup_ps (__m512 __A) 8661 { 8662 return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, 8663 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); 8664 } 8665 8666 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8667 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) 8668 { 8669 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8670 (__v16sf)_mm512_moveldup_ps(__A), 8671 (__v16sf)__W); 8672 } 8673 8674 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8675 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) 8676 { 8677 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8678 (__v16sf)_mm512_moveldup_ps(__A), 8679 (__v16sf)_mm512_setzero_ps()); 8680 } 8681 8682 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8683 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 8684 { 8685 return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W); 8686 } 8687 8688 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8689 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) 8690 { 8691 return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), 8692 _mm_setzero_ps()); 8693 } 8694 8695 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8696 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8697 { 8698 return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W); 8699 } 8700 8701 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8702 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) 8703 { 8704 return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), 8705 _mm_setzero_pd()); 8706 } 8707 8708 static __inline__ void __DEFAULT_FN_ATTRS128 8709 _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) 8710 { 8711 __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1); 8712 } 8713 8714 static __inline__ void __DEFAULT_FN_ATTRS128 8715 _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) 8716 { 8717 __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1); 8718 } 8719 8720 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8721 _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) 8722 { 8723 __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W, 8724 (__v4sf)_mm_setzero_ps(), 8725 0, 4, 4, 4); 8726 8727 return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1); 8728 } 8729 8730 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8731 _mm_maskz_load_ss (__mmask8 __U, const float* __A) 8732 { 8733 return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A, 8734 (__v4sf) _mm_setzero_ps(), 8735 __U & 1); 8736 } 8737 8738 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8739 _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) 8740 { 8741 __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, 8742 (__v2df)_mm_setzero_pd(), 8743 0, 2); 8744 8745 return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1); 8746 } 8747 8748 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8749 _mm_maskz_load_sd (__mmask8 __U, const double* __A) 8750 { 8751 return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, 8752 (__v2df) _mm_setzero_pd(), 8753 __U & 1); 8754 } 8755 8756 #define _mm512_shuffle_epi32(A, I) \ 8757 (__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)) 8758 8759 #define _mm512_mask_shuffle_epi32(W, U, A, I) \ 8760 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 8761 (__v16si)_mm512_shuffle_epi32((A), (I)), \ 8762 (__v16si)(__m512i)(W)) 8763 8764 #define _mm512_maskz_shuffle_epi32(U, A, I) \ 8765 (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 8766 (__v16si)_mm512_shuffle_epi32((A), (I)), \ 8767 (__v16si)_mm512_setzero_si512()) 8768 8769 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8770 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) 8771 { 8772 return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, 8773 (__v8df) __W, 8774 (__mmask8) __U); 8775 } 8776 8777 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8778 _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) 8779 { 8780 return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, 8781 (__v8df) _mm512_setzero_pd (), 8782 (__mmask8) __U); 8783 } 8784 8785 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8786 _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 8787 { 8788 return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, 8789 (__v8di) __W, 8790 (__mmask8) __U); 8791 } 8792 8793 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8794 _mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A) 8795 { 8796 return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, 8797 (__v8di) _mm512_setzero_si512 (), 8798 (__mmask8) __U); 8799 } 8800 8801 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8802 _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) 8803 { 8804 return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, 8805 (__v8df) __W, 8806 (__mmask8) __U); 8807 } 8808 8809 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8810 _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) 8811 { 8812 return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, 8813 (__v8df) _mm512_setzero_pd(), 8814 (__mmask8) __U); 8815 } 8816 8817 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8818 _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) 8819 { 8820 return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, 8821 (__v8di) __W, 8822 (__mmask8) __U); 8823 } 8824 8825 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8826 _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) 8827 { 8828 return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, 8829 (__v8di) _mm512_setzero_si512(), 8830 (__mmask8) __U); 8831 } 8832 8833 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8834 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) 8835 { 8836 return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, 8837 (__v16sf) __W, 8838 (__mmask16) __U); 8839 } 8840 8841 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8842 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) 8843 { 8844 return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, 8845 (__v16sf) _mm512_setzero_ps(), 8846 (__mmask16) __U); 8847 } 8848 8849 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8850 _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) 8851 { 8852 return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, 8853 (__v16si) __W, 8854 (__mmask16) __U); 8855 } 8856 8857 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8858 _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) 8859 { 8860 return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, 8861 (__v16si) _mm512_setzero_si512(), 8862 (__mmask16) __U); 8863 } 8864 8865 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8866 _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) 8867 { 8868 return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, 8869 (__v16sf) __W, 8870 (__mmask16) __U); 8871 } 8872 8873 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8874 _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) 8875 { 8876 return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, 8877 (__v16sf) _mm512_setzero_ps(), 8878 (__mmask16) __U); 8879 } 8880 8881 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8882 _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 8883 { 8884 return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, 8885 (__v16si) __W, 8886 (__mmask16) __U); 8887 } 8888 8889 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8890 _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) 8891 { 8892 return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, 8893 (__v16si) _mm512_setzero_si512(), 8894 (__mmask16) __U); 8895 } 8896 8897 #define _mm512_cvt_roundps_pd(A, R) \ 8898 (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8899 (__v8df)_mm512_undefined_pd(), \ 8900 (__mmask8)-1, (int)(R)) 8901 8902 #define _mm512_mask_cvt_roundps_pd(W, U, A, R) \ 8903 (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8904 (__v8df)(__m512d)(W), \ 8905 (__mmask8)(U), (int)(R)) 8906 8907 #define _mm512_maskz_cvt_roundps_pd(U, A, R) \ 8908 (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8909 (__v8df)_mm512_setzero_pd(), \ 8910 (__mmask8)(U), (int)(R)) 8911 8912 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8913 _mm512_cvtps_pd (__m256 __A) 8914 { 8915 return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df); 8916 } 8917 8918 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8919 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) 8920 { 8921 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8922 (__v8df)_mm512_cvtps_pd(__A), 8923 (__v8df)__W); 8924 } 8925 8926 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8927 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) 8928 { 8929 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8930 (__v8df)_mm512_cvtps_pd(__A), 8931 (__v8df)_mm512_setzero_pd()); 8932 } 8933 8934 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8935 _mm512_cvtpslo_pd (__m512 __A) 8936 { 8937 return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); 8938 } 8939 8940 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8941 _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) 8942 { 8943 return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); 8944 } 8945 8946 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8947 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) 8948 { 8949 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 8950 (__v8df) __A, 8951 (__v8df) __W); 8952 } 8953 8954 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8955 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) 8956 { 8957 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 8958 (__v8df) __A, 8959 (__v8df) _mm512_setzero_pd ()); 8960 } 8961 8962 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8963 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) 8964 { 8965 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 8966 (__v16sf) __A, 8967 (__v16sf) __W); 8968 } 8969 8970 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8971 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) 8972 { 8973 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 8974 (__v16sf) __A, 8975 (__v16sf) _mm512_setzero_ps ()); 8976 } 8977 8978 static __inline__ void __DEFAULT_FN_ATTRS512 8979 _mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) 8980 { 8981 __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, 8982 (__mmask8) __U); 8983 } 8984 8985 static __inline__ void __DEFAULT_FN_ATTRS512 8986 _mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) 8987 { 8988 __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, 8989 (__mmask8) __U); 8990 } 8991 8992 static __inline__ void __DEFAULT_FN_ATTRS512 8993 _mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) 8994 { 8995 __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, 8996 (__mmask16) __U); 8997 } 8998 8999 static __inline__ void __DEFAULT_FN_ATTRS512 9000 _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) 9001 { 9002 __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, 9003 (__mmask16) __U); 9004 } 9005 9006 #define _mm_cvt_roundsd_ss(A, B, R) \ 9007 (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9008 (__v2df)(__m128d)(B), \ 9009 (__v4sf)_mm_undefined_ps(), \ 9010 (__mmask8)-1, (int)(R)) 9011 9012 #define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \ 9013 (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9014 (__v2df)(__m128d)(B), \ 9015 (__v4sf)(__m128)(W), \ 9016 (__mmask8)(U), (int)(R)) 9017 9018 #define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \ 9019 (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9020 (__v2df)(__m128d)(B), \ 9021 (__v4sf)_mm_setzero_ps(), \ 9022 (__mmask8)(U), (int)(R)) 9023 9024 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9025 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) 9026 { 9027 return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, 9028 (__v2df)__B, 9029 (__v4sf)__W, 9030 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9031 } 9032 9033 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9034 _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) 9035 { 9036 return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, 9037 (__v2df)__B, 9038 (__v4sf)_mm_setzero_ps(), 9039 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9040 } 9041 9042 #define _mm_cvtss_i32 _mm_cvtss_si32 9043 #define _mm_cvtsd_i32 _mm_cvtsd_si32 9044 #define _mm_cvti32_sd _mm_cvtsi32_sd 9045 #define _mm_cvti32_ss _mm_cvtsi32_ss 9046 #ifdef __x86_64__ 9047 #define _mm_cvtss_i64 _mm_cvtss_si64 9048 #define _mm_cvtsd_i64 _mm_cvtsd_si64 9049 #define _mm_cvti64_sd _mm_cvtsi64_sd 9050 #define _mm_cvti64_ss _mm_cvtsi64_ss 9051 #endif 9052 9053 #ifdef __x86_64__ 9054 #define _mm_cvt_roundi64_sd(A, B, R) \ 9055 (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ 9056 (int)(R)) 9057 9058 #define _mm_cvt_roundsi64_sd(A, B, R) \ 9059 (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ 9060 (int)(R)) 9061 #endif 9062 9063 #define _mm_cvt_roundsi32_ss(A, B, R) \ 9064 (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)) 9065 9066 #define _mm_cvt_roundi32_ss(A, B, R) \ 9067 (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)) 9068 9069 #ifdef __x86_64__ 9070 #define _mm_cvt_roundsi64_ss(A, B, R) \ 9071 (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ 9072 (int)(R)) 9073 9074 #define _mm_cvt_roundi64_ss(A, B, R) \ 9075 (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ 9076 (int)(R)) 9077 #endif 9078 9079 #define _mm_cvt_roundss_sd(A, B, R) \ 9080 (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9081 (__v4sf)(__m128)(B), \ 9082 (__v2df)_mm_undefined_pd(), \ 9083 (__mmask8)-1, (int)(R)) 9084 9085 #define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \ 9086 (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9087 (__v4sf)(__m128)(B), \ 9088 (__v2df)(__m128d)(W), \ 9089 (__mmask8)(U), (int)(R)) 9090 9091 #define _mm_maskz_cvt_roundss_sd(U, A, B, R) \ 9092 (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9093 (__v4sf)(__m128)(B), \ 9094 (__v2df)_mm_setzero_pd(), \ 9095 (__mmask8)(U), (int)(R)) 9096 9097 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9098 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) 9099 { 9100 return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, 9101 (__v4sf)__B, 9102 (__v2df)__W, 9103 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9104 } 9105 9106 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9107 _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B) 9108 { 9109 return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, 9110 (__v4sf)__B, 9111 (__v2df)_mm_setzero_pd(), 9112 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9113 } 9114 9115 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9116 _mm_cvtu32_sd (__m128d __A, unsigned __B) 9117 { 9118 __A[0] = __B; 9119 return __A; 9120 } 9121 9122 #ifdef __x86_64__ 9123 #define _mm_cvt_roundu64_sd(A, B, R) \ 9124 (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ 9125 (unsigned long long)(B), (int)(R)) 9126 9127 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9128 _mm_cvtu64_sd (__m128d __A, unsigned long long __B) 9129 { 9130 __A[0] = __B; 9131 return __A; 9132 } 9133 #endif 9134 9135 #define _mm_cvt_roundu32_ss(A, B, R) \ 9136 (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ 9137 (int)(R)) 9138 9139 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9140 _mm_cvtu32_ss (__m128 __A, unsigned __B) 9141 { 9142 __A[0] = __B; 9143 return __A; 9144 } 9145 9146 #ifdef __x86_64__ 9147 #define _mm_cvt_roundu64_ss(A, B, R) \ 9148 (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ 9149 (unsigned long long)(B), (int)(R)) 9150 9151 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9152 _mm_cvtu64_ss (__m128 __A, unsigned long long __B) 9153 { 9154 __A[0] = __B; 9155 return __A; 9156 } 9157 #endif 9158 9159 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9160 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) 9161 { 9162 return (__m512i) __builtin_ia32_selectd_512(__M, 9163 (__v16si) _mm512_set1_epi32(__A), 9164 (__v16si) __O); 9165 } 9166 9167 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9168 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) 9169 { 9170 return (__m512i) __builtin_ia32_selectq_512(__M, 9171 (__v8di) _mm512_set1_epi64(__A), 9172 (__v8di) __O); 9173 } 9174 9175 static __inline __m512i __DEFAULT_FN_ATTRS512 9176 _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, 9177 char __e58, char __e57, char __e56, char __e55, char __e54, char __e53, 9178 char __e52, char __e51, char __e50, char __e49, char __e48, char __e47, 9179 char __e46, char __e45, char __e44, char __e43, char __e42, char __e41, 9180 char __e40, char __e39, char __e38, char __e37, char __e36, char __e35, 9181 char __e34, char __e33, char __e32, char __e31, char __e30, char __e29, 9182 char __e28, char __e27, char __e26, char __e25, char __e24, char __e23, 9183 char __e22, char __e21, char __e20, char __e19, char __e18, char __e17, 9184 char __e16, char __e15, char __e14, char __e13, char __e12, char __e11, 9185 char __e10, char __e9, char __e8, char __e7, char __e6, char __e5, 9186 char __e4, char __e3, char __e2, char __e1, char __e0) { 9187 9188 return __extension__ (__m512i)(__v64qi) 9189 {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, 9190 __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, 9191 __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, 9192 __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31, 9193 __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39, 9194 __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47, 9195 __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55, 9196 __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63}; 9197 } 9198 9199 static __inline __m512i __DEFAULT_FN_ATTRS512 9200 _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, 9201 short __e27, short __e26, short __e25, short __e24, short __e23, 9202 short __e22, short __e21, short __e20, short __e19, short __e18, 9203 short __e17, short __e16, short __e15, short __e14, short __e13, 9204 short __e12, short __e11, short __e10, short __e9, short __e8, 9205 short __e7, short __e6, short __e5, short __e4, short __e3, 9206 short __e2, short __e1, short __e0) { 9207 return __extension__ (__m512i)(__v32hi) 9208 {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, 9209 __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, 9210 __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, 9211 __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 }; 9212 } 9213 9214 static __inline __m512i __DEFAULT_FN_ATTRS512 9215 _mm512_set_epi32 (int __A, int __B, int __C, int __D, 9216 int __E, int __F, int __G, int __H, 9217 int __I, int __J, int __K, int __L, 9218 int __M, int __N, int __O, int __P) 9219 { 9220 return __extension__ (__m512i)(__v16si) 9221 { __P, __O, __N, __M, __L, __K, __J, __I, 9222 __H, __G, __F, __E, __D, __C, __B, __A }; 9223 } 9224 9225 #define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \ 9226 e8,e9,e10,e11,e12,e13,e14,e15) \ 9227 _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \ 9228 (e5),(e4),(e3),(e2),(e1),(e0)) 9229 9230 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9231 _mm512_set_epi64 (long long __A, long long __B, long long __C, 9232 long long __D, long long __E, long long __F, 9233 long long __G, long long __H) 9234 { 9235 return __extension__ (__m512i) (__v8di) 9236 { __H, __G, __F, __E, __D, __C, __B, __A }; 9237 } 9238 9239 #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \ 9240 _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) 9241 9242 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9243 _mm512_set_pd (double __A, double __B, double __C, double __D, 9244 double __E, double __F, double __G, double __H) 9245 { 9246 return __extension__ (__m512d) 9247 { __H, __G, __F, __E, __D, __C, __B, __A }; 9248 } 9249 9250 #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \ 9251 _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) 9252 9253 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9254 _mm512_set_ps (float __A, float __B, float __C, float __D, 9255 float __E, float __F, float __G, float __H, 9256 float __I, float __J, float __K, float __L, 9257 float __M, float __N, float __O, float __P) 9258 { 9259 return __extension__ (__m512) 9260 { __P, __O, __N, __M, __L, __K, __J, __I, 9261 __H, __G, __F, __E, __D, __C, __B, __A }; 9262 } 9263 9264 #define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \ 9265 _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \ 9266 (e4),(e3),(e2),(e1),(e0)) 9267 9268 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9269 _mm512_abs_ps(__m512 __A) 9270 { 9271 return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; 9272 } 9273 9274 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9275 _mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) 9276 { 9277 return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; 9278 } 9279 9280 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9281 _mm512_abs_pd(__m512d __A) 9282 { 9283 return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ; 9284 } 9285 9286 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9287 _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) 9288 { 9289 return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); 9290 } 9291 9292 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as 9293 * outputs. This class of vector operation forms the basis of many scientific 9294 * computations. In vector-reduction arithmetic, the evaluation off is 9295 * independent of the order of the input elements of V. 9296 9297 * Used bisection method. At each step, we partition the vector with previous 9298 * step in half, and the operation is performed on its two halves. 9299 * This takes log2(n) steps where n is the number of elements in the vector. 9300 */ 9301 9302 #define _mm512_mask_reduce_operator(op) \ 9303 __v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \ 9304 __v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \ 9305 __m256i __t3 = (__m256i)(__t1 op __t2); \ 9306 __v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \ 9307 __v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \ 9308 __v2du __t6 = __t4 op __t5; \ 9309 __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ 9310 __v2du __t8 = __t6 op __t7; \ 9311 return __t8[0] 9312 9313 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) { 9314 _mm512_mask_reduce_operator(+); 9315 } 9316 9317 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) { 9318 _mm512_mask_reduce_operator(*); 9319 } 9320 9321 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) { 9322 _mm512_mask_reduce_operator(&); 9323 } 9324 9325 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) { 9326 _mm512_mask_reduce_operator(|); 9327 } 9328 9329 static __inline__ long long __DEFAULT_FN_ATTRS512 9330 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { 9331 __W = _mm512_maskz_mov_epi64(__M, __W); 9332 _mm512_mask_reduce_operator(+); 9333 } 9334 9335 static __inline__ long long __DEFAULT_FN_ATTRS512 9336 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { 9337 __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W); 9338 _mm512_mask_reduce_operator(*); 9339 } 9340 9341 static __inline__ long long __DEFAULT_FN_ATTRS512 9342 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { 9343 __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W); 9344 _mm512_mask_reduce_operator(&); 9345 } 9346 9347 static __inline__ long long __DEFAULT_FN_ATTRS512 9348 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { 9349 __W = _mm512_maskz_mov_epi64(__M, __W); 9350 _mm512_mask_reduce_operator(|); 9351 } 9352 #undef _mm512_mask_reduce_operator 9353 9354 #define _mm512_mask_reduce_operator(op) \ 9355 __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \ 9356 __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \ 9357 __m256d __t3 = __t1 op __t2; \ 9358 __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \ 9359 __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \ 9360 __m128d __t6 = __t4 op __t5; \ 9361 __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ 9362 __m128d __t8 = __t6 op __t7; \ 9363 return __t8[0] 9364 9365 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { 9366 _mm512_mask_reduce_operator(+); 9367 } 9368 9369 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) { 9370 _mm512_mask_reduce_operator(*); 9371 } 9372 9373 static __inline__ double __DEFAULT_FN_ATTRS512 9374 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { 9375 __W = _mm512_maskz_mov_pd(__M, __W); 9376 _mm512_mask_reduce_operator(+); 9377 } 9378 9379 static __inline__ double __DEFAULT_FN_ATTRS512 9380 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { 9381 __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W); 9382 _mm512_mask_reduce_operator(*); 9383 } 9384 #undef _mm512_mask_reduce_operator 9385 9386 #define _mm512_mask_reduce_operator(op) \ 9387 __v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \ 9388 __v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \ 9389 __m256i __t3 = (__m256i)(__t1 op __t2); \ 9390 __v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \ 9391 __v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \ 9392 __v4su __t6 = __t4 op __t5; \ 9393 __v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ 9394 __v4su __t8 = __t6 op __t7; \ 9395 __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ 9396 __v4su __t10 = __t8 op __t9; \ 9397 return __t10[0] 9398 9399 static __inline__ int __DEFAULT_FN_ATTRS512 9400 _mm512_reduce_add_epi32(__m512i __W) { 9401 _mm512_mask_reduce_operator(+); 9402 } 9403 9404 static __inline__ int __DEFAULT_FN_ATTRS512 9405 _mm512_reduce_mul_epi32(__m512i __W) { 9406 _mm512_mask_reduce_operator(*); 9407 } 9408 9409 static __inline__ int __DEFAULT_FN_ATTRS512 9410 _mm512_reduce_and_epi32(__m512i __W) { 9411 _mm512_mask_reduce_operator(&); 9412 } 9413 9414 static __inline__ int __DEFAULT_FN_ATTRS512 9415 _mm512_reduce_or_epi32(__m512i __W) { 9416 _mm512_mask_reduce_operator(|); 9417 } 9418 9419 static __inline__ int __DEFAULT_FN_ATTRS512 9420 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { 9421 __W = _mm512_maskz_mov_epi32(__M, __W); 9422 _mm512_mask_reduce_operator(+); 9423 } 9424 9425 static __inline__ int __DEFAULT_FN_ATTRS512 9426 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { 9427 __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W); 9428 _mm512_mask_reduce_operator(*); 9429 } 9430 9431 static __inline__ int __DEFAULT_FN_ATTRS512 9432 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { 9433 __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W); 9434 _mm512_mask_reduce_operator(&); 9435 } 9436 9437 static __inline__ int __DEFAULT_FN_ATTRS512 9438 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { 9439 __W = _mm512_maskz_mov_epi32(__M, __W); 9440 _mm512_mask_reduce_operator(|); 9441 } 9442 #undef _mm512_mask_reduce_operator 9443 9444 #define _mm512_mask_reduce_operator(op) \ 9445 __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \ 9446 __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \ 9447 __m256 __t3 = __t1 op __t2; \ 9448 __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \ 9449 __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \ 9450 __m128 __t6 = __t4 op __t5; \ 9451 __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ 9452 __m128 __t8 = __t6 op __t7; \ 9453 __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ 9454 __m128 __t10 = __t8 op __t9; \ 9455 return __t10[0] 9456 9457 static __inline__ float __DEFAULT_FN_ATTRS512 9458 _mm512_reduce_add_ps(__m512 __W) { 9459 _mm512_mask_reduce_operator(+); 9460 } 9461 9462 static __inline__ float __DEFAULT_FN_ATTRS512 9463 _mm512_reduce_mul_ps(__m512 __W) { 9464 _mm512_mask_reduce_operator(*); 9465 } 9466 9467 static __inline__ float __DEFAULT_FN_ATTRS512 9468 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { 9469 __W = _mm512_maskz_mov_ps(__M, __W); 9470 _mm512_mask_reduce_operator(+); 9471 } 9472 9473 static __inline__ float __DEFAULT_FN_ATTRS512 9474 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { 9475 __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W); 9476 _mm512_mask_reduce_operator(*); 9477 } 9478 #undef _mm512_mask_reduce_operator 9479 9480 #define _mm512_mask_reduce_operator(op) \ 9481 __m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \ 9482 __m512i __t2 = _mm512_##op(__V, __t1); \ 9483 __m512i __t3 = (__m512i)__builtin_shufflevector((__v8di)__t2, (__v8di)__t2, 2, 3, 0, 1, 6, 7, 4, 5); \ 9484 __m512i __t4 = _mm512_##op(__t2, __t3); \ 9485 __m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \ 9486 __v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \ 9487 return __t6[0] 9488 9489 static __inline__ long long __DEFAULT_FN_ATTRS512 9490 _mm512_reduce_max_epi64(__m512i __V) { 9491 _mm512_mask_reduce_operator(max_epi64); 9492 } 9493 9494 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9495 _mm512_reduce_max_epu64(__m512i __V) { 9496 _mm512_mask_reduce_operator(max_epu64); 9497 } 9498 9499 static __inline__ long long __DEFAULT_FN_ATTRS512 9500 _mm512_reduce_min_epi64(__m512i __V) { 9501 _mm512_mask_reduce_operator(min_epi64); 9502 } 9503 9504 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9505 _mm512_reduce_min_epu64(__m512i __V) { 9506 _mm512_mask_reduce_operator(min_epu64); 9507 } 9508 9509 static __inline__ long long __DEFAULT_FN_ATTRS512 9510 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { 9511 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V); 9512 _mm512_mask_reduce_operator(max_epi64); 9513 } 9514 9515 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9516 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { 9517 __V = _mm512_maskz_mov_epi64(__M, __V); 9518 _mm512_mask_reduce_operator(max_epu64); 9519 } 9520 9521 static __inline__ long long __DEFAULT_FN_ATTRS512 9522 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { 9523 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V); 9524 _mm512_mask_reduce_operator(min_epi64); 9525 } 9526 9527 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9528 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { 9529 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V); 9530 _mm512_mask_reduce_operator(min_epu64); 9531 } 9532 #undef _mm512_mask_reduce_operator 9533 9534 #define _mm512_mask_reduce_operator(op) \ 9535 __m256i __t1 = _mm512_extracti64x4_epi64(__V, 0); \ 9536 __m256i __t2 = _mm512_extracti64x4_epi64(__V, 1); \ 9537 __m256i __t3 = _mm256_##op(__t1, __t2); \ 9538 __m128i __t4 = _mm256_extracti128_si256(__t3, 0); \ 9539 __m128i __t5 = _mm256_extracti128_si256(__t3, 1); \ 9540 __m128i __t6 = _mm_##op(__t4, __t5); \ 9541 __m128i __t7 = (__m128i)__builtin_shufflevector((__v4si)__t6, (__v4si)__t6, 2, 3, 0, 1); \ 9542 __m128i __t8 = _mm_##op(__t6, __t7); \ 9543 __m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \ 9544 __v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \ 9545 return __t10[0] 9546 9547 static __inline__ int __DEFAULT_FN_ATTRS512 9548 _mm512_reduce_max_epi32(__m512i __V) { 9549 _mm512_mask_reduce_operator(max_epi32); 9550 } 9551 9552 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9553 _mm512_reduce_max_epu32(__m512i __V) { 9554 _mm512_mask_reduce_operator(max_epu32); 9555 } 9556 9557 static __inline__ int __DEFAULT_FN_ATTRS512 9558 _mm512_reduce_min_epi32(__m512i __V) { 9559 _mm512_mask_reduce_operator(min_epi32); 9560 } 9561 9562 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9563 _mm512_reduce_min_epu32(__m512i __V) { 9564 _mm512_mask_reduce_operator(min_epu32); 9565 } 9566 9567 static __inline__ int __DEFAULT_FN_ATTRS512 9568 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { 9569 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V); 9570 _mm512_mask_reduce_operator(max_epi32); 9571 } 9572 9573 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9574 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { 9575 __V = _mm512_maskz_mov_epi32(__M, __V); 9576 _mm512_mask_reduce_operator(max_epu32); 9577 } 9578 9579 static __inline__ int __DEFAULT_FN_ATTRS512 9580 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { 9581 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V); 9582 _mm512_mask_reduce_operator(min_epi32); 9583 } 9584 9585 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9586 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { 9587 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V); 9588 _mm512_mask_reduce_operator(min_epu32); 9589 } 9590 #undef _mm512_mask_reduce_operator 9591 9592 #define _mm512_mask_reduce_operator(op) \ 9593 __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \ 9594 __m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \ 9595 __m256d __t3 = _mm256_##op(__t1, __t2); \ 9596 __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \ 9597 __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \ 9598 __m128d __t6 = _mm_##op(__t4, __t5); \ 9599 __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ 9600 __m128d __t8 = _mm_##op(__t6, __t7); \ 9601 return __t8[0] 9602 9603 static __inline__ double __DEFAULT_FN_ATTRS512 9604 _mm512_reduce_max_pd(__m512d __V) { 9605 _mm512_mask_reduce_operator(max_pd); 9606 } 9607 9608 static __inline__ double __DEFAULT_FN_ATTRS512 9609 _mm512_reduce_min_pd(__m512d __V) { 9610 _mm512_mask_reduce_operator(min_pd); 9611 } 9612 9613 static __inline__ double __DEFAULT_FN_ATTRS512 9614 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { 9615 __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V); 9616 _mm512_mask_reduce_operator(max_pd); 9617 } 9618 9619 static __inline__ double __DEFAULT_FN_ATTRS512 9620 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { 9621 __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V); 9622 _mm512_mask_reduce_operator(min_pd); 9623 } 9624 #undef _mm512_mask_reduce_operator 9625 9626 #define _mm512_mask_reduce_operator(op) \ 9627 __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \ 9628 __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \ 9629 __m256 __t3 = _mm256_##op(__t1, __t2); \ 9630 __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \ 9631 __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \ 9632 __m128 __t6 = _mm_##op(__t4, __t5); \ 9633 __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ 9634 __m128 __t8 = _mm_##op(__t6, __t7); \ 9635 __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ 9636 __m128 __t10 = _mm_##op(__t8, __t9); \ 9637 return __t10[0] 9638 9639 static __inline__ float __DEFAULT_FN_ATTRS512 9640 _mm512_reduce_max_ps(__m512 __V) { 9641 _mm512_mask_reduce_operator(max_ps); 9642 } 9643 9644 static __inline__ float __DEFAULT_FN_ATTRS512 9645 _mm512_reduce_min_ps(__m512 __V) { 9646 _mm512_mask_reduce_operator(min_ps); 9647 } 9648 9649 static __inline__ float __DEFAULT_FN_ATTRS512 9650 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { 9651 __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V); 9652 _mm512_mask_reduce_operator(max_ps); 9653 } 9654 9655 static __inline__ float __DEFAULT_FN_ATTRS512 9656 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { 9657 __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V); 9658 _mm512_mask_reduce_operator(min_ps); 9659 } 9660 #undef _mm512_mask_reduce_operator 9661 9662 /// Moves the least significant 32 bits of a vector of [16 x i32] to a 9663 /// 32-bit signed integer value. 9664 /// 9665 /// \headerfile <x86intrin.h> 9666 /// 9667 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 9668 /// 9669 /// \param __A 9670 /// A vector of [16 x i32]. The least significant 32 bits are moved to the 9671 /// destination. 9672 /// \returns A 32-bit signed integer containing the moved value. 9673 static __inline__ int __DEFAULT_FN_ATTRS512 9674 _mm512_cvtsi512_si32(__m512i __A) { 9675 __v16si __b = (__v16si)__A; 9676 return __b[0]; 9677 } 9678 9679 #undef __DEFAULT_FN_ATTRS512 9680 #undef __DEFAULT_FN_ATTRS128 9681 #undef __DEFAULT_FN_ATTRS 9682 9683 #endif /* __AVX512FINTRIN_H */ 9684