1 /*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead." 11 #endif 12 13 #ifndef __AVX512FINTRIN_H 14 #define __AVX512FINTRIN_H 15 16 typedef char __v64qi __attribute__((__vector_size__(64))); 17 typedef short __v32hi __attribute__((__vector_size__(64))); 18 typedef double __v8df __attribute__((__vector_size__(64))); 19 typedef float __v16sf __attribute__((__vector_size__(64))); 20 typedef long long __v8di __attribute__((__vector_size__(64))); 21 typedef int __v16si __attribute__((__vector_size__(64))); 22 23 /* Unsigned types */ 24 typedef unsigned char __v64qu __attribute__((__vector_size__(64))); 25 typedef unsigned short __v32hu __attribute__((__vector_size__(64))); 26 typedef unsigned long long __v8du __attribute__((__vector_size__(64))); 27 typedef unsigned int __v16su __attribute__((__vector_size__(64))); 28 29 /* We need an explicitly signed variant for char. Note that this shouldn't 30 * appear in the interface though. */ 31 typedef signed char __v64qs __attribute__((__vector_size__(64))); 32 33 typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64))); 34 typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64))); 35 typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64))); 36 37 typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1))); 38 typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1))); 39 typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1))); 40 41 typedef unsigned char __mmask8; 42 typedef unsigned short __mmask16; 43 44 /* Rounding mode macros. */ 45 #define _MM_FROUND_TO_NEAREST_INT 0x00 46 #define _MM_FROUND_TO_NEG_INF 0x01 47 #define _MM_FROUND_TO_POS_INF 0x02 48 #define _MM_FROUND_TO_ZERO 0x03 49 #define _MM_FROUND_CUR_DIRECTION 0x04 50 51 /* Constants for integer comparison predicates */ 52 typedef enum { 53 _MM_CMPINT_EQ, /* Equal */ 54 _MM_CMPINT_LT, /* Less than */ 55 _MM_CMPINT_LE, /* Less than or Equal */ 56 _MM_CMPINT_UNUSED, 57 _MM_CMPINT_NE, /* Not Equal */ 58 _MM_CMPINT_NLT, /* Not Less than */ 59 #define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */ 60 _MM_CMPINT_NLE /* Not Less than or Equal */ 61 #define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */ 62 } _MM_CMPINT_ENUM; 63 64 typedef enum 65 { 66 _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, 67 _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, 68 _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, 69 _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, 70 _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, 71 _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, 72 _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, 73 _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, 74 _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, 75 _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, 76 _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, 77 _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, 78 _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, 79 _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, 80 _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, 81 _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, 82 _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, 83 _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, 84 _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, 85 _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, 86 _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, 87 _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, 88 _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, 89 _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, 90 _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, 91 _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, 92 _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, 93 _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, 94 _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, 95 _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, 96 _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, 97 _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, 98 _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, 99 _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, 100 _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, 101 _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, 102 _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, 103 _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, 104 _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, 105 _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, 106 _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, 107 _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, 108 _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, 109 _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, 110 _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, 111 _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, 112 _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, 113 _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, 114 _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, 115 _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, 116 _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, 117 _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, 118 _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, 119 _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, 120 _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, 121 _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, 122 _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, 123 _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, 124 _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, 125 _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, 126 _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, 127 _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, 128 _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, 129 _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, 130 _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, 131 _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, 132 _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, 133 _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, 134 _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, 135 _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, 136 _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, 137 _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, 138 _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, 139 _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, 140 _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, 141 _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, 142 _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, 143 _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, 144 _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, 145 _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, 146 _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, 147 _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, 148 _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, 149 _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, 150 _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, 151 _MM_PERM_DDDD = 0xFF 152 } _MM_PERM_ENUM; 153 154 typedef enum 155 { 156 _MM_MANT_NORM_1_2, /* interval [1, 2) */ 157 _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */ 158 _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */ 159 _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */ 160 } _MM_MANTISSA_NORM_ENUM; 161 162 typedef enum 163 { 164 _MM_MANT_SIGN_src, /* sign = sign(SRC) */ 165 _MM_MANT_SIGN_zero, /* sign = 0 */ 166 _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */ 167 } _MM_MANTISSA_SIGN_ENUM; 168 169 /* Define the default attributes for the functions in this file. */ 170 #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512))) 171 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128))) 172 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f"))) 173 174 /* Create vectors with repeated elements */ 175 176 static __inline __m512i __DEFAULT_FN_ATTRS512 177 _mm512_setzero_si512(void) 178 { 179 return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; 180 } 181 182 #define _mm512_setzero_epi32 _mm512_setzero_si512 183 184 static __inline__ __m512d __DEFAULT_FN_ATTRS512 185 _mm512_undefined_pd(void) 186 { 187 return (__m512d)__builtin_ia32_undef512(); 188 } 189 190 static __inline__ __m512 __DEFAULT_FN_ATTRS512 191 _mm512_undefined(void) 192 { 193 return (__m512)__builtin_ia32_undef512(); 194 } 195 196 static __inline__ __m512 __DEFAULT_FN_ATTRS512 197 _mm512_undefined_ps(void) 198 { 199 return (__m512)__builtin_ia32_undef512(); 200 } 201 202 static __inline__ __m512i __DEFAULT_FN_ATTRS512 203 _mm512_undefined_epi32(void) 204 { 205 return (__m512i)__builtin_ia32_undef512(); 206 } 207 208 static __inline__ __m512i __DEFAULT_FN_ATTRS512 209 _mm512_broadcastd_epi32 (__m128i __A) 210 { 211 return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A, 212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 213 } 214 215 static __inline__ __m512i __DEFAULT_FN_ATTRS512 216 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) 217 { 218 return (__m512i)__builtin_ia32_selectd_512(__M, 219 (__v16si) _mm512_broadcastd_epi32(__A), 220 (__v16si) __O); 221 } 222 223 static __inline__ __m512i __DEFAULT_FN_ATTRS512 224 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) 225 { 226 return (__m512i)__builtin_ia32_selectd_512(__M, 227 (__v16si) _mm512_broadcastd_epi32(__A), 228 (__v16si) _mm512_setzero_si512()); 229 } 230 231 static __inline__ __m512i __DEFAULT_FN_ATTRS512 232 _mm512_broadcastq_epi64 (__m128i __A) 233 { 234 return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A, 235 0, 0, 0, 0, 0, 0, 0, 0); 236 } 237 238 static __inline__ __m512i __DEFAULT_FN_ATTRS512 239 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) 240 { 241 return (__m512i)__builtin_ia32_selectq_512(__M, 242 (__v8di) _mm512_broadcastq_epi64(__A), 243 (__v8di) __O); 244 245 } 246 247 static __inline__ __m512i __DEFAULT_FN_ATTRS512 248 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) 249 { 250 return (__m512i)__builtin_ia32_selectq_512(__M, 251 (__v8di) _mm512_broadcastq_epi64(__A), 252 (__v8di) _mm512_setzero_si512()); 253 } 254 255 256 static __inline __m512 __DEFAULT_FN_ATTRS512 257 _mm512_setzero_ps(void) 258 { 259 return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 260 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; 261 } 262 263 #define _mm512_setzero _mm512_setzero_ps 264 265 static __inline __m512d __DEFAULT_FN_ATTRS512 266 _mm512_setzero_pd(void) 267 { 268 return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; 269 } 270 271 static __inline __m512 __DEFAULT_FN_ATTRS512 272 _mm512_set1_ps(float __w) 273 { 274 return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w, 275 __w, __w, __w, __w, __w, __w, __w, __w }; 276 } 277 278 static __inline __m512d __DEFAULT_FN_ATTRS512 279 _mm512_set1_pd(double __w) 280 { 281 return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w }; 282 } 283 284 static __inline __m512i __DEFAULT_FN_ATTRS512 285 _mm512_set1_epi8(char __w) 286 { 287 return __extension__ (__m512i)(__v64qi){ 288 __w, __w, __w, __w, __w, __w, __w, __w, 289 __w, __w, __w, __w, __w, __w, __w, __w, 290 __w, __w, __w, __w, __w, __w, __w, __w, 291 __w, __w, __w, __w, __w, __w, __w, __w, 292 __w, __w, __w, __w, __w, __w, __w, __w, 293 __w, __w, __w, __w, __w, __w, __w, __w, 294 __w, __w, __w, __w, __w, __w, __w, __w, 295 __w, __w, __w, __w, __w, __w, __w, __w }; 296 } 297 298 static __inline __m512i __DEFAULT_FN_ATTRS512 299 _mm512_set1_epi16(short __w) 300 { 301 return __extension__ (__m512i)(__v32hi){ 302 __w, __w, __w, __w, __w, __w, __w, __w, 303 __w, __w, __w, __w, __w, __w, __w, __w, 304 __w, __w, __w, __w, __w, __w, __w, __w, 305 __w, __w, __w, __w, __w, __w, __w, __w }; 306 } 307 308 static __inline __m512i __DEFAULT_FN_ATTRS512 309 _mm512_set1_epi32(int __s) 310 { 311 return __extension__ (__m512i)(__v16si){ 312 __s, __s, __s, __s, __s, __s, __s, __s, 313 __s, __s, __s, __s, __s, __s, __s, __s }; 314 } 315 316 static __inline __m512i __DEFAULT_FN_ATTRS512 317 _mm512_maskz_set1_epi32(__mmask16 __M, int __A) 318 { 319 return (__m512i)__builtin_ia32_selectd_512(__M, 320 (__v16si)_mm512_set1_epi32(__A), 321 (__v16si)_mm512_setzero_si512()); 322 } 323 324 static __inline __m512i __DEFAULT_FN_ATTRS512 325 _mm512_set1_epi64(long long __d) 326 { 327 return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; 328 } 329 330 static __inline __m512i __DEFAULT_FN_ATTRS512 331 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) 332 { 333 return (__m512i)__builtin_ia32_selectq_512(__M, 334 (__v8di)_mm512_set1_epi64(__A), 335 (__v8di)_mm512_setzero_si512()); 336 } 337 338 static __inline__ __m512 __DEFAULT_FN_ATTRS512 339 _mm512_broadcastss_ps(__m128 __A) 340 { 341 return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A, 342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 343 } 344 345 static __inline __m512i __DEFAULT_FN_ATTRS512 346 _mm512_set4_epi32 (int __A, int __B, int __C, int __D) 347 { 348 return __extension__ (__m512i)(__v16si) 349 { __D, __C, __B, __A, __D, __C, __B, __A, 350 __D, __C, __B, __A, __D, __C, __B, __A }; 351 } 352 353 static __inline __m512i __DEFAULT_FN_ATTRS512 354 _mm512_set4_epi64 (long long __A, long long __B, long long __C, 355 long long __D) 356 { 357 return __extension__ (__m512i) (__v8di) 358 { __D, __C, __B, __A, __D, __C, __B, __A }; 359 } 360 361 static __inline __m512d __DEFAULT_FN_ATTRS512 362 _mm512_set4_pd (double __A, double __B, double __C, double __D) 363 { 364 return __extension__ (__m512d) 365 { __D, __C, __B, __A, __D, __C, __B, __A }; 366 } 367 368 static __inline __m512 __DEFAULT_FN_ATTRS512 369 _mm512_set4_ps (float __A, float __B, float __C, float __D) 370 { 371 return __extension__ (__m512) 372 { __D, __C, __B, __A, __D, __C, __B, __A, 373 __D, __C, __B, __A, __D, __C, __B, __A }; 374 } 375 376 #define _mm512_setr4_epi32(e0,e1,e2,e3) \ 377 _mm512_set4_epi32((e3),(e2),(e1),(e0)) 378 379 #define _mm512_setr4_epi64(e0,e1,e2,e3) \ 380 _mm512_set4_epi64((e3),(e2),(e1),(e0)) 381 382 #define _mm512_setr4_pd(e0,e1,e2,e3) \ 383 _mm512_set4_pd((e3),(e2),(e1),(e0)) 384 385 #define _mm512_setr4_ps(e0,e1,e2,e3) \ 386 _mm512_set4_ps((e3),(e2),(e1),(e0)) 387 388 static __inline__ __m512d __DEFAULT_FN_ATTRS512 389 _mm512_broadcastsd_pd(__m128d __A) 390 { 391 return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A, 392 0, 0, 0, 0, 0, 0, 0, 0); 393 } 394 395 /* Cast between vector types */ 396 397 static __inline __m512d __DEFAULT_FN_ATTRS512 398 _mm512_castpd256_pd512(__m256d __a) 399 { 400 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0, 401 1, 2, 3, 4, 5, 6, 7); 402 } 403 404 static __inline __m512 __DEFAULT_FN_ATTRS512 405 _mm512_castps256_ps512(__m256 __a) 406 { 407 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0, 408 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 409 } 410 411 static __inline __m128d __DEFAULT_FN_ATTRS512 412 _mm512_castpd512_pd128(__m512d __a) 413 { 414 return __builtin_shufflevector(__a, __a, 0, 1); 415 } 416 417 static __inline __m256d __DEFAULT_FN_ATTRS512 418 _mm512_castpd512_pd256 (__m512d __A) 419 { 420 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3); 421 } 422 423 static __inline __m128 __DEFAULT_FN_ATTRS512 424 _mm512_castps512_ps128(__m512 __a) 425 { 426 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); 427 } 428 429 static __inline __m256 __DEFAULT_FN_ATTRS512 430 _mm512_castps512_ps256 (__m512 __A) 431 { 432 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7); 433 } 434 435 static __inline __m512 __DEFAULT_FN_ATTRS512 436 _mm512_castpd_ps (__m512d __A) 437 { 438 return (__m512) (__A); 439 } 440 441 static __inline __m512i __DEFAULT_FN_ATTRS512 442 _mm512_castpd_si512 (__m512d __A) 443 { 444 return (__m512i) (__A); 445 } 446 447 static __inline__ __m512d __DEFAULT_FN_ATTRS512 448 _mm512_castpd128_pd512 (__m128d __A) 449 { 450 __m256d __B = __builtin_nondeterministic_value(__B); 451 return __builtin_shufflevector( 452 __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3), 453 __B, 0, 1, 2, 3, 4, 5, 6, 7); 454 } 455 456 static __inline __m512d __DEFAULT_FN_ATTRS512 457 _mm512_castps_pd (__m512 __A) 458 { 459 return (__m512d) (__A); 460 } 461 462 static __inline __m512i __DEFAULT_FN_ATTRS512 463 _mm512_castps_si512 (__m512 __A) 464 { 465 return (__m512i) (__A); 466 } 467 468 static __inline__ __m512 __DEFAULT_FN_ATTRS512 469 _mm512_castps128_ps512 (__m128 __A) 470 { 471 __m256 __B = __builtin_nondeterministic_value(__B); 472 return __builtin_shufflevector( 473 __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7), 474 __B, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 475 } 476 477 static __inline__ __m512i __DEFAULT_FN_ATTRS512 478 _mm512_castsi128_si512 (__m128i __A) 479 { 480 __m256i __B = __builtin_nondeterministic_value(__B); 481 return __builtin_shufflevector( 482 __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3), 483 __B, 0, 1, 2, 3, 4, 5, 6, 7); 484 } 485 486 static __inline__ __m512i __DEFAULT_FN_ATTRS512 487 _mm512_castsi256_si512 (__m256i __A) 488 { 489 return __builtin_shufflevector( __A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7); 490 } 491 492 static __inline __m512 __DEFAULT_FN_ATTRS512 493 _mm512_castsi512_ps (__m512i __A) 494 { 495 return (__m512) (__A); 496 } 497 498 static __inline __m512d __DEFAULT_FN_ATTRS512 499 _mm512_castsi512_pd (__m512i __A) 500 { 501 return (__m512d) (__A); 502 } 503 504 static __inline __m128i __DEFAULT_FN_ATTRS512 505 _mm512_castsi512_si128 (__m512i __A) 506 { 507 return (__m128i)__builtin_shufflevector(__A, __A , 0, 1); 508 } 509 510 static __inline __m256i __DEFAULT_FN_ATTRS512 511 _mm512_castsi512_si256 (__m512i __A) 512 { 513 return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); 514 } 515 516 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 517 _mm512_int2mask(int __a) 518 { 519 return (__mmask16)__a; 520 } 521 522 static __inline__ int __DEFAULT_FN_ATTRS 523 _mm512_mask2int(__mmask16 __a) 524 { 525 return (int)__a; 526 } 527 528 /// Constructs a 512-bit floating-point vector of [8 x double] from a 529 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits 530 /// contain the value of the source vector. The upper 384 bits are set 531 /// to zero. 532 /// 533 /// \headerfile <x86intrin.h> 534 /// 535 /// This intrinsic has no corresponding instruction. 536 /// 537 /// \param __a 538 /// A 128-bit vector of [2 x double]. 539 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits 540 /// contain the value of the parameter. The upper 384 bits are set to zero. 541 static __inline __m512d __DEFAULT_FN_ATTRS512 542 _mm512_zextpd128_pd512(__m128d __a) 543 { 544 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3); 545 } 546 547 /// Constructs a 512-bit floating-point vector of [8 x double] from a 548 /// 256-bit floating-point vector of [4 x double]. The lower 256 bits 549 /// contain the value of the source vector. The upper 256 bits are set 550 /// to zero. 551 /// 552 /// \headerfile <x86intrin.h> 553 /// 554 /// This intrinsic has no corresponding instruction. 555 /// 556 /// \param __a 557 /// A 256-bit vector of [4 x double]. 558 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits 559 /// contain the value of the parameter. The upper 256 bits are set to zero. 560 static __inline __m512d __DEFAULT_FN_ATTRS512 561 _mm512_zextpd256_pd512(__m256d __a) 562 { 563 return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7); 564 } 565 566 /// Constructs a 512-bit floating-point vector of [16 x float] from a 567 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 568 /// the value of the source vector. The upper 384 bits are set to zero. 569 /// 570 /// \headerfile <x86intrin.h> 571 /// 572 /// This intrinsic has no corresponding instruction. 573 /// 574 /// \param __a 575 /// A 128-bit vector of [4 x float]. 576 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits 577 /// contain the value of the parameter. The upper 384 bits are set to zero. 578 static __inline __m512 __DEFAULT_FN_ATTRS512 579 _mm512_zextps128_ps512(__m128 __a) 580 { 581 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7); 582 } 583 584 /// Constructs a 512-bit floating-point vector of [16 x float] from a 585 /// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain 586 /// the value of the source vector. The upper 256 bits are set to zero. 587 /// 588 /// \headerfile <x86intrin.h> 589 /// 590 /// This intrinsic has no corresponding instruction. 591 /// 592 /// \param __a 593 /// A 256-bit vector of [8 x float]. 594 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits 595 /// contain the value of the parameter. The upper 256 bits are set to zero. 596 static __inline __m512 __DEFAULT_FN_ATTRS512 597 _mm512_zextps256_ps512(__m256 __a) 598 { 599 return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 600 } 601 602 /// Constructs a 512-bit integer vector from a 128-bit integer vector. 603 /// The lower 128 bits contain the value of the source vector. The upper 604 /// 384 bits are set to zero. 605 /// 606 /// \headerfile <x86intrin.h> 607 /// 608 /// This intrinsic has no corresponding instruction. 609 /// 610 /// \param __a 611 /// A 128-bit integer vector. 612 /// \returns A 512-bit integer vector. The lower 128 bits contain the value of 613 /// the parameter. The upper 384 bits are set to zero. 614 static __inline __m512i __DEFAULT_FN_ATTRS512 615 _mm512_zextsi128_si512(__m128i __a) 616 { 617 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3); 618 } 619 620 /// Constructs a 512-bit integer vector from a 256-bit integer vector. 621 /// The lower 256 bits contain the value of the source vector. The upper 622 /// 256 bits are set to zero. 623 /// 624 /// \headerfile <x86intrin.h> 625 /// 626 /// This intrinsic has no corresponding instruction. 627 /// 628 /// \param __a 629 /// A 256-bit integer vector. 630 /// \returns A 512-bit integer vector. The lower 256 bits contain the value of 631 /// the parameter. The upper 256 bits are set to zero. 632 static __inline __m512i __DEFAULT_FN_ATTRS512 633 _mm512_zextsi256_si512(__m256i __a) 634 { 635 return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7); 636 } 637 638 /* Bitwise operators */ 639 static __inline__ __m512i __DEFAULT_FN_ATTRS512 640 _mm512_and_epi32(__m512i __a, __m512i __b) 641 { 642 return (__m512i)((__v16su)__a & (__v16su)__b); 643 } 644 645 static __inline__ __m512i __DEFAULT_FN_ATTRS512 646 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 647 { 648 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 649 (__v16si) _mm512_and_epi32(__a, __b), 650 (__v16si) __src); 651 } 652 653 static __inline__ __m512i __DEFAULT_FN_ATTRS512 654 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) 655 { 656 return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (), 657 __k, __a, __b); 658 } 659 660 static __inline__ __m512i __DEFAULT_FN_ATTRS512 661 _mm512_and_epi64(__m512i __a, __m512i __b) 662 { 663 return (__m512i)((__v8du)__a & (__v8du)__b); 664 } 665 666 static __inline__ __m512i __DEFAULT_FN_ATTRS512 667 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 668 { 669 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k, 670 (__v8di) _mm512_and_epi64(__a, __b), 671 (__v8di) __src); 672 } 673 674 static __inline__ __m512i __DEFAULT_FN_ATTRS512 675 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) 676 { 677 return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (), 678 __k, __a, __b); 679 } 680 681 static __inline__ __m512i __DEFAULT_FN_ATTRS512 682 _mm512_andnot_si512 (__m512i __A, __m512i __B) 683 { 684 return (__m512i)(~(__v8du)__A & (__v8du)__B); 685 } 686 687 static __inline__ __m512i __DEFAULT_FN_ATTRS512 688 _mm512_andnot_epi32 (__m512i __A, __m512i __B) 689 { 690 return (__m512i)(~(__v16su)__A & (__v16su)__B); 691 } 692 693 static __inline__ __m512i __DEFAULT_FN_ATTRS512 694 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 695 { 696 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 697 (__v16si)_mm512_andnot_epi32(__A, __B), 698 (__v16si)__W); 699 } 700 701 static __inline__ __m512i __DEFAULT_FN_ATTRS512 702 _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B) 703 { 704 return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(), 705 __U, __A, __B); 706 } 707 708 static __inline__ __m512i __DEFAULT_FN_ATTRS512 709 _mm512_andnot_epi64(__m512i __A, __m512i __B) 710 { 711 return (__m512i)(~(__v8du)__A & (__v8du)__B); 712 } 713 714 static __inline__ __m512i __DEFAULT_FN_ATTRS512 715 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 716 { 717 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 718 (__v8di)_mm512_andnot_epi64(__A, __B), 719 (__v8di)__W); 720 } 721 722 static __inline__ __m512i __DEFAULT_FN_ATTRS512 723 _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B) 724 { 725 return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(), 726 __U, __A, __B); 727 } 728 729 static __inline__ __m512i __DEFAULT_FN_ATTRS512 730 _mm512_or_epi32(__m512i __a, __m512i __b) 731 { 732 return (__m512i)((__v16su)__a | (__v16su)__b); 733 } 734 735 static __inline__ __m512i __DEFAULT_FN_ATTRS512 736 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 737 { 738 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 739 (__v16si)_mm512_or_epi32(__a, __b), 740 (__v16si)__src); 741 } 742 743 static __inline__ __m512i __DEFAULT_FN_ATTRS512 744 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) 745 { 746 return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b); 747 } 748 749 static __inline__ __m512i __DEFAULT_FN_ATTRS512 750 _mm512_or_epi64(__m512i __a, __m512i __b) 751 { 752 return (__m512i)((__v8du)__a | (__v8du)__b); 753 } 754 755 static __inline__ __m512i __DEFAULT_FN_ATTRS512 756 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 757 { 758 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, 759 (__v8di)_mm512_or_epi64(__a, __b), 760 (__v8di)__src); 761 } 762 763 static __inline__ __m512i __DEFAULT_FN_ATTRS512 764 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) 765 { 766 return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b); 767 } 768 769 static __inline__ __m512i __DEFAULT_FN_ATTRS512 770 _mm512_xor_epi32(__m512i __a, __m512i __b) 771 { 772 return (__m512i)((__v16su)__a ^ (__v16su)__b); 773 } 774 775 static __inline__ __m512i __DEFAULT_FN_ATTRS512 776 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) 777 { 778 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, 779 (__v16si)_mm512_xor_epi32(__a, __b), 780 (__v16si)__src); 781 } 782 783 static __inline__ __m512i __DEFAULT_FN_ATTRS512 784 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) 785 { 786 return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b); 787 } 788 789 static __inline__ __m512i __DEFAULT_FN_ATTRS512 790 _mm512_xor_epi64(__m512i __a, __m512i __b) 791 { 792 return (__m512i)((__v8du)__a ^ (__v8du)__b); 793 } 794 795 static __inline__ __m512i __DEFAULT_FN_ATTRS512 796 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) 797 { 798 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, 799 (__v8di)_mm512_xor_epi64(__a, __b), 800 (__v8di)__src); 801 } 802 803 static __inline__ __m512i __DEFAULT_FN_ATTRS512 804 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) 805 { 806 return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b); 807 } 808 809 static __inline__ __m512i __DEFAULT_FN_ATTRS512 810 _mm512_and_si512(__m512i __a, __m512i __b) 811 { 812 return (__m512i)((__v8du)__a & (__v8du)__b); 813 } 814 815 static __inline__ __m512i __DEFAULT_FN_ATTRS512 816 _mm512_or_si512(__m512i __a, __m512i __b) 817 { 818 return (__m512i)((__v8du)__a | (__v8du)__b); 819 } 820 821 static __inline__ __m512i __DEFAULT_FN_ATTRS512 822 _mm512_xor_si512(__m512i __a, __m512i __b) 823 { 824 return (__m512i)((__v8du)__a ^ (__v8du)__b); 825 } 826 827 /* Arithmetic */ 828 829 static __inline __m512d __DEFAULT_FN_ATTRS512 830 _mm512_add_pd(__m512d __a, __m512d __b) 831 { 832 return (__m512d)((__v8df)__a + (__v8df)__b); 833 } 834 835 static __inline __m512 __DEFAULT_FN_ATTRS512 836 _mm512_add_ps(__m512 __a, __m512 __b) 837 { 838 return (__m512)((__v16sf)__a + (__v16sf)__b); 839 } 840 841 static __inline __m512d __DEFAULT_FN_ATTRS512 842 _mm512_mul_pd(__m512d __a, __m512d __b) 843 { 844 return (__m512d)((__v8df)__a * (__v8df)__b); 845 } 846 847 static __inline __m512 __DEFAULT_FN_ATTRS512 848 _mm512_mul_ps(__m512 __a, __m512 __b) 849 { 850 return (__m512)((__v16sf)__a * (__v16sf)__b); 851 } 852 853 static __inline __m512d __DEFAULT_FN_ATTRS512 854 _mm512_sub_pd(__m512d __a, __m512d __b) 855 { 856 return (__m512d)((__v8df)__a - (__v8df)__b); 857 } 858 859 static __inline __m512 __DEFAULT_FN_ATTRS512 860 _mm512_sub_ps(__m512 __a, __m512 __b) 861 { 862 return (__m512)((__v16sf)__a - (__v16sf)__b); 863 } 864 865 static __inline__ __m512i __DEFAULT_FN_ATTRS512 866 _mm512_add_epi64 (__m512i __A, __m512i __B) 867 { 868 return (__m512i) ((__v8du) __A + (__v8du) __B); 869 } 870 871 static __inline__ __m512i __DEFAULT_FN_ATTRS512 872 _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 873 { 874 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 875 (__v8di)_mm512_add_epi64(__A, __B), 876 (__v8di)__W); 877 } 878 879 static __inline__ __m512i __DEFAULT_FN_ATTRS512 880 _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) 881 { 882 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 883 (__v8di)_mm512_add_epi64(__A, __B), 884 (__v8di)_mm512_setzero_si512()); 885 } 886 887 static __inline__ __m512i __DEFAULT_FN_ATTRS512 888 _mm512_sub_epi64 (__m512i __A, __m512i __B) 889 { 890 return (__m512i) ((__v8du) __A - (__v8du) __B); 891 } 892 893 static __inline__ __m512i __DEFAULT_FN_ATTRS512 894 _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 895 { 896 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 897 (__v8di)_mm512_sub_epi64(__A, __B), 898 (__v8di)__W); 899 } 900 901 static __inline__ __m512i __DEFAULT_FN_ATTRS512 902 _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) 903 { 904 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 905 (__v8di)_mm512_sub_epi64(__A, __B), 906 (__v8di)_mm512_setzero_si512()); 907 } 908 909 static __inline__ __m512i __DEFAULT_FN_ATTRS512 910 _mm512_add_epi32 (__m512i __A, __m512i __B) 911 { 912 return (__m512i) ((__v16su) __A + (__v16su) __B); 913 } 914 915 static __inline__ __m512i __DEFAULT_FN_ATTRS512 916 _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 917 { 918 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 919 (__v16si)_mm512_add_epi32(__A, __B), 920 (__v16si)__W); 921 } 922 923 static __inline__ __m512i __DEFAULT_FN_ATTRS512 924 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 925 { 926 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 927 (__v16si)_mm512_add_epi32(__A, __B), 928 (__v16si)_mm512_setzero_si512()); 929 } 930 931 static __inline__ __m512i __DEFAULT_FN_ATTRS512 932 _mm512_sub_epi32 (__m512i __A, __m512i __B) 933 { 934 return (__m512i) ((__v16su) __A - (__v16su) __B); 935 } 936 937 static __inline__ __m512i __DEFAULT_FN_ATTRS512 938 _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 939 { 940 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 941 (__v16si)_mm512_sub_epi32(__A, __B), 942 (__v16si)__W); 943 } 944 945 static __inline__ __m512i __DEFAULT_FN_ATTRS512 946 _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) 947 { 948 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 949 (__v16si)_mm512_sub_epi32(__A, __B), 950 (__v16si)_mm512_setzero_si512()); 951 } 952 953 #define _mm512_max_round_pd(A, B, R) \ 954 ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \ 955 (__v8df)(__m512d)(B), (int)(R))) 956 957 #define _mm512_mask_max_round_pd(W, U, A, B, R) \ 958 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 959 (__v8df)_mm512_max_round_pd((A), (B), (R)), \ 960 (__v8df)(W))) 961 962 #define _mm512_maskz_max_round_pd(U, A, B, R) \ 963 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 964 (__v8df)_mm512_max_round_pd((A), (B), (R)), \ 965 (__v8df)_mm512_setzero_pd())) 966 967 static __inline__ __m512d __DEFAULT_FN_ATTRS512 968 _mm512_max_pd(__m512d __A, __m512d __B) 969 { 970 return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B, 971 _MM_FROUND_CUR_DIRECTION); 972 } 973 974 static __inline__ __m512d __DEFAULT_FN_ATTRS512 975 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 976 { 977 return (__m512d)__builtin_ia32_selectpd_512(__U, 978 (__v8df)_mm512_max_pd(__A, __B), 979 (__v8df)__W); 980 } 981 982 static __inline__ __m512d __DEFAULT_FN_ATTRS512 983 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) 984 { 985 return (__m512d)__builtin_ia32_selectpd_512(__U, 986 (__v8df)_mm512_max_pd(__A, __B), 987 (__v8df)_mm512_setzero_pd()); 988 } 989 990 #define _mm512_max_round_ps(A, B, R) \ 991 ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \ 992 (__v16sf)(__m512)(B), (int)(R))) 993 994 #define _mm512_mask_max_round_ps(W, U, A, B, R) \ 995 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 996 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ 997 (__v16sf)(W))) 998 999 #define _mm512_maskz_max_round_ps(U, A, B, R) \ 1000 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1001 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ 1002 (__v16sf)_mm512_setzero_ps())) 1003 1004 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1005 _mm512_max_ps(__m512 __A, __m512 __B) 1006 { 1007 return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B, 1008 _MM_FROUND_CUR_DIRECTION); 1009 } 1010 1011 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1012 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 1013 { 1014 return (__m512)__builtin_ia32_selectps_512(__U, 1015 (__v16sf)_mm512_max_ps(__A, __B), 1016 (__v16sf)__W); 1017 } 1018 1019 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1020 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) 1021 { 1022 return (__m512)__builtin_ia32_selectps_512(__U, 1023 (__v16sf)_mm512_max_ps(__A, __B), 1024 (__v16sf)_mm512_setzero_ps()); 1025 } 1026 1027 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1028 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1029 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, 1030 (__v4sf) __B, 1031 (__v4sf) __W, 1032 (__mmask8) __U, 1033 _MM_FROUND_CUR_DIRECTION); 1034 } 1035 1036 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1037 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1038 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, 1039 (__v4sf) __B, 1040 (__v4sf) _mm_setzero_ps (), 1041 (__mmask8) __U, 1042 _MM_FROUND_CUR_DIRECTION); 1043 } 1044 1045 #define _mm_max_round_ss(A, B, R) \ 1046 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1047 (__v4sf)(__m128)(B), \ 1048 (__v4sf)_mm_setzero_ps(), \ 1049 (__mmask8)-1, (int)(R))) 1050 1051 #define _mm_mask_max_round_ss(W, U, A, B, R) \ 1052 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1053 (__v4sf)(__m128)(B), \ 1054 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1055 (int)(R))) 1056 1057 #define _mm_maskz_max_round_ss(U, A, B, R) \ 1058 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ 1059 (__v4sf)(__m128)(B), \ 1060 (__v4sf)_mm_setzero_ps(), \ 1061 (__mmask8)(U), (int)(R))) 1062 1063 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1064 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1065 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, 1066 (__v2df) __B, 1067 (__v2df) __W, 1068 (__mmask8) __U, 1069 _MM_FROUND_CUR_DIRECTION); 1070 } 1071 1072 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1073 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1074 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, 1075 (__v2df) __B, 1076 (__v2df) _mm_setzero_pd (), 1077 (__mmask8) __U, 1078 _MM_FROUND_CUR_DIRECTION); 1079 } 1080 1081 #define _mm_max_round_sd(A, B, R) \ 1082 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1083 (__v2df)(__m128d)(B), \ 1084 (__v2df)_mm_setzero_pd(), \ 1085 (__mmask8)-1, (int)(R))) 1086 1087 #define _mm_mask_max_round_sd(W, U, A, B, R) \ 1088 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1089 (__v2df)(__m128d)(B), \ 1090 (__v2df)(__m128d)(W), \ 1091 (__mmask8)(U), (int)(R))) 1092 1093 #define _mm_maskz_max_round_sd(U, A, B, R) \ 1094 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ 1095 (__v2df)(__m128d)(B), \ 1096 (__v2df)_mm_setzero_pd(), \ 1097 (__mmask8)(U), (int)(R))) 1098 1099 static __inline __m512i 1100 __DEFAULT_FN_ATTRS512 1101 _mm512_max_epi32(__m512i __A, __m512i __B) 1102 { 1103 return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B); 1104 } 1105 1106 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1107 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1108 { 1109 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1110 (__v16si)_mm512_max_epi32(__A, __B), 1111 (__v16si)__W); 1112 } 1113 1114 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1115 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) 1116 { 1117 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1118 (__v16si)_mm512_max_epi32(__A, __B), 1119 (__v16si)_mm512_setzero_si512()); 1120 } 1121 1122 static __inline __m512i __DEFAULT_FN_ATTRS512 1123 _mm512_max_epu32(__m512i __A, __m512i __B) 1124 { 1125 return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B); 1126 } 1127 1128 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1129 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1130 { 1131 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1132 (__v16si)_mm512_max_epu32(__A, __B), 1133 (__v16si)__W); 1134 } 1135 1136 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1137 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) 1138 { 1139 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1140 (__v16si)_mm512_max_epu32(__A, __B), 1141 (__v16si)_mm512_setzero_si512()); 1142 } 1143 1144 static __inline __m512i __DEFAULT_FN_ATTRS512 1145 _mm512_max_epi64(__m512i __A, __m512i __B) 1146 { 1147 return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B); 1148 } 1149 1150 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1151 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1152 { 1153 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1154 (__v8di)_mm512_max_epi64(__A, __B), 1155 (__v8di)__W); 1156 } 1157 1158 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1159 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) 1160 { 1161 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1162 (__v8di)_mm512_max_epi64(__A, __B), 1163 (__v8di)_mm512_setzero_si512()); 1164 } 1165 1166 static __inline __m512i __DEFAULT_FN_ATTRS512 1167 _mm512_max_epu64(__m512i __A, __m512i __B) 1168 { 1169 return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B); 1170 } 1171 1172 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1173 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1174 { 1175 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1176 (__v8di)_mm512_max_epu64(__A, __B), 1177 (__v8di)__W); 1178 } 1179 1180 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1181 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) 1182 { 1183 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1184 (__v8di)_mm512_max_epu64(__A, __B), 1185 (__v8di)_mm512_setzero_si512()); 1186 } 1187 1188 #define _mm512_min_round_pd(A, B, R) \ 1189 ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \ 1190 (__v8df)(__m512d)(B), (int)(R))) 1191 1192 #define _mm512_mask_min_round_pd(W, U, A, B, R) \ 1193 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1194 (__v8df)_mm512_min_round_pd((A), (B), (R)), \ 1195 (__v8df)(W))) 1196 1197 #define _mm512_maskz_min_round_pd(U, A, B, R) \ 1198 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1199 (__v8df)_mm512_min_round_pd((A), (B), (R)), \ 1200 (__v8df)_mm512_setzero_pd())) 1201 1202 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1203 _mm512_min_pd(__m512d __A, __m512d __B) 1204 { 1205 return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B, 1206 _MM_FROUND_CUR_DIRECTION); 1207 } 1208 1209 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1210 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 1211 { 1212 return (__m512d)__builtin_ia32_selectpd_512(__U, 1213 (__v8df)_mm512_min_pd(__A, __B), 1214 (__v8df)__W); 1215 } 1216 1217 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1218 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) 1219 { 1220 return (__m512d)__builtin_ia32_selectpd_512(__U, 1221 (__v8df)_mm512_min_pd(__A, __B), 1222 (__v8df)_mm512_setzero_pd()); 1223 } 1224 1225 #define _mm512_min_round_ps(A, B, R) \ 1226 ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \ 1227 (__v16sf)(__m512)(B), (int)(R))) 1228 1229 #define _mm512_mask_min_round_ps(W, U, A, B, R) \ 1230 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1231 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ 1232 (__v16sf)(W))) 1233 1234 #define _mm512_maskz_min_round_ps(U, A, B, R) \ 1235 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1236 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ 1237 (__v16sf)_mm512_setzero_ps())) 1238 1239 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1240 _mm512_min_ps(__m512 __A, __m512 __B) 1241 { 1242 return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B, 1243 _MM_FROUND_CUR_DIRECTION); 1244 } 1245 1246 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1247 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 1248 { 1249 return (__m512)__builtin_ia32_selectps_512(__U, 1250 (__v16sf)_mm512_min_ps(__A, __B), 1251 (__v16sf)__W); 1252 } 1253 1254 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1255 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) 1256 { 1257 return (__m512)__builtin_ia32_selectps_512(__U, 1258 (__v16sf)_mm512_min_ps(__A, __B), 1259 (__v16sf)_mm512_setzero_ps()); 1260 } 1261 1262 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1263 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1264 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, 1265 (__v4sf) __B, 1266 (__v4sf) __W, 1267 (__mmask8) __U, 1268 _MM_FROUND_CUR_DIRECTION); 1269 } 1270 1271 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1272 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1273 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, 1274 (__v4sf) __B, 1275 (__v4sf) _mm_setzero_ps (), 1276 (__mmask8) __U, 1277 _MM_FROUND_CUR_DIRECTION); 1278 } 1279 1280 #define _mm_min_round_ss(A, B, R) \ 1281 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1282 (__v4sf)(__m128)(B), \ 1283 (__v4sf)_mm_setzero_ps(), \ 1284 (__mmask8)-1, (int)(R))) 1285 1286 #define _mm_mask_min_round_ss(W, U, A, B, R) \ 1287 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1288 (__v4sf)(__m128)(B), \ 1289 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1290 (int)(R))) 1291 1292 #define _mm_maskz_min_round_ss(U, A, B, R) \ 1293 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ 1294 (__v4sf)(__m128)(B), \ 1295 (__v4sf)_mm_setzero_ps(), \ 1296 (__mmask8)(U), (int)(R))) 1297 1298 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1299 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1300 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, 1301 (__v2df) __B, 1302 (__v2df) __W, 1303 (__mmask8) __U, 1304 _MM_FROUND_CUR_DIRECTION); 1305 } 1306 1307 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1308 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1309 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, 1310 (__v2df) __B, 1311 (__v2df) _mm_setzero_pd (), 1312 (__mmask8) __U, 1313 _MM_FROUND_CUR_DIRECTION); 1314 } 1315 1316 #define _mm_min_round_sd(A, B, R) \ 1317 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1318 (__v2df)(__m128d)(B), \ 1319 (__v2df)_mm_setzero_pd(), \ 1320 (__mmask8)-1, (int)(R))) 1321 1322 #define _mm_mask_min_round_sd(W, U, A, B, R) \ 1323 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1324 (__v2df)(__m128d)(B), \ 1325 (__v2df)(__m128d)(W), \ 1326 (__mmask8)(U), (int)(R))) 1327 1328 #define _mm_maskz_min_round_sd(U, A, B, R) \ 1329 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ 1330 (__v2df)(__m128d)(B), \ 1331 (__v2df)_mm_setzero_pd(), \ 1332 (__mmask8)(U), (int)(R))) 1333 1334 static __inline __m512i 1335 __DEFAULT_FN_ATTRS512 1336 _mm512_min_epi32(__m512i __A, __m512i __B) 1337 { 1338 return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B); 1339 } 1340 1341 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1342 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1343 { 1344 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1345 (__v16si)_mm512_min_epi32(__A, __B), 1346 (__v16si)__W); 1347 } 1348 1349 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1350 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) 1351 { 1352 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1353 (__v16si)_mm512_min_epi32(__A, __B), 1354 (__v16si)_mm512_setzero_si512()); 1355 } 1356 1357 static __inline __m512i __DEFAULT_FN_ATTRS512 1358 _mm512_min_epu32(__m512i __A, __m512i __B) 1359 { 1360 return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B); 1361 } 1362 1363 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1364 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1365 { 1366 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1367 (__v16si)_mm512_min_epu32(__A, __B), 1368 (__v16si)__W); 1369 } 1370 1371 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1372 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) 1373 { 1374 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1375 (__v16si)_mm512_min_epu32(__A, __B), 1376 (__v16si)_mm512_setzero_si512()); 1377 } 1378 1379 static __inline __m512i __DEFAULT_FN_ATTRS512 1380 _mm512_min_epi64(__m512i __A, __m512i __B) 1381 { 1382 return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B); 1383 } 1384 1385 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1386 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1387 { 1388 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1389 (__v8di)_mm512_min_epi64(__A, __B), 1390 (__v8di)__W); 1391 } 1392 1393 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1394 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) 1395 { 1396 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1397 (__v8di)_mm512_min_epi64(__A, __B), 1398 (__v8di)_mm512_setzero_si512()); 1399 } 1400 1401 static __inline __m512i __DEFAULT_FN_ATTRS512 1402 _mm512_min_epu64(__m512i __A, __m512i __B) 1403 { 1404 return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B); 1405 } 1406 1407 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1408 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) 1409 { 1410 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1411 (__v8di)_mm512_min_epu64(__A, __B), 1412 (__v8di)__W); 1413 } 1414 1415 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1416 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) 1417 { 1418 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1419 (__v8di)_mm512_min_epu64(__A, __B), 1420 (__v8di)_mm512_setzero_si512()); 1421 } 1422 1423 static __inline __m512i __DEFAULT_FN_ATTRS512 1424 _mm512_mul_epi32(__m512i __X, __m512i __Y) 1425 { 1426 return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); 1427 } 1428 1429 static __inline __m512i __DEFAULT_FN_ATTRS512 1430 _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) 1431 { 1432 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1433 (__v8di)_mm512_mul_epi32(__X, __Y), 1434 (__v8di)__W); 1435 } 1436 1437 static __inline __m512i __DEFAULT_FN_ATTRS512 1438 _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) 1439 { 1440 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1441 (__v8di)_mm512_mul_epi32(__X, __Y), 1442 (__v8di)_mm512_setzero_si512 ()); 1443 } 1444 1445 static __inline __m512i __DEFAULT_FN_ATTRS512 1446 _mm512_mul_epu32(__m512i __X, __m512i __Y) 1447 { 1448 return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); 1449 } 1450 1451 static __inline __m512i __DEFAULT_FN_ATTRS512 1452 _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) 1453 { 1454 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1455 (__v8di)_mm512_mul_epu32(__X, __Y), 1456 (__v8di)__W); 1457 } 1458 1459 static __inline __m512i __DEFAULT_FN_ATTRS512 1460 _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) 1461 { 1462 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1463 (__v8di)_mm512_mul_epu32(__X, __Y), 1464 (__v8di)_mm512_setzero_si512 ()); 1465 } 1466 1467 static __inline __m512i __DEFAULT_FN_ATTRS512 1468 _mm512_mullo_epi32 (__m512i __A, __m512i __B) 1469 { 1470 return (__m512i) ((__v16su) __A * (__v16su) __B); 1471 } 1472 1473 static __inline __m512i __DEFAULT_FN_ATTRS512 1474 _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) 1475 { 1476 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1477 (__v16si)_mm512_mullo_epi32(__A, __B), 1478 (__v16si)_mm512_setzero_si512()); 1479 } 1480 1481 static __inline __m512i __DEFAULT_FN_ATTRS512 1482 _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) 1483 { 1484 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1485 (__v16si)_mm512_mullo_epi32(__A, __B), 1486 (__v16si)__W); 1487 } 1488 1489 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1490 _mm512_mullox_epi64 (__m512i __A, __m512i __B) { 1491 return (__m512i) ((__v8du) __A * (__v8du) __B); 1492 } 1493 1494 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1495 _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { 1496 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1497 (__v8di)_mm512_mullox_epi64(__A, __B), 1498 (__v8di)__W); 1499 } 1500 1501 #define _mm512_sqrt_round_pd(A, R) \ 1502 ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R))) 1503 1504 #define _mm512_mask_sqrt_round_pd(W, U, A, R) \ 1505 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1506 (__v8df)_mm512_sqrt_round_pd((A), (R)), \ 1507 (__v8df)(__m512d)(W))) 1508 1509 #define _mm512_maskz_sqrt_round_pd(U, A, R) \ 1510 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1511 (__v8df)_mm512_sqrt_round_pd((A), (R)), \ 1512 (__v8df)_mm512_setzero_pd())) 1513 1514 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1515 _mm512_sqrt_pd(__m512d __A) 1516 { 1517 return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A, 1518 _MM_FROUND_CUR_DIRECTION); 1519 } 1520 1521 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1522 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) 1523 { 1524 return (__m512d)__builtin_ia32_selectpd_512(__U, 1525 (__v8df)_mm512_sqrt_pd(__A), 1526 (__v8df)__W); 1527 } 1528 1529 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1530 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) 1531 { 1532 return (__m512d)__builtin_ia32_selectpd_512(__U, 1533 (__v8df)_mm512_sqrt_pd(__A), 1534 (__v8df)_mm512_setzero_pd()); 1535 } 1536 1537 #define _mm512_sqrt_round_ps(A, R) \ 1538 ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R))) 1539 1540 #define _mm512_mask_sqrt_round_ps(W, U, A, R) \ 1541 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1542 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ 1543 (__v16sf)(__m512)(W))) 1544 1545 #define _mm512_maskz_sqrt_round_ps(U, A, R) \ 1546 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1547 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ 1548 (__v16sf)_mm512_setzero_ps())) 1549 1550 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1551 _mm512_sqrt_ps(__m512 __A) 1552 { 1553 return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A, 1554 _MM_FROUND_CUR_DIRECTION); 1555 } 1556 1557 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1558 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) 1559 { 1560 return (__m512)__builtin_ia32_selectps_512(__U, 1561 (__v16sf)_mm512_sqrt_ps(__A), 1562 (__v16sf)__W); 1563 } 1564 1565 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1566 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) 1567 { 1568 return (__m512)__builtin_ia32_selectps_512(__U, 1569 (__v16sf)_mm512_sqrt_ps(__A), 1570 (__v16sf)_mm512_setzero_ps()); 1571 } 1572 1573 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1574 _mm512_rsqrt14_pd(__m512d __A) 1575 { 1576 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1577 (__v8df) 1578 _mm512_setzero_pd (), 1579 (__mmask8) -1);} 1580 1581 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1582 _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) 1583 { 1584 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1585 (__v8df) __W, 1586 (__mmask8) __U); 1587 } 1588 1589 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1590 _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) 1591 { 1592 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, 1593 (__v8df) 1594 _mm512_setzero_pd (), 1595 (__mmask8) __U); 1596 } 1597 1598 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1599 _mm512_rsqrt14_ps(__m512 __A) 1600 { 1601 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1602 (__v16sf) 1603 _mm512_setzero_ps (), 1604 (__mmask16) -1); 1605 } 1606 1607 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1608 _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) 1609 { 1610 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1611 (__v16sf) __W, 1612 (__mmask16) __U); 1613 } 1614 1615 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1616 _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) 1617 { 1618 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, 1619 (__v16sf) 1620 _mm512_setzero_ps (), 1621 (__mmask16) __U); 1622 } 1623 1624 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1625 _mm_rsqrt14_ss(__m128 __A, __m128 __B) 1626 { 1627 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1628 (__v4sf) __B, 1629 (__v4sf) 1630 _mm_setzero_ps (), 1631 (__mmask8) -1); 1632 } 1633 1634 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1635 _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 1636 { 1637 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1638 (__v4sf) __B, 1639 (__v4sf) __W, 1640 (__mmask8) __U); 1641 } 1642 1643 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1644 _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) 1645 { 1646 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, 1647 (__v4sf) __B, 1648 (__v4sf) _mm_setzero_ps (), 1649 (__mmask8) __U); 1650 } 1651 1652 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1653 _mm_rsqrt14_sd(__m128d __A, __m128d __B) 1654 { 1655 return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A, 1656 (__v2df) __B, 1657 (__v2df) 1658 _mm_setzero_pd (), 1659 (__mmask8) -1); 1660 } 1661 1662 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1663 _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 1664 { 1665 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, 1666 (__v2df) __B, 1667 (__v2df) __W, 1668 (__mmask8) __U); 1669 } 1670 1671 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1672 _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) 1673 { 1674 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, 1675 (__v2df) __B, 1676 (__v2df) _mm_setzero_pd (), 1677 (__mmask8) __U); 1678 } 1679 1680 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1681 _mm512_rcp14_pd(__m512d __A) 1682 { 1683 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1684 (__v8df) 1685 _mm512_setzero_pd (), 1686 (__mmask8) -1); 1687 } 1688 1689 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1690 _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) 1691 { 1692 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1693 (__v8df) __W, 1694 (__mmask8) __U); 1695 } 1696 1697 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1698 _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) 1699 { 1700 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, 1701 (__v8df) 1702 _mm512_setzero_pd (), 1703 (__mmask8) __U); 1704 } 1705 1706 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1707 _mm512_rcp14_ps(__m512 __A) 1708 { 1709 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1710 (__v16sf) 1711 _mm512_setzero_ps (), 1712 (__mmask16) -1); 1713 } 1714 1715 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1716 _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) 1717 { 1718 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1719 (__v16sf) __W, 1720 (__mmask16) __U); 1721 } 1722 1723 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1724 _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) 1725 { 1726 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, 1727 (__v16sf) 1728 _mm512_setzero_ps (), 1729 (__mmask16) __U); 1730 } 1731 1732 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1733 _mm_rcp14_ss(__m128 __A, __m128 __B) 1734 { 1735 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1736 (__v4sf) __B, 1737 (__v4sf) 1738 _mm_setzero_ps (), 1739 (__mmask8) -1); 1740 } 1741 1742 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1743 _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 1744 { 1745 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1746 (__v4sf) __B, 1747 (__v4sf) __W, 1748 (__mmask8) __U); 1749 } 1750 1751 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1752 _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) 1753 { 1754 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, 1755 (__v4sf) __B, 1756 (__v4sf) _mm_setzero_ps (), 1757 (__mmask8) __U); 1758 } 1759 1760 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1761 _mm_rcp14_sd(__m128d __A, __m128d __B) 1762 { 1763 return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A, 1764 (__v2df) __B, 1765 (__v2df) 1766 _mm_setzero_pd (), 1767 (__mmask8) -1); 1768 } 1769 1770 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1771 _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 1772 { 1773 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, 1774 (__v2df) __B, 1775 (__v2df) __W, 1776 (__mmask8) __U); 1777 } 1778 1779 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1780 _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) 1781 { 1782 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, 1783 (__v2df) __B, 1784 (__v2df) _mm_setzero_pd (), 1785 (__mmask8) __U); 1786 } 1787 1788 static __inline __m512 __DEFAULT_FN_ATTRS512 1789 _mm512_floor_ps(__m512 __A) 1790 { 1791 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1792 _MM_FROUND_FLOOR, 1793 (__v16sf) __A, (unsigned short)-1, 1794 _MM_FROUND_CUR_DIRECTION); 1795 } 1796 1797 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1798 _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) 1799 { 1800 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1801 _MM_FROUND_FLOOR, 1802 (__v16sf) __W, __U, 1803 _MM_FROUND_CUR_DIRECTION); 1804 } 1805 1806 static __inline __m512d __DEFAULT_FN_ATTRS512 1807 _mm512_floor_pd(__m512d __A) 1808 { 1809 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1810 _MM_FROUND_FLOOR, 1811 (__v8df) __A, (unsigned char)-1, 1812 _MM_FROUND_CUR_DIRECTION); 1813 } 1814 1815 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1816 _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) 1817 { 1818 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1819 _MM_FROUND_FLOOR, 1820 (__v8df) __W, __U, 1821 _MM_FROUND_CUR_DIRECTION); 1822 } 1823 1824 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1825 _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) 1826 { 1827 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1828 _MM_FROUND_CEIL, 1829 (__v16sf) __W, __U, 1830 _MM_FROUND_CUR_DIRECTION); 1831 } 1832 1833 static __inline __m512 __DEFAULT_FN_ATTRS512 1834 _mm512_ceil_ps(__m512 __A) 1835 { 1836 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, 1837 _MM_FROUND_CEIL, 1838 (__v16sf) __A, (unsigned short)-1, 1839 _MM_FROUND_CUR_DIRECTION); 1840 } 1841 1842 static __inline __m512d __DEFAULT_FN_ATTRS512 1843 _mm512_ceil_pd(__m512d __A) 1844 { 1845 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1846 _MM_FROUND_CEIL, 1847 (__v8df) __A, (unsigned char)-1, 1848 _MM_FROUND_CUR_DIRECTION); 1849 } 1850 1851 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1852 _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) 1853 { 1854 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, 1855 _MM_FROUND_CEIL, 1856 (__v8df) __W, __U, 1857 _MM_FROUND_CUR_DIRECTION); 1858 } 1859 1860 static __inline __m512i __DEFAULT_FN_ATTRS512 1861 _mm512_abs_epi64(__m512i __A) 1862 { 1863 return (__m512i)__builtin_elementwise_abs((__v8di)__A); 1864 } 1865 1866 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1867 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 1868 { 1869 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1870 (__v8di)_mm512_abs_epi64(__A), 1871 (__v8di)__W); 1872 } 1873 1874 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1875 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) 1876 { 1877 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 1878 (__v8di)_mm512_abs_epi64(__A), 1879 (__v8di)_mm512_setzero_si512()); 1880 } 1881 1882 static __inline __m512i __DEFAULT_FN_ATTRS512 1883 _mm512_abs_epi32(__m512i __A) 1884 { 1885 return (__m512i)__builtin_elementwise_abs((__v16si) __A); 1886 } 1887 1888 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1889 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 1890 { 1891 return (__m512i)__builtin_ia32_selectd_512(__U, 1892 (__v16si)_mm512_abs_epi32(__A), 1893 (__v16si)__W); 1894 } 1895 1896 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1897 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) 1898 { 1899 return (__m512i)__builtin_ia32_selectd_512(__U, 1900 (__v16si)_mm512_abs_epi32(__A), 1901 (__v16si)_mm512_setzero_si512()); 1902 } 1903 1904 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1905 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 1906 __A = _mm_add_ss(__A, __B); 1907 return __builtin_ia32_selectss_128(__U, __A, __W); 1908 } 1909 1910 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1911 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { 1912 __A = _mm_add_ss(__A, __B); 1913 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 1914 } 1915 1916 #define _mm_add_round_ss(A, B, R) \ 1917 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1918 (__v4sf)(__m128)(B), \ 1919 (__v4sf)_mm_setzero_ps(), \ 1920 (__mmask8)-1, (int)(R))) 1921 1922 #define _mm_mask_add_round_ss(W, U, A, B, R) \ 1923 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1924 (__v4sf)(__m128)(B), \ 1925 (__v4sf)(__m128)(W), (__mmask8)(U), \ 1926 (int)(R))) 1927 1928 #define _mm_maskz_add_round_ss(U, A, B, R) \ 1929 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ 1930 (__v4sf)(__m128)(B), \ 1931 (__v4sf)_mm_setzero_ps(), \ 1932 (__mmask8)(U), (int)(R))) 1933 1934 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1935 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 1936 __A = _mm_add_sd(__A, __B); 1937 return __builtin_ia32_selectsd_128(__U, __A, __W); 1938 } 1939 1940 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1941 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { 1942 __A = _mm_add_sd(__A, __B); 1943 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 1944 } 1945 #define _mm_add_round_sd(A, B, R) \ 1946 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1947 (__v2df)(__m128d)(B), \ 1948 (__v2df)_mm_setzero_pd(), \ 1949 (__mmask8)-1, (int)(R))) 1950 1951 #define _mm_mask_add_round_sd(W, U, A, B, R) \ 1952 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1953 (__v2df)(__m128d)(B), \ 1954 (__v2df)(__m128d)(W), \ 1955 (__mmask8)(U), (int)(R))) 1956 1957 #define _mm_maskz_add_round_sd(U, A, B, R) \ 1958 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ 1959 (__v2df)(__m128d)(B), \ 1960 (__v2df)_mm_setzero_pd(), \ 1961 (__mmask8)(U), (int)(R))) 1962 1963 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1964 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 1965 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 1966 (__v8df)_mm512_add_pd(__A, __B), 1967 (__v8df)__W); 1968 } 1969 1970 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1971 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { 1972 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 1973 (__v8df)_mm512_add_pd(__A, __B), 1974 (__v8df)_mm512_setzero_pd()); 1975 } 1976 1977 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1978 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 1979 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 1980 (__v16sf)_mm512_add_ps(__A, __B), 1981 (__v16sf)__W); 1982 } 1983 1984 static __inline__ __m512 __DEFAULT_FN_ATTRS512 1985 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { 1986 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 1987 (__v16sf)_mm512_add_ps(__A, __B), 1988 (__v16sf)_mm512_setzero_ps()); 1989 } 1990 1991 #define _mm512_add_round_pd(A, B, R) \ 1992 ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \ 1993 (__v8df)(__m512d)(B), (int)(R))) 1994 1995 #define _mm512_mask_add_round_pd(W, U, A, B, R) \ 1996 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1997 (__v8df)_mm512_add_round_pd((A), (B), (R)), \ 1998 (__v8df)(__m512d)(W))) 1999 2000 #define _mm512_maskz_add_round_pd(U, A, B, R) \ 2001 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2002 (__v8df)_mm512_add_round_pd((A), (B), (R)), \ 2003 (__v8df)_mm512_setzero_pd())) 2004 2005 #define _mm512_add_round_ps(A, B, R) \ 2006 ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ 2007 (__v16sf)(__m512)(B), (int)(R))) 2008 2009 #define _mm512_mask_add_round_ps(W, U, A, B, R) \ 2010 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2011 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ 2012 (__v16sf)(__m512)(W))) 2013 2014 #define _mm512_maskz_add_round_ps(U, A, B, R) \ 2015 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2016 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ 2017 (__v16sf)_mm512_setzero_ps())) 2018 2019 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2020 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2021 __A = _mm_sub_ss(__A, __B); 2022 return __builtin_ia32_selectss_128(__U, __A, __W); 2023 } 2024 2025 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2026 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2027 __A = _mm_sub_ss(__A, __B); 2028 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2029 } 2030 #define _mm_sub_round_ss(A, B, R) \ 2031 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2032 (__v4sf)(__m128)(B), \ 2033 (__v4sf)_mm_setzero_ps(), \ 2034 (__mmask8)-1, (int)(R))) 2035 2036 #define _mm_mask_sub_round_ss(W, U, A, B, R) \ 2037 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2038 (__v4sf)(__m128)(B), \ 2039 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2040 (int)(R))) 2041 2042 #define _mm_maskz_sub_round_ss(U, A, B, R) \ 2043 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ 2044 (__v4sf)(__m128)(B), \ 2045 (__v4sf)_mm_setzero_ps(), \ 2046 (__mmask8)(U), (int)(R))) 2047 2048 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2049 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2050 __A = _mm_sub_sd(__A, __B); 2051 return __builtin_ia32_selectsd_128(__U, __A, __W); 2052 } 2053 2054 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2055 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2056 __A = _mm_sub_sd(__A, __B); 2057 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2058 } 2059 2060 #define _mm_sub_round_sd(A, B, R) \ 2061 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2062 (__v2df)(__m128d)(B), \ 2063 (__v2df)_mm_setzero_pd(), \ 2064 (__mmask8)-1, (int)(R))) 2065 2066 #define _mm_mask_sub_round_sd(W, U, A, B, R) \ 2067 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2068 (__v2df)(__m128d)(B), \ 2069 (__v2df)(__m128d)(W), \ 2070 (__mmask8)(U), (int)(R))) 2071 2072 #define _mm_maskz_sub_round_sd(U, A, B, R) \ 2073 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ 2074 (__v2df)(__m128d)(B), \ 2075 (__v2df)_mm_setzero_pd(), \ 2076 (__mmask8)(U), (int)(R))) 2077 2078 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2079 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2080 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2081 (__v8df)_mm512_sub_pd(__A, __B), 2082 (__v8df)__W); 2083 } 2084 2085 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2086 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2087 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2088 (__v8df)_mm512_sub_pd(__A, __B), 2089 (__v8df)_mm512_setzero_pd()); 2090 } 2091 2092 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2093 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2094 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2095 (__v16sf)_mm512_sub_ps(__A, __B), 2096 (__v16sf)__W); 2097 } 2098 2099 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2100 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2101 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2102 (__v16sf)_mm512_sub_ps(__A, __B), 2103 (__v16sf)_mm512_setzero_ps()); 2104 } 2105 2106 #define _mm512_sub_round_pd(A, B, R) \ 2107 ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \ 2108 (__v8df)(__m512d)(B), (int)(R))) 2109 2110 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \ 2111 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2112 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ 2113 (__v8df)(__m512d)(W))) 2114 2115 #define _mm512_maskz_sub_round_pd(U, A, B, R) \ 2116 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2117 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ 2118 (__v8df)_mm512_setzero_pd())) 2119 2120 #define _mm512_sub_round_ps(A, B, R) \ 2121 ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ 2122 (__v16sf)(__m512)(B), (int)(R))) 2123 2124 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \ 2125 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2126 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ 2127 (__v16sf)(__m512)(W))) 2128 2129 #define _mm512_maskz_sub_round_ps(U, A, B, R) \ 2130 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2131 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ 2132 (__v16sf)_mm512_setzero_ps())) 2133 2134 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2135 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2136 __A = _mm_mul_ss(__A, __B); 2137 return __builtin_ia32_selectss_128(__U, __A, __W); 2138 } 2139 2140 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2141 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2142 __A = _mm_mul_ss(__A, __B); 2143 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2144 } 2145 #define _mm_mul_round_ss(A, B, R) \ 2146 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2147 (__v4sf)(__m128)(B), \ 2148 (__v4sf)_mm_setzero_ps(), \ 2149 (__mmask8)-1, (int)(R))) 2150 2151 #define _mm_mask_mul_round_ss(W, U, A, B, R) \ 2152 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2153 (__v4sf)(__m128)(B), \ 2154 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2155 (int)(R))) 2156 2157 #define _mm_maskz_mul_round_ss(U, A, B, R) \ 2158 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ 2159 (__v4sf)(__m128)(B), \ 2160 (__v4sf)_mm_setzero_ps(), \ 2161 (__mmask8)(U), (int)(R))) 2162 2163 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2164 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2165 __A = _mm_mul_sd(__A, __B); 2166 return __builtin_ia32_selectsd_128(__U, __A, __W); 2167 } 2168 2169 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2170 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2171 __A = _mm_mul_sd(__A, __B); 2172 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2173 } 2174 2175 #define _mm_mul_round_sd(A, B, R) \ 2176 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2177 (__v2df)(__m128d)(B), \ 2178 (__v2df)_mm_setzero_pd(), \ 2179 (__mmask8)-1, (int)(R))) 2180 2181 #define _mm_mask_mul_round_sd(W, U, A, B, R) \ 2182 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2183 (__v2df)(__m128d)(B), \ 2184 (__v2df)(__m128d)(W), \ 2185 (__mmask8)(U), (int)(R))) 2186 2187 #define _mm_maskz_mul_round_sd(U, A, B, R) \ 2188 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ 2189 (__v2df)(__m128d)(B), \ 2190 (__v2df)_mm_setzero_pd(), \ 2191 (__mmask8)(U), (int)(R))) 2192 2193 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2194 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2195 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2196 (__v8df)_mm512_mul_pd(__A, __B), 2197 (__v8df)__W); 2198 } 2199 2200 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2201 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2202 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2203 (__v8df)_mm512_mul_pd(__A, __B), 2204 (__v8df)_mm512_setzero_pd()); 2205 } 2206 2207 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2208 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2209 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2210 (__v16sf)_mm512_mul_ps(__A, __B), 2211 (__v16sf)__W); 2212 } 2213 2214 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2215 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2216 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2217 (__v16sf)_mm512_mul_ps(__A, __B), 2218 (__v16sf)_mm512_setzero_ps()); 2219 } 2220 2221 #define _mm512_mul_round_pd(A, B, R) \ 2222 ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \ 2223 (__v8df)(__m512d)(B), (int)(R))) 2224 2225 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \ 2226 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2227 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ 2228 (__v8df)(__m512d)(W))) 2229 2230 #define _mm512_maskz_mul_round_pd(U, A, B, R) \ 2231 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2232 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ 2233 (__v8df)_mm512_setzero_pd())) 2234 2235 #define _mm512_mul_round_ps(A, B, R) \ 2236 ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ 2237 (__v16sf)(__m512)(B), (int)(R))) 2238 2239 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \ 2240 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2241 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ 2242 (__v16sf)(__m512)(W))) 2243 2244 #define _mm512_maskz_mul_round_ps(U, A, B, R) \ 2245 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2246 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ 2247 (__v16sf)_mm512_setzero_ps())) 2248 2249 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2250 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { 2251 __A = _mm_div_ss(__A, __B); 2252 return __builtin_ia32_selectss_128(__U, __A, __W); 2253 } 2254 2255 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2256 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { 2257 __A = _mm_div_ss(__A, __B); 2258 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); 2259 } 2260 2261 #define _mm_div_round_ss(A, B, R) \ 2262 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2263 (__v4sf)(__m128)(B), \ 2264 (__v4sf)_mm_setzero_ps(), \ 2265 (__mmask8)-1, (int)(R))) 2266 2267 #define _mm_mask_div_round_ss(W, U, A, B, R) \ 2268 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2269 (__v4sf)(__m128)(B), \ 2270 (__v4sf)(__m128)(W), (__mmask8)(U), \ 2271 (int)(R))) 2272 2273 #define _mm_maskz_div_round_ss(U, A, B, R) \ 2274 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ 2275 (__v4sf)(__m128)(B), \ 2276 (__v4sf)_mm_setzero_ps(), \ 2277 (__mmask8)(U), (int)(R))) 2278 2279 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2280 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { 2281 __A = _mm_div_sd(__A, __B); 2282 return __builtin_ia32_selectsd_128(__U, __A, __W); 2283 } 2284 2285 static __inline__ __m128d __DEFAULT_FN_ATTRS128 2286 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { 2287 __A = _mm_div_sd(__A, __B); 2288 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); 2289 } 2290 2291 #define _mm_div_round_sd(A, B, R) \ 2292 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2293 (__v2df)(__m128d)(B), \ 2294 (__v2df)_mm_setzero_pd(), \ 2295 (__mmask8)-1, (int)(R))) 2296 2297 #define _mm_mask_div_round_sd(W, U, A, B, R) \ 2298 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2299 (__v2df)(__m128d)(B), \ 2300 (__v2df)(__m128d)(W), \ 2301 (__mmask8)(U), (int)(R))) 2302 2303 #define _mm_maskz_div_round_sd(U, A, B, R) \ 2304 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ 2305 (__v2df)(__m128d)(B), \ 2306 (__v2df)_mm_setzero_pd(), \ 2307 (__mmask8)(U), (int)(R))) 2308 2309 static __inline __m512d __DEFAULT_FN_ATTRS512 2310 _mm512_div_pd(__m512d __a, __m512d __b) 2311 { 2312 return (__m512d)((__v8df)__a/(__v8df)__b); 2313 } 2314 2315 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2316 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 2317 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2318 (__v8df)_mm512_div_pd(__A, __B), 2319 (__v8df)__W); 2320 } 2321 2322 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2323 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { 2324 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 2325 (__v8df)_mm512_div_pd(__A, __B), 2326 (__v8df)_mm512_setzero_pd()); 2327 } 2328 2329 static __inline __m512 __DEFAULT_FN_ATTRS512 2330 _mm512_div_ps(__m512 __a, __m512 __b) 2331 { 2332 return (__m512)((__v16sf)__a/(__v16sf)__b); 2333 } 2334 2335 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2336 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 2337 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2338 (__v16sf)_mm512_div_ps(__A, __B), 2339 (__v16sf)__W); 2340 } 2341 2342 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2343 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { 2344 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2345 (__v16sf)_mm512_div_ps(__A, __B), 2346 (__v16sf)_mm512_setzero_ps()); 2347 } 2348 2349 #define _mm512_div_round_pd(A, B, R) \ 2350 ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \ 2351 (__v8df)(__m512d)(B), (int)(R))) 2352 2353 #define _mm512_mask_div_round_pd(W, U, A, B, R) \ 2354 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2355 (__v8df)_mm512_div_round_pd((A), (B), (R)), \ 2356 (__v8df)(__m512d)(W))) 2357 2358 #define _mm512_maskz_div_round_pd(U, A, B, R) \ 2359 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 2360 (__v8df)_mm512_div_round_pd((A), (B), (R)), \ 2361 (__v8df)_mm512_setzero_pd())) 2362 2363 #define _mm512_div_round_ps(A, B, R) \ 2364 ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ 2365 (__v16sf)(__m512)(B), (int)(R))) 2366 2367 #define _mm512_mask_div_round_ps(W, U, A, B, R) \ 2368 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2369 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ 2370 (__v16sf)(__m512)(W))) 2371 2372 #define _mm512_maskz_div_round_ps(U, A, B, R) \ 2373 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 2374 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ 2375 (__v16sf)_mm512_setzero_ps())) 2376 2377 #define _mm512_roundscale_ps(A, B) \ 2378 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ 2379 (__v16sf)_mm512_undefined_ps(), \ 2380 (__mmask16)-1, \ 2381 _MM_FROUND_CUR_DIRECTION)) 2382 2383 #define _mm512_mask_roundscale_ps(A, B, C, imm) \ 2384 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ 2385 (__v16sf)(__m512)(A), (__mmask16)(B), \ 2386 _MM_FROUND_CUR_DIRECTION)) 2387 2388 #define _mm512_maskz_roundscale_ps(A, B, imm) \ 2389 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ 2390 (__v16sf)_mm512_setzero_ps(), \ 2391 (__mmask16)(A), \ 2392 _MM_FROUND_CUR_DIRECTION)) 2393 2394 #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \ 2395 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ 2396 (__v16sf)(__m512)(A), (__mmask16)(B), \ 2397 (int)(R))) 2398 2399 #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \ 2400 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ 2401 (__v16sf)_mm512_setzero_ps(), \ 2402 (__mmask16)(A), (int)(R))) 2403 2404 #define _mm512_roundscale_round_ps(A, imm, R) \ 2405 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \ 2406 (__v16sf)_mm512_undefined_ps(), \ 2407 (__mmask16)-1, (int)(R))) 2408 2409 #define _mm512_roundscale_pd(A, B) \ 2410 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ 2411 (__v8df)_mm512_undefined_pd(), \ 2412 (__mmask8)-1, \ 2413 _MM_FROUND_CUR_DIRECTION)) 2414 2415 #define _mm512_mask_roundscale_pd(A, B, C, imm) \ 2416 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ 2417 (__v8df)(__m512d)(A), (__mmask8)(B), \ 2418 _MM_FROUND_CUR_DIRECTION)) 2419 2420 #define _mm512_maskz_roundscale_pd(A, B, imm) \ 2421 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ 2422 (__v8df)_mm512_setzero_pd(), \ 2423 (__mmask8)(A), \ 2424 _MM_FROUND_CUR_DIRECTION)) 2425 2426 #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \ 2427 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ 2428 (__v8df)(__m512d)(A), (__mmask8)(B), \ 2429 (int)(R))) 2430 2431 #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \ 2432 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ 2433 (__v8df)_mm512_setzero_pd(), \ 2434 (__mmask8)(A), (int)(R))) 2435 2436 #define _mm512_roundscale_round_pd(A, imm, R) \ 2437 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \ 2438 (__v8df)_mm512_undefined_pd(), \ 2439 (__mmask8)-1, (int)(R))) 2440 2441 #define _mm512_fmadd_round_pd(A, B, C, R) \ 2442 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2443 (__v8df)(__m512d)(B), \ 2444 (__v8df)(__m512d)(C), \ 2445 (__mmask8)-1, (int)(R))) 2446 2447 2448 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ 2449 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2450 (__v8df)(__m512d)(B), \ 2451 (__v8df)(__m512d)(C), \ 2452 (__mmask8)(U), (int)(R))) 2453 2454 2455 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ 2456 ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ 2457 (__v8df)(__m512d)(B), \ 2458 (__v8df)(__m512d)(C), \ 2459 (__mmask8)(U), (int)(R))) 2460 2461 2462 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ 2463 ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ 2464 (__v8df)(__m512d)(B), \ 2465 (__v8df)(__m512d)(C), \ 2466 (__mmask8)(U), (int)(R))) 2467 2468 2469 #define _mm512_fmsub_round_pd(A, B, C, R) \ 2470 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2471 (__v8df)(__m512d)(B), \ 2472 -(__v8df)(__m512d)(C), \ 2473 (__mmask8)-1, (int)(R))) 2474 2475 2476 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ 2477 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 2478 (__v8df)(__m512d)(B), \ 2479 -(__v8df)(__m512d)(C), \ 2480 (__mmask8)(U), (int)(R))) 2481 2482 2483 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ 2484 ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ 2485 (__v8df)(__m512d)(B), \ 2486 -(__v8df)(__m512d)(C), \ 2487 (__mmask8)(U), (int)(R))) 2488 2489 2490 #define _mm512_fnmadd_round_pd(A, B, C, R) \ 2491 ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ 2492 (__v8df)(__m512d)(B), \ 2493 (__v8df)(__m512d)(C), \ 2494 (__mmask8)-1, (int)(R))) 2495 2496 2497 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ 2498 ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ 2499 (__v8df)(__m512d)(B), \ 2500 (__v8df)(__m512d)(C), \ 2501 (__mmask8)(U), (int)(R))) 2502 2503 2504 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ 2505 ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ 2506 (__v8df)(__m512d)(B), \ 2507 (__v8df)(__m512d)(C), \ 2508 (__mmask8)(U), (int)(R))) 2509 2510 2511 #define _mm512_fnmsub_round_pd(A, B, C, R) \ 2512 ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ 2513 (__v8df)(__m512d)(B), \ 2514 -(__v8df)(__m512d)(C), \ 2515 (__mmask8)-1, (int)(R))) 2516 2517 2518 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ 2519 ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ 2520 (__v8df)(__m512d)(B), \ 2521 -(__v8df)(__m512d)(C), \ 2522 (__mmask8)(U), (int)(R))) 2523 2524 2525 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2526 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) 2527 { 2528 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2529 (__v8df) __B, 2530 (__v8df) __C, 2531 (__mmask8) -1, 2532 _MM_FROUND_CUR_DIRECTION); 2533 } 2534 2535 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2536 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2537 { 2538 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2539 (__v8df) __B, 2540 (__v8df) __C, 2541 (__mmask8) __U, 2542 _MM_FROUND_CUR_DIRECTION); 2543 } 2544 2545 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2546 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2547 { 2548 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, 2549 (__v8df) __B, 2550 (__v8df) __C, 2551 (__mmask8) __U, 2552 _MM_FROUND_CUR_DIRECTION); 2553 } 2554 2555 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2556 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2557 { 2558 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, 2559 (__v8df) __B, 2560 (__v8df) __C, 2561 (__mmask8) __U, 2562 _MM_FROUND_CUR_DIRECTION); 2563 } 2564 2565 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2566 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) 2567 { 2568 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2569 (__v8df) __B, 2570 -(__v8df) __C, 2571 (__mmask8) -1, 2572 _MM_FROUND_CUR_DIRECTION); 2573 } 2574 2575 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2576 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2577 { 2578 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2579 (__v8df) __B, 2580 -(__v8df) __C, 2581 (__mmask8) __U, 2582 _MM_FROUND_CUR_DIRECTION); 2583 } 2584 2585 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2586 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2587 { 2588 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, 2589 (__v8df) __B, 2590 -(__v8df) __C, 2591 (__mmask8) __U, 2592 _MM_FROUND_CUR_DIRECTION); 2593 } 2594 2595 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2596 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) 2597 { 2598 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2599 -(__v8df) __B, 2600 (__v8df) __C, 2601 (__mmask8) -1, 2602 _MM_FROUND_CUR_DIRECTION); 2603 } 2604 2605 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2606 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2607 { 2608 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, 2609 (__v8df) __B, 2610 (__v8df) __C, 2611 (__mmask8) __U, 2612 _MM_FROUND_CUR_DIRECTION); 2613 } 2614 2615 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2616 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2617 { 2618 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, 2619 (__v8df) __B, 2620 (__v8df) __C, 2621 (__mmask8) __U, 2622 _MM_FROUND_CUR_DIRECTION); 2623 } 2624 2625 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2626 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) 2627 { 2628 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 2629 -(__v8df) __B, 2630 -(__v8df) __C, 2631 (__mmask8) -1, 2632 _MM_FROUND_CUR_DIRECTION); 2633 } 2634 2635 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2636 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2637 { 2638 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, 2639 (__v8df) __B, 2640 -(__v8df) __C, 2641 (__mmask8) __U, 2642 _MM_FROUND_CUR_DIRECTION); 2643 } 2644 2645 #define _mm512_fmadd_round_ps(A, B, C, R) \ 2646 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2647 (__v16sf)(__m512)(B), \ 2648 (__v16sf)(__m512)(C), \ 2649 (__mmask16)-1, (int)(R))) 2650 2651 2652 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ 2653 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2654 (__v16sf)(__m512)(B), \ 2655 (__v16sf)(__m512)(C), \ 2656 (__mmask16)(U), (int)(R))) 2657 2658 2659 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ 2660 ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ 2661 (__v16sf)(__m512)(B), \ 2662 (__v16sf)(__m512)(C), \ 2663 (__mmask16)(U), (int)(R))) 2664 2665 2666 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ 2667 ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ 2668 (__v16sf)(__m512)(B), \ 2669 (__v16sf)(__m512)(C), \ 2670 (__mmask16)(U), (int)(R))) 2671 2672 2673 #define _mm512_fmsub_round_ps(A, B, C, R) \ 2674 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2675 (__v16sf)(__m512)(B), \ 2676 -(__v16sf)(__m512)(C), \ 2677 (__mmask16)-1, (int)(R))) 2678 2679 2680 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ 2681 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2682 (__v16sf)(__m512)(B), \ 2683 -(__v16sf)(__m512)(C), \ 2684 (__mmask16)(U), (int)(R))) 2685 2686 2687 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ 2688 ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ 2689 (__v16sf)(__m512)(B), \ 2690 -(__v16sf)(__m512)(C), \ 2691 (__mmask16)(U), (int)(R))) 2692 2693 2694 #define _mm512_fnmadd_round_ps(A, B, C, R) \ 2695 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2696 -(__v16sf)(__m512)(B), \ 2697 (__v16sf)(__m512)(C), \ 2698 (__mmask16)-1, (int)(R))) 2699 2700 2701 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ 2702 ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ 2703 (__v16sf)(__m512)(B), \ 2704 (__v16sf)(__m512)(C), \ 2705 (__mmask16)(U), (int)(R))) 2706 2707 2708 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ 2709 ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ 2710 (__v16sf)(__m512)(B), \ 2711 (__v16sf)(__m512)(C), \ 2712 (__mmask16)(U), (int)(R))) 2713 2714 2715 #define _mm512_fnmsub_round_ps(A, B, C, R) \ 2716 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 2717 -(__v16sf)(__m512)(B), \ 2718 -(__v16sf)(__m512)(C), \ 2719 (__mmask16)-1, (int)(R))) 2720 2721 2722 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ 2723 ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ 2724 (__v16sf)(__m512)(B), \ 2725 -(__v16sf)(__m512)(C), \ 2726 (__mmask16)(U), (int)(R))) 2727 2728 2729 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2730 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) 2731 { 2732 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2733 (__v16sf) __B, 2734 (__v16sf) __C, 2735 (__mmask16) -1, 2736 _MM_FROUND_CUR_DIRECTION); 2737 } 2738 2739 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2740 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 2741 { 2742 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2743 (__v16sf) __B, 2744 (__v16sf) __C, 2745 (__mmask16) __U, 2746 _MM_FROUND_CUR_DIRECTION); 2747 } 2748 2749 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2750 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 2751 { 2752 return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, 2753 (__v16sf) __B, 2754 (__v16sf) __C, 2755 (__mmask16) __U, 2756 _MM_FROUND_CUR_DIRECTION); 2757 } 2758 2759 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2760 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2761 { 2762 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, 2763 (__v16sf) __B, 2764 (__v16sf) __C, 2765 (__mmask16) __U, 2766 _MM_FROUND_CUR_DIRECTION); 2767 } 2768 2769 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2770 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) 2771 { 2772 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2773 (__v16sf) __B, 2774 -(__v16sf) __C, 2775 (__mmask16) -1, 2776 _MM_FROUND_CUR_DIRECTION); 2777 } 2778 2779 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2780 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 2781 { 2782 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2783 (__v16sf) __B, 2784 -(__v16sf) __C, 2785 (__mmask16) __U, 2786 _MM_FROUND_CUR_DIRECTION); 2787 } 2788 2789 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2790 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2791 { 2792 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, 2793 (__v16sf) __B, 2794 -(__v16sf) __C, 2795 (__mmask16) __U, 2796 _MM_FROUND_CUR_DIRECTION); 2797 } 2798 2799 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2800 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) 2801 { 2802 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2803 -(__v16sf) __B, 2804 (__v16sf) __C, 2805 (__mmask16) -1, 2806 _MM_FROUND_CUR_DIRECTION); 2807 } 2808 2809 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2810 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 2811 { 2812 return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, 2813 (__v16sf) __B, 2814 (__v16sf) __C, 2815 (__mmask16) __U, 2816 _MM_FROUND_CUR_DIRECTION); 2817 } 2818 2819 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2820 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2821 { 2822 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, 2823 (__v16sf) __B, 2824 (__v16sf) __C, 2825 (__mmask16) __U, 2826 _MM_FROUND_CUR_DIRECTION); 2827 } 2828 2829 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2830 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) 2831 { 2832 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 2833 -(__v16sf) __B, 2834 -(__v16sf) __C, 2835 (__mmask16) -1, 2836 _MM_FROUND_CUR_DIRECTION); 2837 } 2838 2839 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2840 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 2841 { 2842 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, 2843 (__v16sf) __B, 2844 -(__v16sf) __C, 2845 (__mmask16) __U, 2846 _MM_FROUND_CUR_DIRECTION); 2847 } 2848 2849 #define _mm512_fmaddsub_round_pd(A, B, C, R) \ 2850 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2851 (__v8df)(__m512d)(B), \ 2852 (__v8df)(__m512d)(C), \ 2853 (__mmask8)-1, (int)(R))) 2854 2855 2856 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ 2857 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2858 (__v8df)(__m512d)(B), \ 2859 (__v8df)(__m512d)(C), \ 2860 (__mmask8)(U), (int)(R))) 2861 2862 2863 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ 2864 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ 2865 (__v8df)(__m512d)(B), \ 2866 (__v8df)(__m512d)(C), \ 2867 (__mmask8)(U), (int)(R))) 2868 2869 2870 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ 2871 ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ 2872 (__v8df)(__m512d)(B), \ 2873 (__v8df)(__m512d)(C), \ 2874 (__mmask8)(U), (int)(R))) 2875 2876 2877 #define _mm512_fmsubadd_round_pd(A, B, C, R) \ 2878 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2879 (__v8df)(__m512d)(B), \ 2880 -(__v8df)(__m512d)(C), \ 2881 (__mmask8)-1, (int)(R))) 2882 2883 2884 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ 2885 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ 2886 (__v8df)(__m512d)(B), \ 2887 -(__v8df)(__m512d)(C), \ 2888 (__mmask8)(U), (int)(R))) 2889 2890 2891 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ 2892 ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ 2893 (__v8df)(__m512d)(B), \ 2894 -(__v8df)(__m512d)(C), \ 2895 (__mmask8)(U), (int)(R))) 2896 2897 2898 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2899 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) 2900 { 2901 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2902 (__v8df) __B, 2903 (__v8df) __C, 2904 (__mmask8) -1, 2905 _MM_FROUND_CUR_DIRECTION); 2906 } 2907 2908 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2909 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2910 { 2911 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2912 (__v8df) __B, 2913 (__v8df) __C, 2914 (__mmask8) __U, 2915 _MM_FROUND_CUR_DIRECTION); 2916 } 2917 2918 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2919 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 2920 { 2921 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, 2922 (__v8df) __B, 2923 (__v8df) __C, 2924 (__mmask8) __U, 2925 _MM_FROUND_CUR_DIRECTION); 2926 } 2927 2928 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2929 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2930 { 2931 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, 2932 (__v8df) __B, 2933 (__v8df) __C, 2934 (__mmask8) __U, 2935 _MM_FROUND_CUR_DIRECTION); 2936 } 2937 2938 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2939 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) 2940 { 2941 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2942 (__v8df) __B, 2943 -(__v8df) __C, 2944 (__mmask8) -1, 2945 _MM_FROUND_CUR_DIRECTION); 2946 } 2947 2948 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2949 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 2950 { 2951 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, 2952 (__v8df) __B, 2953 -(__v8df) __C, 2954 (__mmask8) __U, 2955 _MM_FROUND_CUR_DIRECTION); 2956 } 2957 2958 static __inline__ __m512d __DEFAULT_FN_ATTRS512 2959 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) 2960 { 2961 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, 2962 (__v8df) __B, 2963 -(__v8df) __C, 2964 (__mmask8) __U, 2965 _MM_FROUND_CUR_DIRECTION); 2966 } 2967 2968 #define _mm512_fmaddsub_round_ps(A, B, C, R) \ 2969 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2970 (__v16sf)(__m512)(B), \ 2971 (__v16sf)(__m512)(C), \ 2972 (__mmask16)-1, (int)(R))) 2973 2974 2975 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ 2976 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2977 (__v16sf)(__m512)(B), \ 2978 (__v16sf)(__m512)(C), \ 2979 (__mmask16)(U), (int)(R))) 2980 2981 2982 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ 2983 ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ 2984 (__v16sf)(__m512)(B), \ 2985 (__v16sf)(__m512)(C), \ 2986 (__mmask16)(U), (int)(R))) 2987 2988 2989 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ 2990 ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ 2991 (__v16sf)(__m512)(B), \ 2992 (__v16sf)(__m512)(C), \ 2993 (__mmask16)(U), (int)(R))) 2994 2995 2996 #define _mm512_fmsubadd_round_ps(A, B, C, R) \ 2997 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 2998 (__v16sf)(__m512)(B), \ 2999 -(__v16sf)(__m512)(C), \ 3000 (__mmask16)-1, (int)(R))) 3001 3002 3003 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ 3004 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ 3005 (__v16sf)(__m512)(B), \ 3006 -(__v16sf)(__m512)(C), \ 3007 (__mmask16)(U), (int)(R))) 3008 3009 3010 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ 3011 ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ 3012 (__v16sf)(__m512)(B), \ 3013 -(__v16sf)(__m512)(C), \ 3014 (__mmask16)(U), (int)(R))) 3015 3016 3017 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3018 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) 3019 { 3020 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3021 (__v16sf) __B, 3022 (__v16sf) __C, 3023 (__mmask16) -1, 3024 _MM_FROUND_CUR_DIRECTION); 3025 } 3026 3027 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3028 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3029 { 3030 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3031 (__v16sf) __B, 3032 (__v16sf) __C, 3033 (__mmask16) __U, 3034 _MM_FROUND_CUR_DIRECTION); 3035 } 3036 3037 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3038 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3039 { 3040 return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, 3041 (__v16sf) __B, 3042 (__v16sf) __C, 3043 (__mmask16) __U, 3044 _MM_FROUND_CUR_DIRECTION); 3045 } 3046 3047 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3048 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 3049 { 3050 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, 3051 (__v16sf) __B, 3052 (__v16sf) __C, 3053 (__mmask16) __U, 3054 _MM_FROUND_CUR_DIRECTION); 3055 } 3056 3057 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3058 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) 3059 { 3060 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3061 (__v16sf) __B, 3062 -(__v16sf) __C, 3063 (__mmask16) -1, 3064 _MM_FROUND_CUR_DIRECTION); 3065 } 3066 3067 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3068 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3069 { 3070 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, 3071 (__v16sf) __B, 3072 -(__v16sf) __C, 3073 (__mmask16) __U, 3074 _MM_FROUND_CUR_DIRECTION); 3075 } 3076 3077 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3078 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) 3079 { 3080 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, 3081 (__v16sf) __B, 3082 -(__v16sf) __C, 3083 (__mmask16) __U, 3084 _MM_FROUND_CUR_DIRECTION); 3085 } 3086 3087 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ 3088 ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ 3089 (__v8df)(__m512d)(B), \ 3090 (__v8df)(__m512d)(C), \ 3091 (__mmask8)(U), (int)(R))) 3092 3093 3094 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3095 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3096 { 3097 return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, 3098 (__v8df) __B, 3099 (__v8df) __C, 3100 (__mmask8) __U, 3101 _MM_FROUND_CUR_DIRECTION); 3102 } 3103 3104 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ 3105 ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ 3106 (__v16sf)(__m512)(B), \ 3107 (__v16sf)(__m512)(C), \ 3108 (__mmask16)(U), (int)(R))) 3109 3110 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3111 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3112 { 3113 return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, 3114 (__v16sf) __B, 3115 (__v16sf) __C, 3116 (__mmask16) __U, 3117 _MM_FROUND_CUR_DIRECTION); 3118 } 3119 3120 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ 3121 ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ 3122 (__v8df)(__m512d)(B), \ 3123 (__v8df)(__m512d)(C), \ 3124 (__mmask8)(U), (int)(R))) 3125 3126 3127 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3128 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3129 { 3130 return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, 3131 (__v8df) __B, 3132 (__v8df) __C, 3133 (__mmask8) __U, 3134 _MM_FROUND_CUR_DIRECTION); 3135 } 3136 3137 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ 3138 ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ 3139 (__v16sf)(__m512)(B), \ 3140 (__v16sf)(__m512)(C), \ 3141 (__mmask16)(U), (int)(R))) 3142 3143 3144 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3145 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3146 { 3147 return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, 3148 (__v16sf) __B, 3149 (__v16sf) __C, 3150 (__mmask16) __U, 3151 _MM_FROUND_CUR_DIRECTION); 3152 } 3153 3154 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ 3155 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 3156 -(__v8df)(__m512d)(B), \ 3157 (__v8df)(__m512d)(C), \ 3158 (__mmask8)(U), (int)(R))) 3159 3160 3161 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3162 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 3163 { 3164 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 3165 -(__v8df) __B, 3166 (__v8df) __C, 3167 (__mmask8) __U, 3168 _MM_FROUND_CUR_DIRECTION); 3169 } 3170 3171 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ 3172 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 3173 -(__v16sf)(__m512)(B), \ 3174 (__v16sf)(__m512)(C), \ 3175 (__mmask16)(U), (int)(R))) 3176 3177 3178 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3179 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3180 { 3181 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 3182 -(__v16sf) __B, 3183 (__v16sf) __C, 3184 (__mmask16) __U, 3185 _MM_FROUND_CUR_DIRECTION); 3186 } 3187 3188 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ 3189 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ 3190 -(__v8df)(__m512d)(B), \ 3191 -(__v8df)(__m512d)(C), \ 3192 (__mmask8)(U), (int)(R))) 3193 3194 3195 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ 3196 ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \ 3197 (__v8df)(__m512d)(B), \ 3198 (__v8df)(__m512d)(C), \ 3199 (__mmask8)(U), (int)(R))) 3200 3201 3202 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3203 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) 3204 { 3205 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, 3206 -(__v8df) __B, 3207 -(__v8df) __C, 3208 (__mmask8) __U, 3209 _MM_FROUND_CUR_DIRECTION); 3210 } 3211 3212 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3213 _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) 3214 { 3215 return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A, 3216 (__v8df) __B, 3217 (__v8df) __C, 3218 (__mmask8) __U, 3219 _MM_FROUND_CUR_DIRECTION); 3220 } 3221 3222 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ 3223 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ 3224 -(__v16sf)(__m512)(B), \ 3225 -(__v16sf)(__m512)(C), \ 3226 (__mmask16)(U), (int)(R))) 3227 3228 3229 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ 3230 ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \ 3231 (__v16sf)(__m512)(B), \ 3232 (__v16sf)(__m512)(C), \ 3233 (__mmask16)(U), (int)(R))) 3234 3235 3236 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3237 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) 3238 { 3239 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, 3240 -(__v16sf) __B, 3241 -(__v16sf) __C, 3242 (__mmask16) __U, 3243 _MM_FROUND_CUR_DIRECTION); 3244 } 3245 3246 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3247 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) 3248 { 3249 return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A, 3250 (__v16sf) __B, 3251 (__v16sf) __C, 3252 (__mmask16) __U, 3253 _MM_FROUND_CUR_DIRECTION); 3254 } 3255 3256 3257 3258 /* Vector permutations */ 3259 3260 static __inline __m512i __DEFAULT_FN_ATTRS512 3261 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) 3262 { 3263 return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, 3264 (__v16si) __B); 3265 } 3266 3267 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3268 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, 3269 __m512i __B) 3270 { 3271 return (__m512i)__builtin_ia32_selectd_512(__U, 3272 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3273 (__v16si)__A); 3274 } 3275 3276 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3277 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, 3278 __m512i __B) 3279 { 3280 return (__m512i)__builtin_ia32_selectd_512(__U, 3281 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3282 (__v16si)__I); 3283 } 3284 3285 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3286 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, 3287 __m512i __B) 3288 { 3289 return (__m512i)__builtin_ia32_selectd_512(__U, 3290 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), 3291 (__v16si)_mm512_setzero_si512()); 3292 } 3293 3294 static __inline __m512i __DEFAULT_FN_ATTRS512 3295 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) 3296 { 3297 return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, 3298 (__v8di) __B); 3299 } 3300 3301 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3302 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, 3303 __m512i __B) 3304 { 3305 return (__m512i)__builtin_ia32_selectq_512(__U, 3306 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3307 (__v8di)__A); 3308 } 3309 3310 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3311 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, 3312 __m512i __B) 3313 { 3314 return (__m512i)__builtin_ia32_selectq_512(__U, 3315 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3316 (__v8di)__I); 3317 } 3318 3319 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3320 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, 3321 __m512i __B) 3322 { 3323 return (__m512i)__builtin_ia32_selectq_512(__U, 3324 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), 3325 (__v8di)_mm512_setzero_si512()); 3326 } 3327 3328 #define _mm512_alignr_epi64(A, B, I) \ 3329 ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \ 3330 (__v8di)(__m512i)(B), (int)(I))) 3331 3332 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \ 3333 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 3334 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ 3335 (__v8di)(__m512i)(W))) 3336 3337 #define _mm512_maskz_alignr_epi64(U, A, B, imm) \ 3338 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 3339 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ 3340 (__v8di)_mm512_setzero_si512())) 3341 3342 #define _mm512_alignr_epi32(A, B, I) \ 3343 ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \ 3344 (__v16si)(__m512i)(B), (int)(I))) 3345 3346 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \ 3347 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 3348 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ 3349 (__v16si)(__m512i)(W))) 3350 3351 #define _mm512_maskz_alignr_epi32(U, A, B, imm) \ 3352 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 3353 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ 3354 (__v16si)_mm512_setzero_si512())) 3355 /* Vector Extract */ 3356 3357 #define _mm512_extractf64x4_pd(A, I) \ 3358 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ 3359 (__v4df)_mm256_undefined_pd(), \ 3360 (__mmask8)-1)) 3361 3362 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \ 3363 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ 3364 (__v4df)(__m256d)(W), \ 3365 (__mmask8)(U))) 3366 3367 #define _mm512_maskz_extractf64x4_pd(U, A, imm) \ 3368 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ 3369 (__v4df)_mm256_setzero_pd(), \ 3370 (__mmask8)(U))) 3371 3372 #define _mm512_extractf32x4_ps(A, I) \ 3373 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ 3374 (__v4sf)_mm_undefined_ps(), \ 3375 (__mmask8)-1)) 3376 3377 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \ 3378 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ 3379 (__v4sf)(__m128)(W), \ 3380 (__mmask8)(U))) 3381 3382 #define _mm512_maskz_extractf32x4_ps(U, A, imm) \ 3383 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ 3384 (__v4sf)_mm_setzero_ps(), \ 3385 (__mmask8)(U))) 3386 3387 /* Vector Blend */ 3388 3389 static __inline __m512d __DEFAULT_FN_ATTRS512 3390 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) 3391 { 3392 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 3393 (__v8df) __W, 3394 (__v8df) __A); 3395 } 3396 3397 static __inline __m512 __DEFAULT_FN_ATTRS512 3398 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) 3399 { 3400 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 3401 (__v16sf) __W, 3402 (__v16sf) __A); 3403 } 3404 3405 static __inline __m512i __DEFAULT_FN_ATTRS512 3406 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) 3407 { 3408 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 3409 (__v8di) __W, 3410 (__v8di) __A); 3411 } 3412 3413 static __inline __m512i __DEFAULT_FN_ATTRS512 3414 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) 3415 { 3416 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 3417 (__v16si) __W, 3418 (__v16si) __A); 3419 } 3420 3421 /* Compare */ 3422 3423 #define _mm512_cmp_round_ps_mask(A, B, P, R) \ 3424 ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ 3425 (__v16sf)(__m512)(B), (int)(P), \ 3426 (__mmask16)-1, (int)(R))) 3427 3428 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \ 3429 ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ 3430 (__v16sf)(__m512)(B), (int)(P), \ 3431 (__mmask16)(U), (int)(R))) 3432 3433 #define _mm512_cmp_ps_mask(A, B, P) \ 3434 _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3435 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \ 3436 _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3437 3438 #define _mm512_cmpeq_ps_mask(A, B) \ 3439 _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ) 3440 #define _mm512_mask_cmpeq_ps_mask(k, A, B) \ 3441 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ) 3442 3443 #define _mm512_cmplt_ps_mask(A, B) \ 3444 _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS) 3445 #define _mm512_mask_cmplt_ps_mask(k, A, B) \ 3446 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS) 3447 3448 #define _mm512_cmple_ps_mask(A, B) \ 3449 _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS) 3450 #define _mm512_mask_cmple_ps_mask(k, A, B) \ 3451 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS) 3452 3453 #define _mm512_cmpunord_ps_mask(A, B) \ 3454 _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q) 3455 #define _mm512_mask_cmpunord_ps_mask(k, A, B) \ 3456 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q) 3457 3458 #define _mm512_cmpneq_ps_mask(A, B) \ 3459 _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ) 3460 #define _mm512_mask_cmpneq_ps_mask(k, A, B) \ 3461 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ) 3462 3463 #define _mm512_cmpnlt_ps_mask(A, B) \ 3464 _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US) 3465 #define _mm512_mask_cmpnlt_ps_mask(k, A, B) \ 3466 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US) 3467 3468 #define _mm512_cmpnle_ps_mask(A, B) \ 3469 _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US) 3470 #define _mm512_mask_cmpnle_ps_mask(k, A, B) \ 3471 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US) 3472 3473 #define _mm512_cmpord_ps_mask(A, B) \ 3474 _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q) 3475 #define _mm512_mask_cmpord_ps_mask(k, A, B) \ 3476 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q) 3477 3478 #define _mm512_cmp_round_pd_mask(A, B, P, R) \ 3479 ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ 3480 (__v8df)(__m512d)(B), (int)(P), \ 3481 (__mmask8)-1, (int)(R))) 3482 3483 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \ 3484 ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ 3485 (__v8df)(__m512d)(B), (int)(P), \ 3486 (__mmask8)(U), (int)(R))) 3487 3488 #define _mm512_cmp_pd_mask(A, B, P) \ 3489 _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3490 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \ 3491 _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) 3492 3493 #define _mm512_cmpeq_pd_mask(A, B) \ 3494 _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ) 3495 #define _mm512_mask_cmpeq_pd_mask(k, A, B) \ 3496 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ) 3497 3498 #define _mm512_cmplt_pd_mask(A, B) \ 3499 _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS) 3500 #define _mm512_mask_cmplt_pd_mask(k, A, B) \ 3501 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS) 3502 3503 #define _mm512_cmple_pd_mask(A, B) \ 3504 _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS) 3505 #define _mm512_mask_cmple_pd_mask(k, A, B) \ 3506 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS) 3507 3508 #define _mm512_cmpunord_pd_mask(A, B) \ 3509 _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q) 3510 #define _mm512_mask_cmpunord_pd_mask(k, A, B) \ 3511 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q) 3512 3513 #define _mm512_cmpneq_pd_mask(A, B) \ 3514 _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ) 3515 #define _mm512_mask_cmpneq_pd_mask(k, A, B) \ 3516 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ) 3517 3518 #define _mm512_cmpnlt_pd_mask(A, B) \ 3519 _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US) 3520 #define _mm512_mask_cmpnlt_pd_mask(k, A, B) \ 3521 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US) 3522 3523 #define _mm512_cmpnle_pd_mask(A, B) \ 3524 _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US) 3525 #define _mm512_mask_cmpnle_pd_mask(k, A, B) \ 3526 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US) 3527 3528 #define _mm512_cmpord_pd_mask(A, B) \ 3529 _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q) 3530 #define _mm512_mask_cmpord_pd_mask(k, A, B) \ 3531 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q) 3532 3533 /* Conversion */ 3534 3535 #define _mm512_cvtt_roundps_epu32(A, R) \ 3536 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3537 (__v16si)_mm512_undefined_epi32(), \ 3538 (__mmask16)-1, (int)(R))) 3539 3540 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \ 3541 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3542 (__v16si)(__m512i)(W), \ 3543 (__mmask16)(U), (int)(R))) 3544 3545 #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \ 3546 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ 3547 (__v16si)_mm512_setzero_si512(), \ 3548 (__mmask16)(U), (int)(R))) 3549 3550 3551 static __inline __m512i __DEFAULT_FN_ATTRS512 3552 _mm512_cvttps_epu32(__m512 __A) 3553 { 3554 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3555 (__v16si) 3556 _mm512_setzero_si512 (), 3557 (__mmask16) -1, 3558 _MM_FROUND_CUR_DIRECTION); 3559 } 3560 3561 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3562 _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) 3563 { 3564 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3565 (__v16si) __W, 3566 (__mmask16) __U, 3567 _MM_FROUND_CUR_DIRECTION); 3568 } 3569 3570 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3571 _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) 3572 { 3573 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, 3574 (__v16si) _mm512_setzero_si512 (), 3575 (__mmask16) __U, 3576 _MM_FROUND_CUR_DIRECTION); 3577 } 3578 3579 #define _mm512_cvt_roundepi32_ps(A, R) \ 3580 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3581 (__v16sf)_mm512_setzero_ps(), \ 3582 (__mmask16)-1, (int)(R))) 3583 3584 #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \ 3585 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3586 (__v16sf)(__m512)(W), \ 3587 (__mmask16)(U), (int)(R))) 3588 3589 #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \ 3590 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ 3591 (__v16sf)_mm512_setzero_ps(), \ 3592 (__mmask16)(U), (int)(R))) 3593 3594 #define _mm512_cvt_roundepu32_ps(A, R) \ 3595 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3596 (__v16sf)_mm512_setzero_ps(), \ 3597 (__mmask16)-1, (int)(R))) 3598 3599 #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \ 3600 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3601 (__v16sf)(__m512)(W), \ 3602 (__mmask16)(U), (int)(R))) 3603 3604 #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \ 3605 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ 3606 (__v16sf)_mm512_setzero_ps(), \ 3607 (__mmask16)(U), (int)(R))) 3608 3609 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3610 _mm512_cvtepu32_ps (__m512i __A) 3611 { 3612 return (__m512)__builtin_convertvector((__v16su)__A, __v16sf); 3613 } 3614 3615 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3616 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) 3617 { 3618 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3619 (__v16sf)_mm512_cvtepu32_ps(__A), 3620 (__v16sf)__W); 3621 } 3622 3623 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3624 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) 3625 { 3626 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3627 (__v16sf)_mm512_cvtepu32_ps(__A), 3628 (__v16sf)_mm512_setzero_ps()); 3629 } 3630 3631 static __inline __m512d __DEFAULT_FN_ATTRS512 3632 _mm512_cvtepi32_pd(__m256i __A) 3633 { 3634 return (__m512d)__builtin_convertvector((__v8si)__A, __v8df); 3635 } 3636 3637 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3638 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) 3639 { 3640 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3641 (__v8df)_mm512_cvtepi32_pd(__A), 3642 (__v8df)__W); 3643 } 3644 3645 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3646 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) 3647 { 3648 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3649 (__v8df)_mm512_cvtepi32_pd(__A), 3650 (__v8df)_mm512_setzero_pd()); 3651 } 3652 3653 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3654 _mm512_cvtepi32lo_pd(__m512i __A) 3655 { 3656 return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A)); 3657 } 3658 3659 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3660 _mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) 3661 { 3662 return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A)); 3663 } 3664 3665 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3666 _mm512_cvtepi32_ps (__m512i __A) 3667 { 3668 return (__m512)__builtin_convertvector((__v16si)__A, __v16sf); 3669 } 3670 3671 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3672 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) 3673 { 3674 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3675 (__v16sf)_mm512_cvtepi32_ps(__A), 3676 (__v16sf)__W); 3677 } 3678 3679 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3680 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) 3681 { 3682 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 3683 (__v16sf)_mm512_cvtepi32_ps(__A), 3684 (__v16sf)_mm512_setzero_ps()); 3685 } 3686 3687 static __inline __m512d __DEFAULT_FN_ATTRS512 3688 _mm512_cvtepu32_pd(__m256i __A) 3689 { 3690 return (__m512d)__builtin_convertvector((__v8su)__A, __v8df); 3691 } 3692 3693 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3694 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) 3695 { 3696 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3697 (__v8df)_mm512_cvtepu32_pd(__A), 3698 (__v8df)__W); 3699 } 3700 3701 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3702 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) 3703 { 3704 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 3705 (__v8df)_mm512_cvtepu32_pd(__A), 3706 (__v8df)_mm512_setzero_pd()); 3707 } 3708 3709 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3710 _mm512_cvtepu32lo_pd(__m512i __A) 3711 { 3712 return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A)); 3713 } 3714 3715 static __inline__ __m512d __DEFAULT_FN_ATTRS512 3716 _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) 3717 { 3718 return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A)); 3719 } 3720 3721 #define _mm512_cvt_roundpd_ps(A, R) \ 3722 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3723 (__v8sf)_mm256_setzero_ps(), \ 3724 (__mmask8)-1, (int)(R))) 3725 3726 #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \ 3727 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3728 (__v8sf)(__m256)(W), (__mmask8)(U), \ 3729 (int)(R))) 3730 3731 #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \ 3732 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ 3733 (__v8sf)_mm256_setzero_ps(), \ 3734 (__mmask8)(U), (int)(R))) 3735 3736 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3737 _mm512_cvtpd_ps (__m512d __A) 3738 { 3739 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3740 (__v8sf) _mm256_undefined_ps (), 3741 (__mmask8) -1, 3742 _MM_FROUND_CUR_DIRECTION); 3743 } 3744 3745 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3746 _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) 3747 { 3748 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3749 (__v8sf) __W, 3750 (__mmask8) __U, 3751 _MM_FROUND_CUR_DIRECTION); 3752 } 3753 3754 static __inline__ __m256 __DEFAULT_FN_ATTRS512 3755 _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) 3756 { 3757 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, 3758 (__v8sf) _mm256_setzero_ps (), 3759 (__mmask8) __U, 3760 _MM_FROUND_CUR_DIRECTION); 3761 } 3762 3763 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3764 _mm512_cvtpd_pslo (__m512d __A) 3765 { 3766 return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A), 3767 (__v8sf) _mm256_setzero_ps (), 3768 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3769 } 3770 3771 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3772 _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) 3773 { 3774 return (__m512) __builtin_shufflevector ( 3775 (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W), 3776 __U, __A), 3777 (__v8sf) _mm256_setzero_ps (), 3778 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3779 } 3780 3781 #define _mm512_cvt_roundps_ph(A, I) \ 3782 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3783 (__v16hi)_mm256_undefined_si256(), \ 3784 (__mmask16)-1)) 3785 3786 #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ 3787 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3788 (__v16hi)(__m256i)(U), \ 3789 (__mmask16)(W))) 3790 3791 #define _mm512_maskz_cvt_roundps_ph(W, A, I) \ 3792 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ 3793 (__v16hi)_mm256_setzero_si256(), \ 3794 (__mmask16)(W))) 3795 3796 #define _mm512_cvtps_ph _mm512_cvt_roundps_ph 3797 #define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph 3798 #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph 3799 3800 #define _mm512_cvt_roundph_ps(A, R) \ 3801 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3802 (__v16sf)_mm512_undefined_ps(), \ 3803 (__mmask16)-1, (int)(R))) 3804 3805 #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \ 3806 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3807 (__v16sf)(__m512)(W), \ 3808 (__mmask16)(U), (int)(R))) 3809 3810 #define _mm512_maskz_cvt_roundph_ps(U, A, R) \ 3811 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ 3812 (__v16sf)_mm512_setzero_ps(), \ 3813 (__mmask16)(U), (int)(R))) 3814 3815 3816 static __inline __m512 __DEFAULT_FN_ATTRS512 3817 _mm512_cvtph_ps(__m256i __A) 3818 { 3819 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3820 (__v16sf) 3821 _mm512_setzero_ps (), 3822 (__mmask16) -1, 3823 _MM_FROUND_CUR_DIRECTION); 3824 } 3825 3826 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3827 _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) 3828 { 3829 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3830 (__v16sf) __W, 3831 (__mmask16) __U, 3832 _MM_FROUND_CUR_DIRECTION); 3833 } 3834 3835 static __inline__ __m512 __DEFAULT_FN_ATTRS512 3836 _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) 3837 { 3838 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, 3839 (__v16sf) _mm512_setzero_ps (), 3840 (__mmask16) __U, 3841 _MM_FROUND_CUR_DIRECTION); 3842 } 3843 3844 #define _mm512_cvtt_roundpd_epi32(A, R) \ 3845 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3846 (__v8si)_mm256_setzero_si256(), \ 3847 (__mmask8)-1, (int)(R))) 3848 3849 #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \ 3850 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3851 (__v8si)(__m256i)(W), \ 3852 (__mmask8)(U), (int)(R))) 3853 3854 #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \ 3855 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ 3856 (__v8si)_mm256_setzero_si256(), \ 3857 (__mmask8)(U), (int)(R))) 3858 3859 static __inline __m256i __DEFAULT_FN_ATTRS512 3860 _mm512_cvttpd_epi32(__m512d __a) 3861 { 3862 return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a, 3863 (__v8si)_mm256_setzero_si256(), 3864 (__mmask8) -1, 3865 _MM_FROUND_CUR_DIRECTION); 3866 } 3867 3868 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3869 _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) 3870 { 3871 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, 3872 (__v8si) __W, 3873 (__mmask8) __U, 3874 _MM_FROUND_CUR_DIRECTION); 3875 } 3876 3877 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3878 _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) 3879 { 3880 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, 3881 (__v8si) _mm256_setzero_si256 (), 3882 (__mmask8) __U, 3883 _MM_FROUND_CUR_DIRECTION); 3884 } 3885 3886 #define _mm512_cvtt_roundps_epi32(A, R) \ 3887 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3888 (__v16si)_mm512_setzero_si512(), \ 3889 (__mmask16)-1, (int)(R))) 3890 3891 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \ 3892 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3893 (__v16si)(__m512i)(W), \ 3894 (__mmask16)(U), (int)(R))) 3895 3896 #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \ 3897 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ 3898 (__v16si)_mm512_setzero_si512(), \ 3899 (__mmask16)(U), (int)(R))) 3900 3901 static __inline __m512i __DEFAULT_FN_ATTRS512 3902 _mm512_cvttps_epi32(__m512 __a) 3903 { 3904 return (__m512i) 3905 __builtin_ia32_cvttps2dq512_mask((__v16sf) __a, 3906 (__v16si) _mm512_setzero_si512 (), 3907 (__mmask16) -1, _MM_FROUND_CUR_DIRECTION); 3908 } 3909 3910 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3911 _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) 3912 { 3913 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, 3914 (__v16si) __W, 3915 (__mmask16) __U, 3916 _MM_FROUND_CUR_DIRECTION); 3917 } 3918 3919 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3920 _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) 3921 { 3922 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, 3923 (__v16si) _mm512_setzero_si512 (), 3924 (__mmask16) __U, 3925 _MM_FROUND_CUR_DIRECTION); 3926 } 3927 3928 #define _mm512_cvt_roundps_epi32(A, R) \ 3929 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3930 (__v16si)_mm512_setzero_si512(), \ 3931 (__mmask16)-1, (int)(R))) 3932 3933 #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \ 3934 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3935 (__v16si)(__m512i)(W), \ 3936 (__mmask16)(U), (int)(R))) 3937 3938 #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \ 3939 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ 3940 (__v16si)_mm512_setzero_si512(), \ 3941 (__mmask16)(U), (int)(R))) 3942 3943 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3944 _mm512_cvtps_epi32 (__m512 __A) 3945 { 3946 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3947 (__v16si) _mm512_undefined_epi32 (), 3948 (__mmask16) -1, 3949 _MM_FROUND_CUR_DIRECTION); 3950 } 3951 3952 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3953 _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) 3954 { 3955 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3956 (__v16si) __W, 3957 (__mmask16) __U, 3958 _MM_FROUND_CUR_DIRECTION); 3959 } 3960 3961 static __inline__ __m512i __DEFAULT_FN_ATTRS512 3962 _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) 3963 { 3964 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, 3965 (__v16si) 3966 _mm512_setzero_si512 (), 3967 (__mmask16) __U, 3968 _MM_FROUND_CUR_DIRECTION); 3969 } 3970 3971 #define _mm512_cvt_roundpd_epi32(A, R) \ 3972 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3973 (__v8si)_mm256_setzero_si256(), \ 3974 (__mmask8)-1, (int)(R))) 3975 3976 #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \ 3977 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3978 (__v8si)(__m256i)(W), \ 3979 (__mmask8)(U), (int)(R))) 3980 3981 #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \ 3982 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ 3983 (__v8si)_mm256_setzero_si256(), \ 3984 (__mmask8)(U), (int)(R))) 3985 3986 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3987 _mm512_cvtpd_epi32 (__m512d __A) 3988 { 3989 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 3990 (__v8si) 3991 _mm256_undefined_si256 (), 3992 (__mmask8) -1, 3993 _MM_FROUND_CUR_DIRECTION); 3994 } 3995 3996 static __inline__ __m256i __DEFAULT_FN_ATTRS512 3997 _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) 3998 { 3999 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 4000 (__v8si) __W, 4001 (__mmask8) __U, 4002 _MM_FROUND_CUR_DIRECTION); 4003 } 4004 4005 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4006 _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) 4007 { 4008 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, 4009 (__v8si) 4010 _mm256_setzero_si256 (), 4011 (__mmask8) __U, 4012 _MM_FROUND_CUR_DIRECTION); 4013 } 4014 4015 #define _mm512_cvt_roundps_epu32(A, R) \ 4016 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4017 (__v16si)_mm512_setzero_si512(), \ 4018 (__mmask16)-1, (int)(R))) 4019 4020 #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \ 4021 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4022 (__v16si)(__m512i)(W), \ 4023 (__mmask16)(U), (int)(R))) 4024 4025 #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \ 4026 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ 4027 (__v16si)_mm512_setzero_si512(), \ 4028 (__mmask16)(U), (int)(R))) 4029 4030 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4031 _mm512_cvtps_epu32 ( __m512 __A) 4032 { 4033 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\ 4034 (__v16si)\ 4035 _mm512_undefined_epi32 (), 4036 (__mmask16) -1,\ 4037 _MM_FROUND_CUR_DIRECTION); 4038 } 4039 4040 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4041 _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) 4042 { 4043 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, 4044 (__v16si) __W, 4045 (__mmask16) __U, 4046 _MM_FROUND_CUR_DIRECTION); 4047 } 4048 4049 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4050 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A) 4051 { 4052 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, 4053 (__v16si) 4054 _mm512_setzero_si512 (), 4055 (__mmask16) __U , 4056 _MM_FROUND_CUR_DIRECTION); 4057 } 4058 4059 #define _mm512_cvt_roundpd_epu32(A, R) \ 4060 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4061 (__v8si)_mm256_setzero_si256(), \ 4062 (__mmask8)-1, (int)(R))) 4063 4064 #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \ 4065 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4066 (__v8si)(__m256i)(W), \ 4067 (__mmask8)(U), (int)(R))) 4068 4069 #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \ 4070 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ 4071 (__v8si)_mm256_setzero_si256(), \ 4072 (__mmask8)(U), (int)(R))) 4073 4074 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4075 _mm512_cvtpd_epu32 (__m512d __A) 4076 { 4077 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4078 (__v8si) 4079 _mm256_undefined_si256 (), 4080 (__mmask8) -1, 4081 _MM_FROUND_CUR_DIRECTION); 4082 } 4083 4084 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4085 _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) 4086 { 4087 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4088 (__v8si) __W, 4089 (__mmask8) __U, 4090 _MM_FROUND_CUR_DIRECTION); 4091 } 4092 4093 static __inline__ __m256i __DEFAULT_FN_ATTRS512 4094 _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) 4095 { 4096 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, 4097 (__v8si) 4098 _mm256_setzero_si256 (), 4099 (__mmask8) __U, 4100 _MM_FROUND_CUR_DIRECTION); 4101 } 4102 4103 static __inline__ double __DEFAULT_FN_ATTRS512 4104 _mm512_cvtsd_f64(__m512d __a) 4105 { 4106 return __a[0]; 4107 } 4108 4109 static __inline__ float __DEFAULT_FN_ATTRS512 4110 _mm512_cvtss_f32(__m512 __a) 4111 { 4112 return __a[0]; 4113 } 4114 4115 /* Unpack and Interleave */ 4116 4117 static __inline __m512d __DEFAULT_FN_ATTRS512 4118 _mm512_unpackhi_pd(__m512d __a, __m512d __b) 4119 { 4120 return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 4121 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); 4122 } 4123 4124 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4125 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 4126 { 4127 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4128 (__v8df)_mm512_unpackhi_pd(__A, __B), 4129 (__v8df)__W); 4130 } 4131 4132 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4133 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) 4134 { 4135 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4136 (__v8df)_mm512_unpackhi_pd(__A, __B), 4137 (__v8df)_mm512_setzero_pd()); 4138 } 4139 4140 static __inline __m512d __DEFAULT_FN_ATTRS512 4141 _mm512_unpacklo_pd(__m512d __a, __m512d __b) 4142 { 4143 return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 4144 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); 4145 } 4146 4147 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4148 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 4149 { 4150 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4151 (__v8df)_mm512_unpacklo_pd(__A, __B), 4152 (__v8df)__W); 4153 } 4154 4155 static __inline__ __m512d __DEFAULT_FN_ATTRS512 4156 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) 4157 { 4158 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, 4159 (__v8df)_mm512_unpacklo_pd(__A, __B), 4160 (__v8df)_mm512_setzero_pd()); 4161 } 4162 4163 static __inline __m512 __DEFAULT_FN_ATTRS512 4164 _mm512_unpackhi_ps(__m512 __a, __m512 __b) 4165 { 4166 return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, 4167 2, 18, 3, 19, 4168 2+4, 18+4, 3+4, 19+4, 4169 2+8, 18+8, 3+8, 19+8, 4170 2+12, 18+12, 3+12, 19+12); 4171 } 4172 4173 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4174 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 4175 { 4176 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4177 (__v16sf)_mm512_unpackhi_ps(__A, __B), 4178 (__v16sf)__W); 4179 } 4180 4181 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4182 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) 4183 { 4184 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4185 (__v16sf)_mm512_unpackhi_ps(__A, __B), 4186 (__v16sf)_mm512_setzero_ps()); 4187 } 4188 4189 static __inline __m512 __DEFAULT_FN_ATTRS512 4190 _mm512_unpacklo_ps(__m512 __a, __m512 __b) 4191 { 4192 return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, 4193 0, 16, 1, 17, 4194 0+4, 16+4, 1+4, 17+4, 4195 0+8, 16+8, 1+8, 17+8, 4196 0+12, 16+12, 1+12, 17+12); 4197 } 4198 4199 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4200 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 4201 { 4202 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4203 (__v16sf)_mm512_unpacklo_ps(__A, __B), 4204 (__v16sf)__W); 4205 } 4206 4207 static __inline__ __m512 __DEFAULT_FN_ATTRS512 4208 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) 4209 { 4210 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, 4211 (__v16sf)_mm512_unpacklo_ps(__A, __B), 4212 (__v16sf)_mm512_setzero_ps()); 4213 } 4214 4215 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4216 _mm512_unpackhi_epi32(__m512i __A, __m512i __B) 4217 { 4218 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, 4219 2, 18, 3, 19, 4220 2+4, 18+4, 3+4, 19+4, 4221 2+8, 18+8, 3+8, 19+8, 4222 2+12, 18+12, 3+12, 19+12); 4223 } 4224 4225 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4226 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4227 { 4228 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4229 (__v16si)_mm512_unpackhi_epi32(__A, __B), 4230 (__v16si)__W); 4231 } 4232 4233 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4234 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) 4235 { 4236 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4237 (__v16si)_mm512_unpackhi_epi32(__A, __B), 4238 (__v16si)_mm512_setzero_si512()); 4239 } 4240 4241 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4242 _mm512_unpacklo_epi32(__m512i __A, __m512i __B) 4243 { 4244 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, 4245 0, 16, 1, 17, 4246 0+4, 16+4, 1+4, 17+4, 4247 0+8, 16+8, 1+8, 17+8, 4248 0+12, 16+12, 1+12, 17+12); 4249 } 4250 4251 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4252 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4253 { 4254 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4255 (__v16si)_mm512_unpacklo_epi32(__A, __B), 4256 (__v16si)__W); 4257 } 4258 4259 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4260 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) 4261 { 4262 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, 4263 (__v16si)_mm512_unpacklo_epi32(__A, __B), 4264 (__v16si)_mm512_setzero_si512()); 4265 } 4266 4267 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4268 _mm512_unpackhi_epi64(__m512i __A, __m512i __B) 4269 { 4270 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 4271 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); 4272 } 4273 4274 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4275 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4276 { 4277 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4278 (__v8di)_mm512_unpackhi_epi64(__A, __B), 4279 (__v8di)__W); 4280 } 4281 4282 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4283 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) 4284 { 4285 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4286 (__v8di)_mm512_unpackhi_epi64(__A, __B), 4287 (__v8di)_mm512_setzero_si512()); 4288 } 4289 4290 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4291 _mm512_unpacklo_epi64 (__m512i __A, __m512i __B) 4292 { 4293 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 4294 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); 4295 } 4296 4297 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4298 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4299 { 4300 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4301 (__v8di)_mm512_unpacklo_epi64(__A, __B), 4302 (__v8di)__W); 4303 } 4304 4305 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4306 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 4307 { 4308 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, 4309 (__v8di)_mm512_unpacklo_epi64(__A, __B), 4310 (__v8di)_mm512_setzero_si512()); 4311 } 4312 4313 4314 /* SIMD load ops */ 4315 4316 static __inline __m512i __DEFAULT_FN_ATTRS512 4317 _mm512_loadu_si512 (void const *__P) 4318 { 4319 struct __loadu_si512 { 4320 __m512i_u __v; 4321 } __attribute__((__packed__, __may_alias__)); 4322 return ((const struct __loadu_si512*)__P)->__v; 4323 } 4324 4325 static __inline __m512i __DEFAULT_FN_ATTRS512 4326 _mm512_loadu_epi32 (void const *__P) 4327 { 4328 struct __loadu_epi32 { 4329 __m512i_u __v; 4330 } __attribute__((__packed__, __may_alias__)); 4331 return ((const struct __loadu_epi32*)__P)->__v; 4332 } 4333 4334 static __inline __m512i __DEFAULT_FN_ATTRS512 4335 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) 4336 { 4337 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, 4338 (__v16si) __W, 4339 (__mmask16) __U); 4340 } 4341 4342 4343 static __inline __m512i __DEFAULT_FN_ATTRS512 4344 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) 4345 { 4346 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P, 4347 (__v16si) 4348 _mm512_setzero_si512 (), 4349 (__mmask16) __U); 4350 } 4351 4352 static __inline __m512i __DEFAULT_FN_ATTRS512 4353 _mm512_loadu_epi64 (void const *__P) 4354 { 4355 struct __loadu_epi64 { 4356 __m512i_u __v; 4357 } __attribute__((__packed__, __may_alias__)); 4358 return ((const struct __loadu_epi64*)__P)->__v; 4359 } 4360 4361 static __inline __m512i __DEFAULT_FN_ATTRS512 4362 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) 4363 { 4364 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, 4365 (__v8di) __W, 4366 (__mmask8) __U); 4367 } 4368 4369 static __inline __m512i __DEFAULT_FN_ATTRS512 4370 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) 4371 { 4372 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P, 4373 (__v8di) 4374 _mm512_setzero_si512 (), 4375 (__mmask8) __U); 4376 } 4377 4378 static __inline __m512 __DEFAULT_FN_ATTRS512 4379 _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) 4380 { 4381 return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, 4382 (__v16sf) __W, 4383 (__mmask16) __U); 4384 } 4385 4386 static __inline __m512 __DEFAULT_FN_ATTRS512 4387 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) 4388 { 4389 return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P, 4390 (__v16sf) 4391 _mm512_setzero_ps (), 4392 (__mmask16) __U); 4393 } 4394 4395 static __inline __m512d __DEFAULT_FN_ATTRS512 4396 _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) 4397 { 4398 return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, 4399 (__v8df) __W, 4400 (__mmask8) __U); 4401 } 4402 4403 static __inline __m512d __DEFAULT_FN_ATTRS512 4404 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) 4405 { 4406 return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P, 4407 (__v8df) 4408 _mm512_setzero_pd (), 4409 (__mmask8) __U); 4410 } 4411 4412 static __inline __m512d __DEFAULT_FN_ATTRS512 4413 _mm512_loadu_pd(void const *__p) 4414 { 4415 struct __loadu_pd { 4416 __m512d_u __v; 4417 } __attribute__((__packed__, __may_alias__)); 4418 return ((const struct __loadu_pd*)__p)->__v; 4419 } 4420 4421 static __inline __m512 __DEFAULT_FN_ATTRS512 4422 _mm512_loadu_ps(void const *__p) 4423 { 4424 struct __loadu_ps { 4425 __m512_u __v; 4426 } __attribute__((__packed__, __may_alias__)); 4427 return ((const struct __loadu_ps*)__p)->__v; 4428 } 4429 4430 static __inline __m512 __DEFAULT_FN_ATTRS512 4431 _mm512_load_ps(void const *__p) 4432 { 4433 return *(const __m512*)__p; 4434 } 4435 4436 static __inline __m512 __DEFAULT_FN_ATTRS512 4437 _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) 4438 { 4439 return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, 4440 (__v16sf) __W, 4441 (__mmask16) __U); 4442 } 4443 4444 static __inline __m512 __DEFAULT_FN_ATTRS512 4445 _mm512_maskz_load_ps(__mmask16 __U, void const *__P) 4446 { 4447 return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P, 4448 (__v16sf) 4449 _mm512_setzero_ps (), 4450 (__mmask16) __U); 4451 } 4452 4453 static __inline __m512d __DEFAULT_FN_ATTRS512 4454 _mm512_load_pd(void const *__p) 4455 { 4456 return *(const __m512d*)__p; 4457 } 4458 4459 static __inline __m512d __DEFAULT_FN_ATTRS512 4460 _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) 4461 { 4462 return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, 4463 (__v8df) __W, 4464 (__mmask8) __U); 4465 } 4466 4467 static __inline __m512d __DEFAULT_FN_ATTRS512 4468 _mm512_maskz_load_pd(__mmask8 __U, void const *__P) 4469 { 4470 return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P, 4471 (__v8df) 4472 _mm512_setzero_pd (), 4473 (__mmask8) __U); 4474 } 4475 4476 static __inline __m512i __DEFAULT_FN_ATTRS512 4477 _mm512_load_si512 (void const *__P) 4478 { 4479 return *(const __m512i *) __P; 4480 } 4481 4482 static __inline __m512i __DEFAULT_FN_ATTRS512 4483 _mm512_load_epi32 (void const *__P) 4484 { 4485 return *(const __m512i *) __P; 4486 } 4487 4488 static __inline __m512i __DEFAULT_FN_ATTRS512 4489 _mm512_load_epi64 (void const *__P) 4490 { 4491 return *(const __m512i *) __P; 4492 } 4493 4494 /* SIMD store ops */ 4495 4496 static __inline void __DEFAULT_FN_ATTRS512 4497 _mm512_storeu_epi64 (void *__P, __m512i __A) 4498 { 4499 struct __storeu_epi64 { 4500 __m512i_u __v; 4501 } __attribute__((__packed__, __may_alias__)); 4502 ((struct __storeu_epi64*)__P)->__v = __A; 4503 } 4504 4505 static __inline void __DEFAULT_FN_ATTRS512 4506 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) 4507 { 4508 __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A, 4509 (__mmask8) __U); 4510 } 4511 4512 static __inline void __DEFAULT_FN_ATTRS512 4513 _mm512_storeu_si512 (void *__P, __m512i __A) 4514 { 4515 struct __storeu_si512 { 4516 __m512i_u __v; 4517 } __attribute__((__packed__, __may_alias__)); 4518 ((struct __storeu_si512*)__P)->__v = __A; 4519 } 4520 4521 static __inline void __DEFAULT_FN_ATTRS512 4522 _mm512_storeu_epi32 (void *__P, __m512i __A) 4523 { 4524 struct __storeu_epi32 { 4525 __m512i_u __v; 4526 } __attribute__((__packed__, __may_alias__)); 4527 ((struct __storeu_epi32*)__P)->__v = __A; 4528 } 4529 4530 static __inline void __DEFAULT_FN_ATTRS512 4531 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) 4532 { 4533 __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A, 4534 (__mmask16) __U); 4535 } 4536 4537 static __inline void __DEFAULT_FN_ATTRS512 4538 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) 4539 { 4540 __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U); 4541 } 4542 4543 static __inline void __DEFAULT_FN_ATTRS512 4544 _mm512_storeu_pd(void *__P, __m512d __A) 4545 { 4546 struct __storeu_pd { 4547 __m512d_u __v; 4548 } __attribute__((__packed__, __may_alias__)); 4549 ((struct __storeu_pd*)__P)->__v = __A; 4550 } 4551 4552 static __inline void __DEFAULT_FN_ATTRS512 4553 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) 4554 { 4555 __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A, 4556 (__mmask16) __U); 4557 } 4558 4559 static __inline void __DEFAULT_FN_ATTRS512 4560 _mm512_storeu_ps(void *__P, __m512 __A) 4561 { 4562 struct __storeu_ps { 4563 __m512_u __v; 4564 } __attribute__((__packed__, __may_alias__)); 4565 ((struct __storeu_ps*)__P)->__v = __A; 4566 } 4567 4568 static __inline void __DEFAULT_FN_ATTRS512 4569 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) 4570 { 4571 __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U); 4572 } 4573 4574 static __inline void __DEFAULT_FN_ATTRS512 4575 _mm512_store_pd(void *__P, __m512d __A) 4576 { 4577 *(__m512d*)__P = __A; 4578 } 4579 4580 static __inline void __DEFAULT_FN_ATTRS512 4581 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) 4582 { 4583 __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A, 4584 (__mmask16) __U); 4585 } 4586 4587 static __inline void __DEFAULT_FN_ATTRS512 4588 _mm512_store_ps(void *__P, __m512 __A) 4589 { 4590 *(__m512*)__P = __A; 4591 } 4592 4593 static __inline void __DEFAULT_FN_ATTRS512 4594 _mm512_store_si512 (void *__P, __m512i __A) 4595 { 4596 *(__m512i *) __P = __A; 4597 } 4598 4599 static __inline void __DEFAULT_FN_ATTRS512 4600 _mm512_store_epi32 (void *__P, __m512i __A) 4601 { 4602 *(__m512i *) __P = __A; 4603 } 4604 4605 static __inline void __DEFAULT_FN_ATTRS512 4606 _mm512_store_epi64 (void *__P, __m512i __A) 4607 { 4608 *(__m512i *) __P = __A; 4609 } 4610 4611 /* Mask ops */ 4612 4613 static __inline __mmask16 __DEFAULT_FN_ATTRS 4614 _mm512_knot(__mmask16 __M) 4615 { 4616 return __builtin_ia32_knothi(__M); 4617 } 4618 4619 /* Integer compare */ 4620 4621 #define _mm512_cmpeq_epi32_mask(A, B) \ 4622 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) 4623 #define _mm512_mask_cmpeq_epi32_mask(k, A, B) \ 4624 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) 4625 #define _mm512_cmpge_epi32_mask(A, B) \ 4626 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) 4627 #define _mm512_mask_cmpge_epi32_mask(k, A, B) \ 4628 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) 4629 #define _mm512_cmpgt_epi32_mask(A, B) \ 4630 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) 4631 #define _mm512_mask_cmpgt_epi32_mask(k, A, B) \ 4632 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) 4633 #define _mm512_cmple_epi32_mask(A, B) \ 4634 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) 4635 #define _mm512_mask_cmple_epi32_mask(k, A, B) \ 4636 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) 4637 #define _mm512_cmplt_epi32_mask(A, B) \ 4638 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) 4639 #define _mm512_mask_cmplt_epi32_mask(k, A, B) \ 4640 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) 4641 #define _mm512_cmpneq_epi32_mask(A, B) \ 4642 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) 4643 #define _mm512_mask_cmpneq_epi32_mask(k, A, B) \ 4644 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) 4645 4646 #define _mm512_cmpeq_epu32_mask(A, B) \ 4647 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) 4648 #define _mm512_mask_cmpeq_epu32_mask(k, A, B) \ 4649 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) 4650 #define _mm512_cmpge_epu32_mask(A, B) \ 4651 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) 4652 #define _mm512_mask_cmpge_epu32_mask(k, A, B) \ 4653 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) 4654 #define _mm512_cmpgt_epu32_mask(A, B) \ 4655 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) 4656 #define _mm512_mask_cmpgt_epu32_mask(k, A, B) \ 4657 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) 4658 #define _mm512_cmple_epu32_mask(A, B) \ 4659 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) 4660 #define _mm512_mask_cmple_epu32_mask(k, A, B) \ 4661 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) 4662 #define _mm512_cmplt_epu32_mask(A, B) \ 4663 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) 4664 #define _mm512_mask_cmplt_epu32_mask(k, A, B) \ 4665 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) 4666 #define _mm512_cmpneq_epu32_mask(A, B) \ 4667 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) 4668 #define _mm512_mask_cmpneq_epu32_mask(k, A, B) \ 4669 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) 4670 4671 #define _mm512_cmpeq_epi64_mask(A, B) \ 4672 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) 4673 #define _mm512_mask_cmpeq_epi64_mask(k, A, B) \ 4674 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) 4675 #define _mm512_cmpge_epi64_mask(A, B) \ 4676 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) 4677 #define _mm512_mask_cmpge_epi64_mask(k, A, B) \ 4678 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) 4679 #define _mm512_cmpgt_epi64_mask(A, B) \ 4680 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) 4681 #define _mm512_mask_cmpgt_epi64_mask(k, A, B) \ 4682 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) 4683 #define _mm512_cmple_epi64_mask(A, B) \ 4684 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) 4685 #define _mm512_mask_cmple_epi64_mask(k, A, B) \ 4686 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) 4687 #define _mm512_cmplt_epi64_mask(A, B) \ 4688 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) 4689 #define _mm512_mask_cmplt_epi64_mask(k, A, B) \ 4690 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) 4691 #define _mm512_cmpneq_epi64_mask(A, B) \ 4692 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) 4693 #define _mm512_mask_cmpneq_epi64_mask(k, A, B) \ 4694 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) 4695 4696 #define _mm512_cmpeq_epu64_mask(A, B) \ 4697 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) 4698 #define _mm512_mask_cmpeq_epu64_mask(k, A, B) \ 4699 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) 4700 #define _mm512_cmpge_epu64_mask(A, B) \ 4701 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) 4702 #define _mm512_mask_cmpge_epu64_mask(k, A, B) \ 4703 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) 4704 #define _mm512_cmpgt_epu64_mask(A, B) \ 4705 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) 4706 #define _mm512_mask_cmpgt_epu64_mask(k, A, B) \ 4707 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) 4708 #define _mm512_cmple_epu64_mask(A, B) \ 4709 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) 4710 #define _mm512_mask_cmple_epu64_mask(k, A, B) \ 4711 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) 4712 #define _mm512_cmplt_epu64_mask(A, B) \ 4713 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) 4714 #define _mm512_mask_cmplt_epu64_mask(k, A, B) \ 4715 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) 4716 #define _mm512_cmpneq_epu64_mask(A, B) \ 4717 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) 4718 #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \ 4719 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) 4720 4721 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4722 _mm512_cvtepi8_epi32(__m128i __A) 4723 { 4724 /* This function always performs a signed extension, but __v16qi is a char 4725 which may be signed or unsigned, so use __v16qs. */ 4726 return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si); 4727 } 4728 4729 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4730 _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) 4731 { 4732 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4733 (__v16si)_mm512_cvtepi8_epi32(__A), 4734 (__v16si)__W); 4735 } 4736 4737 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4738 _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) 4739 { 4740 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4741 (__v16si)_mm512_cvtepi8_epi32(__A), 4742 (__v16si)_mm512_setzero_si512()); 4743 } 4744 4745 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4746 _mm512_cvtepi8_epi64(__m128i __A) 4747 { 4748 /* This function always performs a signed extension, but __v16qi is a char 4749 which may be signed or unsigned, so use __v16qs. */ 4750 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); 4751 } 4752 4753 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4754 _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4755 { 4756 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4757 (__v8di)_mm512_cvtepi8_epi64(__A), 4758 (__v8di)__W); 4759 } 4760 4761 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4762 _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) 4763 { 4764 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4765 (__v8di)_mm512_cvtepi8_epi64(__A), 4766 (__v8di)_mm512_setzero_si512 ()); 4767 } 4768 4769 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4770 _mm512_cvtepi32_epi64(__m256i __X) 4771 { 4772 return (__m512i)__builtin_convertvector((__v8si)__X, __v8di); 4773 } 4774 4775 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4776 _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) 4777 { 4778 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4779 (__v8di)_mm512_cvtepi32_epi64(__X), 4780 (__v8di)__W); 4781 } 4782 4783 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4784 _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) 4785 { 4786 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4787 (__v8di)_mm512_cvtepi32_epi64(__X), 4788 (__v8di)_mm512_setzero_si512()); 4789 } 4790 4791 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4792 _mm512_cvtepi16_epi32(__m256i __A) 4793 { 4794 return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si); 4795 } 4796 4797 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4798 _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) 4799 { 4800 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4801 (__v16si)_mm512_cvtepi16_epi32(__A), 4802 (__v16si)__W); 4803 } 4804 4805 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4806 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) 4807 { 4808 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4809 (__v16si)_mm512_cvtepi16_epi32(__A), 4810 (__v16si)_mm512_setzero_si512 ()); 4811 } 4812 4813 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4814 _mm512_cvtepi16_epi64(__m128i __A) 4815 { 4816 return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di); 4817 } 4818 4819 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4820 _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4821 { 4822 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4823 (__v8di)_mm512_cvtepi16_epi64(__A), 4824 (__v8di)__W); 4825 } 4826 4827 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4828 _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) 4829 { 4830 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4831 (__v8di)_mm512_cvtepi16_epi64(__A), 4832 (__v8di)_mm512_setzero_si512()); 4833 } 4834 4835 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4836 _mm512_cvtepu8_epi32(__m128i __A) 4837 { 4838 return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si); 4839 } 4840 4841 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4842 _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) 4843 { 4844 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4845 (__v16si)_mm512_cvtepu8_epi32(__A), 4846 (__v16si)__W); 4847 } 4848 4849 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4850 _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) 4851 { 4852 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4853 (__v16si)_mm512_cvtepu8_epi32(__A), 4854 (__v16si)_mm512_setzero_si512()); 4855 } 4856 4857 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4858 _mm512_cvtepu8_epi64(__m128i __A) 4859 { 4860 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); 4861 } 4862 4863 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4864 _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4865 { 4866 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4867 (__v8di)_mm512_cvtepu8_epi64(__A), 4868 (__v8di)__W); 4869 } 4870 4871 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4872 _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) 4873 { 4874 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4875 (__v8di)_mm512_cvtepu8_epi64(__A), 4876 (__v8di)_mm512_setzero_si512()); 4877 } 4878 4879 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4880 _mm512_cvtepu32_epi64(__m256i __X) 4881 { 4882 return (__m512i)__builtin_convertvector((__v8su)__X, __v8di); 4883 } 4884 4885 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4886 _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) 4887 { 4888 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4889 (__v8di)_mm512_cvtepu32_epi64(__X), 4890 (__v8di)__W); 4891 } 4892 4893 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4894 _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) 4895 { 4896 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4897 (__v8di)_mm512_cvtepu32_epi64(__X), 4898 (__v8di)_mm512_setzero_si512()); 4899 } 4900 4901 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4902 _mm512_cvtepu16_epi32(__m256i __A) 4903 { 4904 return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si); 4905 } 4906 4907 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4908 _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) 4909 { 4910 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4911 (__v16si)_mm512_cvtepu16_epi32(__A), 4912 (__v16si)__W); 4913 } 4914 4915 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4916 _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) 4917 { 4918 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 4919 (__v16si)_mm512_cvtepu16_epi32(__A), 4920 (__v16si)_mm512_setzero_si512()); 4921 } 4922 4923 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4924 _mm512_cvtepu16_epi64(__m128i __A) 4925 { 4926 return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di); 4927 } 4928 4929 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4930 _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) 4931 { 4932 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4933 (__v8di)_mm512_cvtepu16_epi64(__A), 4934 (__v8di)__W); 4935 } 4936 4937 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4938 _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) 4939 { 4940 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 4941 (__v8di)_mm512_cvtepu16_epi64(__A), 4942 (__v8di)_mm512_setzero_si512()); 4943 } 4944 4945 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4946 _mm512_rorv_epi32 (__m512i __A, __m512i __B) 4947 { 4948 return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B); 4949 } 4950 4951 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4952 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 4953 { 4954 return (__m512i)__builtin_ia32_selectd_512(__U, 4955 (__v16si)_mm512_rorv_epi32(__A, __B), 4956 (__v16si)__W); 4957 } 4958 4959 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4960 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 4961 { 4962 return (__m512i)__builtin_ia32_selectd_512(__U, 4963 (__v16si)_mm512_rorv_epi32(__A, __B), 4964 (__v16si)_mm512_setzero_si512()); 4965 } 4966 4967 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4968 _mm512_rorv_epi64 (__m512i __A, __m512i __B) 4969 { 4970 return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B); 4971 } 4972 4973 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4974 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 4975 { 4976 return (__m512i)__builtin_ia32_selectq_512(__U, 4977 (__v8di)_mm512_rorv_epi64(__A, __B), 4978 (__v8di)__W); 4979 } 4980 4981 static __inline__ __m512i __DEFAULT_FN_ATTRS512 4982 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 4983 { 4984 return (__m512i)__builtin_ia32_selectq_512(__U, 4985 (__v8di)_mm512_rorv_epi64(__A, __B), 4986 (__v8di)_mm512_setzero_si512()); 4987 } 4988 4989 4990 4991 #define _mm512_cmp_epi32_mask(a, b, p) \ 4992 ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ 4993 (__v16si)(__m512i)(b), (int)(p), \ 4994 (__mmask16)-1)) 4995 4996 #define _mm512_cmp_epu32_mask(a, b, p) \ 4997 ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ 4998 (__v16si)(__m512i)(b), (int)(p), \ 4999 (__mmask16)-1)) 5000 5001 #define _mm512_cmp_epi64_mask(a, b, p) \ 5002 ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ 5003 (__v8di)(__m512i)(b), (int)(p), \ 5004 (__mmask8)-1)) 5005 5006 #define _mm512_cmp_epu64_mask(a, b, p) \ 5007 ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ 5008 (__v8di)(__m512i)(b), (int)(p), \ 5009 (__mmask8)-1)) 5010 5011 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \ 5012 ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ 5013 (__v16si)(__m512i)(b), (int)(p), \ 5014 (__mmask16)(m))) 5015 5016 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \ 5017 ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ 5018 (__v16si)(__m512i)(b), (int)(p), \ 5019 (__mmask16)(m))) 5020 5021 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \ 5022 ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ 5023 (__v8di)(__m512i)(b), (int)(p), \ 5024 (__mmask8)(m))) 5025 5026 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \ 5027 ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ 5028 (__v8di)(__m512i)(b), (int)(p), \ 5029 (__mmask8)(m))) 5030 5031 #define _mm512_rol_epi32(a, b) \ 5032 ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b))) 5033 5034 #define _mm512_mask_rol_epi32(W, U, a, b) \ 5035 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5036 (__v16si)_mm512_rol_epi32((a), (b)), \ 5037 (__v16si)(__m512i)(W))) 5038 5039 #define _mm512_maskz_rol_epi32(U, a, b) \ 5040 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5041 (__v16si)_mm512_rol_epi32((a), (b)), \ 5042 (__v16si)_mm512_setzero_si512())) 5043 5044 #define _mm512_rol_epi64(a, b) \ 5045 ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b))) 5046 5047 #define _mm512_mask_rol_epi64(W, U, a, b) \ 5048 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5049 (__v8di)_mm512_rol_epi64((a), (b)), \ 5050 (__v8di)(__m512i)(W))) 5051 5052 #define _mm512_maskz_rol_epi64(U, a, b) \ 5053 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5054 (__v8di)_mm512_rol_epi64((a), (b)), \ 5055 (__v8di)_mm512_setzero_si512())) 5056 5057 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5058 _mm512_rolv_epi32 (__m512i __A, __m512i __B) 5059 { 5060 return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B); 5061 } 5062 5063 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5064 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) 5065 { 5066 return (__m512i)__builtin_ia32_selectd_512(__U, 5067 (__v16si)_mm512_rolv_epi32(__A, __B), 5068 (__v16si)__W); 5069 } 5070 5071 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5072 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) 5073 { 5074 return (__m512i)__builtin_ia32_selectd_512(__U, 5075 (__v16si)_mm512_rolv_epi32(__A, __B), 5076 (__v16si)_mm512_setzero_si512()); 5077 } 5078 5079 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5080 _mm512_rolv_epi64 (__m512i __A, __m512i __B) 5081 { 5082 return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B); 5083 } 5084 5085 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5086 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) 5087 { 5088 return (__m512i)__builtin_ia32_selectq_512(__U, 5089 (__v8di)_mm512_rolv_epi64(__A, __B), 5090 (__v8di)__W); 5091 } 5092 5093 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5094 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) 5095 { 5096 return (__m512i)__builtin_ia32_selectq_512(__U, 5097 (__v8di)_mm512_rolv_epi64(__A, __B), 5098 (__v8di)_mm512_setzero_si512()); 5099 } 5100 5101 #define _mm512_ror_epi32(A, B) \ 5102 ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B))) 5103 5104 #define _mm512_mask_ror_epi32(W, U, A, B) \ 5105 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5106 (__v16si)_mm512_ror_epi32((A), (B)), \ 5107 (__v16si)(__m512i)(W))) 5108 5109 #define _mm512_maskz_ror_epi32(U, A, B) \ 5110 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 5111 (__v16si)_mm512_ror_epi32((A), (B)), \ 5112 (__v16si)_mm512_setzero_si512())) 5113 5114 #define _mm512_ror_epi64(A, B) \ 5115 ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B))) 5116 5117 #define _mm512_mask_ror_epi64(W, U, A, B) \ 5118 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5119 (__v8di)_mm512_ror_epi64((A), (B)), \ 5120 (__v8di)(__m512i)(W))) 5121 5122 #define _mm512_maskz_ror_epi64(U, A, B) \ 5123 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 5124 (__v8di)_mm512_ror_epi64((A), (B)), \ 5125 (__v8di)_mm512_setzero_si512())) 5126 5127 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5128 _mm512_slli_epi32(__m512i __A, unsigned int __B) 5129 { 5130 return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B); 5131 } 5132 5133 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5134 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, 5135 unsigned int __B) 5136 { 5137 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5138 (__v16si)_mm512_slli_epi32(__A, __B), 5139 (__v16si)__W); 5140 } 5141 5142 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5143 _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { 5144 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5145 (__v16si)_mm512_slli_epi32(__A, __B), 5146 (__v16si)_mm512_setzero_si512()); 5147 } 5148 5149 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5150 _mm512_slli_epi64(__m512i __A, unsigned int __B) 5151 { 5152 return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B); 5153 } 5154 5155 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5156 _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) 5157 { 5158 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5159 (__v8di)_mm512_slli_epi64(__A, __B), 5160 (__v8di)__W); 5161 } 5162 5163 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5164 _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) 5165 { 5166 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5167 (__v8di)_mm512_slli_epi64(__A, __B), 5168 (__v8di)_mm512_setzero_si512()); 5169 } 5170 5171 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5172 _mm512_srli_epi32(__m512i __A, unsigned int __B) 5173 { 5174 return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B); 5175 } 5176 5177 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5178 _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, 5179 unsigned int __B) 5180 { 5181 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5182 (__v16si)_mm512_srli_epi32(__A, __B), 5183 (__v16si)__W); 5184 } 5185 5186 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5187 _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { 5188 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5189 (__v16si)_mm512_srli_epi32(__A, __B), 5190 (__v16si)_mm512_setzero_si512()); 5191 } 5192 5193 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5194 _mm512_srli_epi64(__m512i __A, unsigned int __B) 5195 { 5196 return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B); 5197 } 5198 5199 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5200 _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, 5201 unsigned int __B) 5202 { 5203 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5204 (__v8di)_mm512_srli_epi64(__A, __B), 5205 (__v8di)__W); 5206 } 5207 5208 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5209 _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, 5210 unsigned int __B) 5211 { 5212 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5213 (__v8di)_mm512_srli_epi64(__A, __B), 5214 (__v8di)_mm512_setzero_si512()); 5215 } 5216 5217 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5218 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) 5219 { 5220 return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, 5221 (__v16si) __W, 5222 (__mmask16) __U); 5223 } 5224 5225 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5226 _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) 5227 { 5228 return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, 5229 (__v16si) 5230 _mm512_setzero_si512 (), 5231 (__mmask16) __U); 5232 } 5233 5234 static __inline__ void __DEFAULT_FN_ATTRS512 5235 _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) 5236 { 5237 __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A, 5238 (__mmask16) __U); 5239 } 5240 5241 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5242 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 5243 { 5244 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 5245 (__v16si) __A, 5246 (__v16si) __W); 5247 } 5248 5249 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5250 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) 5251 { 5252 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, 5253 (__v16si) __A, 5254 (__v16si) _mm512_setzero_si512 ()); 5255 } 5256 5257 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5258 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 5259 { 5260 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 5261 (__v8di) __A, 5262 (__v8di) __W); 5263 } 5264 5265 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5266 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) 5267 { 5268 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, 5269 (__v8di) __A, 5270 (__v8di) _mm512_setzero_si512 ()); 5271 } 5272 5273 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5274 _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) 5275 { 5276 return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, 5277 (__v8di) __W, 5278 (__mmask8) __U); 5279 } 5280 5281 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5282 _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) 5283 { 5284 return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, 5285 (__v8di) 5286 _mm512_setzero_si512 (), 5287 (__mmask8) __U); 5288 } 5289 5290 static __inline__ void __DEFAULT_FN_ATTRS512 5291 _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) 5292 { 5293 __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A, 5294 (__mmask8) __U); 5295 } 5296 5297 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5298 _mm512_movedup_pd (__m512d __A) 5299 { 5300 return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A, 5301 0, 0, 2, 2, 4, 4, 6, 6); 5302 } 5303 5304 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5305 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) 5306 { 5307 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 5308 (__v8df)_mm512_movedup_pd(__A), 5309 (__v8df)__W); 5310 } 5311 5312 static __inline__ __m512d __DEFAULT_FN_ATTRS512 5313 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) 5314 { 5315 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 5316 (__v8df)_mm512_movedup_pd(__A), 5317 (__v8df)_mm512_setzero_pd()); 5318 } 5319 5320 #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \ 5321 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5322 (__v8df)(__m512d)(B), \ 5323 (__v8di)(__m512i)(C), (int)(imm), \ 5324 (__mmask8)-1, (int)(R))) 5325 5326 #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \ 5327 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5328 (__v8df)(__m512d)(B), \ 5329 (__v8di)(__m512i)(C), (int)(imm), \ 5330 (__mmask8)(U), (int)(R))) 5331 5332 #define _mm512_fixupimm_pd(A, B, C, imm) \ 5333 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5334 (__v8df)(__m512d)(B), \ 5335 (__v8di)(__m512i)(C), (int)(imm), \ 5336 (__mmask8)-1, \ 5337 _MM_FROUND_CUR_DIRECTION)) 5338 5339 #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \ 5340 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ 5341 (__v8df)(__m512d)(B), \ 5342 (__v8di)(__m512i)(C), (int)(imm), \ 5343 (__mmask8)(U), \ 5344 _MM_FROUND_CUR_DIRECTION)) 5345 5346 #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \ 5347 ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ 5348 (__v8df)(__m512d)(B), \ 5349 (__v8di)(__m512i)(C), \ 5350 (int)(imm), (__mmask8)(U), \ 5351 (int)(R))) 5352 5353 #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \ 5354 ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ 5355 (__v8df)(__m512d)(B), \ 5356 (__v8di)(__m512i)(C), \ 5357 (int)(imm), (__mmask8)(U), \ 5358 _MM_FROUND_CUR_DIRECTION)) 5359 5360 #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \ 5361 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5362 (__v16sf)(__m512)(B), \ 5363 (__v16si)(__m512i)(C), (int)(imm), \ 5364 (__mmask16)-1, (int)(R))) 5365 5366 #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \ 5367 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5368 (__v16sf)(__m512)(B), \ 5369 (__v16si)(__m512i)(C), (int)(imm), \ 5370 (__mmask16)(U), (int)(R))) 5371 5372 #define _mm512_fixupimm_ps(A, B, C, imm) \ 5373 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5374 (__v16sf)(__m512)(B), \ 5375 (__v16si)(__m512i)(C), (int)(imm), \ 5376 (__mmask16)-1, \ 5377 _MM_FROUND_CUR_DIRECTION)) 5378 5379 #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \ 5380 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ 5381 (__v16sf)(__m512)(B), \ 5382 (__v16si)(__m512i)(C), (int)(imm), \ 5383 (__mmask16)(U), \ 5384 _MM_FROUND_CUR_DIRECTION)) 5385 5386 #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \ 5387 ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ 5388 (__v16sf)(__m512)(B), \ 5389 (__v16si)(__m512i)(C), \ 5390 (int)(imm), (__mmask16)(U), \ 5391 (int)(R))) 5392 5393 #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \ 5394 ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ 5395 (__v16sf)(__m512)(B), \ 5396 (__v16si)(__m512i)(C), \ 5397 (int)(imm), (__mmask16)(U), \ 5398 _MM_FROUND_CUR_DIRECTION)) 5399 5400 #define _mm_fixupimm_round_sd(A, B, C, imm, R) \ 5401 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5402 (__v2df)(__m128d)(B), \ 5403 (__v2di)(__m128i)(C), (int)(imm), \ 5404 (__mmask8)-1, (int)(R))) 5405 5406 #define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \ 5407 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5408 (__v2df)(__m128d)(B), \ 5409 (__v2di)(__m128i)(C), (int)(imm), \ 5410 (__mmask8)(U), (int)(R))) 5411 5412 #define _mm_fixupimm_sd(A, B, C, imm) \ 5413 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5414 (__v2df)(__m128d)(B), \ 5415 (__v2di)(__m128i)(C), (int)(imm), \ 5416 (__mmask8)-1, \ 5417 _MM_FROUND_CUR_DIRECTION)) 5418 5419 #define _mm_mask_fixupimm_sd(A, U, B, C, imm) \ 5420 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ 5421 (__v2df)(__m128d)(B), \ 5422 (__v2di)(__m128i)(C), (int)(imm), \ 5423 (__mmask8)(U), \ 5424 _MM_FROUND_CUR_DIRECTION)) 5425 5426 #define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \ 5427 ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ 5428 (__v2df)(__m128d)(B), \ 5429 (__v2di)(__m128i)(C), (int)(imm), \ 5430 (__mmask8)(U), (int)(R))) 5431 5432 #define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \ 5433 ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ 5434 (__v2df)(__m128d)(B), \ 5435 (__v2di)(__m128i)(C), (int)(imm), \ 5436 (__mmask8)(U), \ 5437 _MM_FROUND_CUR_DIRECTION)) 5438 5439 #define _mm_fixupimm_round_ss(A, B, C, imm, R) \ 5440 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5441 (__v4sf)(__m128)(B), \ 5442 (__v4si)(__m128i)(C), (int)(imm), \ 5443 (__mmask8)-1, (int)(R))) 5444 5445 #define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \ 5446 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5447 (__v4sf)(__m128)(B), \ 5448 (__v4si)(__m128i)(C), (int)(imm), \ 5449 (__mmask8)(U), (int)(R))) 5450 5451 #define _mm_fixupimm_ss(A, B, C, imm) \ 5452 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5453 (__v4sf)(__m128)(B), \ 5454 (__v4si)(__m128i)(C), (int)(imm), \ 5455 (__mmask8)-1, \ 5456 _MM_FROUND_CUR_DIRECTION)) 5457 5458 #define _mm_mask_fixupimm_ss(A, U, B, C, imm) \ 5459 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ 5460 (__v4sf)(__m128)(B), \ 5461 (__v4si)(__m128i)(C), (int)(imm), \ 5462 (__mmask8)(U), \ 5463 _MM_FROUND_CUR_DIRECTION)) 5464 5465 #define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \ 5466 ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ 5467 (__v4sf)(__m128)(B), \ 5468 (__v4si)(__m128i)(C), (int)(imm), \ 5469 (__mmask8)(U), (int)(R))) 5470 5471 #define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \ 5472 ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ 5473 (__v4sf)(__m128)(B), \ 5474 (__v4si)(__m128i)(C), (int)(imm), \ 5475 (__mmask8)(U), \ 5476 _MM_FROUND_CUR_DIRECTION)) 5477 5478 #define _mm_getexp_round_sd(A, B, R) \ 5479 ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5480 (__v2df)(__m128d)(B), \ 5481 (__v2df)_mm_setzero_pd(), \ 5482 (__mmask8)-1, (int)(R))) 5483 5484 5485 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5486 _mm_getexp_sd (__m128d __A, __m128d __B) 5487 { 5488 return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A, 5489 (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); 5490 } 5491 5492 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5493 _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 5494 { 5495 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, 5496 (__v2df) __B, 5497 (__v2df) __W, 5498 (__mmask8) __U, 5499 _MM_FROUND_CUR_DIRECTION); 5500 } 5501 5502 #define _mm_mask_getexp_round_sd(W, U, A, B, R) \ 5503 ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5504 (__v2df)(__m128d)(B), \ 5505 (__v2df)(__m128d)(W), \ 5506 (__mmask8)(U), (int)(R))) 5507 5508 static __inline__ __m128d __DEFAULT_FN_ATTRS128 5509 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) 5510 { 5511 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, 5512 (__v2df) __B, 5513 (__v2df) _mm_setzero_pd (), 5514 (__mmask8) __U, 5515 _MM_FROUND_CUR_DIRECTION); 5516 } 5517 5518 #define _mm_maskz_getexp_round_sd(U, A, B, R) \ 5519 ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ 5520 (__v2df)(__m128d)(B), \ 5521 (__v2df)_mm_setzero_pd(), \ 5522 (__mmask8)(U), (int)(R))) 5523 5524 #define _mm_getexp_round_ss(A, B, R) \ 5525 ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5526 (__v4sf)(__m128)(B), \ 5527 (__v4sf)_mm_setzero_ps(), \ 5528 (__mmask8)-1, (int)(R))) 5529 5530 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5531 _mm_getexp_ss (__m128 __A, __m128 __B) 5532 { 5533 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5534 (__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); 5535 } 5536 5537 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5538 _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 5539 { 5540 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5541 (__v4sf) __B, 5542 (__v4sf) __W, 5543 (__mmask8) __U, 5544 _MM_FROUND_CUR_DIRECTION); 5545 } 5546 5547 #define _mm_mask_getexp_round_ss(W, U, A, B, R) \ 5548 ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5549 (__v4sf)(__m128)(B), \ 5550 (__v4sf)(__m128)(W), \ 5551 (__mmask8)(U), (int)(R))) 5552 5553 static __inline__ __m128 __DEFAULT_FN_ATTRS128 5554 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) 5555 { 5556 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, 5557 (__v4sf) __B, 5558 (__v4sf) _mm_setzero_ps (), 5559 (__mmask8) __U, 5560 _MM_FROUND_CUR_DIRECTION); 5561 } 5562 5563 #define _mm_maskz_getexp_round_ss(U, A, B, R) \ 5564 ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ 5565 (__v4sf)(__m128)(B), \ 5566 (__v4sf)_mm_setzero_ps(), \ 5567 (__mmask8)(U), (int)(R))) 5568 5569 #define _mm_getmant_round_sd(A, B, C, D, R) \ 5570 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5571 (__v2df)(__m128d)(B), \ 5572 (int)(((D)<<2) | (C)), \ 5573 (__v2df)_mm_setzero_pd(), \ 5574 (__mmask8)-1, (int)(R))) 5575 5576 #define _mm_getmant_sd(A, B, C, D) \ 5577 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5578 (__v2df)(__m128d)(B), \ 5579 (int)(((D)<<2) | (C)), \ 5580 (__v2df)_mm_setzero_pd(), \ 5581 (__mmask8)-1, \ 5582 _MM_FROUND_CUR_DIRECTION)) 5583 5584 #define _mm_mask_getmant_sd(W, U, A, B, C, D) \ 5585 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5586 (__v2df)(__m128d)(B), \ 5587 (int)(((D)<<2) | (C)), \ 5588 (__v2df)(__m128d)(W), \ 5589 (__mmask8)(U), \ 5590 _MM_FROUND_CUR_DIRECTION)) 5591 5592 #define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \ 5593 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5594 (__v2df)(__m128d)(B), \ 5595 (int)(((D)<<2) | (C)), \ 5596 (__v2df)(__m128d)(W), \ 5597 (__mmask8)(U), (int)(R))) 5598 5599 #define _mm_maskz_getmant_sd(U, A, B, C, D) \ 5600 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5601 (__v2df)(__m128d)(B), \ 5602 (int)(((D)<<2) | (C)), \ 5603 (__v2df)_mm_setzero_pd(), \ 5604 (__mmask8)(U), \ 5605 _MM_FROUND_CUR_DIRECTION)) 5606 5607 #define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \ 5608 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ 5609 (__v2df)(__m128d)(B), \ 5610 (int)(((D)<<2) | (C)), \ 5611 (__v2df)_mm_setzero_pd(), \ 5612 (__mmask8)(U), (int)(R))) 5613 5614 #define _mm_getmant_round_ss(A, B, C, D, R) \ 5615 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5616 (__v4sf)(__m128)(B), \ 5617 (int)(((D)<<2) | (C)), \ 5618 (__v4sf)_mm_setzero_ps(), \ 5619 (__mmask8)-1, (int)(R))) 5620 5621 #define _mm_getmant_ss(A, B, C, D) \ 5622 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5623 (__v4sf)(__m128)(B), \ 5624 (int)(((D)<<2) | (C)), \ 5625 (__v4sf)_mm_setzero_ps(), \ 5626 (__mmask8)-1, \ 5627 _MM_FROUND_CUR_DIRECTION)) 5628 5629 #define _mm_mask_getmant_ss(W, U, A, B, C, D) \ 5630 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5631 (__v4sf)(__m128)(B), \ 5632 (int)(((D)<<2) | (C)), \ 5633 (__v4sf)(__m128)(W), \ 5634 (__mmask8)(U), \ 5635 _MM_FROUND_CUR_DIRECTION)) 5636 5637 #define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \ 5638 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5639 (__v4sf)(__m128)(B), \ 5640 (int)(((D)<<2) | (C)), \ 5641 (__v4sf)(__m128)(W), \ 5642 (__mmask8)(U), (int)(R))) 5643 5644 #define _mm_maskz_getmant_ss(U, A, B, C, D) \ 5645 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5646 (__v4sf)(__m128)(B), \ 5647 (int)(((D)<<2) | (C)), \ 5648 (__v4sf)_mm_setzero_ps(), \ 5649 (__mmask8)(U), \ 5650 _MM_FROUND_CUR_DIRECTION)) 5651 5652 #define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \ 5653 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ 5654 (__v4sf)(__m128)(B), \ 5655 (int)(((D)<<2) | (C)), \ 5656 (__v4sf)_mm_setzero_ps(), \ 5657 (__mmask8)(U), (int)(R))) 5658 5659 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 5660 _mm512_kmov (__mmask16 __A) 5661 { 5662 return __A; 5663 } 5664 5665 #define _mm_comi_round_sd(A, B, P, R) \ 5666 ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ 5667 (int)(P), (int)(R))) 5668 5669 #define _mm_comi_round_ss(A, B, P, R) \ 5670 ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ 5671 (int)(P), (int)(R))) 5672 5673 #ifdef __x86_64__ 5674 #define _mm_cvt_roundsd_si64(A, R) \ 5675 ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) 5676 #endif 5677 5678 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5679 _mm512_sll_epi32(__m512i __A, __m128i __B) 5680 { 5681 return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); 5682 } 5683 5684 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5685 _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5686 { 5687 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5688 (__v16si)_mm512_sll_epi32(__A, __B), 5689 (__v16si)__W); 5690 } 5691 5692 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5693 _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5694 { 5695 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5696 (__v16si)_mm512_sll_epi32(__A, __B), 5697 (__v16si)_mm512_setzero_si512()); 5698 } 5699 5700 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5701 _mm512_sll_epi64(__m512i __A, __m128i __B) 5702 { 5703 return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); 5704 } 5705 5706 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5707 _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5708 { 5709 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5710 (__v8di)_mm512_sll_epi64(__A, __B), 5711 (__v8di)__W); 5712 } 5713 5714 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5715 _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5716 { 5717 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5718 (__v8di)_mm512_sll_epi64(__A, __B), 5719 (__v8di)_mm512_setzero_si512()); 5720 } 5721 5722 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5723 _mm512_sllv_epi32(__m512i __X, __m512i __Y) 5724 { 5725 return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y); 5726 } 5727 5728 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5729 _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5730 { 5731 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5732 (__v16si)_mm512_sllv_epi32(__X, __Y), 5733 (__v16si)__W); 5734 } 5735 5736 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5737 _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5738 { 5739 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5740 (__v16si)_mm512_sllv_epi32(__X, __Y), 5741 (__v16si)_mm512_setzero_si512()); 5742 } 5743 5744 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5745 _mm512_sllv_epi64(__m512i __X, __m512i __Y) 5746 { 5747 return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y); 5748 } 5749 5750 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5751 _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5752 { 5753 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5754 (__v8di)_mm512_sllv_epi64(__X, __Y), 5755 (__v8di)__W); 5756 } 5757 5758 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5759 _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5760 { 5761 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5762 (__v8di)_mm512_sllv_epi64(__X, __Y), 5763 (__v8di)_mm512_setzero_si512()); 5764 } 5765 5766 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5767 _mm512_sra_epi32(__m512i __A, __m128i __B) 5768 { 5769 return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); 5770 } 5771 5772 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5773 _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5774 { 5775 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5776 (__v16si)_mm512_sra_epi32(__A, __B), 5777 (__v16si)__W); 5778 } 5779 5780 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5781 _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5782 { 5783 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5784 (__v16si)_mm512_sra_epi32(__A, __B), 5785 (__v16si)_mm512_setzero_si512()); 5786 } 5787 5788 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5789 _mm512_sra_epi64(__m512i __A, __m128i __B) 5790 { 5791 return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); 5792 } 5793 5794 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5795 _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5796 { 5797 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5798 (__v8di)_mm512_sra_epi64(__A, __B), 5799 (__v8di)__W); 5800 } 5801 5802 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5803 _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5804 { 5805 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5806 (__v8di)_mm512_sra_epi64(__A, __B), 5807 (__v8di)_mm512_setzero_si512()); 5808 } 5809 5810 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5811 _mm512_srav_epi32(__m512i __X, __m512i __Y) 5812 { 5813 return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y); 5814 } 5815 5816 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5817 _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5818 { 5819 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5820 (__v16si)_mm512_srav_epi32(__X, __Y), 5821 (__v16si)__W); 5822 } 5823 5824 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5825 _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5826 { 5827 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5828 (__v16si)_mm512_srav_epi32(__X, __Y), 5829 (__v16si)_mm512_setzero_si512()); 5830 } 5831 5832 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5833 _mm512_srav_epi64(__m512i __X, __m512i __Y) 5834 { 5835 return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y); 5836 } 5837 5838 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5839 _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5840 { 5841 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5842 (__v8di)_mm512_srav_epi64(__X, __Y), 5843 (__v8di)__W); 5844 } 5845 5846 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5847 _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5848 { 5849 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5850 (__v8di)_mm512_srav_epi64(__X, __Y), 5851 (__v8di)_mm512_setzero_si512()); 5852 } 5853 5854 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5855 _mm512_srl_epi32(__m512i __A, __m128i __B) 5856 { 5857 return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); 5858 } 5859 5860 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5861 _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) 5862 { 5863 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5864 (__v16si)_mm512_srl_epi32(__A, __B), 5865 (__v16si)__W); 5866 } 5867 5868 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5869 _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) 5870 { 5871 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5872 (__v16si)_mm512_srl_epi32(__A, __B), 5873 (__v16si)_mm512_setzero_si512()); 5874 } 5875 5876 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5877 _mm512_srl_epi64(__m512i __A, __m128i __B) 5878 { 5879 return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); 5880 } 5881 5882 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5883 _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) 5884 { 5885 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5886 (__v8di)_mm512_srl_epi64(__A, __B), 5887 (__v8di)__W); 5888 } 5889 5890 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5891 _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) 5892 { 5893 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5894 (__v8di)_mm512_srl_epi64(__A, __B), 5895 (__v8di)_mm512_setzero_si512()); 5896 } 5897 5898 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5899 _mm512_srlv_epi32(__m512i __X, __m512i __Y) 5900 { 5901 return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y); 5902 } 5903 5904 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5905 _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) 5906 { 5907 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5908 (__v16si)_mm512_srlv_epi32(__X, __Y), 5909 (__v16si)__W); 5910 } 5911 5912 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5913 _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) 5914 { 5915 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 5916 (__v16si)_mm512_srlv_epi32(__X, __Y), 5917 (__v16si)_mm512_setzero_si512()); 5918 } 5919 5920 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5921 _mm512_srlv_epi64 (__m512i __X, __m512i __Y) 5922 { 5923 return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y); 5924 } 5925 5926 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5927 _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) 5928 { 5929 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5930 (__v8di)_mm512_srlv_epi64(__X, __Y), 5931 (__v8di)__W); 5932 } 5933 5934 static __inline__ __m512i __DEFAULT_FN_ATTRS512 5935 _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) 5936 { 5937 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 5938 (__v8di)_mm512_srlv_epi64(__X, __Y), 5939 (__v8di)_mm512_setzero_si512()); 5940 } 5941 5942 /// \enum _MM_TERNLOG_ENUM 5943 /// A helper to represent the ternary logic operations among vector \a A, 5944 /// \a B and \a C. The representation is passed to \a imm. 5945 typedef enum { 5946 _MM_TERNLOG_A = 0xF0, 5947 _MM_TERNLOG_B = 0xCC, 5948 _MM_TERNLOG_C = 0xAA 5949 } _MM_TERNLOG_ENUM; 5950 5951 #define _mm512_ternarylogic_epi32(A, B, C, imm) \ 5952 ((__m512i)__builtin_ia32_pternlogd512_mask( \ 5953 (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ 5954 (unsigned char)(imm), (__mmask16)-1)) 5955 5956 #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \ 5957 ((__m512i)__builtin_ia32_pternlogd512_mask( \ 5958 (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ 5959 (unsigned char)(imm), (__mmask16)(U))) 5960 5961 #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \ 5962 ((__m512i)__builtin_ia32_pternlogd512_maskz( \ 5963 (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ 5964 (unsigned char)(imm), (__mmask16)(U))) 5965 5966 #define _mm512_ternarylogic_epi64(A, B, C, imm) \ 5967 ((__m512i)__builtin_ia32_pternlogq512_mask( \ 5968 (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ 5969 (unsigned char)(imm), (__mmask8)-1)) 5970 5971 #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \ 5972 ((__m512i)__builtin_ia32_pternlogq512_mask( \ 5973 (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ 5974 (unsigned char)(imm), (__mmask8)(U))) 5975 5976 #define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \ 5977 ((__m512i)__builtin_ia32_pternlogq512_maskz( \ 5978 (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ 5979 (unsigned char)(imm), (__mmask8)(U))) 5980 5981 #ifdef __x86_64__ 5982 #define _mm_cvt_roundsd_i64(A, R) \ 5983 ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) 5984 #endif 5985 5986 #define _mm_cvt_roundsd_si32(A, R) \ 5987 ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))) 5988 5989 #define _mm_cvt_roundsd_i32(A, R) \ 5990 ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))) 5991 5992 #define _mm_cvt_roundsd_u32(A, R) \ 5993 ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R))) 5994 5995 static __inline__ unsigned __DEFAULT_FN_ATTRS128 5996 _mm_cvtsd_u32 (__m128d __A) 5997 { 5998 return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, 5999 _MM_FROUND_CUR_DIRECTION); 6000 } 6001 6002 #ifdef __x86_64__ 6003 #define _mm_cvt_roundsd_u64(A, R) \ 6004 ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ 6005 (int)(R))) 6006 6007 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6008 _mm_cvtsd_u64 (__m128d __A) 6009 { 6010 return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) 6011 __A, 6012 _MM_FROUND_CUR_DIRECTION); 6013 } 6014 #endif 6015 6016 #define _mm_cvt_roundss_si32(A, R) \ 6017 ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))) 6018 6019 #define _mm_cvt_roundss_i32(A, R) \ 6020 ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))) 6021 6022 #ifdef __x86_64__ 6023 #define _mm_cvt_roundss_si64(A, R) \ 6024 ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))) 6025 6026 #define _mm_cvt_roundss_i64(A, R) \ 6027 ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))) 6028 #endif 6029 6030 #define _mm_cvt_roundss_u32(A, R) \ 6031 ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R))) 6032 6033 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6034 _mm_cvtss_u32 (__m128 __A) 6035 { 6036 return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, 6037 _MM_FROUND_CUR_DIRECTION); 6038 } 6039 6040 #ifdef __x86_64__ 6041 #define _mm_cvt_roundss_u64(A, R) \ 6042 ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ 6043 (int)(R))) 6044 6045 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6046 _mm_cvtss_u64 (__m128 __A) 6047 { 6048 return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) 6049 __A, 6050 _MM_FROUND_CUR_DIRECTION); 6051 } 6052 #endif 6053 6054 #define _mm_cvtt_roundsd_i32(A, R) \ 6055 ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))) 6056 6057 #define _mm_cvtt_roundsd_si32(A, R) \ 6058 ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))) 6059 6060 static __inline__ int __DEFAULT_FN_ATTRS128 6061 _mm_cvttsd_i32 (__m128d __A) 6062 { 6063 return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, 6064 _MM_FROUND_CUR_DIRECTION); 6065 } 6066 6067 #ifdef __x86_64__ 6068 #define _mm_cvtt_roundsd_si64(A, R) \ 6069 ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))) 6070 6071 #define _mm_cvtt_roundsd_i64(A, R) \ 6072 ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))) 6073 6074 static __inline__ long long __DEFAULT_FN_ATTRS128 6075 _mm_cvttsd_i64 (__m128d __A) 6076 { 6077 return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, 6078 _MM_FROUND_CUR_DIRECTION); 6079 } 6080 #endif 6081 6082 #define _mm_cvtt_roundsd_u32(A, R) \ 6083 ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R))) 6084 6085 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6086 _mm_cvttsd_u32 (__m128d __A) 6087 { 6088 return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, 6089 _MM_FROUND_CUR_DIRECTION); 6090 } 6091 6092 #ifdef __x86_64__ 6093 #define _mm_cvtt_roundsd_u64(A, R) \ 6094 ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ 6095 (int)(R))) 6096 6097 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6098 _mm_cvttsd_u64 (__m128d __A) 6099 { 6100 return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) 6101 __A, 6102 _MM_FROUND_CUR_DIRECTION); 6103 } 6104 #endif 6105 6106 #define _mm_cvtt_roundss_i32(A, R) \ 6107 ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))) 6108 6109 #define _mm_cvtt_roundss_si32(A, R) \ 6110 ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))) 6111 6112 static __inline__ int __DEFAULT_FN_ATTRS128 6113 _mm_cvttss_i32 (__m128 __A) 6114 { 6115 return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, 6116 _MM_FROUND_CUR_DIRECTION); 6117 } 6118 6119 #ifdef __x86_64__ 6120 #define _mm_cvtt_roundss_i64(A, R) \ 6121 ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))) 6122 6123 #define _mm_cvtt_roundss_si64(A, R) \ 6124 ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))) 6125 6126 static __inline__ long long __DEFAULT_FN_ATTRS128 6127 _mm_cvttss_i64 (__m128 __A) 6128 { 6129 return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, 6130 _MM_FROUND_CUR_DIRECTION); 6131 } 6132 #endif 6133 6134 #define _mm_cvtt_roundss_u32(A, R) \ 6135 ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R))) 6136 6137 static __inline__ unsigned __DEFAULT_FN_ATTRS128 6138 _mm_cvttss_u32 (__m128 __A) 6139 { 6140 return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, 6141 _MM_FROUND_CUR_DIRECTION); 6142 } 6143 6144 #ifdef __x86_64__ 6145 #define _mm_cvtt_roundss_u64(A, R) \ 6146 ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ 6147 (int)(R))) 6148 6149 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 6150 _mm_cvttss_u64 (__m128 __A) 6151 { 6152 return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) 6153 __A, 6154 _MM_FROUND_CUR_DIRECTION); 6155 } 6156 #endif 6157 6158 #define _mm512_permute_pd(X, C) \ 6159 ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C))) 6160 6161 #define _mm512_mask_permute_pd(W, U, X, C) \ 6162 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6163 (__v8df)_mm512_permute_pd((X), (C)), \ 6164 (__v8df)(__m512d)(W))) 6165 6166 #define _mm512_maskz_permute_pd(U, X, C) \ 6167 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6168 (__v8df)_mm512_permute_pd((X), (C)), \ 6169 (__v8df)_mm512_setzero_pd())) 6170 6171 #define _mm512_permute_ps(X, C) \ 6172 ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C))) 6173 6174 #define _mm512_mask_permute_ps(W, U, X, C) \ 6175 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6176 (__v16sf)_mm512_permute_ps((X), (C)), \ 6177 (__v16sf)(__m512)(W))) 6178 6179 #define _mm512_maskz_permute_ps(U, X, C) \ 6180 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6181 (__v16sf)_mm512_permute_ps((X), (C)), \ 6182 (__v16sf)_mm512_setzero_ps())) 6183 6184 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6185 _mm512_permutevar_pd(__m512d __A, __m512i __C) 6186 { 6187 return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C); 6188 } 6189 6190 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6191 _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) 6192 { 6193 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 6194 (__v8df)_mm512_permutevar_pd(__A, __C), 6195 (__v8df)__W); 6196 } 6197 6198 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6199 _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) 6200 { 6201 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 6202 (__v8df)_mm512_permutevar_pd(__A, __C), 6203 (__v8df)_mm512_setzero_pd()); 6204 } 6205 6206 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6207 _mm512_permutevar_ps(__m512 __A, __m512i __C) 6208 { 6209 return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C); 6210 } 6211 6212 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6213 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) 6214 { 6215 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 6216 (__v16sf)_mm512_permutevar_ps(__A, __C), 6217 (__v16sf)__W); 6218 } 6219 6220 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6221 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) 6222 { 6223 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 6224 (__v16sf)_mm512_permutevar_ps(__A, __C), 6225 (__v16sf)_mm512_setzero_ps()); 6226 } 6227 6228 static __inline __m512d __DEFAULT_FN_ATTRS512 6229 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) 6230 { 6231 return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, 6232 (__v8df)__B); 6233 } 6234 6235 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6236 _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) 6237 { 6238 return (__m512d)__builtin_ia32_selectpd_512(__U, 6239 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6240 (__v8df)__A); 6241 } 6242 6243 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6244 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, 6245 __m512d __B) 6246 { 6247 return (__m512d)__builtin_ia32_selectpd_512(__U, 6248 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6249 (__v8df)(__m512d)__I); 6250 } 6251 6252 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6253 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, 6254 __m512d __B) 6255 { 6256 return (__m512d)__builtin_ia32_selectpd_512(__U, 6257 (__v8df)_mm512_permutex2var_pd(__A, __I, __B), 6258 (__v8df)_mm512_setzero_pd()); 6259 } 6260 6261 static __inline __m512 __DEFAULT_FN_ATTRS512 6262 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) 6263 { 6264 return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, 6265 (__v16sf) __B); 6266 } 6267 6268 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6269 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) 6270 { 6271 return (__m512)__builtin_ia32_selectps_512(__U, 6272 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6273 (__v16sf)__A); 6274 } 6275 6276 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6277 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) 6278 { 6279 return (__m512)__builtin_ia32_selectps_512(__U, 6280 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6281 (__v16sf)(__m512)__I); 6282 } 6283 6284 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6285 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) 6286 { 6287 return (__m512)__builtin_ia32_selectps_512(__U, 6288 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), 6289 (__v16sf)_mm512_setzero_ps()); 6290 } 6291 6292 6293 #define _mm512_cvtt_roundpd_epu32(A, R) \ 6294 ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6295 (__v8si)_mm256_undefined_si256(), \ 6296 (__mmask8)-1, (int)(R))) 6297 6298 #define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \ 6299 ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6300 (__v8si)(__m256i)(W), \ 6301 (__mmask8)(U), (int)(R))) 6302 6303 #define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \ 6304 ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ 6305 (__v8si)_mm256_setzero_si256(), \ 6306 (__mmask8)(U), (int)(R))) 6307 6308 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6309 _mm512_cvttpd_epu32 (__m512d __A) 6310 { 6311 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6312 (__v8si) 6313 _mm256_undefined_si256 (), 6314 (__mmask8) -1, 6315 _MM_FROUND_CUR_DIRECTION); 6316 } 6317 6318 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6319 _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) 6320 { 6321 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6322 (__v8si) __W, 6323 (__mmask8) __U, 6324 _MM_FROUND_CUR_DIRECTION); 6325 } 6326 6327 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6328 _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) 6329 { 6330 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, 6331 (__v8si) 6332 _mm256_setzero_si256 (), 6333 (__mmask8) __U, 6334 _MM_FROUND_CUR_DIRECTION); 6335 } 6336 6337 #define _mm_roundscale_round_sd(A, B, imm, R) \ 6338 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6339 (__v2df)(__m128d)(B), \ 6340 (__v2df)_mm_setzero_pd(), \ 6341 (__mmask8)-1, (int)(imm), \ 6342 (int)(R))) 6343 6344 #define _mm_roundscale_sd(A, B, imm) \ 6345 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6346 (__v2df)(__m128d)(B), \ 6347 (__v2df)_mm_setzero_pd(), \ 6348 (__mmask8)-1, (int)(imm), \ 6349 _MM_FROUND_CUR_DIRECTION)) 6350 6351 #define _mm_mask_roundscale_sd(W, U, A, B, imm) \ 6352 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6353 (__v2df)(__m128d)(B), \ 6354 (__v2df)(__m128d)(W), \ 6355 (__mmask8)(U), (int)(imm), \ 6356 _MM_FROUND_CUR_DIRECTION)) 6357 6358 #define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \ 6359 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6360 (__v2df)(__m128d)(B), \ 6361 (__v2df)(__m128d)(W), \ 6362 (__mmask8)(U), (int)(I), \ 6363 (int)(R))) 6364 6365 #define _mm_maskz_roundscale_sd(U, A, B, I) \ 6366 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6367 (__v2df)(__m128d)(B), \ 6368 (__v2df)_mm_setzero_pd(), \ 6369 (__mmask8)(U), (int)(I), \ 6370 _MM_FROUND_CUR_DIRECTION)) 6371 6372 #define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ 6373 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ 6374 (__v2df)(__m128d)(B), \ 6375 (__v2df)_mm_setzero_pd(), \ 6376 (__mmask8)(U), (int)(I), \ 6377 (int)(R))) 6378 6379 #define _mm_roundscale_round_ss(A, B, imm, R) \ 6380 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6381 (__v4sf)(__m128)(B), \ 6382 (__v4sf)_mm_setzero_ps(), \ 6383 (__mmask8)-1, (int)(imm), \ 6384 (int)(R))) 6385 6386 #define _mm_roundscale_ss(A, B, imm) \ 6387 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6388 (__v4sf)(__m128)(B), \ 6389 (__v4sf)_mm_setzero_ps(), \ 6390 (__mmask8)-1, (int)(imm), \ 6391 _MM_FROUND_CUR_DIRECTION)) 6392 6393 #define _mm_mask_roundscale_ss(W, U, A, B, I) \ 6394 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6395 (__v4sf)(__m128)(B), \ 6396 (__v4sf)(__m128)(W), \ 6397 (__mmask8)(U), (int)(I), \ 6398 _MM_FROUND_CUR_DIRECTION)) 6399 6400 #define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \ 6401 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6402 (__v4sf)(__m128)(B), \ 6403 (__v4sf)(__m128)(W), \ 6404 (__mmask8)(U), (int)(I), \ 6405 (int)(R))) 6406 6407 #define _mm_maskz_roundscale_ss(U, A, B, I) \ 6408 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6409 (__v4sf)(__m128)(B), \ 6410 (__v4sf)_mm_setzero_ps(), \ 6411 (__mmask8)(U), (int)(I), \ 6412 _MM_FROUND_CUR_DIRECTION)) 6413 6414 #define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ 6415 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ 6416 (__v4sf)(__m128)(B), \ 6417 (__v4sf)_mm_setzero_ps(), \ 6418 (__mmask8)(U), (int)(I), \ 6419 (int)(R))) 6420 6421 #define _mm512_scalef_round_pd(A, B, R) \ 6422 ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6423 (__v8df)(__m512d)(B), \ 6424 (__v8df)_mm512_undefined_pd(), \ 6425 (__mmask8)-1, (int)(R))) 6426 6427 #define _mm512_mask_scalef_round_pd(W, U, A, B, R) \ 6428 ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6429 (__v8df)(__m512d)(B), \ 6430 (__v8df)(__m512d)(W), \ 6431 (__mmask8)(U), (int)(R))) 6432 6433 #define _mm512_maskz_scalef_round_pd(U, A, B, R) \ 6434 ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ 6435 (__v8df)(__m512d)(B), \ 6436 (__v8df)_mm512_setzero_pd(), \ 6437 (__mmask8)(U), (int)(R))) 6438 6439 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6440 _mm512_scalef_pd (__m512d __A, __m512d __B) 6441 { 6442 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6443 (__v8df) __B, 6444 (__v8df) 6445 _mm512_undefined_pd (), 6446 (__mmask8) -1, 6447 _MM_FROUND_CUR_DIRECTION); 6448 } 6449 6450 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6451 _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) 6452 { 6453 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6454 (__v8df) __B, 6455 (__v8df) __W, 6456 (__mmask8) __U, 6457 _MM_FROUND_CUR_DIRECTION); 6458 } 6459 6460 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6461 _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) 6462 { 6463 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, 6464 (__v8df) __B, 6465 (__v8df) 6466 _mm512_setzero_pd (), 6467 (__mmask8) __U, 6468 _MM_FROUND_CUR_DIRECTION); 6469 } 6470 6471 #define _mm512_scalef_round_ps(A, B, R) \ 6472 ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6473 (__v16sf)(__m512)(B), \ 6474 (__v16sf)_mm512_undefined_ps(), \ 6475 (__mmask16)-1, (int)(R))) 6476 6477 #define _mm512_mask_scalef_round_ps(W, U, A, B, R) \ 6478 ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6479 (__v16sf)(__m512)(B), \ 6480 (__v16sf)(__m512)(W), \ 6481 (__mmask16)(U), (int)(R))) 6482 6483 #define _mm512_maskz_scalef_round_ps(U, A, B, R) \ 6484 ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ 6485 (__v16sf)(__m512)(B), \ 6486 (__v16sf)_mm512_setzero_ps(), \ 6487 (__mmask16)(U), (int)(R))) 6488 6489 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6490 _mm512_scalef_ps (__m512 __A, __m512 __B) 6491 { 6492 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6493 (__v16sf) __B, 6494 (__v16sf) 6495 _mm512_undefined_ps (), 6496 (__mmask16) -1, 6497 _MM_FROUND_CUR_DIRECTION); 6498 } 6499 6500 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6501 _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) 6502 { 6503 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6504 (__v16sf) __B, 6505 (__v16sf) __W, 6506 (__mmask16) __U, 6507 _MM_FROUND_CUR_DIRECTION); 6508 } 6509 6510 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6511 _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) 6512 { 6513 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, 6514 (__v16sf) __B, 6515 (__v16sf) 6516 _mm512_setzero_ps (), 6517 (__mmask16) __U, 6518 _MM_FROUND_CUR_DIRECTION); 6519 } 6520 6521 #define _mm_scalef_round_sd(A, B, R) \ 6522 ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6523 (__v2df)(__m128d)(B), \ 6524 (__v2df)_mm_setzero_pd(), \ 6525 (__mmask8)-1, (int)(R))) 6526 6527 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6528 _mm_scalef_sd (__m128d __A, __m128d __B) 6529 { 6530 return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A, 6531 (__v2df)( __B), (__v2df) _mm_setzero_pd(), 6532 (__mmask8) -1, 6533 _MM_FROUND_CUR_DIRECTION); 6534 } 6535 6536 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6537 _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 6538 { 6539 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, 6540 (__v2df) __B, 6541 (__v2df) __W, 6542 (__mmask8) __U, 6543 _MM_FROUND_CUR_DIRECTION); 6544 } 6545 6546 #define _mm_mask_scalef_round_sd(W, U, A, B, R) \ 6547 ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6548 (__v2df)(__m128d)(B), \ 6549 (__v2df)(__m128d)(W), \ 6550 (__mmask8)(U), (int)(R))) 6551 6552 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6553 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) 6554 { 6555 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, 6556 (__v2df) __B, 6557 (__v2df) _mm_setzero_pd (), 6558 (__mmask8) __U, 6559 _MM_FROUND_CUR_DIRECTION); 6560 } 6561 6562 #define _mm_maskz_scalef_round_sd(U, A, B, R) \ 6563 ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ 6564 (__v2df)(__m128d)(B), \ 6565 (__v2df)_mm_setzero_pd(), \ 6566 (__mmask8)(U), (int)(R))) 6567 6568 #define _mm_scalef_round_ss(A, B, R) \ 6569 ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6570 (__v4sf)(__m128)(B), \ 6571 (__v4sf)_mm_setzero_ps(), \ 6572 (__mmask8)-1, (int)(R))) 6573 6574 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6575 _mm_scalef_ss (__m128 __A, __m128 __B) 6576 { 6577 return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A, 6578 (__v4sf)( __B), (__v4sf) _mm_setzero_ps(), 6579 (__mmask8) -1, 6580 _MM_FROUND_CUR_DIRECTION); 6581 } 6582 6583 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6584 _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 6585 { 6586 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, 6587 (__v4sf) __B, 6588 (__v4sf) __W, 6589 (__mmask8) __U, 6590 _MM_FROUND_CUR_DIRECTION); 6591 } 6592 6593 #define _mm_mask_scalef_round_ss(W, U, A, B, R) \ 6594 ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6595 (__v4sf)(__m128)(B), \ 6596 (__v4sf)(__m128)(W), \ 6597 (__mmask8)(U), (int)(R))) 6598 6599 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6600 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) 6601 { 6602 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, 6603 (__v4sf) __B, 6604 (__v4sf) _mm_setzero_ps (), 6605 (__mmask8) __U, 6606 _MM_FROUND_CUR_DIRECTION); 6607 } 6608 6609 #define _mm_maskz_scalef_round_ss(U, A, B, R) \ 6610 ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ 6611 (__v4sf)(__m128)(B), \ 6612 (__v4sf)_mm_setzero_ps(), \ 6613 (__mmask8)(U), \ 6614 (int)(R))) 6615 6616 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6617 _mm512_srai_epi32(__m512i __A, unsigned int __B) 6618 { 6619 return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B); 6620 } 6621 6622 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6623 _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, 6624 unsigned int __B) 6625 { 6626 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 6627 (__v16si)_mm512_srai_epi32(__A, __B), 6628 (__v16si)__W); 6629 } 6630 6631 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6632 _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, 6633 unsigned int __B) { 6634 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, 6635 (__v16si)_mm512_srai_epi32(__A, __B), 6636 (__v16si)_mm512_setzero_si512()); 6637 } 6638 6639 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6640 _mm512_srai_epi64(__m512i __A, unsigned int __B) 6641 { 6642 return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B); 6643 } 6644 6645 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6646 _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) 6647 { 6648 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 6649 (__v8di)_mm512_srai_epi64(__A, __B), 6650 (__v8di)__W); 6651 } 6652 6653 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6654 _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) 6655 { 6656 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 6657 (__v8di)_mm512_srai_epi64(__A, __B), 6658 (__v8di)_mm512_setzero_si512()); 6659 } 6660 6661 #define _mm512_shuffle_f32x4(A, B, imm) \ 6662 ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \ 6663 (__v16sf)(__m512)(B), (int)(imm))) 6664 6665 #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \ 6666 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6667 (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ 6668 (__v16sf)(__m512)(W))) 6669 6670 #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \ 6671 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6672 (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ 6673 (__v16sf)_mm512_setzero_ps())) 6674 6675 #define _mm512_shuffle_f64x2(A, B, imm) \ 6676 ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \ 6677 (__v8df)(__m512d)(B), (int)(imm))) 6678 6679 #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \ 6680 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6681 (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ 6682 (__v8df)(__m512d)(W))) 6683 6684 #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \ 6685 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6686 (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ 6687 (__v8df)_mm512_setzero_pd())) 6688 6689 #define _mm512_shuffle_i32x4(A, B, imm) \ 6690 ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \ 6691 (__v16si)(__m512i)(B), (int)(imm))) 6692 6693 #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \ 6694 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 6695 (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ 6696 (__v16si)(__m512i)(W))) 6697 6698 #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \ 6699 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 6700 (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ 6701 (__v16si)_mm512_setzero_si512())) 6702 6703 #define _mm512_shuffle_i64x2(A, B, imm) \ 6704 ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \ 6705 (__v8di)(__m512i)(B), (int)(imm))) 6706 6707 #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \ 6708 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 6709 (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ 6710 (__v8di)(__m512i)(W))) 6711 6712 #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \ 6713 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 6714 (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ 6715 (__v8di)_mm512_setzero_si512())) 6716 6717 #define _mm512_shuffle_pd(A, B, M) \ 6718 ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \ 6719 (__v8df)(__m512d)(B), (int)(M))) 6720 6721 #define _mm512_mask_shuffle_pd(W, U, A, B, M) \ 6722 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6723 (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ 6724 (__v8df)(__m512d)(W))) 6725 6726 #define _mm512_maskz_shuffle_pd(U, A, B, M) \ 6727 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 6728 (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ 6729 (__v8df)_mm512_setzero_pd())) 6730 6731 #define _mm512_shuffle_ps(A, B, M) \ 6732 ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \ 6733 (__v16sf)(__m512)(B), (int)(M))) 6734 6735 #define _mm512_mask_shuffle_ps(W, U, A, B, M) \ 6736 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6737 (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ 6738 (__v16sf)(__m512)(W))) 6739 6740 #define _mm512_maskz_shuffle_ps(U, A, B, M) \ 6741 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 6742 (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ 6743 (__v16sf)_mm512_setzero_ps())) 6744 6745 #define _mm_sqrt_round_sd(A, B, R) \ 6746 ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6747 (__v2df)(__m128d)(B), \ 6748 (__v2df)_mm_setzero_pd(), \ 6749 (__mmask8)-1, (int)(R))) 6750 6751 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6752 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 6753 { 6754 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, 6755 (__v2df) __B, 6756 (__v2df) __W, 6757 (__mmask8) __U, 6758 _MM_FROUND_CUR_DIRECTION); 6759 } 6760 6761 #define _mm_mask_sqrt_round_sd(W, U, A, B, R) \ 6762 ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6763 (__v2df)(__m128d)(B), \ 6764 (__v2df)(__m128d)(W), \ 6765 (__mmask8)(U), (int)(R))) 6766 6767 static __inline__ __m128d __DEFAULT_FN_ATTRS128 6768 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) 6769 { 6770 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, 6771 (__v2df) __B, 6772 (__v2df) _mm_setzero_pd (), 6773 (__mmask8) __U, 6774 _MM_FROUND_CUR_DIRECTION); 6775 } 6776 6777 #define _mm_maskz_sqrt_round_sd(U, A, B, R) \ 6778 ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ 6779 (__v2df)(__m128d)(B), \ 6780 (__v2df)_mm_setzero_pd(), \ 6781 (__mmask8)(U), (int)(R))) 6782 6783 #define _mm_sqrt_round_ss(A, B, R) \ 6784 ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6785 (__v4sf)(__m128)(B), \ 6786 (__v4sf)_mm_setzero_ps(), \ 6787 (__mmask8)-1, (int)(R))) 6788 6789 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6790 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 6791 { 6792 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, 6793 (__v4sf) __B, 6794 (__v4sf) __W, 6795 (__mmask8) __U, 6796 _MM_FROUND_CUR_DIRECTION); 6797 } 6798 6799 #define _mm_mask_sqrt_round_ss(W, U, A, B, R) \ 6800 ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6801 (__v4sf)(__m128)(B), \ 6802 (__v4sf)(__m128)(W), (__mmask8)(U), \ 6803 (int)(R))) 6804 6805 static __inline__ __m128 __DEFAULT_FN_ATTRS128 6806 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) 6807 { 6808 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, 6809 (__v4sf) __B, 6810 (__v4sf) _mm_setzero_ps (), 6811 (__mmask8) __U, 6812 _MM_FROUND_CUR_DIRECTION); 6813 } 6814 6815 #define _mm_maskz_sqrt_round_ss(U, A, B, R) \ 6816 ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ 6817 (__v4sf)(__m128)(B), \ 6818 (__v4sf)_mm_setzero_ps(), \ 6819 (__mmask8)(U), (int)(R))) 6820 6821 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6822 _mm512_broadcast_f32x4(__m128 __A) 6823 { 6824 return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 6825 0, 1, 2, 3, 0, 1, 2, 3, 6826 0, 1, 2, 3, 0, 1, 2, 3); 6827 } 6828 6829 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6830 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) 6831 { 6832 return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 6833 (__v16sf)_mm512_broadcast_f32x4(__A), 6834 (__v16sf)__O); 6835 } 6836 6837 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6838 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) 6839 { 6840 return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 6841 (__v16sf)_mm512_broadcast_f32x4(__A), 6842 (__v16sf)_mm512_setzero_ps()); 6843 } 6844 6845 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6846 _mm512_broadcast_f64x4(__m256d __A) 6847 { 6848 return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A, 6849 0, 1, 2, 3, 0, 1, 2, 3); 6850 } 6851 6852 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6853 _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) 6854 { 6855 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, 6856 (__v8df)_mm512_broadcast_f64x4(__A), 6857 (__v8df)__O); 6858 } 6859 6860 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6861 _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) 6862 { 6863 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, 6864 (__v8df)_mm512_broadcast_f64x4(__A), 6865 (__v8df)_mm512_setzero_pd()); 6866 } 6867 6868 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6869 _mm512_broadcast_i32x4(__m128i __A) 6870 { 6871 return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 6872 0, 1, 2, 3, 0, 1, 2, 3, 6873 0, 1, 2, 3, 0, 1, 2, 3); 6874 } 6875 6876 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6877 _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) 6878 { 6879 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 6880 (__v16si)_mm512_broadcast_i32x4(__A), 6881 (__v16si)__O); 6882 } 6883 6884 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6885 _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) 6886 { 6887 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 6888 (__v16si)_mm512_broadcast_i32x4(__A), 6889 (__v16si)_mm512_setzero_si512()); 6890 } 6891 6892 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6893 _mm512_broadcast_i64x4(__m256i __A) 6894 { 6895 return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A, 6896 0, 1, 2, 3, 0, 1, 2, 3); 6897 } 6898 6899 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6900 _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) 6901 { 6902 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 6903 (__v8di)_mm512_broadcast_i64x4(__A), 6904 (__v8di)__O); 6905 } 6906 6907 static __inline__ __m512i __DEFAULT_FN_ATTRS512 6908 _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) 6909 { 6910 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 6911 (__v8di)_mm512_broadcast_i64x4(__A), 6912 (__v8di)_mm512_setzero_si512()); 6913 } 6914 6915 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6916 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) 6917 { 6918 return (__m512d)__builtin_ia32_selectpd_512(__M, 6919 (__v8df) _mm512_broadcastsd_pd(__A), 6920 (__v8df) __O); 6921 } 6922 6923 static __inline__ __m512d __DEFAULT_FN_ATTRS512 6924 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) 6925 { 6926 return (__m512d)__builtin_ia32_selectpd_512(__M, 6927 (__v8df) _mm512_broadcastsd_pd(__A), 6928 (__v8df) _mm512_setzero_pd()); 6929 } 6930 6931 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6932 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) 6933 { 6934 return (__m512)__builtin_ia32_selectps_512(__M, 6935 (__v16sf) _mm512_broadcastss_ps(__A), 6936 (__v16sf) __O); 6937 } 6938 6939 static __inline__ __m512 __DEFAULT_FN_ATTRS512 6940 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) 6941 { 6942 return (__m512)__builtin_ia32_selectps_512(__M, 6943 (__v16sf) _mm512_broadcastss_ps(__A), 6944 (__v16sf) _mm512_setzero_ps()); 6945 } 6946 6947 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6948 _mm512_cvtsepi32_epi8 (__m512i __A) 6949 { 6950 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6951 (__v16qi) _mm_undefined_si128 (), 6952 (__mmask16) -1); 6953 } 6954 6955 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6956 _mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 6957 { 6958 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6959 (__v16qi) __O, __M); 6960 } 6961 6962 static __inline__ __m128i __DEFAULT_FN_ATTRS512 6963 _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) 6964 { 6965 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, 6966 (__v16qi) _mm_setzero_si128 (), 6967 __M); 6968 } 6969 6970 static __inline__ void __DEFAULT_FN_ATTRS512 6971 _mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 6972 { 6973 __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 6974 } 6975 6976 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6977 _mm512_cvtsepi32_epi16 (__m512i __A) 6978 { 6979 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6980 (__v16hi) _mm256_undefined_si256 (), 6981 (__mmask16) -1); 6982 } 6983 6984 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6985 _mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 6986 { 6987 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6988 (__v16hi) __O, __M); 6989 } 6990 6991 static __inline__ __m256i __DEFAULT_FN_ATTRS512 6992 _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) 6993 { 6994 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, 6995 (__v16hi) _mm256_setzero_si256 (), 6996 __M); 6997 } 6998 6999 static __inline__ void __DEFAULT_FN_ATTRS512 7000 _mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) 7001 { 7002 __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); 7003 } 7004 7005 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7006 _mm512_cvtsepi64_epi8 (__m512i __A) 7007 { 7008 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 7009 (__v16qi) _mm_undefined_si128 (), 7010 (__mmask8) -1); 7011 } 7012 7013 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7014 _mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 7015 { 7016 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 7017 (__v16qi) __O, __M); 7018 } 7019 7020 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7021 _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) 7022 { 7023 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, 7024 (__v16qi) _mm_setzero_si128 (), 7025 __M); 7026 } 7027 7028 static __inline__ void __DEFAULT_FN_ATTRS512 7029 _mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7030 { 7031 __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7032 } 7033 7034 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7035 _mm512_cvtsepi64_epi32 (__m512i __A) 7036 { 7037 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7038 (__v8si) _mm256_undefined_si256 (), 7039 (__mmask8) -1); 7040 } 7041 7042 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7043 _mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7044 { 7045 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7046 (__v8si) __O, __M); 7047 } 7048 7049 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7050 _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) 7051 { 7052 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, 7053 (__v8si) _mm256_setzero_si256 (), 7054 __M); 7055 } 7056 7057 static __inline__ void __DEFAULT_FN_ATTRS512 7058 _mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) 7059 { 7060 __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); 7061 } 7062 7063 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7064 _mm512_cvtsepi64_epi16 (__m512i __A) 7065 { 7066 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7067 (__v8hi) _mm_undefined_si128 (), 7068 (__mmask8) -1); 7069 } 7070 7071 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7072 _mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7073 { 7074 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7075 (__v8hi) __O, __M); 7076 } 7077 7078 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7079 _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) 7080 { 7081 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, 7082 (__v8hi) _mm_setzero_si128 (), 7083 __M); 7084 } 7085 7086 static __inline__ void __DEFAULT_FN_ATTRS512 7087 _mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) 7088 { 7089 __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); 7090 } 7091 7092 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7093 _mm512_cvtusepi32_epi8 (__m512i __A) 7094 { 7095 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7096 (__v16qi) _mm_undefined_si128 (), 7097 (__mmask16) -1); 7098 } 7099 7100 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7101 _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 7102 { 7103 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7104 (__v16qi) __O, 7105 __M); 7106 } 7107 7108 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7109 _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) 7110 { 7111 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, 7112 (__v16qi) _mm_setzero_si128 (), 7113 __M); 7114 } 7115 7116 static __inline__ void __DEFAULT_FN_ATTRS512 7117 _mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 7118 { 7119 __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 7120 } 7121 7122 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7123 _mm512_cvtusepi32_epi16 (__m512i __A) 7124 { 7125 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7126 (__v16hi) _mm256_undefined_si256 (), 7127 (__mmask16) -1); 7128 } 7129 7130 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7131 _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 7132 { 7133 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7134 (__v16hi) __O, 7135 __M); 7136 } 7137 7138 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7139 _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) 7140 { 7141 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, 7142 (__v16hi) _mm256_setzero_si256 (), 7143 __M); 7144 } 7145 7146 static __inline__ void __DEFAULT_FN_ATTRS512 7147 _mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) 7148 { 7149 __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); 7150 } 7151 7152 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7153 _mm512_cvtusepi64_epi8 (__m512i __A) 7154 { 7155 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7156 (__v16qi) _mm_undefined_si128 (), 7157 (__mmask8) -1); 7158 } 7159 7160 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7161 _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 7162 { 7163 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7164 (__v16qi) __O, 7165 __M); 7166 } 7167 7168 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7169 _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) 7170 { 7171 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, 7172 (__v16qi) _mm_setzero_si128 (), 7173 __M); 7174 } 7175 7176 static __inline__ void __DEFAULT_FN_ATTRS512 7177 _mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7178 { 7179 __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7180 } 7181 7182 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7183 _mm512_cvtusepi64_epi32 (__m512i __A) 7184 { 7185 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7186 (__v8si) _mm256_undefined_si256 (), 7187 (__mmask8) -1); 7188 } 7189 7190 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7191 _mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7192 { 7193 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7194 (__v8si) __O, __M); 7195 } 7196 7197 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7198 _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) 7199 { 7200 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, 7201 (__v8si) _mm256_setzero_si256 (), 7202 __M); 7203 } 7204 7205 static __inline__ void __DEFAULT_FN_ATTRS512 7206 _mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) 7207 { 7208 __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); 7209 } 7210 7211 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7212 _mm512_cvtusepi64_epi16 (__m512i __A) 7213 { 7214 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7215 (__v8hi) _mm_undefined_si128 (), 7216 (__mmask8) -1); 7217 } 7218 7219 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7220 _mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7221 { 7222 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7223 (__v8hi) __O, __M); 7224 } 7225 7226 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7227 _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) 7228 { 7229 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, 7230 (__v8hi) _mm_setzero_si128 (), 7231 __M); 7232 } 7233 7234 static __inline__ void __DEFAULT_FN_ATTRS512 7235 _mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) 7236 { 7237 __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); 7238 } 7239 7240 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7241 _mm512_cvtepi32_epi8 (__m512i __A) 7242 { 7243 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7244 (__v16qi) _mm_undefined_si128 (), 7245 (__mmask16) -1); 7246 } 7247 7248 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7249 _mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) 7250 { 7251 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7252 (__v16qi) __O, __M); 7253 } 7254 7255 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7256 _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) 7257 { 7258 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, 7259 (__v16qi) _mm_setzero_si128 (), 7260 __M); 7261 } 7262 7263 static __inline__ void __DEFAULT_FN_ATTRS512 7264 _mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) 7265 { 7266 __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); 7267 } 7268 7269 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7270 _mm512_cvtepi32_epi16 (__m512i __A) 7271 { 7272 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7273 (__v16hi) _mm256_undefined_si256 (), 7274 (__mmask16) -1); 7275 } 7276 7277 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7278 _mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) 7279 { 7280 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7281 (__v16hi) __O, __M); 7282 } 7283 7284 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7285 _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) 7286 { 7287 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, 7288 (__v16hi) _mm256_setzero_si256 (), 7289 __M); 7290 } 7291 7292 static __inline__ void __DEFAULT_FN_ATTRS512 7293 _mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) 7294 { 7295 __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); 7296 } 7297 7298 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7299 _mm512_cvtepi64_epi8 (__m512i __A) 7300 { 7301 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7302 (__v16qi) _mm_undefined_si128 (), 7303 (__mmask8) -1); 7304 } 7305 7306 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7307 _mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) 7308 { 7309 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7310 (__v16qi) __O, __M); 7311 } 7312 7313 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7314 _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) 7315 { 7316 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, 7317 (__v16qi) _mm_setzero_si128 (), 7318 __M); 7319 } 7320 7321 static __inline__ void __DEFAULT_FN_ATTRS512 7322 _mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) 7323 { 7324 __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); 7325 } 7326 7327 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7328 _mm512_cvtepi64_epi32 (__m512i __A) 7329 { 7330 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7331 (__v8si) _mm256_undefined_si256 (), 7332 (__mmask8) -1); 7333 } 7334 7335 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7336 _mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) 7337 { 7338 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7339 (__v8si) __O, __M); 7340 } 7341 7342 static __inline__ __m256i __DEFAULT_FN_ATTRS512 7343 _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) 7344 { 7345 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, 7346 (__v8si) _mm256_setzero_si256 (), 7347 __M); 7348 } 7349 7350 static __inline__ void __DEFAULT_FN_ATTRS512 7351 _mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) 7352 { 7353 __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); 7354 } 7355 7356 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7357 _mm512_cvtepi64_epi16 (__m512i __A) 7358 { 7359 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7360 (__v8hi) _mm_undefined_si128 (), 7361 (__mmask8) -1); 7362 } 7363 7364 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7365 _mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) 7366 { 7367 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7368 (__v8hi) __O, __M); 7369 } 7370 7371 static __inline__ __m128i __DEFAULT_FN_ATTRS512 7372 _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) 7373 { 7374 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, 7375 (__v8hi) _mm_setzero_si128 (), 7376 __M); 7377 } 7378 7379 static __inline__ void __DEFAULT_FN_ATTRS512 7380 _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) 7381 { 7382 __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); 7383 } 7384 7385 #define _mm512_extracti32x4_epi32(A, imm) \ 7386 ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7387 (__v4si)_mm_undefined_si128(), \ 7388 (__mmask8)-1)) 7389 7390 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \ 7391 ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7392 (__v4si)(__m128i)(W), \ 7393 (__mmask8)(U))) 7394 7395 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \ 7396 ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ 7397 (__v4si)_mm_setzero_si128(), \ 7398 (__mmask8)(U))) 7399 7400 #define _mm512_extracti64x4_epi64(A, imm) \ 7401 ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7402 (__v4di)_mm256_undefined_si256(), \ 7403 (__mmask8)-1)) 7404 7405 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \ 7406 ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7407 (__v4di)(__m256i)(W), \ 7408 (__mmask8)(U))) 7409 7410 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \ 7411 ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ 7412 (__v4di)_mm256_setzero_si256(), \ 7413 (__mmask8)(U))) 7414 7415 #define _mm512_insertf64x4(A, B, imm) \ 7416 ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ 7417 (__v4df)(__m256d)(B), (int)(imm))) 7418 7419 #define _mm512_mask_insertf64x4(W, U, A, B, imm) \ 7420 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 7421 (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ 7422 (__v8df)(__m512d)(W))) 7423 7424 #define _mm512_maskz_insertf64x4(U, A, B, imm) \ 7425 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 7426 (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ 7427 (__v8df)_mm512_setzero_pd())) 7428 7429 #define _mm512_inserti64x4(A, B, imm) \ 7430 ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \ 7431 (__v4di)(__m256i)(B), (int)(imm))) 7432 7433 #define _mm512_mask_inserti64x4(W, U, A, B, imm) \ 7434 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 7435 (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ 7436 (__v8di)(__m512i)(W))) 7437 7438 #define _mm512_maskz_inserti64x4(U, A, B, imm) \ 7439 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 7440 (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ 7441 (__v8di)_mm512_setzero_si512())) 7442 7443 #define _mm512_insertf32x4(A, B, imm) \ 7444 ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \ 7445 (__v4sf)(__m128)(B), (int)(imm))) 7446 7447 #define _mm512_mask_insertf32x4(W, U, A, B, imm) \ 7448 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 7449 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ 7450 (__v16sf)(__m512)(W))) 7451 7452 #define _mm512_maskz_insertf32x4(U, A, B, imm) \ 7453 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 7454 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ 7455 (__v16sf)_mm512_setzero_ps())) 7456 7457 #define _mm512_inserti32x4(A, B, imm) \ 7458 ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \ 7459 (__v4si)(__m128i)(B), (int)(imm))) 7460 7461 #define _mm512_mask_inserti32x4(W, U, A, B, imm) \ 7462 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 7463 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ 7464 (__v16si)(__m512i)(W))) 7465 7466 #define _mm512_maskz_inserti32x4(U, A, B, imm) \ 7467 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 7468 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ 7469 (__v16si)_mm512_setzero_si512())) 7470 7471 #define _mm512_getmant_round_pd(A, B, C, R) \ 7472 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7473 (int)(((C)<<2) | (B)), \ 7474 (__v8df)_mm512_undefined_pd(), \ 7475 (__mmask8)-1, (int)(R))) 7476 7477 #define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \ 7478 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7479 (int)(((C)<<2) | (B)), \ 7480 (__v8df)(__m512d)(W), \ 7481 (__mmask8)(U), (int)(R))) 7482 7483 #define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \ 7484 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7485 (int)(((C)<<2) | (B)), \ 7486 (__v8df)_mm512_setzero_pd(), \ 7487 (__mmask8)(U), (int)(R))) 7488 7489 #define _mm512_getmant_pd(A, B, C) \ 7490 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7491 (int)(((C)<<2) | (B)), \ 7492 (__v8df)_mm512_setzero_pd(), \ 7493 (__mmask8)-1, \ 7494 _MM_FROUND_CUR_DIRECTION)) 7495 7496 #define _mm512_mask_getmant_pd(W, U, A, B, C) \ 7497 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7498 (int)(((C)<<2) | (B)), \ 7499 (__v8df)(__m512d)(W), \ 7500 (__mmask8)(U), \ 7501 _MM_FROUND_CUR_DIRECTION)) 7502 7503 #define _mm512_maskz_getmant_pd(U, A, B, C) \ 7504 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ 7505 (int)(((C)<<2) | (B)), \ 7506 (__v8df)_mm512_setzero_pd(), \ 7507 (__mmask8)(U), \ 7508 _MM_FROUND_CUR_DIRECTION)) 7509 7510 #define _mm512_getmant_round_ps(A, B, C, R) \ 7511 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7512 (int)(((C)<<2) | (B)), \ 7513 (__v16sf)_mm512_undefined_ps(), \ 7514 (__mmask16)-1, (int)(R))) 7515 7516 #define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \ 7517 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7518 (int)(((C)<<2) | (B)), \ 7519 (__v16sf)(__m512)(W), \ 7520 (__mmask16)(U), (int)(R))) 7521 7522 #define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \ 7523 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7524 (int)(((C)<<2) | (B)), \ 7525 (__v16sf)_mm512_setzero_ps(), \ 7526 (__mmask16)(U), (int)(R))) 7527 7528 #define _mm512_getmant_ps(A, B, C) \ 7529 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7530 (int)(((C)<<2)|(B)), \ 7531 (__v16sf)_mm512_undefined_ps(), \ 7532 (__mmask16)-1, \ 7533 _MM_FROUND_CUR_DIRECTION)) 7534 7535 #define _mm512_mask_getmant_ps(W, U, A, B, C) \ 7536 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7537 (int)(((C)<<2)|(B)), \ 7538 (__v16sf)(__m512)(W), \ 7539 (__mmask16)(U), \ 7540 _MM_FROUND_CUR_DIRECTION)) 7541 7542 #define _mm512_maskz_getmant_ps(U, A, B, C) \ 7543 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ 7544 (int)(((C)<<2)|(B)), \ 7545 (__v16sf)_mm512_setzero_ps(), \ 7546 (__mmask16)(U), \ 7547 _MM_FROUND_CUR_DIRECTION)) 7548 7549 #define _mm512_getexp_round_pd(A, R) \ 7550 ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7551 (__v8df)_mm512_undefined_pd(), \ 7552 (__mmask8)-1, (int)(R))) 7553 7554 #define _mm512_mask_getexp_round_pd(W, U, A, R) \ 7555 ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7556 (__v8df)(__m512d)(W), \ 7557 (__mmask8)(U), (int)(R))) 7558 7559 #define _mm512_maskz_getexp_round_pd(U, A, R) \ 7560 ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ 7561 (__v8df)_mm512_setzero_pd(), \ 7562 (__mmask8)(U), (int)(R))) 7563 7564 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7565 _mm512_getexp_pd (__m512d __A) 7566 { 7567 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7568 (__v8df) _mm512_undefined_pd (), 7569 (__mmask8) -1, 7570 _MM_FROUND_CUR_DIRECTION); 7571 } 7572 7573 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7574 _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) 7575 { 7576 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7577 (__v8df) __W, 7578 (__mmask8) __U, 7579 _MM_FROUND_CUR_DIRECTION); 7580 } 7581 7582 static __inline__ __m512d __DEFAULT_FN_ATTRS512 7583 _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) 7584 { 7585 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, 7586 (__v8df) _mm512_setzero_pd (), 7587 (__mmask8) __U, 7588 _MM_FROUND_CUR_DIRECTION); 7589 } 7590 7591 #define _mm512_getexp_round_ps(A, R) \ 7592 ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7593 (__v16sf)_mm512_undefined_ps(), \ 7594 (__mmask16)-1, (int)(R))) 7595 7596 #define _mm512_mask_getexp_round_ps(W, U, A, R) \ 7597 ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7598 (__v16sf)(__m512)(W), \ 7599 (__mmask16)(U), (int)(R))) 7600 7601 #define _mm512_maskz_getexp_round_ps(U, A, R) \ 7602 ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ 7603 (__v16sf)_mm512_setzero_ps(), \ 7604 (__mmask16)(U), (int)(R))) 7605 7606 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7607 _mm512_getexp_ps (__m512 __A) 7608 { 7609 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7610 (__v16sf) _mm512_undefined_ps (), 7611 (__mmask16) -1, 7612 _MM_FROUND_CUR_DIRECTION); 7613 } 7614 7615 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7616 _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) 7617 { 7618 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7619 (__v16sf) __W, 7620 (__mmask16) __U, 7621 _MM_FROUND_CUR_DIRECTION); 7622 } 7623 7624 static __inline__ __m512 __DEFAULT_FN_ATTRS512 7625 _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) 7626 { 7627 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, 7628 (__v16sf) _mm512_setzero_ps (), 7629 (__mmask16) __U, 7630 _MM_FROUND_CUR_DIRECTION); 7631 } 7632 7633 #define _mm512_i64gather_ps(index, addr, scale) \ 7634 ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \ 7635 (void const *)(addr), \ 7636 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7637 (int)(scale))) 7638 7639 #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \ 7640 ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ 7641 (void const *)(addr), \ 7642 (__v8di)(__m512i)(index), \ 7643 (__mmask8)(mask), (int)(scale))) 7644 7645 #define _mm512_i64gather_epi32(index, addr, scale) \ 7646 ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \ 7647 (void const *)(addr), \ 7648 (__v8di)(__m512i)(index), \ 7649 (__mmask8)-1, (int)(scale))) 7650 7651 #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \ 7652 ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \ 7653 (void const *)(addr), \ 7654 (__v8di)(__m512i)(index), \ 7655 (__mmask8)(mask), (int)(scale))) 7656 7657 #define _mm512_i64gather_pd(index, addr, scale) \ 7658 ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \ 7659 (void const *)(addr), \ 7660 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7661 (int)(scale))) 7662 7663 #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \ 7664 ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \ 7665 (void const *)(addr), \ 7666 (__v8di)(__m512i)(index), \ 7667 (__mmask8)(mask), (int)(scale))) 7668 7669 #define _mm512_i64gather_epi64(index, addr, scale) \ 7670 ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \ 7671 (void const *)(addr), \ 7672 (__v8di)(__m512i)(index), (__mmask8)-1, \ 7673 (int)(scale))) 7674 7675 #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \ 7676 ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \ 7677 (void const *)(addr), \ 7678 (__v8di)(__m512i)(index), \ 7679 (__mmask8)(mask), (int)(scale))) 7680 7681 #define _mm512_i32gather_ps(index, addr, scale) \ 7682 ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \ 7683 (void const *)(addr), \ 7684 (__v16si)(__m512)(index), \ 7685 (__mmask16)-1, (int)(scale))) 7686 7687 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \ 7688 ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \ 7689 (void const *)(addr), \ 7690 (__v16si)(__m512)(index), \ 7691 (__mmask16)(mask), (int)(scale))) 7692 7693 #define _mm512_i32gather_epi32(index, addr, scale) \ 7694 ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \ 7695 (void const *)(addr), \ 7696 (__v16si)(__m512i)(index), \ 7697 (__mmask16)-1, (int)(scale))) 7698 7699 #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \ 7700 ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \ 7701 (void const *)(addr), \ 7702 (__v16si)(__m512i)(index), \ 7703 (__mmask16)(mask), (int)(scale))) 7704 7705 #define _mm512_i32gather_pd(index, addr, scale) \ 7706 ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \ 7707 (void const *)(addr), \ 7708 (__v8si)(__m256i)(index), (__mmask8)-1, \ 7709 (int)(scale))) 7710 7711 #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \ 7712 ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \ 7713 (void const *)(addr), \ 7714 (__v8si)(__m256i)(index), \ 7715 (__mmask8)(mask), (int)(scale))) 7716 7717 #define _mm512_i32gather_epi64(index, addr, scale) \ 7718 ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \ 7719 (void const *)(addr), \ 7720 (__v8si)(__m256i)(index), (__mmask8)-1, \ 7721 (int)(scale))) 7722 7723 #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \ 7724 ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \ 7725 (void const *)(addr), \ 7726 (__v8si)(__m256i)(index), \ 7727 (__mmask8)(mask), (int)(scale))) 7728 7729 #define _mm512_i64scatter_ps(addr, index, v1, scale) \ 7730 __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \ 7731 (__v8di)(__m512i)(index), \ 7732 (__v8sf)(__m256)(v1), (int)(scale)) 7733 7734 #define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \ 7735 __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \ 7736 (__v8di)(__m512i)(index), \ 7737 (__v8sf)(__m256)(v1), (int)(scale)) 7738 7739 #define _mm512_i64scatter_epi32(addr, index, v1, scale) \ 7740 __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \ 7741 (__v8di)(__m512i)(index), \ 7742 (__v8si)(__m256i)(v1), (int)(scale)) 7743 7744 #define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ 7745 __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \ 7746 (__v8di)(__m512i)(index), \ 7747 (__v8si)(__m256i)(v1), (int)(scale)) 7748 7749 #define _mm512_i64scatter_pd(addr, index, v1, scale) \ 7750 __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \ 7751 (__v8di)(__m512i)(index), \ 7752 (__v8df)(__m512d)(v1), (int)(scale)) 7753 7754 #define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \ 7755 __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \ 7756 (__v8di)(__m512i)(index), \ 7757 (__v8df)(__m512d)(v1), (int)(scale)) 7758 7759 #define _mm512_i64scatter_epi64(addr, index, v1, scale) \ 7760 __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \ 7761 (__v8di)(__m512i)(index), \ 7762 (__v8di)(__m512i)(v1), (int)(scale)) 7763 7764 #define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ 7765 __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \ 7766 (__v8di)(__m512i)(index), \ 7767 (__v8di)(__m512i)(v1), (int)(scale)) 7768 7769 #define _mm512_i32scatter_ps(addr, index, v1, scale) \ 7770 __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \ 7771 (__v16si)(__m512i)(index), \ 7772 (__v16sf)(__m512)(v1), (int)(scale)) 7773 7774 #define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \ 7775 __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \ 7776 (__v16si)(__m512i)(index), \ 7777 (__v16sf)(__m512)(v1), (int)(scale)) 7778 7779 #define _mm512_i32scatter_epi32(addr, index, v1, scale) \ 7780 __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \ 7781 (__v16si)(__m512i)(index), \ 7782 (__v16si)(__m512i)(v1), (int)(scale)) 7783 7784 #define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ 7785 __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \ 7786 (__v16si)(__m512i)(index), \ 7787 (__v16si)(__m512i)(v1), (int)(scale)) 7788 7789 #define _mm512_i32scatter_pd(addr, index, v1, scale) \ 7790 __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \ 7791 (__v8si)(__m256i)(index), \ 7792 (__v8df)(__m512d)(v1), (int)(scale)) 7793 7794 #define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \ 7795 __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \ 7796 (__v8si)(__m256i)(index), \ 7797 (__v8df)(__m512d)(v1), (int)(scale)) 7798 7799 #define _mm512_i32scatter_epi64(addr, index, v1, scale) \ 7800 __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \ 7801 (__v8si)(__m256i)(index), \ 7802 (__v8di)(__m512i)(v1), (int)(scale)) 7803 7804 #define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ 7805 __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \ 7806 (__v8si)(__m256i)(index), \ 7807 (__v8di)(__m512i)(v1), (int)(scale)) 7808 7809 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7810 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7811 { 7812 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7813 (__v4sf)__A, 7814 (__v4sf)__B, 7815 (__mmask8)__U, 7816 _MM_FROUND_CUR_DIRECTION); 7817 } 7818 7819 #define _mm_fmadd_round_ss(A, B, C, R) \ 7820 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7821 (__v4sf)(__m128)(B), \ 7822 (__v4sf)(__m128)(C), (__mmask8)-1, \ 7823 (int)(R))) 7824 7825 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) \ 7826 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7827 (__v4sf)(__m128)(A), \ 7828 (__v4sf)(__m128)(B), (__mmask8)(U), \ 7829 (int)(R))) 7830 7831 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7832 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7833 { 7834 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7835 (__v4sf)__B, 7836 (__v4sf)__C, 7837 (__mmask8)__U, 7838 _MM_FROUND_CUR_DIRECTION); 7839 } 7840 7841 #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ 7842 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7843 (__v4sf)(__m128)(B), \ 7844 (__v4sf)(__m128)(C), (__mmask8)(U), \ 7845 (int)(R))) 7846 7847 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7848 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7849 { 7850 return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, 7851 (__v4sf)__X, 7852 (__v4sf)__Y, 7853 (__mmask8)__U, 7854 _MM_FROUND_CUR_DIRECTION); 7855 } 7856 7857 #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \ 7858 ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ 7859 (__v4sf)(__m128)(X), \ 7860 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7861 (int)(R))) 7862 7863 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7864 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7865 { 7866 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7867 (__v4sf)__A, 7868 -(__v4sf)__B, 7869 (__mmask8)__U, 7870 _MM_FROUND_CUR_DIRECTION); 7871 } 7872 7873 #define _mm_fmsub_round_ss(A, B, C, R) \ 7874 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7875 (__v4sf)(__m128)(B), \ 7876 -(__v4sf)(__m128)(C), (__mmask8)-1, \ 7877 (int)(R))) 7878 7879 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) \ 7880 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7881 (__v4sf)(__m128)(A), \ 7882 -(__v4sf)(__m128)(B), (__mmask8)(U), \ 7883 (int)(R))) 7884 7885 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7886 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7887 { 7888 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7889 (__v4sf)__B, 7890 -(__v4sf)__C, 7891 (__mmask8)__U, 7892 _MM_FROUND_CUR_DIRECTION); 7893 } 7894 7895 #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ 7896 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7897 (__v4sf)(__m128)(B), \ 7898 -(__v4sf)(__m128)(C), (__mmask8)(U), \ 7899 (int)(R))) 7900 7901 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7902 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7903 { 7904 return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, 7905 (__v4sf)__X, 7906 (__v4sf)__Y, 7907 (__mmask8)__U, 7908 _MM_FROUND_CUR_DIRECTION); 7909 } 7910 7911 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \ 7912 ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ 7913 (__v4sf)(__m128)(X), \ 7914 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7915 (int)(R))) 7916 7917 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7918 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7919 { 7920 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7921 -(__v4sf)__A, 7922 (__v4sf)__B, 7923 (__mmask8)__U, 7924 _MM_FROUND_CUR_DIRECTION); 7925 } 7926 7927 #define _mm_fnmadd_round_ss(A, B, C, R) \ 7928 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7929 -(__v4sf)(__m128)(B), \ 7930 (__v4sf)(__m128)(C), (__mmask8)-1, \ 7931 (int)(R))) 7932 7933 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \ 7934 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7935 -(__v4sf)(__m128)(A), \ 7936 (__v4sf)(__m128)(B), (__mmask8)(U), \ 7937 (int)(R))) 7938 7939 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7940 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7941 { 7942 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7943 -(__v4sf)__B, 7944 (__v4sf)__C, 7945 (__mmask8)__U, 7946 _MM_FROUND_CUR_DIRECTION); 7947 } 7948 7949 #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ 7950 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 7951 -(__v4sf)(__m128)(B), \ 7952 (__v4sf)(__m128)(C), (__mmask8)(U), \ 7953 (int)(R))) 7954 7955 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7956 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 7957 { 7958 return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, 7959 -(__v4sf)__X, 7960 (__v4sf)__Y, 7961 (__mmask8)__U, 7962 _MM_FROUND_CUR_DIRECTION); 7963 } 7964 7965 #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \ 7966 ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ 7967 -(__v4sf)(__m128)(X), \ 7968 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 7969 (int)(R))) 7970 7971 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7972 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 7973 { 7974 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, 7975 -(__v4sf)__A, 7976 -(__v4sf)__B, 7977 (__mmask8)__U, 7978 _MM_FROUND_CUR_DIRECTION); 7979 } 7980 7981 #define _mm_fnmsub_round_ss(A, B, C, R) \ 7982 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ 7983 -(__v4sf)(__m128)(B), \ 7984 -(__v4sf)(__m128)(C), (__mmask8)-1, \ 7985 (int)(R))) 7986 7987 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \ 7988 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ 7989 -(__v4sf)(__m128)(A), \ 7990 -(__v4sf)(__m128)(B), (__mmask8)(U), \ 7991 (int)(R))) 7992 7993 static __inline__ __m128 __DEFAULT_FN_ATTRS128 7994 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) 7995 { 7996 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, 7997 -(__v4sf)__B, 7998 -(__v4sf)__C, 7999 (__mmask8)__U, 8000 _MM_FROUND_CUR_DIRECTION); 8001 } 8002 8003 #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ 8004 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ 8005 -(__v4sf)(__m128)(B), \ 8006 -(__v4sf)(__m128)(C), (__mmask8)(U), \ 8007 (int)(R))) 8008 8009 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8010 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) 8011 { 8012 return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, 8013 -(__v4sf)__X, 8014 (__v4sf)__Y, 8015 (__mmask8)__U, 8016 _MM_FROUND_CUR_DIRECTION); 8017 } 8018 8019 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \ 8020 ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ 8021 -(__v4sf)(__m128)(X), \ 8022 (__v4sf)(__m128)(Y), (__mmask8)(U), \ 8023 (int)(R))) 8024 8025 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8026 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8027 { 8028 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8029 (__v2df)__A, 8030 (__v2df)__B, 8031 (__mmask8)__U, 8032 _MM_FROUND_CUR_DIRECTION); 8033 } 8034 8035 #define _mm_fmadd_round_sd(A, B, C, R) \ 8036 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8037 (__v2df)(__m128d)(B), \ 8038 (__v2df)(__m128d)(C), (__mmask8)-1, \ 8039 (int)(R))) 8040 8041 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) \ 8042 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8043 (__v2df)(__m128d)(A), \ 8044 (__v2df)(__m128d)(B), (__mmask8)(U), \ 8045 (int)(R))) 8046 8047 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8048 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8049 { 8050 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8051 (__v2df)__B, 8052 (__v2df)__C, 8053 (__mmask8)__U, 8054 _MM_FROUND_CUR_DIRECTION); 8055 } 8056 8057 #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ 8058 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8059 (__v2df)(__m128d)(B), \ 8060 (__v2df)(__m128d)(C), (__mmask8)(U), \ 8061 (int)(R))) 8062 8063 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8064 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8065 { 8066 return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, 8067 (__v2df)__X, 8068 (__v2df)__Y, 8069 (__mmask8)__U, 8070 _MM_FROUND_CUR_DIRECTION); 8071 } 8072 8073 #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \ 8074 ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ 8075 (__v2df)(__m128d)(X), \ 8076 (__v2df)(__m128d)(Y), (__mmask8)(U), \ 8077 (int)(R))) 8078 8079 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8080 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8081 { 8082 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8083 (__v2df)__A, 8084 -(__v2df)__B, 8085 (__mmask8)__U, 8086 _MM_FROUND_CUR_DIRECTION); 8087 } 8088 8089 #define _mm_fmsub_round_sd(A, B, C, R) \ 8090 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8091 (__v2df)(__m128d)(B), \ 8092 -(__v2df)(__m128d)(C), (__mmask8)-1, \ 8093 (int)(R))) 8094 8095 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) \ 8096 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8097 (__v2df)(__m128d)(A), \ 8098 -(__v2df)(__m128d)(B), (__mmask8)(U), \ 8099 (int)(R))) 8100 8101 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8102 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8103 { 8104 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8105 (__v2df)__B, 8106 -(__v2df)__C, 8107 (__mmask8)__U, 8108 _MM_FROUND_CUR_DIRECTION); 8109 } 8110 8111 #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ 8112 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8113 (__v2df)(__m128d)(B), \ 8114 -(__v2df)(__m128d)(C), \ 8115 (__mmask8)(U), (int)(R))) 8116 8117 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8118 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8119 { 8120 return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, 8121 (__v2df)__X, 8122 (__v2df)__Y, 8123 (__mmask8)__U, 8124 _MM_FROUND_CUR_DIRECTION); 8125 } 8126 8127 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \ 8128 ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ 8129 (__v2df)(__m128d)(X), \ 8130 (__v2df)(__m128d)(Y), \ 8131 (__mmask8)(U), (int)(R))) 8132 8133 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8134 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8135 { 8136 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8137 -(__v2df)__A, 8138 (__v2df)__B, 8139 (__mmask8)__U, 8140 _MM_FROUND_CUR_DIRECTION); 8141 } 8142 8143 #define _mm_fnmadd_round_sd(A, B, C, R) \ 8144 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8145 -(__v2df)(__m128d)(B), \ 8146 (__v2df)(__m128d)(C), (__mmask8)-1, \ 8147 (int)(R))) 8148 8149 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \ 8150 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8151 -(__v2df)(__m128d)(A), \ 8152 (__v2df)(__m128d)(B), (__mmask8)(U), \ 8153 (int)(R))) 8154 8155 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8156 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8157 { 8158 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8159 -(__v2df)__B, 8160 (__v2df)__C, 8161 (__mmask8)__U, 8162 _MM_FROUND_CUR_DIRECTION); 8163 } 8164 8165 #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ 8166 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8167 -(__v2df)(__m128d)(B), \ 8168 (__v2df)(__m128d)(C), (__mmask8)(U), \ 8169 (int)(R))) 8170 8171 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8172 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8173 { 8174 return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, 8175 -(__v2df)__X, 8176 (__v2df)__Y, 8177 (__mmask8)__U, 8178 _MM_FROUND_CUR_DIRECTION); 8179 } 8180 8181 #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \ 8182 ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ 8183 -(__v2df)(__m128d)(X), \ 8184 (__v2df)(__m128d)(Y), (__mmask8)(U), \ 8185 (int)(R))) 8186 8187 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8188 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8189 { 8190 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, 8191 -(__v2df)__A, 8192 -(__v2df)__B, 8193 (__mmask8)__U, 8194 _MM_FROUND_CUR_DIRECTION); 8195 } 8196 8197 #define _mm_fnmsub_round_sd(A, B, C, R) \ 8198 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ 8199 -(__v2df)(__m128d)(B), \ 8200 -(__v2df)(__m128d)(C), (__mmask8)-1, \ 8201 (int)(R))) 8202 8203 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \ 8204 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ 8205 -(__v2df)(__m128d)(A), \ 8206 -(__v2df)(__m128d)(B), (__mmask8)(U), \ 8207 (int)(R))) 8208 8209 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8210 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) 8211 { 8212 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, 8213 -(__v2df)__B, 8214 -(__v2df)__C, 8215 (__mmask8)__U, 8216 _MM_FROUND_CUR_DIRECTION); 8217 } 8218 8219 #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ 8220 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ 8221 -(__v2df)(__m128d)(B), \ 8222 -(__v2df)(__m128d)(C), \ 8223 (__mmask8)(U), \ 8224 (int)(R))) 8225 8226 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8227 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) 8228 { 8229 return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, 8230 -(__v2df)__X, 8231 (__v2df)__Y, 8232 (__mmask8)__U, 8233 _MM_FROUND_CUR_DIRECTION); 8234 } 8235 8236 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \ 8237 ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ 8238 -(__v2df)(__m128d)(X), \ 8239 (__v2df)(__m128d)(Y), \ 8240 (__mmask8)(U), (int)(R))) 8241 8242 #define _mm512_permutex_pd(X, C) \ 8243 ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C))) 8244 8245 #define _mm512_mask_permutex_pd(W, U, X, C) \ 8246 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 8247 (__v8df)_mm512_permutex_pd((X), (C)), \ 8248 (__v8df)(__m512d)(W))) 8249 8250 #define _mm512_maskz_permutex_pd(U, X, C) \ 8251 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 8252 (__v8df)_mm512_permutex_pd((X), (C)), \ 8253 (__v8df)_mm512_setzero_pd())) 8254 8255 #define _mm512_permutex_epi64(X, C) \ 8256 ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C))) 8257 8258 #define _mm512_mask_permutex_epi64(W, U, X, C) \ 8259 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 8260 (__v8di)_mm512_permutex_epi64((X), (C)), \ 8261 (__v8di)(__m512i)(W))) 8262 8263 #define _mm512_maskz_permutex_epi64(U, X, C) \ 8264 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 8265 (__v8di)_mm512_permutex_epi64((X), (C)), \ 8266 (__v8di)_mm512_setzero_si512())) 8267 8268 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8269 _mm512_permutexvar_pd (__m512i __X, __m512d __Y) 8270 { 8271 return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X); 8272 } 8273 8274 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8275 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) 8276 { 8277 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8278 (__v8df)_mm512_permutexvar_pd(__X, __Y), 8279 (__v8df)__W); 8280 } 8281 8282 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8283 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) 8284 { 8285 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8286 (__v8df)_mm512_permutexvar_pd(__X, __Y), 8287 (__v8df)_mm512_setzero_pd()); 8288 } 8289 8290 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8291 _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) 8292 { 8293 return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X); 8294 } 8295 8296 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8297 _mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) 8298 { 8299 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 8300 (__v8di)_mm512_permutexvar_epi64(__X, __Y), 8301 (__v8di)_mm512_setzero_si512()); 8302 } 8303 8304 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8305 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, 8306 __m512i __Y) 8307 { 8308 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 8309 (__v8di)_mm512_permutexvar_epi64(__X, __Y), 8310 (__v8di)__W); 8311 } 8312 8313 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8314 _mm512_permutexvar_ps (__m512i __X, __m512 __Y) 8315 { 8316 return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X); 8317 } 8318 8319 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8320 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) 8321 { 8322 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8323 (__v16sf)_mm512_permutexvar_ps(__X, __Y), 8324 (__v16sf)__W); 8325 } 8326 8327 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8328 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) 8329 { 8330 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8331 (__v16sf)_mm512_permutexvar_ps(__X, __Y), 8332 (__v16sf)_mm512_setzero_ps()); 8333 } 8334 8335 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8336 _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) 8337 { 8338 return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X); 8339 } 8340 8341 #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32 8342 8343 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8344 _mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) 8345 { 8346 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 8347 (__v16si)_mm512_permutexvar_epi32(__X, __Y), 8348 (__v16si)_mm512_setzero_si512()); 8349 } 8350 8351 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8352 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, 8353 __m512i __Y) 8354 { 8355 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 8356 (__v16si)_mm512_permutexvar_epi32(__X, __Y), 8357 (__v16si)__W); 8358 } 8359 8360 #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 8361 8362 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8363 _mm512_kand (__mmask16 __A, __mmask16 __B) 8364 { 8365 return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); 8366 } 8367 8368 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8369 _mm512_kandn (__mmask16 __A, __mmask16 __B) 8370 { 8371 return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); 8372 } 8373 8374 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8375 _mm512_kor (__mmask16 __A, __mmask16 __B) 8376 { 8377 return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); 8378 } 8379 8380 static __inline__ int __DEFAULT_FN_ATTRS 8381 _mm512_kortestc (__mmask16 __A, __mmask16 __B) 8382 { 8383 return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B); 8384 } 8385 8386 static __inline__ int __DEFAULT_FN_ATTRS 8387 _mm512_kortestz (__mmask16 __A, __mmask16 __B) 8388 { 8389 return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B); 8390 } 8391 8392 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8393 _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) 8394 { 8395 return (unsigned char)__builtin_ia32_kortestchi(__A, __B); 8396 } 8397 8398 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8399 _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) 8400 { 8401 return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); 8402 } 8403 8404 static __inline__ unsigned char __DEFAULT_FN_ATTRS 8405 _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { 8406 *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B); 8407 return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); 8408 } 8409 8410 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8411 _mm512_kunpackb (__mmask16 __A, __mmask16 __B) 8412 { 8413 return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); 8414 } 8415 8416 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8417 _mm512_kxnor (__mmask16 __A, __mmask16 __B) 8418 { 8419 return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); 8420 } 8421 8422 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8423 _mm512_kxor (__mmask16 __A, __mmask16 __B) 8424 { 8425 return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); 8426 } 8427 8428 #define _kand_mask16 _mm512_kand 8429 #define _kandn_mask16 _mm512_kandn 8430 #define _knot_mask16 _mm512_knot 8431 #define _kor_mask16 _mm512_kor 8432 #define _kxnor_mask16 _mm512_kxnor 8433 #define _kxor_mask16 _mm512_kxor 8434 8435 #define _kshiftli_mask16(A, I) \ 8436 ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I))) 8437 8438 #define _kshiftri_mask16(A, I) \ 8439 ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I))) 8440 8441 static __inline__ unsigned int __DEFAULT_FN_ATTRS 8442 _cvtmask16_u32(__mmask16 __A) { 8443 return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); 8444 } 8445 8446 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8447 _cvtu32_mask16(unsigned int __A) { 8448 return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); 8449 } 8450 8451 static __inline__ __mmask16 __DEFAULT_FN_ATTRS 8452 _load_mask16(__mmask16 *__A) { 8453 return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); 8454 } 8455 8456 static __inline__ void __DEFAULT_FN_ATTRS 8457 _store_mask16(__mmask16 *__A, __mmask16 __B) { 8458 *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B); 8459 } 8460 8461 static __inline__ void __DEFAULT_FN_ATTRS512 8462 _mm512_stream_si512 (void * __P, __m512i __A) 8463 { 8464 typedef __v8di __v8di_aligned __attribute__((aligned(64))); 8465 __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P); 8466 } 8467 8468 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8469 _mm512_stream_load_si512 (void const *__P) 8470 { 8471 typedef __v8di __v8di_aligned __attribute__((aligned(64))); 8472 return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P); 8473 } 8474 8475 static __inline__ void __DEFAULT_FN_ATTRS512 8476 _mm512_stream_pd (void *__P, __m512d __A) 8477 { 8478 typedef __v8df __v8df_aligned __attribute__((aligned(64))); 8479 __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P); 8480 } 8481 8482 static __inline__ void __DEFAULT_FN_ATTRS512 8483 _mm512_stream_ps (void *__P, __m512 __A) 8484 { 8485 typedef __v16sf __v16sf_aligned __attribute__((aligned(64))); 8486 __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P); 8487 } 8488 8489 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8490 _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) 8491 { 8492 return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, 8493 (__v8df) __W, 8494 (__mmask8) __U); 8495 } 8496 8497 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8498 _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) 8499 { 8500 return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, 8501 (__v8df) 8502 _mm512_setzero_pd (), 8503 (__mmask8) __U); 8504 } 8505 8506 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8507 _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 8508 { 8509 return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, 8510 (__v8di) __W, 8511 (__mmask8) __U); 8512 } 8513 8514 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8515 _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) 8516 { 8517 return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, 8518 (__v8di) 8519 _mm512_setzero_si512 (), 8520 (__mmask8) __U); 8521 } 8522 8523 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8524 _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) 8525 { 8526 return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, 8527 (__v16sf) __W, 8528 (__mmask16) __U); 8529 } 8530 8531 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8532 _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) 8533 { 8534 return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, 8535 (__v16sf) 8536 _mm512_setzero_ps (), 8537 (__mmask16) __U); 8538 } 8539 8540 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8541 _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 8542 { 8543 return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, 8544 (__v16si) __W, 8545 (__mmask16) __U); 8546 } 8547 8548 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8549 _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) 8550 { 8551 return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, 8552 (__v16si) 8553 _mm512_setzero_si512 (), 8554 (__mmask16) __U); 8555 } 8556 8557 #define _mm_cmp_round_ss_mask(X, Y, P, R) \ 8558 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8559 (__v4sf)(__m128)(Y), (int)(P), \ 8560 (__mmask8)-1, (int)(R))) 8561 8562 #define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ 8563 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8564 (__v4sf)(__m128)(Y), (int)(P), \ 8565 (__mmask8)(M), (int)(R))) 8566 8567 #define _mm_cmp_ss_mask(X, Y, P) \ 8568 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8569 (__v4sf)(__m128)(Y), (int)(P), \ 8570 (__mmask8)-1, \ 8571 _MM_FROUND_CUR_DIRECTION)) 8572 8573 #define _mm_mask_cmp_ss_mask(M, X, Y, P) \ 8574 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ 8575 (__v4sf)(__m128)(Y), (int)(P), \ 8576 (__mmask8)(M), \ 8577 _MM_FROUND_CUR_DIRECTION)) 8578 8579 #define _mm_cmp_round_sd_mask(X, Y, P, R) \ 8580 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8581 (__v2df)(__m128d)(Y), (int)(P), \ 8582 (__mmask8)-1, (int)(R))) 8583 8584 #define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ 8585 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8586 (__v2df)(__m128d)(Y), (int)(P), \ 8587 (__mmask8)(M), (int)(R))) 8588 8589 #define _mm_cmp_sd_mask(X, Y, P) \ 8590 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8591 (__v2df)(__m128d)(Y), (int)(P), \ 8592 (__mmask8)-1, \ 8593 _MM_FROUND_CUR_DIRECTION)) 8594 8595 #define _mm_mask_cmp_sd_mask(M, X, Y, P) \ 8596 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ 8597 (__v2df)(__m128d)(Y), (int)(P), \ 8598 (__mmask8)(M), \ 8599 _MM_FROUND_CUR_DIRECTION)) 8600 8601 /* Bit Test */ 8602 8603 static __inline __mmask16 __DEFAULT_FN_ATTRS512 8604 _mm512_test_epi32_mask (__m512i __A, __m512i __B) 8605 { 8606 return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B), 8607 _mm512_setzero_si512()); 8608 } 8609 8610 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8611 _mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) 8612 { 8613 return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), 8614 _mm512_setzero_si512()); 8615 } 8616 8617 static __inline __mmask8 __DEFAULT_FN_ATTRS512 8618 _mm512_test_epi64_mask (__m512i __A, __m512i __B) 8619 { 8620 return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B), 8621 _mm512_setzero_si512()); 8622 } 8623 8624 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8625 _mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) 8626 { 8627 return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), 8628 _mm512_setzero_si512()); 8629 } 8630 8631 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8632 _mm512_testn_epi32_mask (__m512i __A, __m512i __B) 8633 { 8634 return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B), 8635 _mm512_setzero_si512()); 8636 } 8637 8638 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 8639 _mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) 8640 { 8641 return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), 8642 _mm512_setzero_si512()); 8643 } 8644 8645 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8646 _mm512_testn_epi64_mask (__m512i __A, __m512i __B) 8647 { 8648 return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B), 8649 _mm512_setzero_si512()); 8650 } 8651 8652 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 8653 _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) 8654 { 8655 return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), 8656 _mm512_setzero_si512()); 8657 } 8658 8659 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8660 _mm512_movehdup_ps (__m512 __A) 8661 { 8662 return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, 8663 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); 8664 } 8665 8666 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8667 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) 8668 { 8669 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8670 (__v16sf)_mm512_movehdup_ps(__A), 8671 (__v16sf)__W); 8672 } 8673 8674 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8675 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) 8676 { 8677 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8678 (__v16sf)_mm512_movehdup_ps(__A), 8679 (__v16sf)_mm512_setzero_ps()); 8680 } 8681 8682 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8683 _mm512_moveldup_ps (__m512 __A) 8684 { 8685 return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, 8686 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); 8687 } 8688 8689 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8690 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) 8691 { 8692 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8693 (__v16sf)_mm512_moveldup_ps(__A), 8694 (__v16sf)__W); 8695 } 8696 8697 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8698 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) 8699 { 8700 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 8701 (__v16sf)_mm512_moveldup_ps(__A), 8702 (__v16sf)_mm512_setzero_ps()); 8703 } 8704 8705 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8706 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) 8707 { 8708 return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W); 8709 } 8710 8711 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8712 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) 8713 { 8714 return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), 8715 _mm_setzero_ps()); 8716 } 8717 8718 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8719 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) 8720 { 8721 return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W); 8722 } 8723 8724 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8725 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) 8726 { 8727 return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), 8728 _mm_setzero_pd()); 8729 } 8730 8731 static __inline__ void __DEFAULT_FN_ATTRS128 8732 _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) 8733 { 8734 __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1); 8735 } 8736 8737 static __inline__ void __DEFAULT_FN_ATTRS128 8738 _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) 8739 { 8740 __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1); 8741 } 8742 8743 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8744 _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) 8745 { 8746 __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W, 8747 (__v4sf)_mm_setzero_ps(), 8748 0, 4, 4, 4); 8749 8750 return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1); 8751 } 8752 8753 static __inline__ __m128 __DEFAULT_FN_ATTRS128 8754 _mm_maskz_load_ss (__mmask8 __U, const float* __A) 8755 { 8756 return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A, 8757 (__v4sf) _mm_setzero_ps(), 8758 __U & 1); 8759 } 8760 8761 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8762 _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) 8763 { 8764 __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, 8765 (__v2df)_mm_setzero_pd(), 8766 0, 2); 8767 8768 return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1); 8769 } 8770 8771 static __inline__ __m128d __DEFAULT_FN_ATTRS128 8772 _mm_maskz_load_sd (__mmask8 __U, const double* __A) 8773 { 8774 return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, 8775 (__v2df) _mm_setzero_pd(), 8776 __U & 1); 8777 } 8778 8779 #define _mm512_shuffle_epi32(A, I) \ 8780 ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I))) 8781 8782 #define _mm512_mask_shuffle_epi32(W, U, A, I) \ 8783 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 8784 (__v16si)_mm512_shuffle_epi32((A), (I)), \ 8785 (__v16si)(__m512i)(W))) 8786 8787 #define _mm512_maskz_shuffle_epi32(U, A, I) \ 8788 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 8789 (__v16si)_mm512_shuffle_epi32((A), (I)), \ 8790 (__v16si)_mm512_setzero_si512())) 8791 8792 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8793 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) 8794 { 8795 return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, 8796 (__v8df) __W, 8797 (__mmask8) __U); 8798 } 8799 8800 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8801 _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) 8802 { 8803 return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, 8804 (__v8df) _mm512_setzero_pd (), 8805 (__mmask8) __U); 8806 } 8807 8808 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8809 _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) 8810 { 8811 return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, 8812 (__v8di) __W, 8813 (__mmask8) __U); 8814 } 8815 8816 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8817 _mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A) 8818 { 8819 return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, 8820 (__v8di) _mm512_setzero_si512 (), 8821 (__mmask8) __U); 8822 } 8823 8824 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8825 _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) 8826 { 8827 return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, 8828 (__v8df) __W, 8829 (__mmask8) __U); 8830 } 8831 8832 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8833 _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) 8834 { 8835 return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, 8836 (__v8df) _mm512_setzero_pd(), 8837 (__mmask8) __U); 8838 } 8839 8840 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8841 _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) 8842 { 8843 return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, 8844 (__v8di) __W, 8845 (__mmask8) __U); 8846 } 8847 8848 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8849 _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) 8850 { 8851 return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, 8852 (__v8di) _mm512_setzero_si512(), 8853 (__mmask8) __U); 8854 } 8855 8856 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8857 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) 8858 { 8859 return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, 8860 (__v16sf) __W, 8861 (__mmask16) __U); 8862 } 8863 8864 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8865 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) 8866 { 8867 return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, 8868 (__v16sf) _mm512_setzero_ps(), 8869 (__mmask16) __U); 8870 } 8871 8872 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8873 _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) 8874 { 8875 return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, 8876 (__v16si) __W, 8877 (__mmask16) __U); 8878 } 8879 8880 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8881 _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) 8882 { 8883 return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, 8884 (__v16si) _mm512_setzero_si512(), 8885 (__mmask16) __U); 8886 } 8887 8888 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8889 _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) 8890 { 8891 return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, 8892 (__v16sf) __W, 8893 (__mmask16) __U); 8894 } 8895 8896 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8897 _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) 8898 { 8899 return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, 8900 (__v16sf) _mm512_setzero_ps(), 8901 (__mmask16) __U); 8902 } 8903 8904 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8905 _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) 8906 { 8907 return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, 8908 (__v16si) __W, 8909 (__mmask16) __U); 8910 } 8911 8912 static __inline__ __m512i __DEFAULT_FN_ATTRS512 8913 _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) 8914 { 8915 return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, 8916 (__v16si) _mm512_setzero_si512(), 8917 (__mmask16) __U); 8918 } 8919 8920 #define _mm512_cvt_roundps_pd(A, R) \ 8921 ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8922 (__v8df)_mm512_undefined_pd(), \ 8923 (__mmask8)-1, (int)(R))) 8924 8925 #define _mm512_mask_cvt_roundps_pd(W, U, A, R) \ 8926 ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8927 (__v8df)(__m512d)(W), \ 8928 (__mmask8)(U), (int)(R))) 8929 8930 #define _mm512_maskz_cvt_roundps_pd(U, A, R) \ 8931 ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ 8932 (__v8df)_mm512_setzero_pd(), \ 8933 (__mmask8)(U), (int)(R))) 8934 8935 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8936 _mm512_cvtps_pd (__m256 __A) 8937 { 8938 return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df); 8939 } 8940 8941 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8942 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) 8943 { 8944 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8945 (__v8df)_mm512_cvtps_pd(__A), 8946 (__v8df)__W); 8947 } 8948 8949 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8950 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) 8951 { 8952 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 8953 (__v8df)_mm512_cvtps_pd(__A), 8954 (__v8df)_mm512_setzero_pd()); 8955 } 8956 8957 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8958 _mm512_cvtpslo_pd (__m512 __A) 8959 { 8960 return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); 8961 } 8962 8963 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8964 _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) 8965 { 8966 return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); 8967 } 8968 8969 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8970 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) 8971 { 8972 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 8973 (__v8df) __A, 8974 (__v8df) __W); 8975 } 8976 8977 static __inline__ __m512d __DEFAULT_FN_ATTRS512 8978 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) 8979 { 8980 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, 8981 (__v8df) __A, 8982 (__v8df) _mm512_setzero_pd ()); 8983 } 8984 8985 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8986 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) 8987 { 8988 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 8989 (__v16sf) __A, 8990 (__v16sf) __W); 8991 } 8992 8993 static __inline__ __m512 __DEFAULT_FN_ATTRS512 8994 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) 8995 { 8996 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, 8997 (__v16sf) __A, 8998 (__v16sf) _mm512_setzero_ps ()); 8999 } 9000 9001 static __inline__ void __DEFAULT_FN_ATTRS512 9002 _mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) 9003 { 9004 __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, 9005 (__mmask8) __U); 9006 } 9007 9008 static __inline__ void __DEFAULT_FN_ATTRS512 9009 _mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) 9010 { 9011 __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, 9012 (__mmask8) __U); 9013 } 9014 9015 static __inline__ void __DEFAULT_FN_ATTRS512 9016 _mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) 9017 { 9018 __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, 9019 (__mmask16) __U); 9020 } 9021 9022 static __inline__ void __DEFAULT_FN_ATTRS512 9023 _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) 9024 { 9025 __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, 9026 (__mmask16) __U); 9027 } 9028 9029 #define _mm_cvt_roundsd_ss(A, B, R) \ 9030 ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9031 (__v2df)(__m128d)(B), \ 9032 (__v4sf)_mm_undefined_ps(), \ 9033 (__mmask8)-1, (int)(R))) 9034 9035 #define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \ 9036 ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9037 (__v2df)(__m128d)(B), \ 9038 (__v4sf)(__m128)(W), \ 9039 (__mmask8)(U), (int)(R))) 9040 9041 #define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \ 9042 ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ 9043 (__v2df)(__m128d)(B), \ 9044 (__v4sf)_mm_setzero_ps(), \ 9045 (__mmask8)(U), (int)(R))) 9046 9047 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9048 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) 9049 { 9050 return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, 9051 (__v2df)__B, 9052 (__v4sf)__W, 9053 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9054 } 9055 9056 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9057 _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) 9058 { 9059 return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, 9060 (__v2df)__B, 9061 (__v4sf)_mm_setzero_ps(), 9062 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9063 } 9064 9065 #define _mm_cvtss_i32 _mm_cvtss_si32 9066 #define _mm_cvtsd_i32 _mm_cvtsd_si32 9067 #define _mm_cvti32_sd _mm_cvtsi32_sd 9068 #define _mm_cvti32_ss _mm_cvtsi32_ss 9069 #ifdef __x86_64__ 9070 #define _mm_cvtss_i64 _mm_cvtss_si64 9071 #define _mm_cvtsd_i64 _mm_cvtsd_si64 9072 #define _mm_cvti64_sd _mm_cvtsi64_sd 9073 #define _mm_cvti64_ss _mm_cvtsi64_ss 9074 #endif 9075 9076 #ifdef __x86_64__ 9077 #define _mm_cvt_roundi64_sd(A, B, R) \ 9078 ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ 9079 (int)(R))) 9080 9081 #define _mm_cvt_roundsi64_sd(A, B, R) \ 9082 ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ 9083 (int)(R))) 9084 #endif 9085 9086 #define _mm_cvt_roundsi32_ss(A, B, R) \ 9087 ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))) 9088 9089 #define _mm_cvt_roundi32_ss(A, B, R) \ 9090 ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))) 9091 9092 #ifdef __x86_64__ 9093 #define _mm_cvt_roundsi64_ss(A, B, R) \ 9094 ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ 9095 (int)(R))) 9096 9097 #define _mm_cvt_roundi64_ss(A, B, R) \ 9098 ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ 9099 (int)(R))) 9100 #endif 9101 9102 #define _mm_cvt_roundss_sd(A, B, R) \ 9103 ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9104 (__v4sf)(__m128)(B), \ 9105 (__v2df)_mm_undefined_pd(), \ 9106 (__mmask8)-1, (int)(R))) 9107 9108 #define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \ 9109 ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9110 (__v4sf)(__m128)(B), \ 9111 (__v2df)(__m128d)(W), \ 9112 (__mmask8)(U), (int)(R))) 9113 9114 #define _mm_maskz_cvt_roundss_sd(U, A, B, R) \ 9115 ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ 9116 (__v4sf)(__m128)(B), \ 9117 (__v2df)_mm_setzero_pd(), \ 9118 (__mmask8)(U), (int)(R))) 9119 9120 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9121 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) 9122 { 9123 return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, 9124 (__v4sf)__B, 9125 (__v2df)__W, 9126 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9127 } 9128 9129 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9130 _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B) 9131 { 9132 return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, 9133 (__v4sf)__B, 9134 (__v2df)_mm_setzero_pd(), 9135 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 9136 } 9137 9138 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9139 _mm_cvtu32_sd (__m128d __A, unsigned __B) 9140 { 9141 __A[0] = __B; 9142 return __A; 9143 } 9144 9145 #ifdef __x86_64__ 9146 #define _mm_cvt_roundu64_sd(A, B, R) \ 9147 ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ 9148 (unsigned long long)(B), (int)(R))) 9149 9150 static __inline__ __m128d __DEFAULT_FN_ATTRS128 9151 _mm_cvtu64_sd (__m128d __A, unsigned long long __B) 9152 { 9153 __A[0] = __B; 9154 return __A; 9155 } 9156 #endif 9157 9158 #define _mm_cvt_roundu32_ss(A, B, R) \ 9159 ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ 9160 (int)(R))) 9161 9162 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9163 _mm_cvtu32_ss (__m128 __A, unsigned __B) 9164 { 9165 __A[0] = __B; 9166 return __A; 9167 } 9168 9169 #ifdef __x86_64__ 9170 #define _mm_cvt_roundu64_ss(A, B, R) \ 9171 ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ 9172 (unsigned long long)(B), (int)(R))) 9173 9174 static __inline__ __m128 __DEFAULT_FN_ATTRS128 9175 _mm_cvtu64_ss (__m128 __A, unsigned long long __B) 9176 { 9177 __A[0] = __B; 9178 return __A; 9179 } 9180 #endif 9181 9182 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9183 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) 9184 { 9185 return (__m512i) __builtin_ia32_selectd_512(__M, 9186 (__v16si) _mm512_set1_epi32(__A), 9187 (__v16si) __O); 9188 } 9189 9190 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9191 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) 9192 { 9193 return (__m512i) __builtin_ia32_selectq_512(__M, 9194 (__v8di) _mm512_set1_epi64(__A), 9195 (__v8di) __O); 9196 } 9197 9198 static __inline __m512i __DEFAULT_FN_ATTRS512 9199 _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, 9200 char __e58, char __e57, char __e56, char __e55, char __e54, char __e53, 9201 char __e52, char __e51, char __e50, char __e49, char __e48, char __e47, 9202 char __e46, char __e45, char __e44, char __e43, char __e42, char __e41, 9203 char __e40, char __e39, char __e38, char __e37, char __e36, char __e35, 9204 char __e34, char __e33, char __e32, char __e31, char __e30, char __e29, 9205 char __e28, char __e27, char __e26, char __e25, char __e24, char __e23, 9206 char __e22, char __e21, char __e20, char __e19, char __e18, char __e17, 9207 char __e16, char __e15, char __e14, char __e13, char __e12, char __e11, 9208 char __e10, char __e9, char __e8, char __e7, char __e6, char __e5, 9209 char __e4, char __e3, char __e2, char __e1, char __e0) { 9210 9211 return __extension__ (__m512i)(__v64qi) 9212 {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, 9213 __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, 9214 __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, 9215 __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31, 9216 __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39, 9217 __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47, 9218 __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55, 9219 __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63}; 9220 } 9221 9222 static __inline __m512i __DEFAULT_FN_ATTRS512 9223 _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, 9224 short __e27, short __e26, short __e25, short __e24, short __e23, 9225 short __e22, short __e21, short __e20, short __e19, short __e18, 9226 short __e17, short __e16, short __e15, short __e14, short __e13, 9227 short __e12, short __e11, short __e10, short __e9, short __e8, 9228 short __e7, short __e6, short __e5, short __e4, short __e3, 9229 short __e2, short __e1, short __e0) { 9230 return __extension__ (__m512i)(__v32hi) 9231 {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, 9232 __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, 9233 __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, 9234 __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 }; 9235 } 9236 9237 static __inline __m512i __DEFAULT_FN_ATTRS512 9238 _mm512_set_epi32 (int __A, int __B, int __C, int __D, 9239 int __E, int __F, int __G, int __H, 9240 int __I, int __J, int __K, int __L, 9241 int __M, int __N, int __O, int __P) 9242 { 9243 return __extension__ (__m512i)(__v16si) 9244 { __P, __O, __N, __M, __L, __K, __J, __I, 9245 __H, __G, __F, __E, __D, __C, __B, __A }; 9246 } 9247 9248 #define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \ 9249 e8,e9,e10,e11,e12,e13,e14,e15) \ 9250 _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \ 9251 (e5),(e4),(e3),(e2),(e1),(e0)) 9252 9253 static __inline__ __m512i __DEFAULT_FN_ATTRS512 9254 _mm512_set_epi64 (long long __A, long long __B, long long __C, 9255 long long __D, long long __E, long long __F, 9256 long long __G, long long __H) 9257 { 9258 return __extension__ (__m512i) (__v8di) 9259 { __H, __G, __F, __E, __D, __C, __B, __A }; 9260 } 9261 9262 #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \ 9263 _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) 9264 9265 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9266 _mm512_set_pd (double __A, double __B, double __C, double __D, 9267 double __E, double __F, double __G, double __H) 9268 { 9269 return __extension__ (__m512d) 9270 { __H, __G, __F, __E, __D, __C, __B, __A }; 9271 } 9272 9273 #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \ 9274 _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) 9275 9276 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9277 _mm512_set_ps (float __A, float __B, float __C, float __D, 9278 float __E, float __F, float __G, float __H, 9279 float __I, float __J, float __K, float __L, 9280 float __M, float __N, float __O, float __P) 9281 { 9282 return __extension__ (__m512) 9283 { __P, __O, __N, __M, __L, __K, __J, __I, 9284 __H, __G, __F, __E, __D, __C, __B, __A }; 9285 } 9286 9287 #define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \ 9288 _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \ 9289 (e4),(e3),(e2),(e1),(e0)) 9290 9291 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9292 _mm512_abs_ps(__m512 __A) 9293 { 9294 return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; 9295 } 9296 9297 static __inline__ __m512 __DEFAULT_FN_ATTRS512 9298 _mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) 9299 { 9300 return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; 9301 } 9302 9303 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9304 _mm512_abs_pd(__m512d __A) 9305 { 9306 return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ; 9307 } 9308 9309 static __inline__ __m512d __DEFAULT_FN_ATTRS512 9310 _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) 9311 { 9312 return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); 9313 } 9314 9315 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as 9316 * outputs. This class of vector operation forms the basis of many scientific 9317 * computations. In vector-reduction arithmetic, the evaluation order is 9318 * independent of the order of the input elements of V. 9319 9320 * For floating-point intrinsics: 9321 * 1. When using fadd/fmul intrinsics, the order of operations within the 9322 * vector is unspecified (associative math). 9323 * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector 9324 * produce unspecified results. 9325 9326 * Used bisection method. At each step, we partition the vector with previous 9327 * step in half, and the operation is performed on its two halves. 9328 * This takes log2(n) steps where n is the number of elements in the vector. 9329 */ 9330 9331 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) { 9332 return __builtin_reduce_add((__v8di)__W); 9333 } 9334 9335 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) { 9336 return __builtin_reduce_mul((__v8di)__W); 9337 } 9338 9339 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) { 9340 return __builtin_reduce_and((__v8di)__W); 9341 } 9342 9343 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) { 9344 return __builtin_reduce_or((__v8di)__W); 9345 } 9346 9347 static __inline__ long long __DEFAULT_FN_ATTRS512 9348 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { 9349 __W = _mm512_maskz_mov_epi64(__M, __W); 9350 return __builtin_reduce_add((__v8di)__W); 9351 } 9352 9353 static __inline__ long long __DEFAULT_FN_ATTRS512 9354 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { 9355 __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W); 9356 return __builtin_reduce_mul((__v8di)__W); 9357 } 9358 9359 static __inline__ long long __DEFAULT_FN_ATTRS512 9360 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { 9361 __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W); 9362 return __builtin_reduce_and((__v8di)__W); 9363 } 9364 9365 static __inline__ long long __DEFAULT_FN_ATTRS512 9366 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { 9367 __W = _mm512_maskz_mov_epi64(__M, __W); 9368 return __builtin_reduce_or((__v8di)__W); 9369 } 9370 9371 // -0.0 is used to ignore the start value since it is the neutral value of 9372 // floating point addition. For more information, please refer to 9373 // https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic 9374 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { 9375 return __builtin_ia32_reduce_fadd_pd512(-0.0, __W); 9376 } 9377 9378 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) { 9379 return __builtin_ia32_reduce_fmul_pd512(1.0, __W); 9380 } 9381 9382 static __inline__ double __DEFAULT_FN_ATTRS512 9383 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { 9384 __W = _mm512_maskz_mov_pd(__M, __W); 9385 return __builtin_ia32_reduce_fadd_pd512(-0.0, __W); 9386 } 9387 9388 static __inline__ double __DEFAULT_FN_ATTRS512 9389 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { 9390 __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W); 9391 return __builtin_ia32_reduce_fmul_pd512(1.0, __W); 9392 } 9393 9394 static __inline__ int __DEFAULT_FN_ATTRS512 9395 _mm512_reduce_add_epi32(__m512i __W) { 9396 return __builtin_reduce_add((__v16si)__W); 9397 } 9398 9399 static __inline__ int __DEFAULT_FN_ATTRS512 9400 _mm512_reduce_mul_epi32(__m512i __W) { 9401 return __builtin_reduce_mul((__v16si)__W); 9402 } 9403 9404 static __inline__ int __DEFAULT_FN_ATTRS512 9405 _mm512_reduce_and_epi32(__m512i __W) { 9406 return __builtin_reduce_and((__v16si)__W); 9407 } 9408 9409 static __inline__ int __DEFAULT_FN_ATTRS512 9410 _mm512_reduce_or_epi32(__m512i __W) { 9411 return __builtin_reduce_or((__v16si)__W); 9412 } 9413 9414 static __inline__ int __DEFAULT_FN_ATTRS512 9415 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { 9416 __W = _mm512_maskz_mov_epi32(__M, __W); 9417 return __builtin_reduce_add((__v16si)__W); 9418 } 9419 9420 static __inline__ int __DEFAULT_FN_ATTRS512 9421 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { 9422 __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W); 9423 return __builtin_reduce_mul((__v16si)__W); 9424 } 9425 9426 static __inline__ int __DEFAULT_FN_ATTRS512 9427 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { 9428 __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W); 9429 return __builtin_reduce_and((__v16si)__W); 9430 } 9431 9432 static __inline__ int __DEFAULT_FN_ATTRS512 9433 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { 9434 __W = _mm512_maskz_mov_epi32(__M, __W); 9435 return __builtin_reduce_or((__v16si)__W); 9436 } 9437 9438 static __inline__ float __DEFAULT_FN_ATTRS512 9439 _mm512_reduce_add_ps(__m512 __W) { 9440 return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W); 9441 } 9442 9443 static __inline__ float __DEFAULT_FN_ATTRS512 9444 _mm512_reduce_mul_ps(__m512 __W) { 9445 return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); 9446 } 9447 9448 static __inline__ float __DEFAULT_FN_ATTRS512 9449 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { 9450 __W = _mm512_maskz_mov_ps(__M, __W); 9451 return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W); 9452 } 9453 9454 static __inline__ float __DEFAULT_FN_ATTRS512 9455 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { 9456 __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W); 9457 return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); 9458 } 9459 9460 static __inline__ long long __DEFAULT_FN_ATTRS512 9461 _mm512_reduce_max_epi64(__m512i __V) { 9462 return __builtin_reduce_max((__v8di)__V); 9463 } 9464 9465 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9466 _mm512_reduce_max_epu64(__m512i __V) { 9467 return __builtin_reduce_max((__v8du)__V); 9468 } 9469 9470 static __inline__ long long __DEFAULT_FN_ATTRS512 9471 _mm512_reduce_min_epi64(__m512i __V) { 9472 return __builtin_reduce_min((__v8di)__V); 9473 } 9474 9475 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9476 _mm512_reduce_min_epu64(__m512i __V) { 9477 return __builtin_reduce_min((__v8du)__V); 9478 } 9479 9480 static __inline__ long long __DEFAULT_FN_ATTRS512 9481 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { 9482 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V); 9483 return __builtin_reduce_max((__v8di)__V); 9484 } 9485 9486 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9487 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { 9488 __V = _mm512_maskz_mov_epi64(__M, __V); 9489 return __builtin_reduce_max((__v8du)__V); 9490 } 9491 9492 static __inline__ long long __DEFAULT_FN_ATTRS512 9493 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { 9494 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V); 9495 return __builtin_reduce_min((__v8di)__V); 9496 } 9497 9498 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 9499 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { 9500 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V); 9501 return __builtin_reduce_min((__v8du)__V); 9502 } 9503 static __inline__ int __DEFAULT_FN_ATTRS512 9504 _mm512_reduce_max_epi32(__m512i __V) { 9505 return __builtin_reduce_max((__v16si)__V); 9506 } 9507 9508 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9509 _mm512_reduce_max_epu32(__m512i __V) { 9510 return __builtin_reduce_max((__v16su)__V); 9511 } 9512 9513 static __inline__ int __DEFAULT_FN_ATTRS512 9514 _mm512_reduce_min_epi32(__m512i __V) { 9515 return __builtin_reduce_min((__v16si)__V); 9516 } 9517 9518 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9519 _mm512_reduce_min_epu32(__m512i __V) { 9520 return __builtin_reduce_min((__v16su)__V); 9521 } 9522 9523 static __inline__ int __DEFAULT_FN_ATTRS512 9524 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { 9525 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V); 9526 return __builtin_reduce_max((__v16si)__V); 9527 } 9528 9529 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9530 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { 9531 __V = _mm512_maskz_mov_epi32(__M, __V); 9532 return __builtin_reduce_max((__v16su)__V); 9533 } 9534 9535 static __inline__ int __DEFAULT_FN_ATTRS512 9536 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { 9537 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V); 9538 return __builtin_reduce_min((__v16si)__V); 9539 } 9540 9541 static __inline__ unsigned int __DEFAULT_FN_ATTRS512 9542 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { 9543 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V); 9544 return __builtin_reduce_min((__v16su)__V); 9545 } 9546 9547 static __inline__ double __DEFAULT_FN_ATTRS512 9548 _mm512_reduce_max_pd(__m512d __V) { 9549 return __builtin_ia32_reduce_fmax_pd512(__V); 9550 } 9551 9552 static __inline__ double __DEFAULT_FN_ATTRS512 9553 _mm512_reduce_min_pd(__m512d __V) { 9554 return __builtin_ia32_reduce_fmin_pd512(__V); 9555 } 9556 9557 static __inline__ double __DEFAULT_FN_ATTRS512 9558 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { 9559 __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V); 9560 return __builtin_ia32_reduce_fmax_pd512(__V); 9561 } 9562 9563 static __inline__ double __DEFAULT_FN_ATTRS512 9564 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { 9565 __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V); 9566 return __builtin_ia32_reduce_fmin_pd512(__V); 9567 } 9568 9569 static __inline__ float __DEFAULT_FN_ATTRS512 9570 _mm512_reduce_max_ps(__m512 __V) { 9571 return __builtin_ia32_reduce_fmax_ps512(__V); 9572 } 9573 9574 static __inline__ float __DEFAULT_FN_ATTRS512 9575 _mm512_reduce_min_ps(__m512 __V) { 9576 return __builtin_ia32_reduce_fmin_ps512(__V); 9577 } 9578 9579 static __inline__ float __DEFAULT_FN_ATTRS512 9580 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { 9581 __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V); 9582 return __builtin_ia32_reduce_fmax_ps512(__V); 9583 } 9584 9585 static __inline__ float __DEFAULT_FN_ATTRS512 9586 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { 9587 __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V); 9588 return __builtin_ia32_reduce_fmin_ps512(__V); 9589 } 9590 9591 /// Moves the least significant 32 bits of a vector of [16 x i32] to a 9592 /// 32-bit signed integer value. 9593 /// 9594 /// \headerfile <x86intrin.h> 9595 /// 9596 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 9597 /// 9598 /// \param __A 9599 /// A vector of [16 x i32]. The least significant 32 bits are moved to the 9600 /// destination. 9601 /// \returns A 32-bit signed integer containing the moved value. 9602 static __inline__ int __DEFAULT_FN_ATTRS512 9603 _mm512_cvtsi512_si32(__m512i __A) { 9604 __v16si __b = (__v16si)__A; 9605 return __b[0]; 9606 } 9607 9608 /// Loads 8 double-precision (64-bit) floating-point elements stored at memory 9609 /// locations starting at location \a base_addr at packed 32-bit integer indices 9610 /// stored in the lower half of \a vindex scaled by \a scale them in dst. 9611 /// 9612 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions. 9613 /// 9614 /// \code{.operation} 9615 /// FOR j := 0 to 7 9616 /// i := j*64 9617 /// m := j*32 9618 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9619 /// dst[i+63:i] := MEM[addr+63:addr] 9620 /// ENDFOR 9621 /// dst[MAX:512] := 0 9622 /// \endcode 9623 #define _mm512_i32logather_pd(vindex, base_addr, scale) \ 9624 _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale)) 9625 9626 /// Loads 8 double-precision (64-bit) floating-point elements from memory 9627 /// starting at location \a base_addr at packed 32-bit integer indices stored in 9628 /// the lower half of \a vindex scaled by \a scale into dst using writemask 9629 /// \a mask (elements are copied from \a src when the corresponding mask bit is 9630 /// not set). 9631 /// 9632 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions. 9633 /// 9634 /// \code{.operation} 9635 /// FOR j := 0 to 7 9636 /// i := j*64 9637 /// m := j*32 9638 /// IF mask[j] 9639 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9640 /// dst[i+63:i] := MEM[addr+63:addr] 9641 /// ELSE 9642 /// dst[i+63:i] := src[i+63:i] 9643 /// FI 9644 /// ENDFOR 9645 /// dst[MAX:512] := 0 9646 /// \endcode 9647 #define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \ 9648 _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \ 9649 (base_addr), (scale)) 9650 9651 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr 9652 /// at packed 32-bit integer indices stored in the lower half of \a vindex 9653 /// scaled by \a scale and stores them in dst. 9654 /// 9655 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions. 9656 /// 9657 /// \code{.operation} 9658 /// FOR j := 0 to 7 9659 /// i := j*64 9660 /// m := j*32 9661 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9662 /// dst[i+63:i] := MEM[addr+63:addr] 9663 /// ENDFOR 9664 /// dst[MAX:512] := 0 9665 /// \endcode 9666 #define _mm512_i32logather_epi64(vindex, base_addr, scale) \ 9667 _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale)) 9668 9669 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr 9670 /// at packed 32-bit integer indices stored in the lower half of \a vindex 9671 /// scaled by \a scale and stores them in dst using writemask \a mask (elements 9672 /// are copied from \a src when the corresponding mask bit is not set). 9673 /// 9674 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions. 9675 /// 9676 /// \code{.operation} 9677 /// FOR j := 0 to 7 9678 /// i := j*64 9679 /// m := j*32 9680 /// IF mask[j] 9681 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9682 /// dst[i+63:i] := MEM[addr+63:addr] 9683 /// ELSE 9684 /// dst[i+63:i] := src[i+63:i] 9685 /// FI 9686 /// ENDFOR 9687 /// dst[MAX:512] := 0 9688 /// \endcode 9689 #define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \ 9690 _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \ 9691 (base_addr), (scale)) 9692 9693 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1 9694 /// and to memory locations starting at location \a base_addr at packed 32-bit 9695 /// integer indices stored in \a vindex scaled by \a scale. 9696 /// 9697 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions. 9698 /// 9699 /// \code{.operation} 9700 /// FOR j := 0 to 7 9701 /// i := j*64 9702 /// m := j*32 9703 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9704 /// MEM[addr+63:addr] := v1[i+63:i] 9705 /// ENDFOR 9706 /// \endcode 9707 #define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \ 9708 _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale)) 9709 9710 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1 9711 /// to memory locations starting at location \a base_addr at packed 32-bit 9712 /// integer indices stored in \a vindex scaled by \a scale. Only those elements 9713 /// whose corresponding mask bit is set in writemask \a mask are written to 9714 /// memory. 9715 /// 9716 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions. 9717 /// 9718 /// \code{.operation} 9719 /// FOR j := 0 to 7 9720 /// i := j*64 9721 /// m := j*32 9722 /// IF mask[j] 9723 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9724 /// MEM[addr+63:addr] := a[i+63:i] 9725 /// FI 9726 /// ENDFOR 9727 /// \endcode 9728 #define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \ 9729 _mm512_mask_i32scatter_pd((base_addr), (mask), \ 9730 _mm512_castsi512_si256(vindex), (v1), (scale)) 9731 9732 /// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in 9733 /// memory locations starting at location \a base_addr at packed 32-bit integer 9734 /// indices stored in \a vindex scaled by \a scale. 9735 /// 9736 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions. 9737 /// 9738 /// \code{.operation} 9739 /// FOR j := 0 to 7 9740 /// i := j*64 9741 /// m := j*32 9742 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9743 /// MEM[addr+63:addr] := a[i+63:i] 9744 /// ENDFOR 9745 /// \endcode 9746 #define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \ 9747 _mm512_i32scatter_epi64((base_addr), \ 9748 _mm512_castsi512_si256(vindex), (v1), (scale)) 9749 9750 /// Stores 8 packed 64-bit integer elements located in a and stores them in 9751 /// memory locations starting at location \a base_addr at packed 32-bit integer 9752 /// indices stored in \a vindex scaled by scale using writemask \a mask (elements 9753 /// whose corresponding mask bit is not set are not written to memory). 9754 /// 9755 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions. 9756 /// 9757 /// \code{.operation} 9758 /// FOR j := 0 to 7 9759 /// i := j*64 9760 /// m := j*32 9761 /// IF mask[j] 9762 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 9763 /// MEM[addr+63:addr] := a[i+63:i] 9764 /// FI 9765 /// ENDFOR 9766 /// \endcode 9767 #define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \ 9768 _mm512_mask_i32scatter_epi64((base_addr), (mask), \ 9769 _mm512_castsi512_si256(vindex), (v1), (scale)) 9770 9771 #undef __DEFAULT_FN_ATTRS512 9772 #undef __DEFAULT_FN_ATTRS128 9773 #undef __DEFAULT_FN_ATTRS 9774 9775 #endif /* __AVX512FINTRIN_H */ 9776