1 /*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error \ 11 "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifdef __SSE2__ 15 16 #ifndef __AVX512VLFP16INTRIN_H 17 #define __AVX512VLFP16INTRIN_H 18 19 /* Define the default attributes for the functions in this file. */ 20 #define __DEFAULT_FN_ATTRS256 \ 21 __attribute__((__always_inline__, __nodebug__, \ 22 __target__("avx512fp16, avx512vl"), \ 23 __min_vector_width__(256))) 24 #define __DEFAULT_FN_ATTRS128 \ 25 __attribute__((__always_inline__, __nodebug__, \ 26 __target__("avx512fp16, avx512vl"), \ 27 __min_vector_width__(128))) 28 29 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) { 30 return __a[0]; 31 } 32 33 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) { 34 return __a[0]; 35 } 36 37 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) { 38 return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0}; 39 } 40 41 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) { 42 return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h}; 43 } 44 45 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) { 46 return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h, 47 __h, __h, __h, __h, __h, __h, __h, __h}; 48 } 49 50 static __inline __m128h __DEFAULT_FN_ATTRS128 51 _mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, 52 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) { 53 return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1}; 54 } 55 56 static __inline __m256h __DEFAULT_FN_ATTRS256 57 _mm256_set1_pch(_Float16 _Complex h) { 58 return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h)); 59 } 60 61 static __inline __m128h __DEFAULT_FN_ATTRS128 62 _mm_set1_pch(_Float16 _Complex h) { 63 return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h)); 64 } 65 66 static __inline __m256h __DEFAULT_FN_ATTRS256 67 _mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, 68 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, 69 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, 70 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) { 71 return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11, 72 __h10, __h9, __h8, __h7, __h6, __h5, 73 __h4, __h3, __h2, __h1}; 74 } 75 76 #define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \ 77 _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1)) 78 79 #define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \ 80 h14, h15, h16) \ 81 _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \ 82 (h7), (h6), (h5), (h4), (h3), (h2), (h1)) 83 84 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A, 85 __m256h __B) { 86 return (__m256h)((__v16hf)__A + (__v16hf)__B); 87 } 88 89 static __inline__ __m256h __DEFAULT_FN_ATTRS256 90 _mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 91 return (__m256h)__builtin_ia32_selectph_256( 92 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W); 93 } 94 95 static __inline__ __m256h __DEFAULT_FN_ATTRS256 96 _mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) { 97 return (__m256h)__builtin_ia32_selectph_256( 98 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 99 } 100 101 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A, 102 __m128h __B) { 103 return (__m128h)((__v8hf)__A + (__v8hf)__B); 104 } 105 106 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W, 107 __mmask8 __U, 108 __m128h __A, 109 __m128h __B) { 110 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), 111 (__v8hf)__W); 112 } 113 114 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U, 115 __m128h __A, 116 __m128h __B) { 117 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), 118 (__v8hf)_mm_setzero_ph()); 119 } 120 121 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A, 122 __m256h __B) { 123 return (__m256h)((__v16hf)__A - (__v16hf)__B); 124 } 125 126 static __inline__ __m256h __DEFAULT_FN_ATTRS256 127 _mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 128 return (__m256h)__builtin_ia32_selectph_256( 129 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W); 130 } 131 132 static __inline__ __m256h __DEFAULT_FN_ATTRS256 133 _mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) { 134 return (__m256h)__builtin_ia32_selectph_256( 135 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 136 } 137 138 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A, 139 __m128h __B) { 140 return (__m128h)((__v8hf)__A - (__v8hf)__B); 141 } 142 143 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W, 144 __mmask8 __U, 145 __m128h __A, 146 __m128h __B) { 147 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), 148 (__v8hf)__W); 149 } 150 151 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U, 152 __m128h __A, 153 __m128h __B) { 154 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), 155 (__v8hf)_mm_setzero_ph()); 156 } 157 158 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A, 159 __m256h __B) { 160 return (__m256h)((__v16hf)__A * (__v16hf)__B); 161 } 162 163 static __inline__ __m256h __DEFAULT_FN_ATTRS256 164 _mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 165 return (__m256h)__builtin_ia32_selectph_256( 166 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W); 167 } 168 169 static __inline__ __m256h __DEFAULT_FN_ATTRS256 170 _mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) { 171 return (__m256h)__builtin_ia32_selectph_256( 172 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 173 } 174 175 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A, 176 __m128h __B) { 177 return (__m128h)((__v8hf)__A * (__v8hf)__B); 178 } 179 180 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W, 181 __mmask8 __U, 182 __m128h __A, 183 __m128h __B) { 184 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), 185 (__v8hf)__W); 186 } 187 188 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U, 189 __m128h __A, 190 __m128h __B) { 191 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), 192 (__v8hf)_mm_setzero_ph()); 193 } 194 195 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A, 196 __m256h __B) { 197 return (__m256h)((__v16hf)__A / (__v16hf)__B); 198 } 199 200 static __inline__ __m256h __DEFAULT_FN_ATTRS256 201 _mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 202 return (__m256h)__builtin_ia32_selectph_256( 203 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W); 204 } 205 206 static __inline__ __m256h __DEFAULT_FN_ATTRS256 207 _mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) { 208 return (__m256h)__builtin_ia32_selectph_256( 209 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 210 } 211 212 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A, 213 __m128h __B) { 214 return (__m128h)((__v8hf)__A / (__v8hf)__B); 215 } 216 217 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W, 218 __mmask8 __U, 219 __m128h __A, 220 __m128h __B) { 221 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), 222 (__v8hf)__W); 223 } 224 225 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U, 226 __m128h __A, 227 __m128h __B) { 228 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), 229 (__v8hf)_mm_setzero_ph()); 230 } 231 232 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A, 233 __m256h __B) { 234 return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B); 235 } 236 237 static __inline__ __m256h __DEFAULT_FN_ATTRS256 238 _mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 239 return (__m256h)__builtin_ia32_selectph_256( 240 (__mmask16)__U, 241 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), 242 (__v16hf)__W); 243 } 244 245 static __inline__ __m256h __DEFAULT_FN_ATTRS256 246 _mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) { 247 return (__m256h)__builtin_ia32_selectph_256( 248 (__mmask16)__U, 249 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), 250 (__v16hf)_mm256_setzero_ph()); 251 } 252 253 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A, 254 __m128h __B) { 255 return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B); 256 } 257 258 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W, 259 __mmask8 __U, 260 __m128h __A, 261 __m128h __B) { 262 return (__m128h)__builtin_ia32_selectph_128( 263 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), 264 (__v8hf)__W); 265 } 266 267 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U, 268 __m128h __A, 269 __m128h __B) { 270 return (__m128h)__builtin_ia32_selectph_128( 271 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), 272 (__v8hf)_mm_setzero_ph()); 273 } 274 275 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A, 276 __m256h __B) { 277 return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B); 278 } 279 280 static __inline__ __m256h __DEFAULT_FN_ATTRS256 281 _mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 282 return (__m256h)__builtin_ia32_selectph_256( 283 (__mmask16)__U, 284 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), 285 (__v16hf)__W); 286 } 287 288 static __inline__ __m256h __DEFAULT_FN_ATTRS256 289 _mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) { 290 return (__m256h)__builtin_ia32_selectph_256( 291 (__mmask16)__U, 292 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), 293 (__v16hf)_mm256_setzero_ph()); 294 } 295 296 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A, 297 __m128h __B) { 298 return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B); 299 } 300 301 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W, 302 __mmask8 __U, 303 __m128h __A, 304 __m128h __B) { 305 return (__m128h)__builtin_ia32_selectph_128( 306 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), 307 (__v8hf)__W); 308 } 309 310 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U, 311 __m128h __A, 312 __m128h __B) { 313 return (__m128h)__builtin_ia32_selectph_128( 314 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), 315 (__v8hf)_mm_setzero_ph()); 316 } 317 318 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { 319 return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A); 320 } 321 322 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) { 323 return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); 324 } 325 326 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) { 327 return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f)); 328 } 329 330 static __inline__ __m256h __DEFAULT_FN_ATTRS256 331 _mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) { 332 return (__m256h)__builtin_ia32_selectps_256( 333 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W); 334 } 335 336 static __inline__ __m256h __DEFAULT_FN_ATTRS256 337 _mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) { 338 return (__m256h)__builtin_ia32_selectps_256( 339 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps()); 340 } 341 342 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) { 343 return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f)); 344 } 345 346 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W, 347 __mmask8 __U, 348 __m128h __A) { 349 return (__m128h)__builtin_ia32_selectps_128( 350 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W); 351 } 352 353 static __inline__ __m128h __DEFAULT_FN_ATTRS128 354 _mm_maskz_conj_pch(__mmask8 __U, __m128h __A) { 355 return (__m128h)__builtin_ia32_selectps_128( 356 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps()); 357 } 358 359 #define _mm256_cmp_ph_mask(a, b, p) \ 360 ((__mmask16)__builtin_ia32_cmpph256_mask( \ 361 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1)) 362 363 #define _mm256_mask_cmp_ph_mask(m, a, b, p) \ 364 ((__mmask16)__builtin_ia32_cmpph256_mask( \ 365 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m))) 366 367 #define _mm_cmp_ph_mask(a, b, p) \ 368 ((__mmask8)__builtin_ia32_cmpph128_mask( \ 369 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1)) 370 371 #define _mm_mask_cmp_ph_mask(m, a, b, p) \ 372 ((__mmask8)__builtin_ia32_cmpph128_mask( \ 373 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m))) 374 375 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) { 376 return (__m256h)__builtin_ia32_rcpph256_mask( 377 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); 378 } 379 380 static __inline__ __m256h __DEFAULT_FN_ATTRS256 381 _mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) { 382 return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W, 383 (__mmask16)__U); 384 } 385 386 static __inline__ __m256h __DEFAULT_FN_ATTRS256 387 _mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) { 388 return (__m256h)__builtin_ia32_rcpph256_mask( 389 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 390 } 391 392 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) { 393 return (__m128h)__builtin_ia32_rcpph128_mask( 394 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 395 } 396 397 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W, 398 __mmask8 __U, 399 __m128h __A) { 400 return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W, 401 (__mmask8)__U); 402 } 403 404 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U, 405 __m128h __A) { 406 return (__m128h)__builtin_ia32_rcpph128_mask( 407 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 408 } 409 410 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) { 411 return (__m256h)__builtin_ia32_rsqrtph256_mask( 412 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); 413 } 414 415 static __inline__ __m256h __DEFAULT_FN_ATTRS256 416 _mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { 417 return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W, 418 (__mmask16)__U); 419 } 420 421 static __inline__ __m256h __DEFAULT_FN_ATTRS256 422 _mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) { 423 return (__m256h)__builtin_ia32_rsqrtph256_mask( 424 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 425 } 426 427 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) { 428 return (__m128h)__builtin_ia32_rsqrtph128_mask( 429 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 430 } 431 432 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W, 433 __mmask8 __U, 434 __m128h __A) { 435 return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W, 436 (__mmask8)__U); 437 } 438 439 static __inline__ __m128h __DEFAULT_FN_ATTRS128 440 _mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) { 441 return (__m128h)__builtin_ia32_rsqrtph128_mask( 442 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 443 } 444 445 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) { 446 return (__m128h)__builtin_ia32_getexpph128_mask( 447 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); 448 } 449 450 static __inline__ __m128h __DEFAULT_FN_ATTRS128 451 _mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) { 452 return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W, 453 (__mmask8)__U); 454 } 455 456 static __inline__ __m128h __DEFAULT_FN_ATTRS128 457 _mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) { 458 return (__m128h)__builtin_ia32_getexpph128_mask( 459 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 460 } 461 462 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) { 463 return (__m256h)__builtin_ia32_getexpph256_mask( 464 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); 465 } 466 467 static __inline__ __m256h __DEFAULT_FN_ATTRS256 468 _mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) { 469 return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W, 470 (__mmask16)__U); 471 } 472 473 static __inline__ __m256h __DEFAULT_FN_ATTRS256 474 _mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) { 475 return (__m256h)__builtin_ia32_getexpph256_mask( 476 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 477 } 478 479 #define _mm_getmant_ph(A, B, C) \ 480 ((__m128h)__builtin_ia32_getmantph128_mask( \ 481 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ 482 (__mmask8)-1)) 483 484 #define _mm_mask_getmant_ph(W, U, A, B, C) \ 485 ((__m128h)__builtin_ia32_getmantph128_mask( \ 486 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \ 487 (__mmask8)(U))) 488 489 #define _mm_maskz_getmant_ph(U, A, B, C) \ 490 ((__m128h)__builtin_ia32_getmantph128_mask( \ 491 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ 492 (__mmask8)(U))) 493 494 #define _mm256_getmant_ph(A, B, C) \ 495 ((__m256h)__builtin_ia32_getmantph256_mask( \ 496 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ 497 (__v16hf)_mm256_setzero_ph(), (__mmask16)-1)) 498 499 #define _mm256_mask_getmant_ph(W, U, A, B, C) \ 500 ((__m256h)__builtin_ia32_getmantph256_mask( \ 501 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \ 502 (__mmask16)(U))) 503 504 #define _mm256_maskz_getmant_ph(U, A, B, C) \ 505 ((__m256h)__builtin_ia32_getmantph256_mask( \ 506 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ 507 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U))) 508 509 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A, 510 __m128h __B) { 511 return (__m128h)__builtin_ia32_scalefph128_mask( 512 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); 513 } 514 515 static __inline__ __m128h __DEFAULT_FN_ATTRS128 516 _mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 517 return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B, 518 (__v8hf)__W, (__mmask8)__U); 519 } 520 521 static __inline__ __m128h __DEFAULT_FN_ATTRS128 522 _mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) { 523 return (__m128h)__builtin_ia32_scalefph128_mask( 524 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 525 } 526 527 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A, 528 __m256h __B) { 529 return (__m256h)__builtin_ia32_scalefph256_mask( 530 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); 531 } 532 533 static __inline__ __m256h __DEFAULT_FN_ATTRS256 534 _mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 535 return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B, 536 (__v16hf)__W, (__mmask16)__U); 537 } 538 539 static __inline__ __m256h __DEFAULT_FN_ATTRS256 540 _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) { 541 return (__m256h)__builtin_ia32_scalefph256_mask( 542 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 543 } 544 545 #define _mm_roundscale_ph(A, imm) \ 546 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ 547 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ 548 (__mmask8)-1)) 549 550 #define _mm_mask_roundscale_ph(W, U, A, imm) \ 551 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ 552 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) 553 554 #define _mm_maskz_roundscale_ph(U, A, imm) \ 555 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ 556 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ 557 (__mmask8)(U))) 558 559 #define _mm256_roundscale_ph(A, imm) \ 560 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ 561 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ 562 (__mmask16)-1)) 563 564 #define _mm256_mask_roundscale_ph(W, U, A, imm) \ 565 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ 566 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \ 567 (__mmask16)(U))) 568 569 #define _mm256_maskz_roundscale_ph(U, A, imm) \ 570 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ 571 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ 572 (__mmask16)(U))) 573 574 #define _mm_reduce_ph(A, imm) \ 575 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ 576 (__v8hf)_mm_setzero_ph(), \ 577 (__mmask8)-1)) 578 579 #define _mm_mask_reduce_ph(W, U, A, imm) \ 580 ((__m128h)__builtin_ia32_reduceph128_mask( \ 581 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) 582 583 #define _mm_maskz_reduce_ph(U, A, imm) \ 584 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ 585 (__v8hf)_mm_setzero_ph(), \ 586 (__mmask8)(U))) 587 588 #define _mm256_reduce_ph(A, imm) \ 589 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ 590 (__v16hf)_mm256_setzero_ph(), \ 591 (__mmask16)-1)) 592 593 #define _mm256_mask_reduce_ph(W, U, A, imm) \ 594 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ 595 (__v16hf)(__m256h)(W), \ 596 (__mmask16)(U))) 597 598 #define _mm256_maskz_reduce_ph(U, A, imm) \ 599 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ 600 (__v16hf)_mm256_setzero_ph(), \ 601 (__mmask16)(U))) 602 603 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) { 604 return __builtin_ia32_sqrtph((__v8hf)__a); 605 } 606 607 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W, 608 __mmask8 __U, 609 __m128h __A) { 610 return (__m128h)__builtin_ia32_selectph_128( 611 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W); 612 } 613 614 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U, 615 __m128h __A) { 616 return (__m128h)__builtin_ia32_selectph_128( 617 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph()); 618 } 619 620 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) { 621 return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a); 622 } 623 624 static __inline__ __m256h __DEFAULT_FN_ATTRS256 625 _mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { 626 return (__m256h)__builtin_ia32_selectph_256( 627 (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W); 628 } 629 630 static __inline__ __m256h __DEFAULT_FN_ATTRS256 631 _mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) { 632 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, 633 (__v16hf)_mm256_sqrt_ph(__A), 634 (__v16hf)_mm256_setzero_ph()); 635 } 636 637 #define _mm_mask_fpclass_ph_mask(U, A, imm) \ 638 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ 639 (int)(imm), (__mmask8)(U))) 640 641 #define _mm_fpclass_ph_mask(A, imm) \ 642 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ 643 (int)(imm), (__mmask8)-1)) 644 645 #define _mm256_mask_fpclass_ph_mask(U, A, imm) \ 646 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ 647 (int)(imm), (__mmask16)(U))) 648 649 #define _mm256_fpclass_ph_mask(A, imm) \ 650 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ 651 (int)(imm), (__mmask16)-1)) 652 653 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) { 654 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( 655 (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 656 } 657 658 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W, 659 __mmask8 __U, 660 __m128d __A) { 661 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W, 662 (__mmask8)__U); 663 } 664 665 static __inline__ __m128h __DEFAULT_FN_ATTRS128 666 _mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) { 667 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( 668 (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 669 } 670 671 static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) { 672 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( 673 (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 674 } 675 676 static __inline__ __m128h __DEFAULT_FN_ATTRS256 677 _mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) { 678 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W, 679 (__mmask8)__U); 680 } 681 682 static __inline__ __m128h __DEFAULT_FN_ATTRS256 683 _mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) { 684 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( 685 (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 686 } 687 688 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) { 689 return (__m128d)__builtin_ia32_vcvtph2pd128_mask( 690 (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1); 691 } 692 693 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W, 694 __mmask8 __U, 695 __m128h __A) { 696 return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W, 697 (__mmask8)__U); 698 } 699 700 static __inline__ __m128d __DEFAULT_FN_ATTRS128 701 _mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { 702 return (__m128d)__builtin_ia32_vcvtph2pd128_mask( 703 (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); 704 } 705 706 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) { 707 return (__m256d)__builtin_ia32_vcvtph2pd256_mask( 708 (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); 709 } 710 711 static __inline__ __m256d __DEFAULT_FN_ATTRS256 712 _mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) { 713 return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W, 714 (__mmask8)__U); 715 } 716 717 static __inline__ __m256d __DEFAULT_FN_ATTRS256 718 _mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { 719 return (__m256d)__builtin_ia32_vcvtph2pd256_mask( 720 (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); 721 } 722 723 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) { 724 return (__m128i)__builtin_ia32_vcvtph2w128_mask( 725 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); 726 } 727 728 static __inline__ __m128i __DEFAULT_FN_ATTRS128 729 _mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { 730 return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W, 731 (__mmask8)__U); 732 } 733 734 static __inline__ __m128i __DEFAULT_FN_ATTRS128 735 _mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) { 736 return (__m128i)__builtin_ia32_vcvtph2w128_mask( 737 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); 738 } 739 740 static __inline__ __m256i __DEFAULT_FN_ATTRS256 741 _mm256_cvtph_epi16(__m256h __A) { 742 return (__m256i)__builtin_ia32_vcvtph2w256_mask( 743 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); 744 } 745 746 static __inline__ __m256i __DEFAULT_FN_ATTRS256 747 _mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { 748 return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W, 749 (__mmask16)__U); 750 } 751 752 static __inline__ __m256i __DEFAULT_FN_ATTRS256 753 _mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) { 754 return (__m256i)__builtin_ia32_vcvtph2w256_mask( 755 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); 756 } 757 758 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) { 759 return (__m128i)__builtin_ia32_vcvttph2w128_mask( 760 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); 761 } 762 763 static __inline__ __m128i __DEFAULT_FN_ATTRS128 764 _mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { 765 return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W, 766 (__mmask8)__U); 767 } 768 769 static __inline__ __m128i __DEFAULT_FN_ATTRS128 770 _mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) { 771 return (__m128i)__builtin_ia32_vcvttph2w128_mask( 772 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); 773 } 774 775 static __inline__ __m256i __DEFAULT_FN_ATTRS256 776 _mm256_cvttph_epi16(__m256h __A) { 777 return (__m256i)__builtin_ia32_vcvttph2w256_mask( 778 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); 779 } 780 781 static __inline__ __m256i __DEFAULT_FN_ATTRS256 782 _mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { 783 return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W, 784 (__mmask16)__U); 785 } 786 787 static __inline__ __m256i __DEFAULT_FN_ATTRS256 788 _mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) { 789 return (__m256i)__builtin_ia32_vcvttph2w256_mask( 790 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); 791 } 792 793 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) { 794 return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf); 795 } 796 797 static __inline__ __m128h __DEFAULT_FN_ATTRS128 798 _mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) { 799 return (__m128h)__builtin_ia32_selectph_128( 800 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W); 801 } 802 803 static __inline__ __m128h __DEFAULT_FN_ATTRS128 804 _mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) { 805 return (__m128h)__builtin_ia32_selectph_128( 806 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph()); 807 } 808 809 static __inline__ __m256h __DEFAULT_FN_ATTRS256 810 _mm256_cvtepi16_ph(__m256i __A) { 811 return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf); 812 } 813 814 static __inline__ __m256h __DEFAULT_FN_ATTRS256 815 _mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) { 816 return (__m256h)__builtin_ia32_selectph_256( 817 (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W); 818 } 819 820 static __inline__ __m256h __DEFAULT_FN_ATTRS256 821 _mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) { 822 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, 823 (__v16hf)_mm256_cvtepi16_ph(__A), 824 (__v16hf)_mm256_setzero_ph()); 825 } 826 827 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) { 828 return (__m128i)__builtin_ia32_vcvtph2uw128_mask( 829 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); 830 } 831 832 static __inline__ __m128i __DEFAULT_FN_ATTRS128 833 _mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { 834 return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W, 835 (__mmask8)__U); 836 } 837 838 static __inline__ __m128i __DEFAULT_FN_ATTRS128 839 _mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) { 840 return (__m128i)__builtin_ia32_vcvtph2uw128_mask( 841 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); 842 } 843 844 static __inline__ __m256i __DEFAULT_FN_ATTRS256 845 _mm256_cvtph_epu16(__m256h __A) { 846 return (__m256i)__builtin_ia32_vcvtph2uw256_mask( 847 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); 848 } 849 850 static __inline__ __m256i __DEFAULT_FN_ATTRS256 851 _mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { 852 return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W, 853 (__mmask16)__U); 854 } 855 856 static __inline__ __m256i __DEFAULT_FN_ATTRS256 857 _mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) { 858 return (__m256i)__builtin_ia32_vcvtph2uw256_mask( 859 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); 860 } 861 862 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) { 863 return (__m128i)__builtin_ia32_vcvttph2uw128_mask( 864 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); 865 } 866 867 static __inline__ __m128i __DEFAULT_FN_ATTRS128 868 _mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { 869 return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W, 870 (__mmask8)__U); 871 } 872 873 static __inline__ __m128i __DEFAULT_FN_ATTRS128 874 _mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) { 875 return (__m128i)__builtin_ia32_vcvttph2uw128_mask( 876 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); 877 } 878 879 static __inline__ __m256i __DEFAULT_FN_ATTRS256 880 _mm256_cvttph_epu16(__m256h __A) { 881 return (__m256i)__builtin_ia32_vcvttph2uw256_mask( 882 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); 883 } 884 885 static __inline__ __m256i __DEFAULT_FN_ATTRS256 886 _mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { 887 return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W, 888 (__mmask16)__U); 889 } 890 891 static __inline__ __m256i __DEFAULT_FN_ATTRS256 892 _mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) { 893 return (__m256i)__builtin_ia32_vcvttph2uw256_mask( 894 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); 895 } 896 897 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) { 898 return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf); 899 } 900 901 static __inline__ __m128h __DEFAULT_FN_ATTRS128 902 _mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) { 903 return (__m128h)__builtin_ia32_selectph_128( 904 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W); 905 } 906 907 static __inline__ __m128h __DEFAULT_FN_ATTRS128 908 _mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) { 909 return (__m128h)__builtin_ia32_selectph_128( 910 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph()); 911 } 912 913 static __inline__ __m256h __DEFAULT_FN_ATTRS256 914 _mm256_cvtepu16_ph(__m256i __A) { 915 return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf); 916 } 917 918 static __inline__ __m256h __DEFAULT_FN_ATTRS256 919 _mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) { 920 return (__m256h)__builtin_ia32_selectph_256( 921 (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W); 922 } 923 924 static __inline__ __m256h __DEFAULT_FN_ATTRS256 925 _mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) { 926 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, 927 (__v16hf)_mm256_cvtepu16_ph(__A), 928 (__v16hf)_mm256_setzero_ph()); 929 } 930 931 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) { 932 return (__m128i)__builtin_ia32_vcvtph2dq128_mask( 933 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); 934 } 935 936 static __inline__ __m128i __DEFAULT_FN_ATTRS128 937 _mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { 938 return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W, 939 (__mmask8)__U); 940 } 941 942 static __inline__ __m128i __DEFAULT_FN_ATTRS128 943 _mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { 944 return (__m128i)__builtin_ia32_vcvtph2dq128_mask( 945 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); 946 } 947 948 static __inline__ __m256i __DEFAULT_FN_ATTRS256 949 _mm256_cvtph_epi32(__m128h __A) { 950 return (__m256i)__builtin_ia32_vcvtph2dq256_mask( 951 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); 952 } 953 954 static __inline__ __m256i __DEFAULT_FN_ATTRS256 955 _mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { 956 return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W, 957 (__mmask8)__U); 958 } 959 960 static __inline__ __m256i __DEFAULT_FN_ATTRS256 961 _mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { 962 return (__m256i)__builtin_ia32_vcvtph2dq256_mask( 963 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); 964 } 965 966 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) { 967 return (__m128i)__builtin_ia32_vcvtph2udq128_mask( 968 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); 969 } 970 971 static __inline__ __m128i __DEFAULT_FN_ATTRS128 972 _mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { 973 return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W, 974 (__mmask8)__U); 975 } 976 977 static __inline__ __m128i __DEFAULT_FN_ATTRS128 978 _mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { 979 return (__m128i)__builtin_ia32_vcvtph2udq128_mask( 980 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); 981 } 982 983 static __inline__ __m256i __DEFAULT_FN_ATTRS256 984 _mm256_cvtph_epu32(__m128h __A) { 985 return (__m256i)__builtin_ia32_vcvtph2udq256_mask( 986 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); 987 } 988 989 static __inline__ __m256i __DEFAULT_FN_ATTRS256 990 _mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { 991 return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W, 992 (__mmask8)__U); 993 } 994 995 static __inline__ __m256i __DEFAULT_FN_ATTRS256 996 _mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { 997 return (__m256i)__builtin_ia32_vcvtph2udq256_mask( 998 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); 999 } 1000 1001 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) { 1002 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( 1003 (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1004 } 1005 1006 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1007 _mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1008 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W, 1009 (__mmask8)__U); 1010 } 1011 1012 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1013 _mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) { 1014 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( 1015 (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1016 } 1017 1018 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1019 _mm256_cvtepi32_ph(__m256i __A) { 1020 return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf); 1021 } 1022 1023 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1024 _mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1025 return (__m128h)__builtin_ia32_selectph_128( 1026 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W); 1027 } 1028 1029 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1030 _mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) { 1031 return (__m128h)__builtin_ia32_selectph_128( 1032 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph()); 1033 } 1034 1035 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) { 1036 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( 1037 (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1038 } 1039 1040 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1041 _mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1042 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W, 1043 (__mmask8)__U); 1044 } 1045 1046 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1047 _mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) { 1048 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( 1049 (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1050 } 1051 1052 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1053 _mm256_cvtepu32_ph(__m256i __A) { 1054 return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf); 1055 } 1056 1057 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1058 _mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1059 return (__m128h)__builtin_ia32_selectph_128( 1060 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W); 1061 } 1062 1063 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1064 _mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) { 1065 return (__m128h)__builtin_ia32_selectph_128( 1066 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph()); 1067 } 1068 1069 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) { 1070 return (__m128i)__builtin_ia32_vcvttph2dq128_mask( 1071 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); 1072 } 1073 1074 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1075 _mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { 1076 return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W, 1077 (__mmask8)__U); 1078 } 1079 1080 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1081 _mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { 1082 return (__m128i)__builtin_ia32_vcvttph2dq128_mask( 1083 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); 1084 } 1085 1086 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1087 _mm256_cvttph_epi32(__m128h __A) { 1088 return (__m256i)__builtin_ia32_vcvttph2dq256_mask( 1089 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); 1090 } 1091 1092 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1093 _mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { 1094 return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W, 1095 (__mmask8)__U); 1096 } 1097 1098 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1099 _mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { 1100 return (__m256i)__builtin_ia32_vcvttph2dq256_mask( 1101 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); 1102 } 1103 1104 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) { 1105 return (__m128i)__builtin_ia32_vcvttph2udq128_mask( 1106 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); 1107 } 1108 1109 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1110 _mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { 1111 return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W, 1112 (__mmask8)__U); 1113 } 1114 1115 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1116 _mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { 1117 return (__m128i)__builtin_ia32_vcvttph2udq128_mask( 1118 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); 1119 } 1120 1121 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1122 _mm256_cvttph_epu32(__m128h __A) { 1123 return (__m256i)__builtin_ia32_vcvttph2udq256_mask( 1124 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); 1125 } 1126 1127 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1128 _mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { 1129 return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W, 1130 (__mmask8)__U); 1131 } 1132 1133 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1134 _mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { 1135 return (__m256i)__builtin_ia32_vcvttph2udq256_mask( 1136 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); 1137 } 1138 1139 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) { 1140 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( 1141 (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1142 } 1143 1144 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1145 _mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1146 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W, 1147 (__mmask8)__U); 1148 } 1149 1150 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1151 _mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) { 1152 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( 1153 (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1154 } 1155 1156 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1157 _mm256_cvtepi64_ph(__m256i __A) { 1158 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( 1159 (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1160 } 1161 1162 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1163 _mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1164 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W, 1165 (__mmask8)__U); 1166 } 1167 1168 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1169 _mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) { 1170 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( 1171 (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1172 } 1173 1174 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) { 1175 return (__m128i)__builtin_ia32_vcvtph2qq128_mask( 1176 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); 1177 } 1178 1179 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1180 _mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { 1181 return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W, 1182 (__mmask8)__U); 1183 } 1184 1185 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1186 _mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { 1187 return (__m128i)__builtin_ia32_vcvtph2qq128_mask( 1188 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); 1189 } 1190 1191 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1192 _mm256_cvtph_epi64(__m128h __A) { 1193 return (__m256i)__builtin_ia32_vcvtph2qq256_mask( 1194 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); 1195 } 1196 1197 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1198 _mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { 1199 return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W, 1200 (__mmask8)__U); 1201 } 1202 1203 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1204 _mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { 1205 return (__m256i)__builtin_ia32_vcvtph2qq256_mask( 1206 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); 1207 } 1208 1209 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) { 1210 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( 1211 (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1212 } 1213 1214 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1215 _mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1216 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W, 1217 (__mmask8)__U); 1218 } 1219 1220 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1221 _mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) { 1222 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( 1223 (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1224 } 1225 1226 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1227 _mm256_cvtepu64_ph(__m256i __A) { 1228 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( 1229 (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1230 } 1231 1232 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1233 _mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1234 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W, 1235 (__mmask8)__U); 1236 } 1237 1238 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1239 _mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) { 1240 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( 1241 (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1242 } 1243 1244 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) { 1245 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( 1246 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); 1247 } 1248 1249 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1250 _mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { 1251 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W, 1252 (__mmask8)__U); 1253 } 1254 1255 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1256 _mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { 1257 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( 1258 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); 1259 } 1260 1261 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1262 _mm256_cvtph_epu64(__m128h __A) { 1263 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( 1264 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); 1265 } 1266 1267 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1268 _mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { 1269 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W, 1270 (__mmask8)__U); 1271 } 1272 1273 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1274 _mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { 1275 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( 1276 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); 1277 } 1278 1279 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) { 1280 return (__m128i)__builtin_ia32_vcvttph2qq128_mask( 1281 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); 1282 } 1283 1284 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1285 _mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { 1286 return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W, 1287 (__mmask8)__U); 1288 } 1289 1290 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1291 _mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { 1292 return (__m128i)__builtin_ia32_vcvttph2qq128_mask( 1293 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); 1294 } 1295 1296 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1297 _mm256_cvttph_epi64(__m128h __A) { 1298 return (__m256i)__builtin_ia32_vcvttph2qq256_mask( 1299 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); 1300 } 1301 1302 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1303 _mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { 1304 return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W, 1305 (__mmask8)__U); 1306 } 1307 1308 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1309 _mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { 1310 return (__m256i)__builtin_ia32_vcvttph2qq256_mask( 1311 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); 1312 } 1313 1314 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) { 1315 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( 1316 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); 1317 } 1318 1319 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1320 _mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { 1321 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W, 1322 (__mmask8)__U); 1323 } 1324 1325 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1326 _mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { 1327 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( 1328 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); 1329 } 1330 1331 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1332 _mm256_cvttph_epu64(__m128h __A) { 1333 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( 1334 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); 1335 } 1336 1337 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1338 _mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { 1339 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W, 1340 (__mmask8)__U); 1341 } 1342 1343 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1344 _mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { 1345 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( 1346 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); 1347 } 1348 1349 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) { 1350 return (__m128)__builtin_ia32_vcvtph2psx128_mask( 1351 (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1); 1352 } 1353 1354 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W, 1355 __mmask8 __U, 1356 __m128h __A) { 1357 return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W, 1358 (__mmask8)__U); 1359 } 1360 1361 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1362 _mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { 1363 return (__m128)__builtin_ia32_vcvtph2psx128_mask( 1364 (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); 1365 } 1366 1367 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) { 1368 return (__m256)__builtin_ia32_vcvtph2psx256_mask( 1369 (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1); 1370 } 1371 1372 static __inline__ __m256 __DEFAULT_FN_ATTRS256 1373 _mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) { 1374 return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W, 1375 (__mmask8)__U); 1376 } 1377 1378 static __inline__ __m256 __DEFAULT_FN_ATTRS256 1379 _mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { 1380 return (__m256)__builtin_ia32_vcvtph2psx256_mask( 1381 (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); 1382 } 1383 1384 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) { 1385 return (__m128h)__builtin_ia32_vcvtps2phx128_mask( 1386 (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1387 } 1388 1389 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W, 1390 __mmask8 __U, 1391 __m128 __A) { 1392 return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W, 1393 (__mmask8)__U); 1394 } 1395 1396 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1397 _mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) { 1398 return (__m128h)__builtin_ia32_vcvtps2phx128_mask( 1399 (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1400 } 1401 1402 static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) { 1403 return (__m128h)__builtin_ia32_vcvtps2phx256_mask( 1404 (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1405 } 1406 1407 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1408 _mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) { 1409 return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W, 1410 (__mmask8)__U); 1411 } 1412 1413 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1414 _mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) { 1415 return (__m128h)__builtin_ia32_vcvtps2phx256_mask( 1416 (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1417 } 1418 1419 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A, 1420 __m128h __B, 1421 __m128h __C) { 1422 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, 1423 (__v8hf)__C); 1424 } 1425 1426 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A, 1427 __mmask8 __U, 1428 __m128h __B, 1429 __m128h __C) { 1430 return (__m128h)__builtin_ia32_selectph_128( 1431 (__mmask8)__U, 1432 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1433 (__v8hf)__A); 1434 } 1435 1436 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1437 _mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1438 return (__m128h)__builtin_ia32_selectph_128( 1439 (__mmask8)__U, 1440 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1441 (__v8hf)__C); 1442 } 1443 1444 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1445 _mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1446 return (__m128h)__builtin_ia32_selectph_128( 1447 (__mmask8)__U, 1448 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1449 (__v8hf)_mm_setzero_ph()); 1450 } 1451 1452 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A, 1453 __m128h __B, 1454 __m128h __C) { 1455 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, 1456 -(__v8hf)__C); 1457 } 1458 1459 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A, 1460 __mmask8 __U, 1461 __m128h __B, 1462 __m128h __C) { 1463 return (__m128h)__builtin_ia32_selectph_128( 1464 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1465 (__v8hf)__A); 1466 } 1467 1468 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1469 _mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1470 return (__m128h)__builtin_ia32_selectph_128( 1471 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1472 (__v8hf)_mm_setzero_ph()); 1473 } 1474 1475 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1476 _mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1477 return (__m128h)__builtin_ia32_selectph_128( 1478 (__mmask8)__U, 1479 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1480 (__v8hf)__C); 1481 } 1482 1483 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1484 _mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1485 return (__m128h)__builtin_ia32_selectph_128( 1486 (__mmask8)__U, 1487 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1488 (__v8hf)_mm_setzero_ph()); 1489 } 1490 1491 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1492 _mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1493 return (__m128h)__builtin_ia32_selectph_128( 1494 (__mmask8)__U, 1495 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1496 (__v8hf)_mm_setzero_ph()); 1497 } 1498 1499 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A, 1500 __m256h __B, 1501 __m256h __C) { 1502 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, 1503 (__v16hf)__C); 1504 } 1505 1506 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1507 _mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1508 return (__m256h)__builtin_ia32_selectph_256( 1509 (__mmask16)__U, 1510 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1511 (__v16hf)__A); 1512 } 1513 1514 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1515 _mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1516 return (__m256h)__builtin_ia32_selectph_256( 1517 (__mmask16)__U, 1518 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1519 (__v16hf)__C); 1520 } 1521 1522 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1523 _mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1524 return (__m256h)__builtin_ia32_selectph_256( 1525 (__mmask16)__U, 1526 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1527 (__v16hf)_mm256_setzero_ph()); 1528 } 1529 1530 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A, 1531 __m256h __B, 1532 __m256h __C) { 1533 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, 1534 -(__v16hf)__C); 1535 } 1536 1537 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1538 _mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1539 return (__m256h)__builtin_ia32_selectph_256( 1540 (__mmask16)__U, 1541 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1542 (__v16hf)__A); 1543 } 1544 1545 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1546 _mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1547 return (__m256h)__builtin_ia32_selectph_256( 1548 (__mmask16)__U, 1549 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1550 (__v16hf)_mm256_setzero_ph()); 1551 } 1552 1553 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1554 _mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1555 return (__m256h)__builtin_ia32_selectph_256( 1556 (__mmask16)__U, 1557 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1558 (__v16hf)__C); 1559 } 1560 1561 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1562 _mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1563 return (__m256h)__builtin_ia32_selectph_256( 1564 (__mmask16)__U, 1565 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1566 (__v16hf)_mm256_setzero_ph()); 1567 } 1568 1569 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1570 _mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1571 return (__m256h)__builtin_ia32_selectph_256( 1572 (__mmask16)__U, 1573 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1574 (__v16hf)_mm256_setzero_ph()); 1575 } 1576 1577 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A, 1578 __m128h __B, 1579 __m128h __C) { 1580 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, 1581 (__v8hf)__C); 1582 } 1583 1584 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1585 _mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1586 return (__m128h)__builtin_ia32_selectph_128( 1587 (__mmask8)__U, 1588 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1589 (__v8hf)__A); 1590 } 1591 1592 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1593 _mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1594 return (__m128h)__builtin_ia32_selectph_128( 1595 (__mmask8)__U, 1596 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1597 (__v8hf)__C); 1598 } 1599 1600 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1601 _mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1602 return (__m128h)__builtin_ia32_selectph_128( 1603 (__mmask8)__U, 1604 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1605 (__v8hf)_mm_setzero_ph()); 1606 } 1607 1608 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A, 1609 __m128h __B, 1610 __m128h __C) { 1611 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, 1612 -(__v8hf)__C); 1613 } 1614 1615 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1616 _mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1617 return (__m128h)__builtin_ia32_selectph_128( 1618 (__mmask8)__U, 1619 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1620 (__v8hf)__A); 1621 } 1622 1623 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1624 _mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1625 return (__m128h)__builtin_ia32_selectph_128( 1626 (__mmask8)__U, 1627 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1628 (__v8hf)_mm_setzero_ph()); 1629 } 1630 1631 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1632 _mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) { 1633 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, 1634 (__v16hf)__C); 1635 } 1636 1637 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1638 _mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1639 return (__m256h)__builtin_ia32_selectph_256( 1640 (__mmask16)__U, 1641 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1642 (__v16hf)__A); 1643 } 1644 1645 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1646 _mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1647 return (__m256h)__builtin_ia32_selectph_256( 1648 (__mmask16)__U, 1649 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1650 (__v16hf)__C); 1651 } 1652 1653 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1654 _mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1655 return (__m256h)__builtin_ia32_selectph_256( 1656 (__mmask16)__U, 1657 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1658 (__v16hf)_mm256_setzero_ph()); 1659 } 1660 1661 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1662 _mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) { 1663 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, 1664 -(__v16hf)__C); 1665 } 1666 1667 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1668 _mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1669 return (__m256h)__builtin_ia32_selectph_256( 1670 (__mmask16)__U, 1671 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1672 (__v16hf)__A); 1673 } 1674 1675 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1676 _mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1677 return (__m256h)__builtin_ia32_selectph_256( 1678 (__mmask16)__U, 1679 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1680 (__v16hf)_mm256_setzero_ph()); 1681 } 1682 1683 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1684 _mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1685 return (__m128h)__builtin_ia32_selectph_128( 1686 (__mmask8)__U, 1687 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1688 (__v8hf)__C); 1689 } 1690 1691 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1692 _mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1693 return (__m256h)__builtin_ia32_selectph_256( 1694 (__mmask16)__U, 1695 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1696 (__v16hf)__C); 1697 } 1698 1699 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1700 _mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1701 return (__m128h)__builtin_ia32_selectph_128( 1702 (__mmask8)__U, 1703 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1704 (__v8hf)__C); 1705 } 1706 1707 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1708 _mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1709 return (__m256h)__builtin_ia32_selectph_256( 1710 (__mmask16)__U, 1711 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1712 (__v16hf)__C); 1713 } 1714 1715 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A, 1716 __m128h __B, 1717 __m128h __C) { 1718 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, 1719 (__v8hf)__C); 1720 } 1721 1722 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1723 _mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1724 return (__m128h)__builtin_ia32_selectph_128( 1725 (__mmask8)__U, 1726 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C), 1727 (__v8hf)__A); 1728 } 1729 1730 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A, 1731 __m256h __B, 1732 __m256h __C) { 1733 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, 1734 (__v16hf)__C); 1735 } 1736 1737 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1738 _mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1739 return (__m256h)__builtin_ia32_selectph_256( 1740 (__mmask16)__U, 1741 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C), 1742 (__v16hf)__A); 1743 } 1744 1745 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A, 1746 __m128h __B, 1747 __m128h __C) { 1748 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, 1749 -(__v8hf)__C); 1750 } 1751 1752 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1753 _mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1754 return (__m128h)__builtin_ia32_selectph_128( 1755 (__mmask8)__U, 1756 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), 1757 (__v8hf)__A); 1758 } 1759 1760 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1761 _mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1762 return (__m128h)__builtin_ia32_selectph_128( 1763 (__mmask8)__U, 1764 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), 1765 (__v8hf)__C); 1766 } 1767 1768 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A, 1769 __m256h __B, 1770 __m256h __C) { 1771 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, 1772 -(__v16hf)__C); 1773 } 1774 1775 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1776 _mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1777 return (__m256h)__builtin_ia32_selectph_256( 1778 (__mmask16)__U, 1779 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), 1780 (__v16hf)__A); 1781 } 1782 1783 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1784 _mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1785 return (__m256h)__builtin_ia32_selectph_256( 1786 (__mmask16)__U, 1787 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), 1788 (__v16hf)__C); 1789 } 1790 1791 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A, 1792 __m128h __B) { 1793 return (__m128h)__builtin_ia32_vfcmulcph128_mask( 1794 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); 1795 } 1796 1797 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1798 _mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 1799 return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B, 1800 (__v4sf)__W, (__mmask8)__U); 1801 } 1802 1803 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1804 _mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { 1805 return (__m128h)__builtin_ia32_vfcmulcph128_mask( 1806 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); 1807 } 1808 1809 static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A, 1810 __m256h __B) { 1811 return (__m256h)__builtin_ia32_vfcmulcph256_mask( 1812 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); 1813 } 1814 1815 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1816 _mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { 1817 return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B, 1818 (__v8sf)__W, (__mmask8)__U); 1819 } 1820 1821 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1822 _mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { 1823 return (__m256h)__builtin_ia32_vfcmulcph256_mask( 1824 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); 1825 } 1826 1827 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A, 1828 __m128h __B, 1829 __m128h __C) { 1830 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1831 (__v4sf)__C, (__mmask8)-1); 1832 } 1833 1834 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1835 _mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1836 return (__m128h)__builtin_ia32_selectps_128( 1837 __U, 1838 __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B, 1839 (__v4sf)__C, (__mmask8)__U), 1840 (__v4sf)__A); 1841 } 1842 1843 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1844 _mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1845 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1846 (__v4sf)__C, (__mmask8)__U); 1847 } 1848 1849 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1850 _mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1851 return (__m128h)__builtin_ia32_vfcmaddcph128_maskz( 1852 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); 1853 } 1854 1855 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A, 1856 __m256h __B, 1857 __m256h __C) { 1858 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1859 (__v8sf)__C, (__mmask8)-1); 1860 } 1861 1862 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1863 _mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { 1864 return (__m256h)__builtin_ia32_selectps_256( 1865 __U, 1866 __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, 1867 (__mmask8)__U), 1868 (__v8sf)__A); 1869 } 1870 1871 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1872 _mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { 1873 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1874 (__v8sf)__C, (__mmask8)__U); 1875 } 1876 1877 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1878 _mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { 1879 return (__m256h)__builtin_ia32_vfcmaddcph256_maskz( 1880 (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); 1881 } 1882 1883 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A, 1884 __m128h __B) { 1885 return (__m128h)__builtin_ia32_vfmulcph128_mask( 1886 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); 1887 } 1888 1889 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W, 1890 __mmask8 __U, 1891 __m128h __A, 1892 __m128h __B) { 1893 return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B, 1894 (__v4sf)__W, (__mmask8)__U); 1895 } 1896 1897 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1898 _mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { 1899 return (__m128h)__builtin_ia32_vfmulcph128_mask( 1900 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); 1901 } 1902 1903 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A, 1904 __m256h __B) { 1905 return (__m256h)__builtin_ia32_vfmulcph256_mask( 1906 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); 1907 } 1908 1909 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1910 _mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { 1911 return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B, 1912 (__v8sf)__W, (__mmask8)__U); 1913 } 1914 1915 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1916 _mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { 1917 return (__m256h)__builtin_ia32_vfmulcph256_mask( 1918 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); 1919 } 1920 1921 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A, 1922 __m128h __B, 1923 __m128h __C) { 1924 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1925 (__v4sf)__C, (__mmask8)-1); 1926 } 1927 1928 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1929 _mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1930 return (__m128h)__builtin_ia32_selectps_128( 1931 __U, 1932 __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, 1933 (__mmask8)__U), 1934 (__v4sf)__A); 1935 } 1936 1937 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1938 _mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1939 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1940 (__v4sf)__C, (__mmask8)__U); 1941 } 1942 1943 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1944 _mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1945 return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B, 1946 (__v4sf)__C, (__mmask8)__U); 1947 } 1948 1949 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A, 1950 __m256h __B, 1951 __m256h __C) { 1952 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1953 (__v8sf)__C, (__mmask8)-1); 1954 } 1955 1956 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1957 _mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { 1958 return (__m256h)__builtin_ia32_selectps_256( 1959 __U, 1960 __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, 1961 (__mmask8)__U), 1962 (__v8sf)__A); 1963 } 1964 1965 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1966 _mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { 1967 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1968 (__v8sf)__C, (__mmask8)__U); 1969 } 1970 1971 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1972 _mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { 1973 return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B, 1974 (__v8sf)__C, (__mmask8)__U); 1975 } 1976 1977 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, 1978 __m128h __A, 1979 __m128h __W) { 1980 return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W, 1981 (__v8hf)__A); 1982 } 1983 1984 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1985 _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { 1986 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W, 1987 (__v16hf)__A); 1988 } 1989 1990 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1991 _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { 1992 return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, 1993 (__v8hi)__B); 1994 } 1995 1996 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1997 _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { 1998 return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, 1999 (__v16hi)__B); 2000 } 2001 2002 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2003 _mm_permutexvar_ph(__m128i __A, __m128h __B) { 2004 return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); 2005 } 2006 2007 static __inline__ __m256h __DEFAULT_FN_ATTRS256 2008 _mm256_permutexvar_ph(__m256i __A, __m256h __B) { 2009 return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); 2010 } 2011 2012 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2013 _mm256_reduce_add_ph(__m256h __W) { 2014 return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W); 2015 } 2016 2017 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2018 _mm256_reduce_mul_ph(__m256h __W) { 2019 return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W); 2020 } 2021 2022 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2023 _mm256_reduce_max_ph(__m256h __V) { 2024 return __builtin_ia32_reduce_fmax_ph256(__V); 2025 } 2026 2027 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2028 _mm256_reduce_min_ph(__m256h __V) { 2029 return __builtin_ia32_reduce_fmin_ph256(__V); 2030 } 2031 2032 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2033 _mm_reduce_add_ph(__m128h __W) { 2034 return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W); 2035 } 2036 2037 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2038 _mm_reduce_mul_ph(__m128h __W) { 2039 return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W); 2040 } 2041 2042 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2043 _mm_reduce_max_ph(__m128h __V) { 2044 return __builtin_ia32_reduce_fmax_ph128(__V); 2045 } 2046 2047 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2048 _mm_reduce_min_ph(__m128h __V) { 2049 return __builtin_ia32_reduce_fmin_ph128(__V); 2050 } 2051 2052 // intrinsics below are alias for f*mul_*ch 2053 #define _mm_mul_pch(A, B) _mm_fmul_pch(A, B) 2054 #define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B) 2055 #define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B) 2056 #define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B) 2057 #define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B) 2058 #define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B) 2059 2060 #define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B) 2061 #define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B) 2062 #define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B) 2063 #define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B) 2064 #define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B) 2065 #define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B) 2066 2067 #undef __DEFAULT_FN_ATTRS128 2068 #undef __DEFAULT_FN_ATTRS256 2069 2070 #endif 2071 #endif 2072