1 /*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error \ 11 "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVX512VLFP16INTRIN_H 15 #define __AVX512VLFP16INTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS256 \ 19 __attribute__((__always_inline__, __nodebug__, \ 20 __target__("avx512fp16, avx512vl"), \ 21 __min_vector_width__(256))) 22 #define __DEFAULT_FN_ATTRS128 \ 23 __attribute__((__always_inline__, __nodebug__, \ 24 __target__("avx512fp16, avx512vl"), \ 25 __min_vector_width__(128))) 26 27 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) { 28 return __a[0]; 29 } 30 31 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) { 32 return __a[0]; 33 } 34 35 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) { 36 return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0}; 37 } 38 39 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) { 40 return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h}; 41 } 42 43 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) { 44 return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h, 45 __h, __h, __h, __h, __h, __h, __h, __h}; 46 } 47 48 static __inline __m128h __DEFAULT_FN_ATTRS128 49 _mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, 50 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) { 51 return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1}; 52 } 53 54 static __inline __m256h __DEFAULT_FN_ATTRS256 55 _mm256_set1_pch(_Float16 _Complex h) { 56 return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h)); 57 } 58 59 static __inline __m128h __DEFAULT_FN_ATTRS128 60 _mm_set1_pch(_Float16 _Complex h) { 61 return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h)); 62 } 63 64 static __inline __m256h __DEFAULT_FN_ATTRS256 65 _mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, 66 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, 67 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, 68 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) { 69 return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11, 70 __h10, __h9, __h8, __h7, __h6, __h5, 71 __h4, __h3, __h2, __h1}; 72 } 73 74 #define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \ 75 _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1)) 76 77 #define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \ 78 h14, h15, h16) \ 79 _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \ 80 (h7), (h6), (h5), (h4), (h3), (h2), (h1)) 81 82 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A, 83 __m256h __B) { 84 return (__m256h)((__v16hf)__A + (__v16hf)__B); 85 } 86 87 static __inline__ __m256h __DEFAULT_FN_ATTRS256 88 _mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 89 return (__m256h)__builtin_ia32_selectph_256( 90 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W); 91 } 92 93 static __inline__ __m256h __DEFAULT_FN_ATTRS256 94 _mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) { 95 return (__m256h)__builtin_ia32_selectph_256( 96 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 97 } 98 99 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A, 100 __m128h __B) { 101 return (__m128h)((__v8hf)__A + (__v8hf)__B); 102 } 103 104 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W, 105 __mmask8 __U, 106 __m128h __A, 107 __m128h __B) { 108 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), 109 (__v8hf)__W); 110 } 111 112 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U, 113 __m128h __A, 114 __m128h __B) { 115 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), 116 (__v8hf)_mm_setzero_ph()); 117 } 118 119 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A, 120 __m256h __B) { 121 return (__m256h)((__v16hf)__A - (__v16hf)__B); 122 } 123 124 static __inline__ __m256h __DEFAULT_FN_ATTRS256 125 _mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 126 return (__m256h)__builtin_ia32_selectph_256( 127 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W); 128 } 129 130 static __inline__ __m256h __DEFAULT_FN_ATTRS256 131 _mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) { 132 return (__m256h)__builtin_ia32_selectph_256( 133 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 134 } 135 136 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A, 137 __m128h __B) { 138 return (__m128h)((__v8hf)__A - (__v8hf)__B); 139 } 140 141 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W, 142 __mmask8 __U, 143 __m128h __A, 144 __m128h __B) { 145 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), 146 (__v8hf)__W); 147 } 148 149 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U, 150 __m128h __A, 151 __m128h __B) { 152 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), 153 (__v8hf)_mm_setzero_ph()); 154 } 155 156 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A, 157 __m256h __B) { 158 return (__m256h)((__v16hf)__A * (__v16hf)__B); 159 } 160 161 static __inline__ __m256h __DEFAULT_FN_ATTRS256 162 _mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 163 return (__m256h)__builtin_ia32_selectph_256( 164 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W); 165 } 166 167 static __inline__ __m256h __DEFAULT_FN_ATTRS256 168 _mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) { 169 return (__m256h)__builtin_ia32_selectph_256( 170 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 171 } 172 173 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A, 174 __m128h __B) { 175 return (__m128h)((__v8hf)__A * (__v8hf)__B); 176 } 177 178 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W, 179 __mmask8 __U, 180 __m128h __A, 181 __m128h __B) { 182 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), 183 (__v8hf)__W); 184 } 185 186 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U, 187 __m128h __A, 188 __m128h __B) { 189 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), 190 (__v8hf)_mm_setzero_ph()); 191 } 192 193 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A, 194 __m256h __B) { 195 return (__m256h)((__v16hf)__A / (__v16hf)__B); 196 } 197 198 static __inline__ __m256h __DEFAULT_FN_ATTRS256 199 _mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 200 return (__m256h)__builtin_ia32_selectph_256( 201 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W); 202 } 203 204 static __inline__ __m256h __DEFAULT_FN_ATTRS256 205 _mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) { 206 return (__m256h)__builtin_ia32_selectph_256( 207 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); 208 } 209 210 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A, 211 __m128h __B) { 212 return (__m128h)((__v8hf)__A / (__v8hf)__B); 213 } 214 215 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W, 216 __mmask8 __U, 217 __m128h __A, 218 __m128h __B) { 219 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), 220 (__v8hf)__W); 221 } 222 223 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U, 224 __m128h __A, 225 __m128h __B) { 226 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), 227 (__v8hf)_mm_setzero_ph()); 228 } 229 230 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A, 231 __m256h __B) { 232 return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B); 233 } 234 235 static __inline__ __m256h __DEFAULT_FN_ATTRS256 236 _mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 237 return (__m256h)__builtin_ia32_selectph_256( 238 (__mmask16)__U, 239 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), 240 (__v16hf)__W); 241 } 242 243 static __inline__ __m256h __DEFAULT_FN_ATTRS256 244 _mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) { 245 return (__m256h)__builtin_ia32_selectph_256( 246 (__mmask16)__U, 247 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), 248 (__v16hf)_mm256_setzero_ph()); 249 } 250 251 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A, 252 __m128h __B) { 253 return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B); 254 } 255 256 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W, 257 __mmask8 __U, 258 __m128h __A, 259 __m128h __B) { 260 return (__m128h)__builtin_ia32_selectph_128( 261 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), 262 (__v8hf)__W); 263 } 264 265 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U, 266 __m128h __A, 267 __m128h __B) { 268 return (__m128h)__builtin_ia32_selectph_128( 269 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), 270 (__v8hf)_mm_setzero_ph()); 271 } 272 273 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A, 274 __m256h __B) { 275 return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B); 276 } 277 278 static __inline__ __m256h __DEFAULT_FN_ATTRS256 279 _mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 280 return (__m256h)__builtin_ia32_selectph_256( 281 (__mmask16)__U, 282 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), 283 (__v16hf)__W); 284 } 285 286 static __inline__ __m256h __DEFAULT_FN_ATTRS256 287 _mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) { 288 return (__m256h)__builtin_ia32_selectph_256( 289 (__mmask16)__U, 290 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), 291 (__v16hf)_mm256_setzero_ph()); 292 } 293 294 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A, 295 __m128h __B) { 296 return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B); 297 } 298 299 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W, 300 __mmask8 __U, 301 __m128h __A, 302 __m128h __B) { 303 return (__m128h)__builtin_ia32_selectph_128( 304 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), 305 (__v8hf)__W); 306 } 307 308 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U, 309 __m128h __A, 310 __m128h __B) { 311 return (__m128h)__builtin_ia32_selectph_128( 312 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), 313 (__v8hf)_mm_setzero_ph()); 314 } 315 316 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { 317 return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A); 318 } 319 320 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) { 321 return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); 322 } 323 324 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) { 325 return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f)); 326 } 327 328 static __inline__ __m256h __DEFAULT_FN_ATTRS256 329 _mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) { 330 return (__m256h)__builtin_ia32_selectps_256( 331 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W); 332 } 333 334 static __inline__ __m256h __DEFAULT_FN_ATTRS256 335 _mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) { 336 return (__m256h)__builtin_ia32_selectps_256( 337 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps()); 338 } 339 340 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) { 341 return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f)); 342 } 343 344 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W, 345 __mmask8 __U, 346 __m128h __A) { 347 return (__m128h)__builtin_ia32_selectps_128( 348 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W); 349 } 350 351 static __inline__ __m128h __DEFAULT_FN_ATTRS128 352 _mm_maskz_conj_pch(__mmask8 __U, __m128h __A) { 353 return (__m128h)__builtin_ia32_selectps_128( 354 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps()); 355 } 356 357 #define _mm256_cmp_ph_mask(a, b, p) \ 358 ((__mmask16)__builtin_ia32_cmpph256_mask( \ 359 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1)) 360 361 #define _mm256_mask_cmp_ph_mask(m, a, b, p) \ 362 ((__mmask16)__builtin_ia32_cmpph256_mask( \ 363 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m))) 364 365 #define _mm_cmp_ph_mask(a, b, p) \ 366 ((__mmask8)__builtin_ia32_cmpph128_mask( \ 367 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1)) 368 369 #define _mm_mask_cmp_ph_mask(m, a, b, p) \ 370 ((__mmask8)__builtin_ia32_cmpph128_mask( \ 371 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m))) 372 373 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) { 374 return (__m256h)__builtin_ia32_rcpph256_mask( 375 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); 376 } 377 378 static __inline__ __m256h __DEFAULT_FN_ATTRS256 379 _mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) { 380 return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W, 381 (__mmask16)__U); 382 } 383 384 static __inline__ __m256h __DEFAULT_FN_ATTRS256 385 _mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) { 386 return (__m256h)__builtin_ia32_rcpph256_mask( 387 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 388 } 389 390 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) { 391 return (__m128h)__builtin_ia32_rcpph128_mask( 392 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 393 } 394 395 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W, 396 __mmask8 __U, 397 __m128h __A) { 398 return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W, 399 (__mmask8)__U); 400 } 401 402 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U, 403 __m128h __A) { 404 return (__m128h)__builtin_ia32_rcpph128_mask( 405 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 406 } 407 408 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) { 409 return (__m256h)__builtin_ia32_rsqrtph256_mask( 410 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); 411 } 412 413 static __inline__ __m256h __DEFAULT_FN_ATTRS256 414 _mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { 415 return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W, 416 (__mmask16)__U); 417 } 418 419 static __inline__ __m256h __DEFAULT_FN_ATTRS256 420 _mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) { 421 return (__m256h)__builtin_ia32_rsqrtph256_mask( 422 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 423 } 424 425 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) { 426 return (__m128h)__builtin_ia32_rsqrtph128_mask( 427 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 428 } 429 430 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W, 431 __mmask8 __U, 432 __m128h __A) { 433 return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W, 434 (__mmask8)__U); 435 } 436 437 static __inline__ __m128h __DEFAULT_FN_ATTRS128 438 _mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) { 439 return (__m128h)__builtin_ia32_rsqrtph128_mask( 440 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 441 } 442 443 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) { 444 return (__m128h)__builtin_ia32_getexpph128_mask( 445 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); 446 } 447 448 static __inline__ __m128h __DEFAULT_FN_ATTRS128 449 _mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) { 450 return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W, 451 (__mmask8)__U); 452 } 453 454 static __inline__ __m128h __DEFAULT_FN_ATTRS128 455 _mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) { 456 return (__m128h)__builtin_ia32_getexpph128_mask( 457 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 458 } 459 460 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) { 461 return (__m256h)__builtin_ia32_getexpph256_mask( 462 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); 463 } 464 465 static __inline__ __m256h __DEFAULT_FN_ATTRS256 466 _mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) { 467 return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W, 468 (__mmask16)__U); 469 } 470 471 static __inline__ __m256h __DEFAULT_FN_ATTRS256 472 _mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) { 473 return (__m256h)__builtin_ia32_getexpph256_mask( 474 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 475 } 476 477 #define _mm_getmant_ph(A, B, C) \ 478 ((__m128h)__builtin_ia32_getmantph128_mask( \ 479 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ 480 (__mmask8)-1)) 481 482 #define _mm_mask_getmant_ph(W, U, A, B, C) \ 483 ((__m128h)__builtin_ia32_getmantph128_mask( \ 484 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \ 485 (__mmask8)(U))) 486 487 #define _mm_maskz_getmant_ph(U, A, B, C) \ 488 ((__m128h)__builtin_ia32_getmantph128_mask( \ 489 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ 490 (__mmask8)(U))) 491 492 #define _mm256_getmant_ph(A, B, C) \ 493 ((__m256h)__builtin_ia32_getmantph256_mask( \ 494 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ 495 (__v16hf)_mm256_setzero_ph(), (__mmask16)-1)) 496 497 #define _mm256_mask_getmant_ph(W, U, A, B, C) \ 498 ((__m256h)__builtin_ia32_getmantph256_mask( \ 499 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \ 500 (__mmask16)(U))) 501 502 #define _mm256_maskz_getmant_ph(U, A, B, C) \ 503 ((__m256h)__builtin_ia32_getmantph256_mask( \ 504 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ 505 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U))) 506 507 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A, 508 __m128h __B) { 509 return (__m128h)__builtin_ia32_scalefph128_mask( 510 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); 511 } 512 513 static __inline__ __m128h __DEFAULT_FN_ATTRS128 514 _mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 515 return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B, 516 (__v8hf)__W, (__mmask8)__U); 517 } 518 519 static __inline__ __m128h __DEFAULT_FN_ATTRS128 520 _mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) { 521 return (__m128h)__builtin_ia32_scalefph128_mask( 522 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 523 } 524 525 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A, 526 __m256h __B) { 527 return (__m256h)__builtin_ia32_scalefph256_mask( 528 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); 529 } 530 531 static __inline__ __m256h __DEFAULT_FN_ATTRS256 532 _mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { 533 return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B, 534 (__v16hf)__W, (__mmask16)__U); 535 } 536 537 static __inline__ __m256h __DEFAULT_FN_ATTRS256 538 _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) { 539 return (__m256h)__builtin_ia32_scalefph256_mask( 540 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); 541 } 542 543 #define _mm_roundscale_ph(A, imm) \ 544 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ 545 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ 546 (__mmask8)-1)) 547 548 #define _mm_mask_roundscale_ph(W, U, A, imm) \ 549 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ 550 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) 551 552 #define _mm_maskz_roundscale_ph(U, A, imm) \ 553 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ 554 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ 555 (__mmask8)(U))) 556 557 #define _mm256_roundscale_ph(A, imm) \ 558 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ 559 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ 560 (__mmask16)-1)) 561 562 #define _mm256_mask_roundscale_ph(W, U, A, imm) \ 563 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ 564 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \ 565 (__mmask16)(U))) 566 567 #define _mm256_maskz_roundscale_ph(U, A, imm) \ 568 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ 569 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ 570 (__mmask16)(U))) 571 572 #define _mm_reduce_ph(A, imm) \ 573 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ 574 (__v8hf)_mm_setzero_ph(), \ 575 (__mmask8)-1)) 576 577 #define _mm_mask_reduce_ph(W, U, A, imm) \ 578 ((__m128h)__builtin_ia32_reduceph128_mask( \ 579 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) 580 581 #define _mm_maskz_reduce_ph(U, A, imm) \ 582 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ 583 (__v8hf)_mm_setzero_ph(), \ 584 (__mmask8)(U))) 585 586 #define _mm256_reduce_ph(A, imm) \ 587 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ 588 (__v16hf)_mm256_setzero_ph(), \ 589 (__mmask16)-1)) 590 591 #define _mm256_mask_reduce_ph(W, U, A, imm) \ 592 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ 593 (__v16hf)(__m256h)(W), \ 594 (__mmask16)(U))) 595 596 #define _mm256_maskz_reduce_ph(U, A, imm) \ 597 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ 598 (__v16hf)_mm256_setzero_ph(), \ 599 (__mmask16)(U))) 600 601 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) { 602 return __builtin_ia32_sqrtph((__v8hf)__a); 603 } 604 605 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W, 606 __mmask8 __U, 607 __m128h __A) { 608 return (__m128h)__builtin_ia32_selectph_128( 609 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W); 610 } 611 612 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U, 613 __m128h __A) { 614 return (__m128h)__builtin_ia32_selectph_128( 615 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph()); 616 } 617 618 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) { 619 return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a); 620 } 621 622 static __inline__ __m256h __DEFAULT_FN_ATTRS256 623 _mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { 624 return (__m256h)__builtin_ia32_selectph_256( 625 (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W); 626 } 627 628 static __inline__ __m256h __DEFAULT_FN_ATTRS256 629 _mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) { 630 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, 631 (__v16hf)_mm256_sqrt_ph(__A), 632 (__v16hf)_mm256_setzero_ph()); 633 } 634 635 #define _mm_mask_fpclass_ph_mask(U, A, imm) \ 636 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ 637 (int)(imm), (__mmask8)(U))) 638 639 #define _mm_fpclass_ph_mask(A, imm) \ 640 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ 641 (int)(imm), (__mmask8)-1)) 642 643 #define _mm256_mask_fpclass_ph_mask(U, A, imm) \ 644 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ 645 (int)(imm), (__mmask16)(U))) 646 647 #define _mm256_fpclass_ph_mask(A, imm) \ 648 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ 649 (int)(imm), (__mmask16)-1)) 650 651 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) { 652 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( 653 (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 654 } 655 656 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W, 657 __mmask8 __U, 658 __m128d __A) { 659 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W, 660 (__mmask8)__U); 661 } 662 663 static __inline__ __m128h __DEFAULT_FN_ATTRS128 664 _mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) { 665 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( 666 (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 667 } 668 669 static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) { 670 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( 671 (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 672 } 673 674 static __inline__ __m128h __DEFAULT_FN_ATTRS256 675 _mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) { 676 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W, 677 (__mmask8)__U); 678 } 679 680 static __inline__ __m128h __DEFAULT_FN_ATTRS256 681 _mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) { 682 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( 683 (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 684 } 685 686 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) { 687 return (__m128d)__builtin_ia32_vcvtph2pd128_mask( 688 (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1); 689 } 690 691 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W, 692 __mmask8 __U, 693 __m128h __A) { 694 return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W, 695 (__mmask8)__U); 696 } 697 698 static __inline__ __m128d __DEFAULT_FN_ATTRS128 699 _mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { 700 return (__m128d)__builtin_ia32_vcvtph2pd128_mask( 701 (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); 702 } 703 704 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) { 705 return (__m256d)__builtin_ia32_vcvtph2pd256_mask( 706 (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); 707 } 708 709 static __inline__ __m256d __DEFAULT_FN_ATTRS256 710 _mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) { 711 return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W, 712 (__mmask8)__U); 713 } 714 715 static __inline__ __m256d __DEFAULT_FN_ATTRS256 716 _mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { 717 return (__m256d)__builtin_ia32_vcvtph2pd256_mask( 718 (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); 719 } 720 721 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) { 722 return (__m128i)__builtin_ia32_vcvtph2w128_mask( 723 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); 724 } 725 726 static __inline__ __m128i __DEFAULT_FN_ATTRS128 727 _mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { 728 return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W, 729 (__mmask8)__U); 730 } 731 732 static __inline__ __m128i __DEFAULT_FN_ATTRS128 733 _mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) { 734 return (__m128i)__builtin_ia32_vcvtph2w128_mask( 735 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); 736 } 737 738 static __inline__ __m256i __DEFAULT_FN_ATTRS256 739 _mm256_cvtph_epi16(__m256h __A) { 740 return (__m256i)__builtin_ia32_vcvtph2w256_mask( 741 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); 742 } 743 744 static __inline__ __m256i __DEFAULT_FN_ATTRS256 745 _mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { 746 return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W, 747 (__mmask16)__U); 748 } 749 750 static __inline__ __m256i __DEFAULT_FN_ATTRS256 751 _mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) { 752 return (__m256i)__builtin_ia32_vcvtph2w256_mask( 753 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); 754 } 755 756 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) { 757 return (__m128i)__builtin_ia32_vcvttph2w128_mask( 758 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); 759 } 760 761 static __inline__ __m128i __DEFAULT_FN_ATTRS128 762 _mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { 763 return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W, 764 (__mmask8)__U); 765 } 766 767 static __inline__ __m128i __DEFAULT_FN_ATTRS128 768 _mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) { 769 return (__m128i)__builtin_ia32_vcvttph2w128_mask( 770 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); 771 } 772 773 static __inline__ __m256i __DEFAULT_FN_ATTRS256 774 _mm256_cvttph_epi16(__m256h __A) { 775 return (__m256i)__builtin_ia32_vcvttph2w256_mask( 776 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); 777 } 778 779 static __inline__ __m256i __DEFAULT_FN_ATTRS256 780 _mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { 781 return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W, 782 (__mmask16)__U); 783 } 784 785 static __inline__ __m256i __DEFAULT_FN_ATTRS256 786 _mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) { 787 return (__m256i)__builtin_ia32_vcvttph2w256_mask( 788 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); 789 } 790 791 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) { 792 return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf); 793 } 794 795 static __inline__ __m128h __DEFAULT_FN_ATTRS128 796 _mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) { 797 return (__m128h)__builtin_ia32_selectph_128( 798 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W); 799 } 800 801 static __inline__ __m128h __DEFAULT_FN_ATTRS128 802 _mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) { 803 return (__m128h)__builtin_ia32_selectph_128( 804 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph()); 805 } 806 807 static __inline__ __m256h __DEFAULT_FN_ATTRS256 808 _mm256_cvtepi16_ph(__m256i __A) { 809 return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf); 810 } 811 812 static __inline__ __m256h __DEFAULT_FN_ATTRS256 813 _mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) { 814 return (__m256h)__builtin_ia32_selectph_256( 815 (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W); 816 } 817 818 static __inline__ __m256h __DEFAULT_FN_ATTRS256 819 _mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) { 820 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, 821 (__v16hf)_mm256_cvtepi16_ph(__A), 822 (__v16hf)_mm256_setzero_ph()); 823 } 824 825 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) { 826 return (__m128i)__builtin_ia32_vcvtph2uw128_mask( 827 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); 828 } 829 830 static __inline__ __m128i __DEFAULT_FN_ATTRS128 831 _mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { 832 return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W, 833 (__mmask8)__U); 834 } 835 836 static __inline__ __m128i __DEFAULT_FN_ATTRS128 837 _mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) { 838 return (__m128i)__builtin_ia32_vcvtph2uw128_mask( 839 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); 840 } 841 842 static __inline__ __m256i __DEFAULT_FN_ATTRS256 843 _mm256_cvtph_epu16(__m256h __A) { 844 return (__m256i)__builtin_ia32_vcvtph2uw256_mask( 845 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); 846 } 847 848 static __inline__ __m256i __DEFAULT_FN_ATTRS256 849 _mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { 850 return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W, 851 (__mmask16)__U); 852 } 853 854 static __inline__ __m256i __DEFAULT_FN_ATTRS256 855 _mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) { 856 return (__m256i)__builtin_ia32_vcvtph2uw256_mask( 857 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); 858 } 859 860 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) { 861 return (__m128i)__builtin_ia32_vcvttph2uw128_mask( 862 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); 863 } 864 865 static __inline__ __m128i __DEFAULT_FN_ATTRS128 866 _mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { 867 return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W, 868 (__mmask8)__U); 869 } 870 871 static __inline__ __m128i __DEFAULT_FN_ATTRS128 872 _mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) { 873 return (__m128i)__builtin_ia32_vcvttph2uw128_mask( 874 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); 875 } 876 877 static __inline__ __m256i __DEFAULT_FN_ATTRS256 878 _mm256_cvttph_epu16(__m256h __A) { 879 return (__m256i)__builtin_ia32_vcvttph2uw256_mask( 880 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); 881 } 882 883 static __inline__ __m256i __DEFAULT_FN_ATTRS256 884 _mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { 885 return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W, 886 (__mmask16)__U); 887 } 888 889 static __inline__ __m256i __DEFAULT_FN_ATTRS256 890 _mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) { 891 return (__m256i)__builtin_ia32_vcvttph2uw256_mask( 892 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); 893 } 894 895 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) { 896 return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf); 897 } 898 899 static __inline__ __m128h __DEFAULT_FN_ATTRS128 900 _mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) { 901 return (__m128h)__builtin_ia32_selectph_128( 902 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W); 903 } 904 905 static __inline__ __m128h __DEFAULT_FN_ATTRS128 906 _mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) { 907 return (__m128h)__builtin_ia32_selectph_128( 908 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph()); 909 } 910 911 static __inline__ __m256h __DEFAULT_FN_ATTRS256 912 _mm256_cvtepu16_ph(__m256i __A) { 913 return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf); 914 } 915 916 static __inline__ __m256h __DEFAULT_FN_ATTRS256 917 _mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) { 918 return (__m256h)__builtin_ia32_selectph_256( 919 (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W); 920 } 921 922 static __inline__ __m256h __DEFAULT_FN_ATTRS256 923 _mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) { 924 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, 925 (__v16hf)_mm256_cvtepu16_ph(__A), 926 (__v16hf)_mm256_setzero_ph()); 927 } 928 929 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) { 930 return (__m128i)__builtin_ia32_vcvtph2dq128_mask( 931 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); 932 } 933 934 static __inline__ __m128i __DEFAULT_FN_ATTRS128 935 _mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { 936 return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W, 937 (__mmask8)__U); 938 } 939 940 static __inline__ __m128i __DEFAULT_FN_ATTRS128 941 _mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { 942 return (__m128i)__builtin_ia32_vcvtph2dq128_mask( 943 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); 944 } 945 946 static __inline__ __m256i __DEFAULT_FN_ATTRS256 947 _mm256_cvtph_epi32(__m128h __A) { 948 return (__m256i)__builtin_ia32_vcvtph2dq256_mask( 949 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); 950 } 951 952 static __inline__ __m256i __DEFAULT_FN_ATTRS256 953 _mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { 954 return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W, 955 (__mmask8)__U); 956 } 957 958 static __inline__ __m256i __DEFAULT_FN_ATTRS256 959 _mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { 960 return (__m256i)__builtin_ia32_vcvtph2dq256_mask( 961 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); 962 } 963 964 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) { 965 return (__m128i)__builtin_ia32_vcvtph2udq128_mask( 966 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); 967 } 968 969 static __inline__ __m128i __DEFAULT_FN_ATTRS128 970 _mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { 971 return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W, 972 (__mmask8)__U); 973 } 974 975 static __inline__ __m128i __DEFAULT_FN_ATTRS128 976 _mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { 977 return (__m128i)__builtin_ia32_vcvtph2udq128_mask( 978 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); 979 } 980 981 static __inline__ __m256i __DEFAULT_FN_ATTRS256 982 _mm256_cvtph_epu32(__m128h __A) { 983 return (__m256i)__builtin_ia32_vcvtph2udq256_mask( 984 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); 985 } 986 987 static __inline__ __m256i __DEFAULT_FN_ATTRS256 988 _mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { 989 return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W, 990 (__mmask8)__U); 991 } 992 993 static __inline__ __m256i __DEFAULT_FN_ATTRS256 994 _mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { 995 return (__m256i)__builtin_ia32_vcvtph2udq256_mask( 996 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); 997 } 998 999 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) { 1000 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( 1001 (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1002 } 1003 1004 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1005 _mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1006 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W, 1007 (__mmask8)__U); 1008 } 1009 1010 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1011 _mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) { 1012 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( 1013 (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1014 } 1015 1016 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1017 _mm256_cvtepi32_ph(__m256i __A) { 1018 return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf); 1019 } 1020 1021 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1022 _mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1023 return (__m128h)__builtin_ia32_selectph_128( 1024 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W); 1025 } 1026 1027 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1028 _mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) { 1029 return (__m128h)__builtin_ia32_selectph_128( 1030 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph()); 1031 } 1032 1033 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) { 1034 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( 1035 (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1036 } 1037 1038 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1039 _mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1040 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W, 1041 (__mmask8)__U); 1042 } 1043 1044 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1045 _mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) { 1046 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( 1047 (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1048 } 1049 1050 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1051 _mm256_cvtepu32_ph(__m256i __A) { 1052 return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf); 1053 } 1054 1055 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1056 _mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1057 return (__m128h)__builtin_ia32_selectph_128( 1058 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W); 1059 } 1060 1061 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1062 _mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) { 1063 return (__m128h)__builtin_ia32_selectph_128( 1064 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph()); 1065 } 1066 1067 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) { 1068 return (__m128i)__builtin_ia32_vcvttph2dq128_mask( 1069 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); 1070 } 1071 1072 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1073 _mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { 1074 return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W, 1075 (__mmask8)__U); 1076 } 1077 1078 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1079 _mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { 1080 return (__m128i)__builtin_ia32_vcvttph2dq128_mask( 1081 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); 1082 } 1083 1084 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1085 _mm256_cvttph_epi32(__m128h __A) { 1086 return (__m256i)__builtin_ia32_vcvttph2dq256_mask( 1087 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); 1088 } 1089 1090 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1091 _mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { 1092 return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W, 1093 (__mmask8)__U); 1094 } 1095 1096 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1097 _mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { 1098 return (__m256i)__builtin_ia32_vcvttph2dq256_mask( 1099 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); 1100 } 1101 1102 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) { 1103 return (__m128i)__builtin_ia32_vcvttph2udq128_mask( 1104 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); 1105 } 1106 1107 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1108 _mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { 1109 return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W, 1110 (__mmask8)__U); 1111 } 1112 1113 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1114 _mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { 1115 return (__m128i)__builtin_ia32_vcvttph2udq128_mask( 1116 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); 1117 } 1118 1119 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1120 _mm256_cvttph_epu32(__m128h __A) { 1121 return (__m256i)__builtin_ia32_vcvttph2udq256_mask( 1122 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); 1123 } 1124 1125 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1126 _mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { 1127 return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W, 1128 (__mmask8)__U); 1129 } 1130 1131 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1132 _mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { 1133 return (__m256i)__builtin_ia32_vcvttph2udq256_mask( 1134 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); 1135 } 1136 1137 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) { 1138 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( 1139 (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1140 } 1141 1142 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1143 _mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1144 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W, 1145 (__mmask8)__U); 1146 } 1147 1148 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1149 _mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) { 1150 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( 1151 (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1152 } 1153 1154 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1155 _mm256_cvtepi64_ph(__m256i __A) { 1156 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( 1157 (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1158 } 1159 1160 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1161 _mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1162 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W, 1163 (__mmask8)__U); 1164 } 1165 1166 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1167 _mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) { 1168 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( 1169 (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1170 } 1171 1172 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) { 1173 return (__m128i)__builtin_ia32_vcvtph2qq128_mask( 1174 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); 1175 } 1176 1177 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1178 _mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { 1179 return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W, 1180 (__mmask8)__U); 1181 } 1182 1183 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1184 _mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { 1185 return (__m128i)__builtin_ia32_vcvtph2qq128_mask( 1186 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); 1187 } 1188 1189 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1190 _mm256_cvtph_epi64(__m128h __A) { 1191 return (__m256i)__builtin_ia32_vcvtph2qq256_mask( 1192 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); 1193 } 1194 1195 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1196 _mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { 1197 return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W, 1198 (__mmask8)__U); 1199 } 1200 1201 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1202 _mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { 1203 return (__m256i)__builtin_ia32_vcvtph2qq256_mask( 1204 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); 1205 } 1206 1207 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) { 1208 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( 1209 (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1210 } 1211 1212 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1213 _mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) { 1214 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W, 1215 (__mmask8)__U); 1216 } 1217 1218 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1219 _mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) { 1220 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( 1221 (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1222 } 1223 1224 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1225 _mm256_cvtepu64_ph(__m256i __A) { 1226 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( 1227 (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1228 } 1229 1230 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1231 _mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) { 1232 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W, 1233 (__mmask8)__U); 1234 } 1235 1236 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1237 _mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) { 1238 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( 1239 (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1240 } 1241 1242 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) { 1243 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( 1244 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); 1245 } 1246 1247 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1248 _mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { 1249 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W, 1250 (__mmask8)__U); 1251 } 1252 1253 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1254 _mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { 1255 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( 1256 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); 1257 } 1258 1259 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1260 _mm256_cvtph_epu64(__m128h __A) { 1261 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( 1262 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); 1263 } 1264 1265 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1266 _mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { 1267 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W, 1268 (__mmask8)__U); 1269 } 1270 1271 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1272 _mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { 1273 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( 1274 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); 1275 } 1276 1277 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) { 1278 return (__m128i)__builtin_ia32_vcvttph2qq128_mask( 1279 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); 1280 } 1281 1282 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1283 _mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { 1284 return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W, 1285 (__mmask8)__U); 1286 } 1287 1288 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1289 _mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { 1290 return (__m128i)__builtin_ia32_vcvttph2qq128_mask( 1291 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); 1292 } 1293 1294 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1295 _mm256_cvttph_epi64(__m128h __A) { 1296 return (__m256i)__builtin_ia32_vcvttph2qq256_mask( 1297 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); 1298 } 1299 1300 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1301 _mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { 1302 return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W, 1303 (__mmask8)__U); 1304 } 1305 1306 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1307 _mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { 1308 return (__m256i)__builtin_ia32_vcvttph2qq256_mask( 1309 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); 1310 } 1311 1312 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) { 1313 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( 1314 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); 1315 } 1316 1317 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1318 _mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { 1319 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W, 1320 (__mmask8)__U); 1321 } 1322 1323 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1324 _mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { 1325 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( 1326 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); 1327 } 1328 1329 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1330 _mm256_cvttph_epu64(__m128h __A) { 1331 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( 1332 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); 1333 } 1334 1335 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1336 _mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { 1337 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W, 1338 (__mmask8)__U); 1339 } 1340 1341 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1342 _mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { 1343 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( 1344 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); 1345 } 1346 1347 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) { 1348 return (__m128)__builtin_ia32_vcvtph2psx128_mask( 1349 (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1); 1350 } 1351 1352 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W, 1353 __mmask8 __U, 1354 __m128h __A) { 1355 return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W, 1356 (__mmask8)__U); 1357 } 1358 1359 static __inline__ __m128 __DEFAULT_FN_ATTRS128 1360 _mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { 1361 return (__m128)__builtin_ia32_vcvtph2psx128_mask( 1362 (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); 1363 } 1364 1365 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) { 1366 return (__m256)__builtin_ia32_vcvtph2psx256_mask( 1367 (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1); 1368 } 1369 1370 static __inline__ __m256 __DEFAULT_FN_ATTRS256 1371 _mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) { 1372 return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W, 1373 (__mmask8)__U); 1374 } 1375 1376 static __inline__ __m256 __DEFAULT_FN_ATTRS256 1377 _mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { 1378 return (__m256)__builtin_ia32_vcvtph2psx256_mask( 1379 (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); 1380 } 1381 1382 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) { 1383 return (__m128h)__builtin_ia32_vcvtps2phx128_mask( 1384 (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1385 } 1386 1387 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W, 1388 __mmask8 __U, 1389 __m128 __A) { 1390 return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W, 1391 (__mmask8)__U); 1392 } 1393 1394 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1395 _mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) { 1396 return (__m128h)__builtin_ia32_vcvtps2phx128_mask( 1397 (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1398 } 1399 1400 static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) { 1401 return (__m128h)__builtin_ia32_vcvtps2phx256_mask( 1402 (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); 1403 } 1404 1405 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1406 _mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) { 1407 return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W, 1408 (__mmask8)__U); 1409 } 1410 1411 static __inline__ __m128h __DEFAULT_FN_ATTRS256 1412 _mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) { 1413 return (__m128h)__builtin_ia32_vcvtps2phx256_mask( 1414 (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1415 } 1416 1417 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A, 1418 __m128h __B, 1419 __m128h __C) { 1420 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, 1421 (__v8hf)__C); 1422 } 1423 1424 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A, 1425 __mmask8 __U, 1426 __m128h __B, 1427 __m128h __C) { 1428 return (__m128h)__builtin_ia32_selectph_128( 1429 (__mmask8)__U, 1430 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1431 (__v8hf)__A); 1432 } 1433 1434 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1435 _mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1436 return (__m128h)__builtin_ia32_selectph_128( 1437 (__mmask8)__U, 1438 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1439 (__v8hf)__C); 1440 } 1441 1442 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1443 _mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1444 return (__m128h)__builtin_ia32_selectph_128( 1445 (__mmask8)__U, 1446 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1447 (__v8hf)_mm_setzero_ph()); 1448 } 1449 1450 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A, 1451 __m128h __B, 1452 __m128h __C) { 1453 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, 1454 -(__v8hf)__C); 1455 } 1456 1457 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A, 1458 __mmask8 __U, 1459 __m128h __B, 1460 __m128h __C) { 1461 return (__m128h)__builtin_ia32_selectph_128( 1462 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1463 (__v8hf)__A); 1464 } 1465 1466 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1467 _mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1468 return (__m128h)__builtin_ia32_selectph_128( 1469 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1470 (__v8hf)_mm_setzero_ph()); 1471 } 1472 1473 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1474 _mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1475 return (__m128h)__builtin_ia32_selectph_128( 1476 (__mmask8)__U, 1477 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1478 (__v8hf)__C); 1479 } 1480 1481 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1482 _mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1483 return (__m128h)__builtin_ia32_selectph_128( 1484 (__mmask8)__U, 1485 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1486 (__v8hf)_mm_setzero_ph()); 1487 } 1488 1489 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1490 _mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1491 return (__m128h)__builtin_ia32_selectph_128( 1492 (__mmask8)__U, 1493 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1494 (__v8hf)_mm_setzero_ph()); 1495 } 1496 1497 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A, 1498 __m256h __B, 1499 __m256h __C) { 1500 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, 1501 (__v16hf)__C); 1502 } 1503 1504 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1505 _mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1506 return (__m256h)__builtin_ia32_selectph_256( 1507 (__mmask16)__U, 1508 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1509 (__v16hf)__A); 1510 } 1511 1512 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1513 _mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1514 return (__m256h)__builtin_ia32_selectph_256( 1515 (__mmask16)__U, 1516 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1517 (__v16hf)__C); 1518 } 1519 1520 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1521 _mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1522 return (__m256h)__builtin_ia32_selectph_256( 1523 (__mmask16)__U, 1524 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1525 (__v16hf)_mm256_setzero_ph()); 1526 } 1527 1528 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A, 1529 __m256h __B, 1530 __m256h __C) { 1531 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, 1532 -(__v16hf)__C); 1533 } 1534 1535 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1536 _mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1537 return (__m256h)__builtin_ia32_selectph_256( 1538 (__mmask16)__U, 1539 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1540 (__v16hf)__A); 1541 } 1542 1543 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1544 _mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1545 return (__m256h)__builtin_ia32_selectph_256( 1546 (__mmask16)__U, 1547 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1548 (__v16hf)_mm256_setzero_ph()); 1549 } 1550 1551 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1552 _mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1553 return (__m256h)__builtin_ia32_selectph_256( 1554 (__mmask16)__U, 1555 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1556 (__v16hf)__C); 1557 } 1558 1559 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1560 _mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1561 return (__m256h)__builtin_ia32_selectph_256( 1562 (__mmask16)__U, 1563 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1564 (__v16hf)_mm256_setzero_ph()); 1565 } 1566 1567 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1568 _mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1569 return (__m256h)__builtin_ia32_selectph_256( 1570 (__mmask16)__U, 1571 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1572 (__v16hf)_mm256_setzero_ph()); 1573 } 1574 1575 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A, 1576 __m128h __B, 1577 __m128h __C) { 1578 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, 1579 (__v8hf)__C); 1580 } 1581 1582 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1583 _mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1584 return (__m128h)__builtin_ia32_selectph_128( 1585 (__mmask8)__U, 1586 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1587 (__v8hf)__A); 1588 } 1589 1590 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1591 _mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1592 return (__m128h)__builtin_ia32_selectph_128( 1593 (__mmask8)__U, 1594 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1595 (__v8hf)__C); 1596 } 1597 1598 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1599 _mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1600 return (__m128h)__builtin_ia32_selectph_128( 1601 (__mmask8)__U, 1602 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), 1603 (__v8hf)_mm_setzero_ph()); 1604 } 1605 1606 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A, 1607 __m128h __B, 1608 __m128h __C) { 1609 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, 1610 -(__v8hf)__C); 1611 } 1612 1613 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1614 _mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1615 return (__m128h)__builtin_ia32_selectph_128( 1616 (__mmask8)__U, 1617 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1618 (__v8hf)__A); 1619 } 1620 1621 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1622 _mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1623 return (__m128h)__builtin_ia32_selectph_128( 1624 (__mmask8)__U, 1625 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1626 (__v8hf)_mm_setzero_ph()); 1627 } 1628 1629 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1630 _mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) { 1631 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, 1632 (__v16hf)__C); 1633 } 1634 1635 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1636 _mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1637 return (__m256h)__builtin_ia32_selectph_256( 1638 (__mmask16)__U, 1639 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1640 (__v16hf)__A); 1641 } 1642 1643 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1644 _mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1645 return (__m256h)__builtin_ia32_selectph_256( 1646 (__mmask16)__U, 1647 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1648 (__v16hf)__C); 1649 } 1650 1651 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1652 _mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1653 return (__m256h)__builtin_ia32_selectph_256( 1654 (__mmask16)__U, 1655 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), 1656 (__v16hf)_mm256_setzero_ph()); 1657 } 1658 1659 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1660 _mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) { 1661 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, 1662 -(__v16hf)__C); 1663 } 1664 1665 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1666 _mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1667 return (__m256h)__builtin_ia32_selectph_256( 1668 (__mmask16)__U, 1669 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1670 (__v16hf)__A); 1671 } 1672 1673 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1674 _mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { 1675 return (__m256h)__builtin_ia32_selectph_256( 1676 (__mmask16)__U, 1677 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1678 (__v16hf)_mm256_setzero_ph()); 1679 } 1680 1681 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1682 _mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1683 return (__m128h)__builtin_ia32_selectph_128( 1684 (__mmask8)__U, 1685 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1686 (__v8hf)__C); 1687 } 1688 1689 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1690 _mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1691 return (__m256h)__builtin_ia32_selectph_256( 1692 (__mmask16)__U, 1693 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1694 (__v16hf)__C); 1695 } 1696 1697 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1698 _mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1699 return (__m128h)__builtin_ia32_selectph_128( 1700 (__mmask8)__U, 1701 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), 1702 (__v8hf)__C); 1703 } 1704 1705 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1706 _mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1707 return (__m256h)__builtin_ia32_selectph_256( 1708 (__mmask16)__U, 1709 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), 1710 (__v16hf)__C); 1711 } 1712 1713 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A, 1714 __m128h __B, 1715 __m128h __C) { 1716 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, 1717 (__v8hf)__C); 1718 } 1719 1720 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1721 _mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1722 return (__m128h)__builtin_ia32_selectph_128( 1723 (__mmask8)__U, 1724 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C), 1725 (__v8hf)__A); 1726 } 1727 1728 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A, 1729 __m256h __B, 1730 __m256h __C) { 1731 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, 1732 (__v16hf)__C); 1733 } 1734 1735 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1736 _mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1737 return (__m256h)__builtin_ia32_selectph_256( 1738 (__mmask16)__U, 1739 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C), 1740 (__v16hf)__A); 1741 } 1742 1743 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A, 1744 __m128h __B, 1745 __m128h __C) { 1746 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, 1747 -(__v8hf)__C); 1748 } 1749 1750 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1751 _mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1752 return (__m128h)__builtin_ia32_selectph_128( 1753 (__mmask8)__U, 1754 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), 1755 (__v8hf)__A); 1756 } 1757 1758 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1759 _mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1760 return (__m128h)__builtin_ia32_selectph_128( 1761 (__mmask8)__U, 1762 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), 1763 (__v8hf)__C); 1764 } 1765 1766 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A, 1767 __m256h __B, 1768 __m256h __C) { 1769 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, 1770 -(__v16hf)__C); 1771 } 1772 1773 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1774 _mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { 1775 return (__m256h)__builtin_ia32_selectph_256( 1776 (__mmask16)__U, 1777 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), 1778 (__v16hf)__A); 1779 } 1780 1781 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1782 _mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { 1783 return (__m256h)__builtin_ia32_selectph_256( 1784 (__mmask16)__U, 1785 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), 1786 (__v16hf)__C); 1787 } 1788 1789 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A, 1790 __m128h __B) { 1791 return (__m128h)__builtin_ia32_vfcmulcph128_mask( 1792 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); 1793 } 1794 1795 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1796 _mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 1797 return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B, 1798 (__v4sf)__W, (__mmask8)__U); 1799 } 1800 1801 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1802 _mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { 1803 return (__m128h)__builtin_ia32_vfcmulcph128_mask( 1804 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); 1805 } 1806 1807 static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A, 1808 __m256h __B) { 1809 return (__m256h)__builtin_ia32_vfcmulcph256_mask( 1810 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); 1811 } 1812 1813 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1814 _mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { 1815 return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B, 1816 (__v8sf)__W, (__mmask8)__U); 1817 } 1818 1819 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1820 _mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { 1821 return (__m256h)__builtin_ia32_vfcmulcph256_mask( 1822 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); 1823 } 1824 1825 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A, 1826 __m128h __B, 1827 __m128h __C) { 1828 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1829 (__v4sf)__C, (__mmask8)-1); 1830 } 1831 1832 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1833 _mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1834 return (__m128h)__builtin_ia32_selectps_128( 1835 __U, 1836 __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B, 1837 (__v4sf)__C, (__mmask8)__U), 1838 (__v4sf)__A); 1839 } 1840 1841 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1842 _mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1843 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1844 (__v4sf)__C, (__mmask8)__U); 1845 } 1846 1847 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1848 _mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1849 return (__m128h)__builtin_ia32_vfcmaddcph128_maskz( 1850 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); 1851 } 1852 1853 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A, 1854 __m256h __B, 1855 __m256h __C) { 1856 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1857 (__v8sf)__C, (__mmask8)-1); 1858 } 1859 1860 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1861 _mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { 1862 return (__m256h)__builtin_ia32_selectps_256( 1863 __U, 1864 __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, 1865 (__mmask8)__U), 1866 (__v8sf)__A); 1867 } 1868 1869 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1870 _mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { 1871 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1872 (__v8sf)__C, (__mmask8)__U); 1873 } 1874 1875 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1876 _mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { 1877 return (__m256h)__builtin_ia32_vfcmaddcph256_maskz( 1878 (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); 1879 } 1880 1881 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A, 1882 __m128h __B) { 1883 return (__m128h)__builtin_ia32_vfmulcph128_mask( 1884 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); 1885 } 1886 1887 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W, 1888 __mmask8 __U, 1889 __m128h __A, 1890 __m128h __B) { 1891 return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B, 1892 (__v4sf)__W, (__mmask8)__U); 1893 } 1894 1895 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1896 _mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { 1897 return (__m128h)__builtin_ia32_vfmulcph128_mask( 1898 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); 1899 } 1900 1901 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A, 1902 __m256h __B) { 1903 return (__m256h)__builtin_ia32_vfmulcph256_mask( 1904 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); 1905 } 1906 1907 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1908 _mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { 1909 return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B, 1910 (__v8sf)__W, (__mmask8)__U); 1911 } 1912 1913 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1914 _mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { 1915 return (__m256h)__builtin_ia32_vfmulcph256_mask( 1916 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); 1917 } 1918 1919 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A, 1920 __m128h __B, 1921 __m128h __C) { 1922 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1923 (__v4sf)__C, (__mmask8)-1); 1924 } 1925 1926 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1927 _mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 1928 return (__m128h)__builtin_ia32_selectps_128( 1929 __U, 1930 __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, 1931 (__mmask8)__U), 1932 (__v4sf)__A); 1933 } 1934 1935 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1936 _mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 1937 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, 1938 (__v4sf)__C, (__mmask8)__U); 1939 } 1940 1941 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1942 _mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 1943 return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B, 1944 (__v4sf)__C, (__mmask8)__U); 1945 } 1946 1947 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A, 1948 __m256h __B, 1949 __m256h __C) { 1950 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1951 (__v8sf)__C, (__mmask8)-1); 1952 } 1953 1954 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1955 _mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { 1956 return (__m256h)__builtin_ia32_selectps_256( 1957 __U, 1958 __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, 1959 (__mmask8)__U), 1960 (__v8sf)__A); 1961 } 1962 1963 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1964 _mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { 1965 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, 1966 (__v8sf)__C, (__mmask8)__U); 1967 } 1968 1969 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1970 _mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { 1971 return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B, 1972 (__v8sf)__C, (__mmask8)__U); 1973 } 1974 1975 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, 1976 __m128h __A, 1977 __m128h __W) { 1978 return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W, 1979 (__v8hf)__A); 1980 } 1981 1982 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1983 _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { 1984 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W, 1985 (__v16hf)__A); 1986 } 1987 1988 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1989 _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { 1990 return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, 1991 (__v8hi)__B); 1992 } 1993 1994 static __inline__ __m256h __DEFAULT_FN_ATTRS256 1995 _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { 1996 return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, 1997 (__v16hi)__B); 1998 } 1999 2000 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2001 _mm_permutexvar_ph(__m128i __A, __m128h __B) { 2002 return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); 2003 } 2004 2005 static __inline__ __m256h __DEFAULT_FN_ATTRS256 2006 _mm256_permutexvar_ph(__m256i __A, __m256h __B) { 2007 return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); 2008 } 2009 2010 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2011 _mm256_reduce_add_ph(__m256h __W) { 2012 return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W); 2013 } 2014 2015 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2016 _mm256_reduce_mul_ph(__m256h __W) { 2017 return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W); 2018 } 2019 2020 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2021 _mm256_reduce_max_ph(__m256h __V) { 2022 return __builtin_ia32_reduce_fmax_ph256(__V); 2023 } 2024 2025 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 2026 _mm256_reduce_min_ph(__m256h __V) { 2027 return __builtin_ia32_reduce_fmin_ph256(__V); 2028 } 2029 2030 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2031 _mm_reduce_add_ph(__m128h __W) { 2032 return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W); 2033 } 2034 2035 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2036 _mm_reduce_mul_ph(__m128h __W) { 2037 return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W); 2038 } 2039 2040 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2041 _mm_reduce_max_ph(__m128h __V) { 2042 return __builtin_ia32_reduce_fmax_ph128(__V); 2043 } 2044 2045 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 2046 _mm_reduce_min_ph(__m128h __V) { 2047 return __builtin_ia32_reduce_fmin_ph128(__V); 2048 } 2049 2050 // intrinsics below are alias for f*mul_*ch 2051 #define _mm_mul_pch(A, B) _mm_fmul_pch(A, B) 2052 #define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B) 2053 #define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B) 2054 #define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B) 2055 #define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B) 2056 #define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B) 2057 2058 #define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B) 2059 #define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B) 2060 #define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B) 2061 #define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B) 2062 #define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B) 2063 #define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B) 2064 2065 #undef __DEFAULT_FN_ATTRS128 2066 #undef __DEFAULT_FN_ATTRS256 2067 2068 #endif 2069