1 /*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead." 11 #endif 12 13 #ifdef __SSE2__ 14 15 #ifndef __AVX512VLBF16INTRIN_H 16 #define __AVX512VLBF16INTRIN_H 17 18 #define __DEFAULT_FN_ATTRS128 \ 19 __attribute__((__always_inline__, __nodebug__, \ 20 __target__("avx512vl, avx512bf16"), __min_vector_width__(128))) 21 #define __DEFAULT_FN_ATTRS256 \ 22 __attribute__((__always_inline__, __nodebug__, \ 23 __target__("avx512vl, avx512bf16"), __min_vector_width__(256))) 24 25 /// Convert Two Packed Single Data to One Packed BF16 Data. 26 /// 27 /// \headerfile <x86intrin.h> 28 /// 29 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 30 /// 31 /// \param __A 32 /// A 128-bit vector of [4 x float]. 33 /// \param __B 34 /// A 128-bit vector of [4 x float]. 35 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 36 /// conversion of __B, and higher 64 bits come from conversion of __A. 37 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 38 _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) { 39 return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A, 40 (__v4sf) __B); 41 } 42 43 /// Convert Two Packed Single Data to One Packed BF16 Data. 44 /// 45 /// \headerfile <x86intrin.h> 46 /// 47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 48 /// 49 /// \param __A 50 /// A 128-bit vector of [4 x float]. 51 /// \param __B 52 /// A 128-bit vector of [4 x float]. 53 /// \param __W 54 /// A 128-bit vector of [8 x bfloat]. 55 /// \param __U 56 /// A 8-bit mask value specifying what is chosen for each element. 57 /// A 1 means conversion of __A or __B. A 0 means element from __W. 58 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 59 /// conversion of __B, and higher 64 bits come from conversion of __A. 60 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 61 _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) { 62 return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, 63 (__v8bf)_mm_cvtne2ps_pbh(__A, __B), 64 (__v8bf)__W); 65 } 66 67 /// Convert Two Packed Single Data to One Packed BF16 Data. 68 /// 69 /// \headerfile <x86intrin.h> 70 /// 71 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 72 /// 73 /// \param __A 74 /// A 128-bit vector of [4 x float]. 75 /// \param __B 76 /// A 128-bit vector of [4 x float]. 77 /// \param __U 78 /// A 8-bit mask value specifying what is chosen for each element. 79 /// A 1 means conversion of __A or __B. A 0 means element is zero. 80 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 81 /// conversion of __B, and higher 64 bits come from conversion of __A. 82 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 83 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) { 84 return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, 85 (__v8bf)_mm_cvtne2ps_pbh(__A, __B), 86 (__v8bf)_mm_setzero_si128()); 87 } 88 89 /// Convert Two Packed Single Data to One Packed BF16 Data. 90 /// 91 /// \headerfile <x86intrin.h> 92 /// 93 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 94 /// 95 /// \param __A 96 /// A 256-bit vector of [8 x float]. 97 /// \param __B 98 /// A 256-bit vector of [8 x float]. 99 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from 100 /// conversion of __B, and higher 128 bits come from conversion of __A. 101 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 102 _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) { 103 return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A, 104 (__v8sf) __B); 105 } 106 107 /// Convert Two Packed Single Data to One Packed BF16 Data. 108 /// 109 /// \headerfile <x86intrin.h> 110 /// 111 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 112 /// 113 /// \param __A 114 /// A 256-bit vector of [8 x float]. 115 /// \param __B 116 /// A 256-bit vector of [8 x float]. 117 /// \param __W 118 /// A 256-bit vector of [16 x bfloat]. 119 /// \param __U 120 /// A 16-bit mask value specifying what is chosen for each element. 121 /// A 1 means conversion of __A or __B. A 0 means element from __W. 122 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from 123 /// conversion of __B, and higher 128 bits come from conversion of __A. 124 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 125 _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) { 126 return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, 127 (__v16bf)_mm256_cvtne2ps_pbh(__A, __B), 128 (__v16bf)__W); 129 } 130 131 /// Convert Two Packed Single Data to One Packed BF16 Data. 132 /// 133 /// \headerfile <x86intrin.h> 134 /// 135 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 136 /// 137 /// \param __A 138 /// A 256-bit vector of [8 x float]. 139 /// \param __B 140 /// A 256-bit vector of [8 x float]. 141 /// \param __U 142 /// A 16-bit mask value specifying what is chosen for each element. 143 /// A 1 means conversion of __A or __B. A 0 means element is zero. 144 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from 145 /// conversion of __B, and higher 128 bits come from conversion of __A. 146 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 147 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) { 148 return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, 149 (__v16bf)_mm256_cvtne2ps_pbh(__A, __B), 150 (__v16bf)_mm256_setzero_si256()); 151 } 152 153 /// Convert Packed Single Data to Packed BF16 Data. 154 /// 155 /// \headerfile <x86intrin.h> 156 /// 157 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 158 /// 159 /// \param __A 160 /// A 128-bit vector of [4 x float]. 161 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 162 /// conversion of __A, and higher 64 bits are 0. 163 #define _mm_cvtneps_pbh(A) \ 164 ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A))) 165 166 /// Convert Packed Single Data to Packed BF16 Data. 167 /// 168 /// \headerfile <x86intrin.h> 169 /// 170 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 171 /// 172 /// \param __A 173 /// A 128-bit vector of [4 x float]. 174 /// \param __W 175 /// A 128-bit vector of [8 x bfloat]. 176 /// \param __U 177 /// A 4-bit mask value specifying what is chosen for each element. 178 /// A 1 means conversion of __A. A 0 means element from __W. 179 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 180 /// conversion of __A, and higher 64 bits are 0. 181 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 182 _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) { 183 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, 184 (__v8bf)__W, 185 (__mmask8)__U); 186 } 187 188 /// Convert Packed Single Data to Packed BF16 Data. 189 /// 190 /// \headerfile <x86intrin.h> 191 /// 192 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 193 /// 194 /// \param __A 195 /// A 128-bit vector of [4 x float]. 196 /// \param __U 197 /// A 4-bit mask value specifying what is chosen for each element. 198 /// A 1 means conversion of __A. A 0 means element is zero. 199 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from 200 /// conversion of __A, and higher 64 bits are 0. 201 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 202 _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) { 203 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, 204 (__v8bf)_mm_setzero_si128(), 205 (__mmask8)__U); 206 } 207 208 /// Convert Packed Single Data to Packed BF16 Data. 209 /// 210 /// \headerfile <x86intrin.h> 211 /// 212 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 213 /// 214 /// \param __A 215 /// A 256-bit vector of [8 x float]. 216 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. 217 #define _mm256_cvtneps_pbh(A) \ 218 ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A))) 219 220 /// Convert Packed Single Data to Packed BF16 Data. 221 /// 222 /// \headerfile <x86intrin.h> 223 /// 224 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 225 /// 226 /// \param __A 227 /// A 256-bit vector of [8 x float]. 228 /// \param __W 229 /// A 256-bit vector of [8 x bfloat]. 230 /// \param __U 231 /// A 8-bit mask value specifying what is chosen for each element. 232 /// A 1 means conversion of __A. A 0 means element from __W. 233 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. 234 static __inline__ __m128bh __DEFAULT_FN_ATTRS256 235 _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) { 236 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, 237 (__v8bf)__W, 238 (__mmask8)__U); 239 } 240 241 /// Convert Packed Single Data to Packed BF16 Data. 242 /// 243 /// \headerfile <x86intrin.h> 244 /// 245 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 246 /// 247 /// \param __A 248 /// A 256-bit vector of [8 x float]. 249 /// \param __U 250 /// A 8-bit mask value specifying what is chosen for each element. 251 /// A 1 means conversion of __A. A 0 means element is zero. 252 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. 253 static __inline__ __m128bh __DEFAULT_FN_ATTRS256 254 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) { 255 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, 256 (__v8bf)_mm_setzero_si128(), 257 (__mmask8)__U); 258 } 259 260 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 261 /// 262 /// \headerfile <x86intrin.h> 263 /// 264 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 265 /// 266 /// \param __A 267 /// A 128-bit vector of [8 x bfloat]. 268 /// \param __B 269 /// A 128-bit vector of [8 x bfloat]. 270 /// \param __D 271 /// A 128-bit vector of [4 x float]. 272 /// \returns A 128-bit vector of [4 x float] comes from Dot Product of 273 /// __A, __B and __D 274 static __inline__ __m128 __DEFAULT_FN_ATTRS128 275 _mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) { 276 return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D, 277 (__v8bf)__A, 278 (__v8bf)__B); 279 } 280 281 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 282 /// 283 /// \headerfile <x86intrin.h> 284 /// 285 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 286 /// 287 /// \param __A 288 /// A 128-bit vector of [8 x bfloat]. 289 /// \param __B 290 /// A 128-bit vector of [8 x bfloat]. 291 /// \param __D 292 /// A 128-bit vector of [4 x float]. 293 /// \param __U 294 /// A 8-bit mask value specifying what is chosen for each element. 295 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. 296 /// \returns A 128-bit vector of [4 x float] comes from Dot Product of 297 /// __A, __B and __D 298 static __inline__ __m128 __DEFAULT_FN_ATTRS128 299 _mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) { 300 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 301 (__v4sf)_mm_dpbf16_ps(__D, __A, __B), 302 (__v4sf)__D); 303 } 304 305 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 306 /// 307 /// \headerfile <x86intrin.h> 308 /// 309 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 310 /// 311 /// \param __A 312 /// A 128-bit vector of [8 x bfloat]. 313 /// \param __B 314 /// A 128-bit vector of [8 x bfloat]. 315 /// \param __D 316 /// A 128-bit vector of [4 x float]. 317 /// \param __U 318 /// A 8-bit mask value specifying what is chosen for each element. 319 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. 320 /// \returns A 128-bit vector of [4 x float] comes from Dot Product of 321 /// __A, __B and __D 322 static __inline__ __m128 __DEFAULT_FN_ATTRS128 323 _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) { 324 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 325 (__v4sf)_mm_dpbf16_ps(__D, __A, __B), 326 (__v4sf)_mm_setzero_si128()); 327 } 328 329 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 330 /// 331 /// \headerfile <x86intrin.h> 332 /// 333 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 334 /// 335 /// \param __A 336 /// A 256-bit vector of [16 x bfloat]. 337 /// \param __B 338 /// A 256-bit vector of [16 x bfloat]. 339 /// \param __D 340 /// A 256-bit vector of [8 x float]. 341 /// \returns A 256-bit vector of [8 x float] comes from Dot Product of 342 /// __A, __B and __D 343 static __inline__ __m256 __DEFAULT_FN_ATTRS256 344 _mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) { 345 return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D, 346 (__v16bf)__A, 347 (__v16bf)__B); 348 } 349 350 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 351 /// 352 /// \headerfile <x86intrin.h> 353 /// 354 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 355 /// 356 /// \param __A 357 /// A 256-bit vector of [16 x bfloat]. 358 /// \param __B 359 /// A 256-bit vector of [16 x bfloat]. 360 /// \param __D 361 /// A 256-bit vector of [8 x float]. 362 /// \param __U 363 /// A 16-bit mask value specifying what is chosen for each element. 364 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. 365 /// \returns A 256-bit vector of [8 x float] comes from Dot Product of 366 /// __A, __B and __D 367 static __inline__ __m256 __DEFAULT_FN_ATTRS256 368 _mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) { 369 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 370 (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), 371 (__v8sf)__D); 372 } 373 374 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 375 /// 376 /// \headerfile <x86intrin.h> 377 /// 378 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 379 /// 380 /// \param __A 381 /// A 256-bit vector of [16 x bfloat]. 382 /// \param __B 383 /// A 256-bit vector of [16 x bfloat]. 384 /// \param __D 385 /// A 256-bit vector of [8 x float]. 386 /// \param __U 387 /// A 8-bit mask value specifying what is chosen for each element. 388 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. 389 /// \returns A 256-bit vector of [8 x float] comes from Dot Product of 390 /// __A, __B and __D 391 static __inline__ __m256 __DEFAULT_FN_ATTRS256 392 _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) { 393 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 394 (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), 395 (__v8sf)_mm256_setzero_si256()); 396 } 397 398 /// Convert One Single float Data to One BF16 Data. 399 /// 400 /// \headerfile <x86intrin.h> 401 /// 402 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 403 /// 404 /// \param __A 405 /// A float data. 406 /// \returns A bf16 data whose sign field and exponent field keep unchanged, 407 /// and fraction field is truncated to 7 bits. 408 static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) { 409 __v4sf __V = {__A, 0, 0, 0}; 410 __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask( 411 (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1); 412 return (__bf16)__R[0]; 413 } 414 415 /// Convert Packed BF16 Data to Packed float Data. 416 /// 417 /// \headerfile <x86intrin.h> 418 /// 419 /// \param __A 420 /// A 128-bit vector of [4 x bfloat]. 421 /// \returns A 128-bit vector of [4 x float] come from conversion of __A 422 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) { 423 return _mm_castsi128_ps( 424 (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16)); 425 } 426 427 /// Convert Packed BF16 Data to Packed float Data. 428 /// 429 /// \headerfile <x86intrin.h> 430 /// 431 /// \param __A 432 /// A 128-bit vector of [8 x bfloat]. 433 /// \returns A 256-bit vector of [8 x float] come from conversion of __A 434 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { 435 return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( 436 (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16)); 437 } 438 439 /// Convert Packed BF16 Data to Packed float Data using zeroing mask. 440 /// 441 /// \headerfile <x86intrin.h> 442 /// 443 /// \param __U 444 /// A 4-bit mask. Elements are zeroed out when the corresponding mask 445 /// bit is not set. 446 /// \param __A 447 /// A 128-bit vector of [4 x bfloat]. 448 /// \returns A 128-bit vector of [4 x float] come from conversion of __A 449 static __inline__ __m128 __DEFAULT_FN_ATTRS128 450 _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { 451 return _mm_castsi128_ps((__m128i)_mm_slli_epi32( 452 (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); 453 } 454 455 /// Convert Packed BF16 Data to Packed float Data using zeroing mask. 456 /// 457 /// \headerfile <x86intrin.h> 458 /// 459 /// \param __U 460 /// A 8-bit mask. Elements are zeroed out when the corresponding mask 461 /// bit is not set. 462 /// \param __A 463 /// A 128-bit vector of [8 x bfloat]. 464 /// \returns A 256-bit vector of [8 x float] come from conversion of __A 465 static __inline__ __m256 __DEFAULT_FN_ATTRS256 466 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { 467 return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( 468 (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); 469 } 470 471 /// Convert Packed BF16 Data to Packed float Data using merging mask. 472 /// 473 /// \headerfile <x86intrin.h> 474 /// 475 /// \param __S 476 /// A 128-bit vector of [4 x float]. Elements are copied from __S when 477 /// the corresponding mask bit is not set. 478 /// \param __U 479 /// A 4-bit mask. Elements are zeroed out when the corresponding mask 480 /// bit is not set. 481 /// \param __A 482 /// A 128-bit vector of [4 x bfloat]. 483 /// \returns A 128-bit vector of [4 x float] come from conversion of __A 484 static __inline__ __m128 __DEFAULT_FN_ATTRS128 485 _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { 486 return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32( 487 (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A), 488 16)); 489 } 490 491 /// Convert Packed BF16 Data to Packed float Data using merging mask. 492 /// 493 /// \headerfile <x86intrin.h> 494 /// 495 /// \param __S 496 /// A 256-bit vector of [8 x float]. Elements are copied from __S when 497 /// the corresponding mask bit is not set. 498 /// \param __U 499 /// A 8-bit mask. Elements are zeroed out when the corresponding mask 500 /// bit is not set. 501 /// \param __A 502 /// A 128-bit vector of [8 x bfloat]. 503 /// \returns A 256-bit vector of [8 x float] come from conversion of __A 504 static __inline__ __m256 __DEFAULT_FN_ATTRS256 505 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) { 506 return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32( 507 (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 508 16)); 509 } 510 511 #undef __DEFAULT_FN_ATTRS128 512 #undef __DEFAULT_FN_ATTRS256 513 514 #endif 515 #endif 516