1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type, 18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations. 19 However scalar float operations in vector (XMM) registers require 20 the POWER8 VSX ISA (2.07) level. There are differences for data 21 format and placement of float scalars in the vector register, which 22 require extra steps to match SSE2 scalar float semantics on POWER. 23 24 It should be noted that there's much difference between X86_64's 25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use 26 portable <fenv.h> instead of access MXSCR directly. 27 28 Most SSE2 scalar float intrinsic operations can be performed more 29 efficiently as C language float scalar operations or optimized to 30 use vector SIMD operations. We recommend this for new applications. 31 */ 32 #error \ 33 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 34 #endif 35 36 #ifndef EMMINTRIN_H_ 37 #define EMMINTRIN_H_ 38 39 #if defined(__ppc64__) && \ 40 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) 41 42 #include <altivec.h> 43 44 /* We need definitions from the SSE header files. */ 45 #include <xmmintrin.h> 46 47 /* SSE2 */ 48 typedef __vector double __v2df; 49 typedef __vector long long __v2di; 50 typedef __vector unsigned long long __v2du; 51 typedef __vector int __v4si; 52 typedef __vector unsigned int __v4su; 53 typedef __vector short __v8hi; 54 typedef __vector unsigned short __v8hu; 55 typedef __vector signed char __v16qi; 56 typedef __vector unsigned char __v16qu; 57 58 /* The Intel API is flexible enough that we must allow aliasing with other 59 vector types, and their scalar components. */ 60 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); 61 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__)); 62 63 /* Unaligned version of the same types. */ 64 typedef long long __m128i_u 65 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 66 typedef double __m128d_u 67 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 68 69 /* Define two value permute mask. */ 70 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 71 72 /* Create a vector with element 0 as F and the rest zero. */ 73 extern __inline __m128d 74 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 75 _mm_set_sd(double __F) { 76 return __extension__(__m128d){__F, 0.0}; 77 } 78 79 /* Create a vector with both elements equal to F. */ 80 extern __inline __m128d 81 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 82 _mm_set1_pd(double __F) { 83 return __extension__(__m128d){__F, __F}; 84 } 85 86 extern __inline __m128d 87 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 88 _mm_set_pd1(double __F) { 89 return _mm_set1_pd(__F); 90 } 91 92 /* Create a vector with the lower value X and upper value W. */ 93 extern __inline __m128d 94 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 95 _mm_set_pd(double __W, double __X) { 96 return __extension__(__m128d){__X, __W}; 97 } 98 99 /* Create a vector with the lower value W and upper value X. */ 100 extern __inline __m128d 101 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 102 _mm_setr_pd(double __W, double __X) { 103 return __extension__(__m128d){__W, __X}; 104 } 105 106 /* Create an undefined vector. */ 107 extern __inline __m128d 108 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 109 _mm_undefined_pd(void) { 110 __m128d __Y = __Y; 111 return __Y; 112 } 113 114 /* Create a vector of zeros. */ 115 extern __inline __m128d 116 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 117 _mm_setzero_pd(void) { 118 return (__m128d)vec_splats(0); 119 } 120 121 /* Sets the low DPFP value of A from the low value of B. */ 122 extern __inline __m128d 123 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 124 _mm_move_sd(__m128d __A, __m128d __B) { 125 __v2df __result = (__v2df)__A; 126 __result[0] = ((__v2df)__B)[0]; 127 return (__m128d)__result; 128 } 129 130 /* Load two DPFP values from P. The address must be 16-byte aligned. */ 131 extern __inline __m128d 132 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 133 _mm_load_pd(double const *__P) { 134 return ((__m128d)vec_ld(0, (__v16qu *)__P)); 135 } 136 137 /* Load two DPFP values from P. The address need not be 16-byte aligned. */ 138 extern __inline __m128d 139 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 140 _mm_loadu_pd(double const *__P) { 141 return (vec_vsx_ld(0, __P)); 142 } 143 144 /* Create a vector with all two elements equal to *P. */ 145 extern __inline __m128d 146 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147 _mm_load1_pd(double const *__P) { 148 return (vec_splats(*__P)); 149 } 150 151 /* Create a vector with element 0 as *P and the rest zero. */ 152 extern __inline __m128d 153 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 154 _mm_load_sd(double const *__P) { 155 return _mm_set_sd(*__P); 156 } 157 158 extern __inline __m128d 159 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 160 _mm_load_pd1(double const *__P) { 161 return _mm_load1_pd(__P); 162 } 163 164 /* Load two DPFP values in reverse order. The address must be aligned. */ 165 extern __inline __m128d 166 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 167 _mm_loadr_pd(double const *__P) { 168 __v2df __tmp = _mm_load_pd(__P); 169 return (__m128d)vec_xxpermdi(__tmp, __tmp, 2); 170 } 171 172 /* Store two DPFP values. The address must be 16-byte aligned. */ 173 extern __inline void 174 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 175 _mm_store_pd(double *__P, __m128d __A) { 176 vec_st((__v16qu)__A, 0, (__v16qu *)__P); 177 } 178 179 /* Store two DPFP values. The address need not be 16-byte aligned. */ 180 extern __inline void 181 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 182 _mm_storeu_pd(double *__P, __m128d __A) { 183 *(__m128d_u *)__P = __A; 184 } 185 186 /* Stores the lower DPFP value. */ 187 extern __inline void 188 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 189 _mm_store_sd(double *__P, __m128d __A) { 190 *__P = ((__v2df)__A)[0]; 191 } 192 193 extern __inline double 194 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 195 _mm_cvtsd_f64(__m128d __A) { 196 return ((__v2df)__A)[0]; 197 } 198 199 extern __inline void 200 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 201 _mm_storel_pd(double *__P, __m128d __A) { 202 _mm_store_sd(__P, __A); 203 } 204 205 /* Stores the upper DPFP value. */ 206 extern __inline void 207 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 208 _mm_storeh_pd(double *__P, __m128d __A) { 209 *__P = ((__v2df)__A)[1]; 210 } 211 /* Store the lower DPFP value across two words. 212 The address must be 16-byte aligned. */ 213 extern __inline void 214 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 215 _mm_store1_pd(double *__P, __m128d __A) { 216 _mm_store_pd(__P, vec_splat(__A, 0)); 217 } 218 219 extern __inline void 220 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221 _mm_store_pd1(double *__P, __m128d __A) { 222 _mm_store1_pd(__P, __A); 223 } 224 225 /* Store two DPFP values in reverse order. The address must be aligned. */ 226 extern __inline void 227 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 228 _mm_storer_pd(double *__P, __m128d __A) { 229 _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2)); 230 } 231 232 /* Intel intrinsic. */ 233 extern __inline long long 234 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 235 _mm_cvtsi128_si64(__m128i __A) { 236 return ((__v2di)__A)[0]; 237 } 238 239 /* Microsoft intrinsic. */ 240 extern __inline long long 241 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 242 _mm_cvtsi128_si64x(__m128i __A) { 243 return ((__v2di)__A)[0]; 244 } 245 246 extern __inline __m128d 247 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 248 _mm_add_pd(__m128d __A, __m128d __B) { 249 return (__m128d)((__v2df)__A + (__v2df)__B); 250 } 251 252 /* Add the lower double-precision (64-bit) floating-point element in 253 a and b, store the result in the lower element of dst, and copy 254 the upper element from a to the upper element of dst. */ 255 extern __inline __m128d 256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 257 _mm_add_sd(__m128d __A, __m128d __B) { 258 __A[0] = __A[0] + __B[0]; 259 return (__A); 260 } 261 262 extern __inline __m128d 263 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 264 _mm_sub_pd(__m128d __A, __m128d __B) { 265 return (__m128d)((__v2df)__A - (__v2df)__B); 266 } 267 268 extern __inline __m128d 269 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 270 _mm_sub_sd(__m128d __A, __m128d __B) { 271 __A[0] = __A[0] - __B[0]; 272 return (__A); 273 } 274 275 extern __inline __m128d 276 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 277 _mm_mul_pd(__m128d __A, __m128d __B) { 278 return (__m128d)((__v2df)__A * (__v2df)__B); 279 } 280 281 extern __inline __m128d 282 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 283 _mm_mul_sd(__m128d __A, __m128d __B) { 284 __A[0] = __A[0] * __B[0]; 285 return (__A); 286 } 287 288 extern __inline __m128d 289 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 290 _mm_div_pd(__m128d __A, __m128d __B) { 291 return (__m128d)((__v2df)__A / (__v2df)__B); 292 } 293 294 extern __inline __m128d 295 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 296 _mm_div_sd(__m128d __A, __m128d __B) { 297 __A[0] = __A[0] / __B[0]; 298 return (__A); 299 } 300 301 extern __inline __m128d 302 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 303 _mm_sqrt_pd(__m128d __A) { 304 return (vec_sqrt(__A)); 305 } 306 307 /* Return pair {sqrt (B[0]), A[1]}. */ 308 extern __inline __m128d 309 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 310 _mm_sqrt_sd(__m128d __A, __m128d __B) { 311 __v2df __c; 312 __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0])); 313 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 314 } 315 316 extern __inline __m128d 317 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 318 _mm_min_pd(__m128d __A, __m128d __B) { 319 return (vec_min(__A, __B)); 320 } 321 322 extern __inline __m128d 323 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 324 _mm_min_sd(__m128d __A, __m128d __B) { 325 __v2df __a, __b, __c; 326 __a = vec_splats(__A[0]); 327 __b = vec_splats(__B[0]); 328 __c = vec_min(__a, __b); 329 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 330 } 331 332 extern __inline __m128d 333 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 334 _mm_max_pd(__m128d __A, __m128d __B) { 335 return (vec_max(__A, __B)); 336 } 337 338 extern __inline __m128d 339 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 340 _mm_max_sd(__m128d __A, __m128d __B) { 341 __v2df __a, __b, __c; 342 __a = vec_splats(__A[0]); 343 __b = vec_splats(__B[0]); 344 __c = vec_max(__a, __b); 345 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 346 } 347 348 extern __inline __m128d 349 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 350 _mm_cmpeq_pd(__m128d __A, __m128d __B) { 351 return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B)); 352 } 353 354 extern __inline __m128d 355 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 356 _mm_cmplt_pd(__m128d __A, __m128d __B) { 357 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B)); 358 } 359 360 extern __inline __m128d 361 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 362 _mm_cmple_pd(__m128d __A, __m128d __B) { 363 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B)); 364 } 365 366 extern __inline __m128d 367 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 368 _mm_cmpgt_pd(__m128d __A, __m128d __B) { 369 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B)); 370 } 371 372 extern __inline __m128d 373 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 374 _mm_cmpge_pd(__m128d __A, __m128d __B) { 375 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B)); 376 } 377 378 extern __inline __m128d 379 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 380 _mm_cmpneq_pd(__m128d __A, __m128d __B) { 381 __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B); 382 return ((__m128d)vec_nor(__temp, __temp)); 383 } 384 385 extern __inline __m128d 386 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 387 _mm_cmpnlt_pd(__m128d __A, __m128d __B) { 388 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B)); 389 } 390 391 extern __inline __m128d 392 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 393 _mm_cmpnle_pd(__m128d __A, __m128d __B) { 394 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B)); 395 } 396 397 extern __inline __m128d 398 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 399 _mm_cmpngt_pd(__m128d __A, __m128d __B) { 400 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B)); 401 } 402 403 extern __inline __m128d 404 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 405 _mm_cmpnge_pd(__m128d __A, __m128d __B) { 406 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B)); 407 } 408 409 extern __inline __m128d 410 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 411 _mm_cmpord_pd(__m128d __A, __m128d __B) { 412 __v2du __c, __d; 413 /* Compare against self will return false (0's) if NAN. */ 414 __c = (__v2du)vec_cmpeq(__A, __A); 415 __d = (__v2du)vec_cmpeq(__B, __B); 416 /* A != NAN and B != NAN. */ 417 return ((__m128d)vec_and(__c, __d)); 418 } 419 420 extern __inline __m128d 421 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 422 _mm_cmpunord_pd(__m128d __A, __m128d __B) { 423 #if _ARCH_PWR8 424 __v2du __c, __d; 425 /* Compare against self will return false (0's) if NAN. */ 426 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A); 427 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B); 428 /* A == NAN OR B == NAN converts too: 429 NOT(A != NAN) OR NOT(B != NAN). */ 430 __c = vec_nor(__c, __c); 431 return ((__m128d)vec_orc(__c, __d)); 432 #else 433 __v2du __c, __d; 434 /* Compare against self will return false (0's) if NAN. */ 435 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A); 436 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B); 437 /* Convert the true ('1's) is NAN. */ 438 __c = vec_nor(__c, __c); 439 __d = vec_nor(__d, __d); 440 return ((__m128d)vec_or(__c, __d)); 441 #endif 442 } 443 444 extern __inline __m128d 445 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 446 _mm_cmpeq_sd(__m128d __A, __m128d __B) { 447 __v2df __a, __b, __c; 448 /* PowerISA VSX does not allow partial (for just lower double) 449 results. So to insure we don't generate spurious exceptions 450 (from the upper double values) we splat the lower double 451 before we do the operation. */ 452 __a = vec_splats(__A[0]); 453 __b = vec_splats(__B[0]); 454 __c = (__v2df)vec_cmpeq(__a, __b); 455 /* Then we merge the lower double result with the original upper 456 double from __A. */ 457 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 458 } 459 460 extern __inline __m128d 461 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 462 _mm_cmplt_sd(__m128d __A, __m128d __B) { 463 __v2df __a, __b, __c; 464 __a = vec_splats(__A[0]); 465 __b = vec_splats(__B[0]); 466 __c = (__v2df)vec_cmplt(__a, __b); 467 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 468 } 469 470 extern __inline __m128d 471 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 472 _mm_cmple_sd(__m128d __A, __m128d __B) { 473 __v2df __a, __b, __c; 474 __a = vec_splats(__A[0]); 475 __b = vec_splats(__B[0]); 476 __c = (__v2df)vec_cmple(__a, __b); 477 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 478 } 479 480 extern __inline __m128d 481 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 482 _mm_cmpgt_sd(__m128d __A, __m128d __B) { 483 __v2df __a, __b, __c; 484 __a = vec_splats(__A[0]); 485 __b = vec_splats(__B[0]); 486 __c = (__v2df)vec_cmpgt(__a, __b); 487 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 488 } 489 490 extern __inline __m128d 491 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 492 _mm_cmpge_sd(__m128d __A, __m128d __B) { 493 __v2df __a, __b, __c; 494 __a = vec_splats(__A[0]); 495 __b = vec_splats(__B[0]); 496 __c = (__v2df)vec_cmpge(__a, __b); 497 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 498 } 499 500 extern __inline __m128d 501 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 502 _mm_cmpneq_sd(__m128d __A, __m128d __B) { 503 __v2df __a, __b, __c; 504 __a = vec_splats(__A[0]); 505 __b = vec_splats(__B[0]); 506 __c = (__v2df)vec_cmpeq(__a, __b); 507 __c = vec_nor(__c, __c); 508 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 509 } 510 511 extern __inline __m128d 512 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 513 _mm_cmpnlt_sd(__m128d __A, __m128d __B) { 514 __v2df __a, __b, __c; 515 __a = vec_splats(__A[0]); 516 __b = vec_splats(__B[0]); 517 /* Not less than is just greater than or equal. */ 518 __c = (__v2df)vec_cmpge(__a, __b); 519 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 520 } 521 522 extern __inline __m128d 523 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 524 _mm_cmpnle_sd(__m128d __A, __m128d __B) { 525 __v2df __a, __b, __c; 526 __a = vec_splats(__A[0]); 527 __b = vec_splats(__B[0]); 528 /* Not less than or equal is just greater than. */ 529 __c = (__v2df)vec_cmpge(__a, __b); 530 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 531 } 532 533 extern __inline __m128d 534 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 535 _mm_cmpngt_sd(__m128d __A, __m128d __B) { 536 __v2df __a, __b, __c; 537 __a = vec_splats(__A[0]); 538 __b = vec_splats(__B[0]); 539 /* Not greater than is just less than or equal. */ 540 __c = (__v2df)vec_cmple(__a, __b); 541 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 542 } 543 544 extern __inline __m128d 545 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 546 _mm_cmpnge_sd(__m128d __A, __m128d __B) { 547 __v2df __a, __b, __c; 548 __a = vec_splats(__A[0]); 549 __b = vec_splats(__B[0]); 550 /* Not greater than or equal is just less than. */ 551 __c = (__v2df)vec_cmplt(__a, __b); 552 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 553 } 554 555 extern __inline __m128d 556 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 557 _mm_cmpord_sd(__m128d __A, __m128d __B) { 558 __v2df __r; 559 __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0])); 560 return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]); 561 } 562 563 extern __inline __m128d 564 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 565 _mm_cmpunord_sd(__m128d __A, __m128d __B) { 566 __v2df __r; 567 __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0])); 568 return (__m128d)_mm_setr_pd(__r[0], __A[1]); 569 } 570 571 /* FIXME 572 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are 573 exactly the same because GCC for PowerPC only generates unordered 574 compares (scalar and vector). 575 Technically __mm_comieq_sp et all should be using the ordered 576 compare and signal for QNaNs. The __mm_ucomieq_sd et all should 577 be OK. */ 578 extern __inline int 579 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 580 _mm_comieq_sd(__m128d __A, __m128d __B) { 581 return (__A[0] == __B[0]); 582 } 583 584 extern __inline int 585 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 586 _mm_comilt_sd(__m128d __A, __m128d __B) { 587 return (__A[0] < __B[0]); 588 } 589 590 extern __inline int 591 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 592 _mm_comile_sd(__m128d __A, __m128d __B) { 593 return (__A[0] <= __B[0]); 594 } 595 596 extern __inline int 597 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 598 _mm_comigt_sd(__m128d __A, __m128d __B) { 599 return (__A[0] > __B[0]); 600 } 601 602 extern __inline int 603 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 604 _mm_comige_sd(__m128d __A, __m128d __B) { 605 return (__A[0] >= __B[0]); 606 } 607 608 extern __inline int 609 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 610 _mm_comineq_sd(__m128d __A, __m128d __B) { 611 return (__A[0] != __B[0]); 612 } 613 614 extern __inline int 615 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 616 _mm_ucomieq_sd(__m128d __A, __m128d __B) { 617 return (__A[0] == __B[0]); 618 } 619 620 extern __inline int 621 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 622 _mm_ucomilt_sd(__m128d __A, __m128d __B) { 623 return (__A[0] < __B[0]); 624 } 625 626 extern __inline int 627 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 628 _mm_ucomile_sd(__m128d __A, __m128d __B) { 629 return (__A[0] <= __B[0]); 630 } 631 632 extern __inline int 633 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 634 _mm_ucomigt_sd(__m128d __A, __m128d __B) { 635 return (__A[0] > __B[0]); 636 } 637 638 extern __inline int 639 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 640 _mm_ucomige_sd(__m128d __A, __m128d __B) { 641 return (__A[0] >= __B[0]); 642 } 643 644 extern __inline int 645 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 646 _mm_ucomineq_sd(__m128d __A, __m128d __B) { 647 return (__A[0] != __B[0]); 648 } 649 650 /* Create a vector of Qi, where i is the element number. */ 651 extern __inline __m128i 652 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 653 _mm_set_epi64x(long long __q1, long long __q0) { 654 return __extension__(__m128i)(__v2di){__q0, __q1}; 655 } 656 657 extern __inline __m128i 658 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 659 _mm_set_epi64(__m64 __q1, __m64 __q0) { 660 return _mm_set_epi64x((long long)__q1, (long long)__q0); 661 } 662 663 extern __inline __m128i 664 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 665 _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) { 666 return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3}; 667 } 668 669 extern __inline __m128i 670 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 671 _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3, 672 short __q2, short __q1, short __q0) { 673 return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3, 674 __q4, __q5, __q6, __q7}; 675 } 676 677 extern __inline __m128i 678 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 679 _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11, 680 char __q10, char __q09, char __q08, char __q07, char __q06, 681 char __q05, char __q04, char __q03, char __q02, char __q01, 682 char __q00) { 683 return __extension__(__m128i)(__v16qi){ 684 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 685 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; 686 } 687 688 /* Set all of the elements of the vector to A. */ 689 extern __inline __m128i 690 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 691 _mm_set1_epi64x(long long __A) { 692 return _mm_set_epi64x(__A, __A); 693 } 694 695 extern __inline __m128i 696 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 697 _mm_set1_epi64(__m64 __A) { 698 return _mm_set_epi64(__A, __A); 699 } 700 701 extern __inline __m128i 702 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 703 _mm_set1_epi32(int __A) { 704 return _mm_set_epi32(__A, __A, __A, __A); 705 } 706 707 extern __inline __m128i 708 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 709 _mm_set1_epi16(short __A) { 710 return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A); 711 } 712 713 extern __inline __m128i 714 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 715 _mm_set1_epi8(char __A) { 716 return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, 717 __A, __A, __A, __A, __A); 718 } 719 720 /* Create a vector of Qi, where i is the element number. 721 The parameter order is reversed from the _mm_set_epi* functions. */ 722 extern __inline __m128i 723 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 724 _mm_setr_epi64(__m64 __q0, __m64 __q1) { 725 return _mm_set_epi64(__q1, __q0); 726 } 727 728 extern __inline __m128i 729 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 730 _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) { 731 return _mm_set_epi32(__q3, __q2, __q1, __q0); 732 } 733 734 extern __inline __m128i 735 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 736 _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4, 737 short __q5, short __q6, short __q7) { 738 return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 739 } 740 741 extern __inline __m128i 742 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 743 _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04, 744 char __q05, char __q06, char __q07, char __q08, char __q09, 745 char __q10, char __q11, char __q12, char __q13, char __q14, 746 char __q15) { 747 return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 748 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 749 } 750 751 /* Create a vector with element 0 as *P and the rest zero. */ 752 extern __inline __m128i 753 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 754 _mm_load_si128(__m128i const *__P) { 755 return *__P; 756 } 757 758 extern __inline __m128i 759 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 760 _mm_loadu_si128(__m128i_u const *__P) { 761 return (__m128i)(vec_vsx_ld(0, (signed int const *)__P)); 762 } 763 764 extern __inline __m128i 765 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 766 _mm_loadl_epi64(__m128i_u const *__P) { 767 return _mm_set_epi64((__m64)0LL, *(__m64 *)__P); 768 } 769 770 extern __inline void 771 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 772 _mm_store_si128(__m128i *__P, __m128i __B) { 773 vec_st((__v16qu)__B, 0, (__v16qu *)__P); 774 } 775 776 extern __inline void 777 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 778 _mm_storeu_si128(__m128i_u *__P, __m128i __B) { 779 *__P = __B; 780 } 781 782 extern __inline void 783 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 784 _mm_storel_epi64(__m128i_u *__P, __m128i __B) { 785 *(long long *)__P = ((__v2di)__B)[0]; 786 } 787 788 extern __inline __m64 789 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 790 _mm_movepi64_pi64(__m128i_u __B) { 791 return (__m64)((__v2di)__B)[0]; 792 } 793 794 extern __inline __m128i 795 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 796 _mm_movpi64_epi64(__m64 __A) { 797 return _mm_set_epi64((__m64)0LL, __A); 798 } 799 800 extern __inline __m128i 801 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 802 _mm_move_epi64(__m128i __A) { 803 return _mm_set_epi64((__m64)0LL, (__m64)__A[0]); 804 } 805 806 /* Create an undefined vector. */ 807 extern __inline __m128i 808 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 809 _mm_undefined_si128(void) { 810 __m128i __Y = __Y; 811 return __Y; 812 } 813 814 /* Create a vector of zeros. */ 815 extern __inline __m128i 816 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 817 _mm_setzero_si128(void) { 818 return __extension__(__m128i)(__v4si){0, 0, 0, 0}; 819 } 820 821 #ifdef _ARCH_PWR8 822 extern __inline __m128d 823 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 824 _mm_cvtepi32_pd(__m128i __A) { 825 __v2di __val; 826 /* For LE need to generate Vector Unpack Low Signed Word. 827 Which is generated from unpackh. */ 828 __val = (__v2di)vec_unpackh((__v4si)__A); 829 830 return (__m128d)vec_ctf(__val, 0); 831 } 832 #endif 833 834 extern __inline __m128 835 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 836 _mm_cvtepi32_ps(__m128i __A) { 837 return ((__m128)vec_ctf((__v4si)__A, 0)); 838 } 839 840 extern __inline __m128i 841 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 842 _mm_cvtpd_epi32(__m128d __A) { 843 __v2df __rounded = vec_rint(__A); 844 __v4si __result, __temp; 845 const __v4si __vzero = {0, 0, 0, 0}; 846 847 /* VSX Vector truncate Double-Precision to integer and Convert to 848 Signed Integer Word format with Saturate. */ 849 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :); 850 851 #ifdef _ARCH_PWR8 852 #ifdef __LITTLE_ENDIAN__ 853 __temp = vec_mergeo(__temp, __temp); 854 #else 855 __temp = vec_mergee(__temp, __temp); 856 #endif 857 __result = (__v4si)vec_vpkudum((__vector long long)__temp, 858 (__vector long long)__vzero); 859 #else 860 { 861 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 862 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 863 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 864 } 865 #endif 866 return (__m128i)__result; 867 } 868 869 extern __inline __m64 870 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 871 _mm_cvtpd_pi32(__m128d __A) { 872 __m128i __result = _mm_cvtpd_epi32(__A); 873 874 return (__m64)__result[0]; 875 } 876 877 extern __inline __m128 878 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 879 _mm_cvtpd_ps(__m128d __A) { 880 __v4sf __result; 881 __v4si __temp; 882 const __v4si __vzero = {0, 0, 0, 0}; 883 884 __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :); 885 886 #ifdef _ARCH_PWR8 887 #ifdef __LITTLE_ENDIAN__ 888 __temp = vec_mergeo(__temp, __temp); 889 #else 890 __temp = vec_mergee(__temp, __temp); 891 #endif 892 __result = (__v4sf)vec_vpkudum((__vector long long)__temp, 893 (__vector long long)__vzero); 894 #else 895 { 896 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 897 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 898 __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 899 } 900 #endif 901 return ((__m128)__result); 902 } 903 904 extern __inline __m128i 905 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 906 _mm_cvttpd_epi32(__m128d __A) { 907 __v4si __result; 908 __v4si __temp; 909 const __v4si __vzero = {0, 0, 0, 0}; 910 911 /* VSX Vector truncate Double-Precision to integer and Convert to 912 Signed Integer Word format with Saturate. */ 913 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :); 914 915 #ifdef _ARCH_PWR8 916 #ifdef __LITTLE_ENDIAN__ 917 __temp = vec_mergeo(__temp, __temp); 918 #else 919 __temp = vec_mergee(__temp, __temp); 920 #endif 921 __result = (__v4si)vec_vpkudum((__vector long long)__temp, 922 (__vector long long)__vzero); 923 #else 924 { 925 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 926 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 927 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 928 } 929 #endif 930 931 return ((__m128i)__result); 932 } 933 934 extern __inline __m64 935 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 936 _mm_cvttpd_pi32(__m128d __A) { 937 __m128i __result = _mm_cvttpd_epi32(__A); 938 939 return (__m64)__result[0]; 940 } 941 942 extern __inline int 943 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 944 _mm_cvtsi128_si32(__m128i __A) { 945 return ((__v4si)__A)[0]; 946 } 947 948 #ifdef _ARCH_PWR8 949 extern __inline __m128d 950 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 951 _mm_cvtpi32_pd(__m64 __A) { 952 __v4si __temp; 953 __v2di __tmp2; 954 __v2df __result; 955 956 __temp = (__v4si)vec_splats(__A); 957 __tmp2 = (__v2di)vec_unpackl(__temp); 958 __result = vec_ctf((__vector signed long long)__tmp2, 0); 959 return (__m128d)__result; 960 } 961 #endif 962 963 extern __inline __m128i 964 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 965 _mm_cvtps_epi32(__m128 __A) { 966 __v4sf __rounded; 967 __v4si __result; 968 969 __rounded = vec_rint((__v4sf)__A); 970 __result = vec_cts(__rounded, 0); 971 return (__m128i)__result; 972 } 973 974 extern __inline __m128i 975 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 976 _mm_cvttps_epi32(__m128 __A) { 977 __v4si __result; 978 979 __result = vec_cts((__v4sf)__A, 0); 980 return (__m128i)__result; 981 } 982 983 extern __inline __m128d 984 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 985 _mm_cvtps_pd(__m128 __A) { 986 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ 987 #ifdef vec_doubleh 988 return (__m128d)vec_doubleh((__v4sf)__A); 989 #else 990 /* Otherwise the compiler is not current and so need to generate the 991 equivalent code. */ 992 __v4sf __a = (__v4sf)__A; 993 __v4sf __temp; 994 __v2df __result; 995 #ifdef __LITTLE_ENDIAN__ 996 /* The input float values are in elements {[0], [1]} but the convert 997 instruction needs them in elements {[1], [3]}, So we use two 998 shift left double vector word immediates to get the elements 999 lined up. */ 1000 __temp = __builtin_vsx_xxsldwi(__a, __a, 3); 1001 __temp = __builtin_vsx_xxsldwi(__a, __temp, 2); 1002 #else 1003 /* The input float values are in elements {[0], [1]} but the convert 1004 instruction needs them in elements {[0], [2]}, So we use two 1005 shift left double vector word immediates to get the elements 1006 lined up. */ 1007 __temp = vec_vmrghw(__a, __a); 1008 #endif 1009 __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :); 1010 return (__m128d)__result; 1011 #endif 1012 } 1013 1014 extern __inline int 1015 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1016 _mm_cvtsd_si32(__m128d __A) { 1017 __v2df __rounded = vec_rint((__v2df)__A); 1018 int __result = ((__v2df)__rounded)[0]; 1019 1020 return __result; 1021 } 1022 /* Intel intrinsic. */ 1023 extern __inline long long 1024 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1025 _mm_cvtsd_si64(__m128d __A) { 1026 __v2df __rounded = vec_rint((__v2df)__A); 1027 long long __result = ((__v2df)__rounded)[0]; 1028 1029 return __result; 1030 } 1031 1032 /* Microsoft intrinsic. */ 1033 extern __inline long long 1034 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1035 _mm_cvtsd_si64x(__m128d __A) { 1036 return _mm_cvtsd_si64((__v2df)__A); 1037 } 1038 1039 extern __inline int 1040 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1041 _mm_cvttsd_si32(__m128d __A) { 1042 int __result = ((__v2df)__A)[0]; 1043 1044 return __result; 1045 } 1046 1047 /* Intel intrinsic. */ 1048 extern __inline long long 1049 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1050 _mm_cvttsd_si64(__m128d __A) { 1051 long long __result = ((__v2df)__A)[0]; 1052 1053 return __result; 1054 } 1055 1056 /* Microsoft intrinsic. */ 1057 extern __inline long long 1058 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1059 _mm_cvttsd_si64x(__m128d __A) { 1060 return _mm_cvttsd_si64(__A); 1061 } 1062 1063 extern __inline __m128 1064 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1065 _mm_cvtsd_ss(__m128 __A, __m128d __B) { 1066 __v4sf __result = (__v4sf)__A; 1067 1068 #ifdef __LITTLE_ENDIAN__ 1069 __v4sf __temp_s; 1070 /* Copy double element[0] to element [1] for conversion. */ 1071 __v2df __temp_b = vec_splat((__v2df)__B, 0); 1072 1073 /* Pre-rotate __A left 3 (logically right 1) elements. */ 1074 __result = __builtin_vsx_xxsldwi(__result, __result, 3); 1075 /* Convert double to single float scalar in a vector. */ 1076 __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :); 1077 /* Shift the resulting scalar into vector element [0]. */ 1078 __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1); 1079 #else 1080 __result[0] = ((__v2df)__B)[0]; 1081 #endif 1082 return (__m128)__result; 1083 } 1084 1085 extern __inline __m128d 1086 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1087 _mm_cvtsi32_sd(__m128d __A, int __B) { 1088 __v2df __result = (__v2df)__A; 1089 double __db = __B; 1090 __result[0] = __db; 1091 return (__m128d)__result; 1092 } 1093 1094 /* Intel intrinsic. */ 1095 extern __inline __m128d 1096 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1097 _mm_cvtsi64_sd(__m128d __A, long long __B) { 1098 __v2df __result = (__v2df)__A; 1099 double __db = __B; 1100 __result[0] = __db; 1101 return (__m128d)__result; 1102 } 1103 1104 /* Microsoft intrinsic. */ 1105 extern __inline __m128d 1106 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1107 _mm_cvtsi64x_sd(__m128d __A, long long __B) { 1108 return _mm_cvtsi64_sd(__A, __B); 1109 } 1110 1111 extern __inline __m128d 1112 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1113 _mm_cvtss_sd(__m128d __A, __m128 __B) { 1114 #ifdef __LITTLE_ENDIAN__ 1115 /* Use splat to move element [0] into position for the convert. */ 1116 __v4sf __temp = vec_splat((__v4sf)__B, 0); 1117 __v2df __res; 1118 /* Convert single float scalar to double in a vector. */ 1119 __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :); 1120 return (__m128d)vec_mergel(__res, (__v2df)__A); 1121 #else 1122 __v2df __res = (__v2df)__A; 1123 __res[0] = ((__v4sf)__B)[0]; 1124 return (__m128d)__res; 1125 #endif 1126 } 1127 1128 extern __inline __m128d 1129 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1130 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) { 1131 __vector double __result; 1132 const int __litmsk = __mask & 0x3; 1133 1134 if (__litmsk == 0) 1135 __result = vec_mergeh(__A, __B); 1136 #if __GNUC__ < 6 1137 else if (__litmsk == 1) 1138 __result = vec_xxpermdi(__B, __A, 2); 1139 else if (__litmsk == 2) 1140 __result = vec_xxpermdi(__B, __A, 1); 1141 #else 1142 else if (__litmsk == 1) 1143 __result = vec_xxpermdi(__A, __B, 2); 1144 else if (__litmsk == 2) 1145 __result = vec_xxpermdi(__A, __B, 1); 1146 #endif 1147 else 1148 __result = vec_mergel(__A, __B); 1149 1150 return __result; 1151 } 1152 1153 extern __inline __m128d 1154 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1155 _mm_unpackhi_pd(__m128d __A, __m128d __B) { 1156 return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B); 1157 } 1158 1159 extern __inline __m128d 1160 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1161 _mm_unpacklo_pd(__m128d __A, __m128d __B) { 1162 return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B); 1163 } 1164 1165 extern __inline __m128d 1166 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1167 _mm_loadh_pd(__m128d __A, double const *__B) { 1168 __v2df __result = (__v2df)__A; 1169 __result[1] = *__B; 1170 return (__m128d)__result; 1171 } 1172 1173 extern __inline __m128d 1174 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1175 _mm_loadl_pd(__m128d __A, double const *__B) { 1176 __v2df __result = (__v2df)__A; 1177 __result[0] = *__B; 1178 return (__m128d)__result; 1179 } 1180 1181 #ifdef _ARCH_PWR8 1182 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1183 1184 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */ 1185 extern __inline int 1186 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1187 _mm_movemask_pd(__m128d __A) { 1188 #ifdef _ARCH_PWR10 1189 return vec_extractm((__v2du)__A); 1190 #else 1191 __vector unsigned long long __result; 1192 static const __vector unsigned int __perm_mask = { 1193 #ifdef __LITTLE_ENDIAN__ 1194 0x80800040, 0x80808080, 0x80808080, 0x80808080 1195 #else 1196 0x80808080, 0x80808080, 0x80808080, 0x80804000 1197 #endif 1198 }; 1199 1200 __result = ((__vector unsigned long long)vec_vbpermq( 1201 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); 1202 1203 #ifdef __LITTLE_ENDIAN__ 1204 return __result[1]; 1205 #else 1206 return __result[0]; 1207 #endif 1208 #endif /* !_ARCH_PWR10 */ 1209 } 1210 #endif /* _ARCH_PWR8 */ 1211 1212 extern __inline __m128i 1213 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1214 _mm_packs_epi16(__m128i __A, __m128i __B) { 1215 return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B); 1216 } 1217 1218 extern __inline __m128i 1219 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1220 _mm_packs_epi32(__m128i __A, __m128i __B) { 1221 return (__m128i)vec_packs((__v4si)__A, (__v4si)__B); 1222 } 1223 1224 extern __inline __m128i 1225 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1226 _mm_packus_epi16(__m128i __A, __m128i __B) { 1227 return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B); 1228 } 1229 1230 extern __inline __m128i 1231 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1232 _mm_unpackhi_epi8(__m128i __A, __m128i __B) { 1233 return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B); 1234 } 1235 1236 extern __inline __m128i 1237 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1238 _mm_unpackhi_epi16(__m128i __A, __m128i __B) { 1239 return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B); 1240 } 1241 1242 extern __inline __m128i 1243 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1244 _mm_unpackhi_epi32(__m128i __A, __m128i __B) { 1245 return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B); 1246 } 1247 1248 extern __inline __m128i 1249 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1250 _mm_unpackhi_epi64(__m128i __A, __m128i __B) { 1251 return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B); 1252 } 1253 1254 extern __inline __m128i 1255 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1256 _mm_unpacklo_epi8(__m128i __A, __m128i __B) { 1257 return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B); 1258 } 1259 1260 extern __inline __m128i 1261 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1262 _mm_unpacklo_epi16(__m128i __A, __m128i __B) { 1263 return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B); 1264 } 1265 1266 extern __inline __m128i 1267 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1268 _mm_unpacklo_epi32(__m128i __A, __m128i __B) { 1269 return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B); 1270 } 1271 1272 extern __inline __m128i 1273 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1274 _mm_unpacklo_epi64(__m128i __A, __m128i __B) { 1275 return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B); 1276 } 1277 1278 extern __inline __m128i 1279 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1280 _mm_add_epi8(__m128i __A, __m128i __B) { 1281 return (__m128i)((__v16qu)__A + (__v16qu)__B); 1282 } 1283 1284 extern __inline __m128i 1285 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1286 _mm_add_epi16(__m128i __A, __m128i __B) { 1287 return (__m128i)((__v8hu)__A + (__v8hu)__B); 1288 } 1289 1290 extern __inline __m128i 1291 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1292 _mm_add_epi32(__m128i __A, __m128i __B) { 1293 return (__m128i)((__v4su)__A + (__v4su)__B); 1294 } 1295 1296 extern __inline __m128i 1297 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1298 _mm_add_epi64(__m128i __A, __m128i __B) { 1299 return (__m128i)((__v2du)__A + (__v2du)__B); 1300 } 1301 1302 extern __inline __m128i 1303 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1304 _mm_adds_epi8(__m128i __A, __m128i __B) { 1305 return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B); 1306 } 1307 1308 extern __inline __m128i 1309 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1310 _mm_adds_epi16(__m128i __A, __m128i __B) { 1311 return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B); 1312 } 1313 1314 extern __inline __m128i 1315 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1316 _mm_adds_epu8(__m128i __A, __m128i __B) { 1317 return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B); 1318 } 1319 1320 extern __inline __m128i 1321 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1322 _mm_adds_epu16(__m128i __A, __m128i __B) { 1323 return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B); 1324 } 1325 1326 extern __inline __m128i 1327 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1328 _mm_sub_epi8(__m128i __A, __m128i __B) { 1329 return (__m128i)((__v16qu)__A - (__v16qu)__B); 1330 } 1331 1332 extern __inline __m128i 1333 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1334 _mm_sub_epi16(__m128i __A, __m128i __B) { 1335 return (__m128i)((__v8hu)__A - (__v8hu)__B); 1336 } 1337 1338 extern __inline __m128i 1339 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1340 _mm_sub_epi32(__m128i __A, __m128i __B) { 1341 return (__m128i)((__v4su)__A - (__v4su)__B); 1342 } 1343 1344 extern __inline __m128i 1345 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1346 _mm_sub_epi64(__m128i __A, __m128i __B) { 1347 return (__m128i)((__v2du)__A - (__v2du)__B); 1348 } 1349 1350 extern __inline __m128i 1351 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1352 _mm_subs_epi8(__m128i __A, __m128i __B) { 1353 return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B); 1354 } 1355 1356 extern __inline __m128i 1357 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1358 _mm_subs_epi16(__m128i __A, __m128i __B) { 1359 return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B); 1360 } 1361 1362 extern __inline __m128i 1363 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1364 _mm_subs_epu8(__m128i __A, __m128i __B) { 1365 return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B); 1366 } 1367 1368 extern __inline __m128i 1369 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1370 _mm_subs_epu16(__m128i __A, __m128i __B) { 1371 return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B); 1372 } 1373 1374 extern __inline __m128i 1375 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1376 _mm_madd_epi16(__m128i __A, __m128i __B) { 1377 __vector signed int __zero = {0, 0, 0, 0}; 1378 1379 return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero); 1380 } 1381 1382 extern __inline __m128i 1383 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1384 _mm_mulhi_epi16(__m128i __A, __m128i __B) { 1385 __vector signed int __w0, __w1; 1386 1387 __vector unsigned char __xform1 = { 1388 #ifdef __LITTLE_ENDIAN__ 1389 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1390 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1391 #else 1392 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 1393 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1394 #endif 1395 }; 1396 1397 __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B); 1398 __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B); 1399 return (__m128i)vec_perm(__w0, __w1, __xform1); 1400 } 1401 1402 extern __inline __m128i 1403 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1404 _mm_mullo_epi16(__m128i __A, __m128i __B) { 1405 return (__m128i)((__v8hi)__A * (__v8hi)__B); 1406 } 1407 1408 extern __inline __m64 1409 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1410 _mm_mul_su32(__m64 __A, __m64 __B) { 1411 unsigned int __a = __A; 1412 unsigned int __b = __B; 1413 1414 return ((__m64)__a * (__m64)__b); 1415 } 1416 1417 #ifdef _ARCH_PWR8 1418 extern __inline __m128i 1419 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1420 _mm_mul_epu32(__m128i __A, __m128i __B) { 1421 #if __GNUC__ < 8 1422 __v2du __result; 1423 1424 #ifdef __LITTLE_ENDIAN__ 1425 /* VMX Vector Multiply Odd Unsigned Word. */ 1426 __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :); 1427 #else 1428 /* VMX Vector Multiply Even Unsigned Word. */ 1429 __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :); 1430 #endif 1431 return (__m128i)__result; 1432 #else 1433 return (__m128i)vec_mule((__v4su)__A, (__v4su)__B); 1434 #endif 1435 } 1436 #endif 1437 1438 extern __inline __m128i 1439 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1440 _mm_slli_epi16(__m128i __A, int __B) { 1441 __v8hu __lshift; 1442 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0}; 1443 1444 if (__B >= 0 && __B < 16) { 1445 if (__builtin_constant_p(__B)) 1446 __lshift = (__v8hu)vec_splat_s16(__B); 1447 else 1448 __lshift = vec_splats((unsigned short)__B); 1449 1450 __result = vec_sl((__v8hi)__A, __lshift); 1451 } 1452 1453 return (__m128i)__result; 1454 } 1455 1456 extern __inline __m128i 1457 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1458 _mm_slli_epi32(__m128i __A, int __B) { 1459 __v4su __lshift; 1460 __v4si __result = {0, 0, 0, 0}; 1461 1462 if (__B >= 0 && __B < 32) { 1463 if (__builtin_constant_p(__B) && __B < 16) 1464 __lshift = (__v4su)vec_splat_s32(__B); 1465 else 1466 __lshift = vec_splats((unsigned int)__B); 1467 1468 __result = vec_sl((__v4si)__A, __lshift); 1469 } 1470 1471 return (__m128i)__result; 1472 } 1473 1474 #ifdef _ARCH_PWR8 1475 extern __inline __m128i 1476 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1477 _mm_slli_epi64(__m128i __A, int __B) { 1478 __v2du __lshift; 1479 __v2di __result = {0, 0}; 1480 1481 if (__B >= 0 && __B < 64) { 1482 if (__builtin_constant_p(__B) && __B < 16) 1483 __lshift = (__v2du)vec_splat_s32(__B); 1484 else 1485 __lshift = (__v2du)vec_splats((unsigned int)__B); 1486 1487 __result = vec_sl((__v2di)__A, __lshift); 1488 } 1489 1490 return (__m128i)__result; 1491 } 1492 #endif 1493 1494 extern __inline __m128i 1495 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1496 _mm_srai_epi16(__m128i __A, int __B) { 1497 __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15}; 1498 __v8hi __result; 1499 1500 if (__B < 16) { 1501 if (__builtin_constant_p(__B)) 1502 __rshift = (__v8hu)vec_splat_s16(__B); 1503 else 1504 __rshift = vec_splats((unsigned short)__B); 1505 } 1506 __result = vec_sra((__v8hi)__A, __rshift); 1507 1508 return (__m128i)__result; 1509 } 1510 1511 extern __inline __m128i 1512 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1513 _mm_srai_epi32(__m128i __A, int __B) { 1514 __v4su __rshift = {31, 31, 31, 31}; 1515 __v4si __result; 1516 1517 if (__B < 32) { 1518 if (__builtin_constant_p(__B)) { 1519 if (__B < 16) 1520 __rshift = (__v4su)vec_splat_s32(__B); 1521 else 1522 __rshift = (__v4su)vec_splats((unsigned int)__B); 1523 } else 1524 __rshift = vec_splats((unsigned int)__B); 1525 } 1526 __result = vec_sra((__v4si)__A, __rshift); 1527 1528 return (__m128i)__result; 1529 } 1530 1531 extern __inline __m128i 1532 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1533 _mm_bslli_si128(__m128i __A, const int __N) { 1534 __v16qu __result; 1535 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1536 1537 if (__N < 16) 1538 __result = vec_sld((__v16qu)__A, __zeros, __N); 1539 else 1540 __result = __zeros; 1541 1542 return (__m128i)__result; 1543 } 1544 1545 extern __inline __m128i 1546 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1547 _mm_bsrli_si128(__m128i __A, const int __N) { 1548 __v16qu __result; 1549 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1550 1551 if (__N < 16) 1552 #ifdef __LITTLE_ENDIAN__ 1553 if (__builtin_constant_p(__N)) 1554 /* Would like to use Vector Shift Left Double by Octet 1555 Immediate here to use the immediate form and avoid 1556 load of __N * 8 value into a separate VR. */ 1557 __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N)); 1558 else 1559 #endif 1560 { 1561 __v16qu __shift = vec_splats((unsigned char)(__N * 8)); 1562 #ifdef __LITTLE_ENDIAN__ 1563 __result = vec_sro((__v16qu)__A, __shift); 1564 #else 1565 __result = vec_slo((__v16qu)__A, __shift); 1566 #endif 1567 } 1568 else 1569 __result = __zeros; 1570 1571 return (__m128i)__result; 1572 } 1573 1574 extern __inline __m128i 1575 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1576 _mm_srli_si128(__m128i __A, const int __N) { 1577 return _mm_bsrli_si128(__A, __N); 1578 } 1579 1580 extern __inline __m128i 1581 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1582 _mm_slli_si128(__m128i __A, const int _imm5) { 1583 __v16qu __result; 1584 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1585 1586 if (_imm5 < 16) 1587 #ifdef __LITTLE_ENDIAN__ 1588 __result = vec_sld((__v16qu)__A, __zeros, _imm5); 1589 #else 1590 __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5)); 1591 #endif 1592 else 1593 __result = __zeros; 1594 1595 return (__m128i)__result; 1596 } 1597 1598 extern __inline __m128i 1599 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1600 1601 _mm_srli_epi16(__m128i __A, int __B) { 1602 __v8hu __rshift; 1603 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0}; 1604 1605 if (__B < 16) { 1606 if (__builtin_constant_p(__B)) 1607 __rshift = (__v8hu)vec_splat_s16(__B); 1608 else 1609 __rshift = vec_splats((unsigned short)__B); 1610 1611 __result = vec_sr((__v8hi)__A, __rshift); 1612 } 1613 1614 return (__m128i)__result; 1615 } 1616 1617 extern __inline __m128i 1618 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1619 _mm_srli_epi32(__m128i __A, int __B) { 1620 __v4su __rshift; 1621 __v4si __result = {0, 0, 0, 0}; 1622 1623 if (__B < 32) { 1624 if (__builtin_constant_p(__B)) { 1625 if (__B < 16) 1626 __rshift = (__v4su)vec_splat_s32(__B); 1627 else 1628 __rshift = (__v4su)vec_splats((unsigned int)__B); 1629 } else 1630 __rshift = vec_splats((unsigned int)__B); 1631 1632 __result = vec_sr((__v4si)__A, __rshift); 1633 } 1634 1635 return (__m128i)__result; 1636 } 1637 1638 #ifdef _ARCH_PWR8 1639 extern __inline __m128i 1640 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1641 _mm_srli_epi64(__m128i __A, int __B) { 1642 __v2du __rshift; 1643 __v2di __result = {0, 0}; 1644 1645 if (__B < 64) { 1646 if (__builtin_constant_p(__B)) { 1647 if (__B < 16) 1648 __rshift = (__v2du)vec_splat_s32(__B); 1649 else 1650 __rshift = (__v2du)vec_splats((unsigned long long)__B); 1651 } else 1652 __rshift = (__v2du)vec_splats((unsigned int)__B); 1653 1654 __result = vec_sr((__v2di)__A, __rshift); 1655 } 1656 1657 return (__m128i)__result; 1658 } 1659 #endif 1660 1661 extern __inline __m128i 1662 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1663 _mm_sll_epi16(__m128i __A, __m128i __B) { 1664 __v8hu __lshift; 1665 __vector __bool short __shmask; 1666 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1667 __v8hu __result; 1668 1669 #ifdef __LITTLE_ENDIAN__ 1670 __lshift = vec_splat((__v8hu)__B, 0); 1671 #else 1672 __lshift = vec_splat((__v8hu)__B, 3); 1673 #endif 1674 __shmask = vec_cmple(__lshift, __shmax); 1675 __result = vec_sl((__v8hu)__A, __lshift); 1676 __result = vec_sel((__v8hu)__shmask, __result, __shmask); 1677 1678 return (__m128i)__result; 1679 } 1680 1681 extern __inline __m128i 1682 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1683 _mm_sll_epi32(__m128i __A, __m128i __B) { 1684 __v4su __lshift; 1685 __vector __bool int __shmask; 1686 const __v4su __shmax = {32, 32, 32, 32}; 1687 __v4su __result; 1688 #ifdef __LITTLE_ENDIAN__ 1689 __lshift = vec_splat((__v4su)__B, 0); 1690 #else 1691 __lshift = vec_splat((__v4su)__B, 1); 1692 #endif 1693 __shmask = vec_cmplt(__lshift, __shmax); 1694 __result = vec_sl((__v4su)__A, __lshift); 1695 __result = vec_sel((__v4su)__shmask, __result, __shmask); 1696 1697 return (__m128i)__result; 1698 } 1699 1700 #ifdef _ARCH_PWR8 1701 extern __inline __m128i 1702 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1703 _mm_sll_epi64(__m128i __A, __m128i __B) { 1704 __v2du __lshift; 1705 __vector __bool long long __shmask; 1706 const __v2du __shmax = {64, 64}; 1707 __v2du __result; 1708 1709 __lshift = vec_splat((__v2du)__B, 0); 1710 __shmask = vec_cmplt(__lshift, __shmax); 1711 __result = vec_sl((__v2du)__A, __lshift); 1712 __result = vec_sel((__v2du)__shmask, __result, __shmask); 1713 1714 return (__m128i)__result; 1715 } 1716 #endif 1717 1718 extern __inline __m128i 1719 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1720 _mm_sra_epi16(__m128i __A, __m128i __B) { 1721 const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1722 __v8hu __rshift; 1723 __v8hi __result; 1724 1725 #ifdef __LITTLE_ENDIAN__ 1726 __rshift = vec_splat((__v8hu)__B, 0); 1727 #else 1728 __rshift = vec_splat((__v8hu)__B, 3); 1729 #endif 1730 __rshift = vec_min(__rshift, __rshmax); 1731 __result = vec_sra((__v8hi)__A, __rshift); 1732 1733 return (__m128i)__result; 1734 } 1735 1736 extern __inline __m128i 1737 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1738 _mm_sra_epi32(__m128i __A, __m128i __B) { 1739 const __v4su __rshmax = {31, 31, 31, 31}; 1740 __v4su __rshift; 1741 __v4si __result; 1742 1743 #ifdef __LITTLE_ENDIAN__ 1744 __rshift = vec_splat((__v4su)__B, 0); 1745 #else 1746 __rshift = vec_splat((__v4su)__B, 1); 1747 #endif 1748 __rshift = vec_min(__rshift, __rshmax); 1749 __result = vec_sra((__v4si)__A, __rshift); 1750 1751 return (__m128i)__result; 1752 } 1753 1754 extern __inline __m128i 1755 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1756 _mm_srl_epi16(__m128i __A, __m128i __B) { 1757 __v8hu __rshift; 1758 __vector __bool short __shmask; 1759 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1760 __v8hu __result; 1761 1762 #ifdef __LITTLE_ENDIAN__ 1763 __rshift = vec_splat((__v8hu)__B, 0); 1764 #else 1765 __rshift = vec_splat((__v8hu)__B, 3); 1766 #endif 1767 __shmask = vec_cmple(__rshift, __shmax); 1768 __result = vec_sr((__v8hu)__A, __rshift); 1769 __result = vec_sel((__v8hu)__shmask, __result, __shmask); 1770 1771 return (__m128i)__result; 1772 } 1773 1774 extern __inline __m128i 1775 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1776 _mm_srl_epi32(__m128i __A, __m128i __B) { 1777 __v4su __rshift; 1778 __vector __bool int __shmask; 1779 const __v4su __shmax = {32, 32, 32, 32}; 1780 __v4su __result; 1781 1782 #ifdef __LITTLE_ENDIAN__ 1783 __rshift = vec_splat((__v4su)__B, 0); 1784 #else 1785 __rshift = vec_splat((__v4su)__B, 1); 1786 #endif 1787 __shmask = vec_cmplt(__rshift, __shmax); 1788 __result = vec_sr((__v4su)__A, __rshift); 1789 __result = vec_sel((__v4su)__shmask, __result, __shmask); 1790 1791 return (__m128i)__result; 1792 } 1793 1794 #ifdef _ARCH_PWR8 1795 extern __inline __m128i 1796 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1797 _mm_srl_epi64(__m128i __A, __m128i __B) { 1798 __v2du __rshift; 1799 __vector __bool long long __shmask; 1800 const __v2du __shmax = {64, 64}; 1801 __v2du __result; 1802 1803 __rshift = vec_splat((__v2du)__B, 0); 1804 __shmask = vec_cmplt(__rshift, __shmax); 1805 __result = vec_sr((__v2du)__A, __rshift); 1806 __result = vec_sel((__v2du)__shmask, __result, __shmask); 1807 1808 return (__m128i)__result; 1809 } 1810 #endif 1811 1812 extern __inline __m128d 1813 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1814 _mm_and_pd(__m128d __A, __m128d __B) { 1815 return (vec_and((__v2df)__A, (__v2df)__B)); 1816 } 1817 1818 extern __inline __m128d 1819 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1820 _mm_andnot_pd(__m128d __A, __m128d __B) { 1821 return (vec_andc((__v2df)__B, (__v2df)__A)); 1822 } 1823 1824 extern __inline __m128d 1825 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1826 _mm_or_pd(__m128d __A, __m128d __B) { 1827 return (vec_or((__v2df)__A, (__v2df)__B)); 1828 } 1829 1830 extern __inline __m128d 1831 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1832 _mm_xor_pd(__m128d __A, __m128d __B) { 1833 return (vec_xor((__v2df)__A, (__v2df)__B)); 1834 } 1835 1836 extern __inline __m128i 1837 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1838 _mm_and_si128(__m128i __A, __m128i __B) { 1839 return (__m128i)vec_and((__v2di)__A, (__v2di)__B); 1840 } 1841 1842 extern __inline __m128i 1843 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1844 _mm_andnot_si128(__m128i __A, __m128i __B) { 1845 return (__m128i)vec_andc((__v2di)__B, (__v2di)__A); 1846 } 1847 1848 extern __inline __m128i 1849 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1850 _mm_or_si128(__m128i __A, __m128i __B) { 1851 return (__m128i)vec_or((__v2di)__A, (__v2di)__B); 1852 } 1853 1854 extern __inline __m128i 1855 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1856 _mm_xor_si128(__m128i __A, __m128i __B) { 1857 return (__m128i)vec_xor((__v2di)__A, (__v2di)__B); 1858 } 1859 1860 extern __inline __m128i 1861 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1862 _mm_cmpeq_epi8(__m128i __A, __m128i __B) { 1863 return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B); 1864 } 1865 1866 extern __inline __m128i 1867 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1868 _mm_cmpeq_epi16(__m128i __A, __m128i __B) { 1869 return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B); 1870 } 1871 1872 extern __inline __m128i 1873 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1874 _mm_cmpeq_epi32(__m128i __A, __m128i __B) { 1875 return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B); 1876 } 1877 1878 extern __inline __m128i 1879 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1880 _mm_cmplt_epi8(__m128i __A, __m128i __B) { 1881 return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B); 1882 } 1883 1884 extern __inline __m128i 1885 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1886 _mm_cmplt_epi16(__m128i __A, __m128i __B) { 1887 return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B); 1888 } 1889 1890 extern __inline __m128i 1891 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1892 _mm_cmplt_epi32(__m128i __A, __m128i __B) { 1893 return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B); 1894 } 1895 1896 extern __inline __m128i 1897 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1898 _mm_cmpgt_epi8(__m128i __A, __m128i __B) { 1899 return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B); 1900 } 1901 1902 extern __inline __m128i 1903 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1904 _mm_cmpgt_epi16(__m128i __A, __m128i __B) { 1905 return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B); 1906 } 1907 1908 extern __inline __m128i 1909 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1910 _mm_cmpgt_epi32(__m128i __A, __m128i __B) { 1911 return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B); 1912 } 1913 1914 extern __inline int 1915 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1916 _mm_extract_epi16(__m128i const __A, int const __N) { 1917 return (unsigned short)((__v8hi)__A)[__N & 7]; 1918 } 1919 1920 extern __inline __m128i 1921 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1922 _mm_insert_epi16(__m128i const __A, int const __D, int const __N) { 1923 __v8hi __result = (__v8hi)__A; 1924 1925 __result[(__N & 7)] = __D; 1926 1927 return (__m128i)__result; 1928 } 1929 1930 extern __inline __m128i 1931 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1932 _mm_max_epi16(__m128i __A, __m128i __B) { 1933 return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B); 1934 } 1935 1936 extern __inline __m128i 1937 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1938 _mm_max_epu8(__m128i __A, __m128i __B) { 1939 return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B); 1940 } 1941 1942 extern __inline __m128i 1943 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1944 _mm_min_epi16(__m128i __A, __m128i __B) { 1945 return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B); 1946 } 1947 1948 extern __inline __m128i 1949 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1950 _mm_min_epu8(__m128i __A, __m128i __B) { 1951 return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B); 1952 } 1953 1954 #ifdef _ARCH_PWR8 1955 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1956 1957 /* Return a mask created from the most significant bit of each 8-bit 1958 element in A. */ 1959 extern __inline int 1960 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1961 _mm_movemask_epi8(__m128i __A) { 1962 #ifdef _ARCH_PWR10 1963 return vec_extractm((__v16qu)__A); 1964 #else 1965 __vector unsigned long long __result; 1966 static const __vector unsigned char __perm_mask = { 1967 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 1968 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00}; 1969 1970 __result = ((__vector unsigned long long)vec_vbpermq( 1971 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); 1972 1973 #ifdef __LITTLE_ENDIAN__ 1974 return __result[1]; 1975 #else 1976 return __result[0]; 1977 #endif 1978 #endif /* !_ARCH_PWR10 */ 1979 } 1980 #endif /* _ARCH_PWR8 */ 1981 1982 extern __inline __m128i 1983 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1984 _mm_mulhi_epu16(__m128i __A, __m128i __B) { 1985 __v4su __w0, __w1; 1986 __v16qu __xform1 = { 1987 #ifdef __LITTLE_ENDIAN__ 1988 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1989 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1990 #else 1991 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 1992 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1993 #endif 1994 }; 1995 1996 __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B); 1997 __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B); 1998 return (__m128i)vec_perm(__w0, __w1, __xform1); 1999 } 2000 2001 extern __inline __m128i 2002 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2003 _mm_shufflehi_epi16(__m128i __A, const int __mask) { 2004 unsigned long __element_selector_98 = __mask & 0x03; 2005 unsigned long __element_selector_BA = (__mask >> 2) & 0x03; 2006 unsigned long __element_selector_DC = (__mask >> 4) & 0x03; 2007 unsigned long __element_selector_FE = (__mask >> 6) & 0x03; 2008 static const unsigned short __permute_selectors[4] = { 2009 #ifdef __LITTLE_ENDIAN__ 2010 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 2011 #else 2012 0x0809, 0x0A0B, 0x0C0D, 0x0E0F 2013 #endif 2014 }; 2015 __v2du __pmask = 2016 #ifdef __LITTLE_ENDIAN__ 2017 {0x1716151413121110UL, 0UL}; 2018 #else 2019 {0x1011121314151617UL, 0UL}; 2020 #endif 2021 __m64_union __t; 2022 __v2du __a, __r; 2023 2024 __t.as_short[0] = __permute_selectors[__element_selector_98]; 2025 __t.as_short[1] = __permute_selectors[__element_selector_BA]; 2026 __t.as_short[2] = __permute_selectors[__element_selector_DC]; 2027 __t.as_short[3] = __permute_selectors[__element_selector_FE]; 2028 __pmask[1] = __t.as_m64; 2029 __a = (__v2du)__A; 2030 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask); 2031 return (__m128i)__r; 2032 } 2033 2034 extern __inline __m128i 2035 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2036 _mm_shufflelo_epi16(__m128i __A, const int __mask) { 2037 unsigned long __element_selector_10 = __mask & 0x03; 2038 unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 2039 unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 2040 unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 2041 static const unsigned short __permute_selectors[4] = { 2042 #ifdef __LITTLE_ENDIAN__ 2043 0x0100, 0x0302, 0x0504, 0x0706 2044 #else 2045 0x0001, 0x0203, 0x0405, 0x0607 2046 #endif 2047 }; 2048 __v2du __pmask = 2049 #ifdef __LITTLE_ENDIAN__ 2050 {0UL, 0x1f1e1d1c1b1a1918UL}; 2051 #else 2052 {0UL, 0x18191a1b1c1d1e1fUL}; 2053 #endif 2054 __m64_union __t; 2055 __v2du __a, __r; 2056 __t.as_short[0] = __permute_selectors[__element_selector_10]; 2057 __t.as_short[1] = __permute_selectors[__element_selector_32]; 2058 __t.as_short[2] = __permute_selectors[__element_selector_54]; 2059 __t.as_short[3] = __permute_selectors[__element_selector_76]; 2060 __pmask[0] = __t.as_m64; 2061 __a = (__v2du)__A; 2062 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask); 2063 return (__m128i)__r; 2064 } 2065 2066 extern __inline __m128i 2067 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2068 _mm_shuffle_epi32(__m128i __A, const int __mask) { 2069 unsigned long __element_selector_10 = __mask & 0x03; 2070 unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 2071 unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 2072 unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 2073 static const unsigned int __permute_selectors[4] = { 2074 #ifdef __LITTLE_ENDIAN__ 2075 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 2076 #else 2077 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 2078 #endif 2079 }; 2080 __v4su __t; 2081 2082 __t[0] = __permute_selectors[__element_selector_10]; 2083 __t[1] = __permute_selectors[__element_selector_32]; 2084 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; 2085 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; 2086 return (__m128i)vec_perm((__v4si)__A, (__v4si)__A, 2087 (__vector unsigned char)__t); 2088 } 2089 2090 extern __inline void 2091 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2092 _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) { 2093 __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; 2094 __v16qu __mask, __tmp; 2095 __m128i_u *__p = (__m128i_u *)__C; 2096 2097 __tmp = (__v16qu)_mm_loadu_si128(__p); 2098 __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit); 2099 __tmp = vec_sel(__tmp, (__v16qu)__A, __mask); 2100 _mm_storeu_si128(__p, (__m128i)__tmp); 2101 } 2102 2103 extern __inline __m128i 2104 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2105 _mm_avg_epu8(__m128i __A, __m128i __B) { 2106 return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B); 2107 } 2108 2109 extern __inline __m128i 2110 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2111 _mm_avg_epu16(__m128i __A, __m128i __B) { 2112 return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B); 2113 } 2114 2115 extern __inline __m128i 2116 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2117 _mm_sad_epu8(__m128i __A, __m128i __B) { 2118 __v16qu __a, __b; 2119 __v16qu __vabsdiff; 2120 __v4si __vsum; 2121 const __v4su __zero = {0, 0, 0, 0}; 2122 __v4si __result; 2123 2124 __a = (__v16qu)__A; 2125 __b = (__v16qu)__B; 2126 #ifndef _ARCH_PWR9 2127 __v16qu __vmin = vec_min(__a, __b); 2128 __v16qu __vmax = vec_max(__a, __b); 2129 __vabsdiff = vec_sub(__vmax, __vmin); 2130 #else 2131 __vabsdiff = vec_absd(__a, __b); 2132 #endif 2133 /* Sum four groups of bytes into integers. */ 2134 __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero); 2135 #ifdef __LITTLE_ENDIAN__ 2136 /* Sum across four integers with two integer results. */ 2137 __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero)); 2138 /* Note: vec_sum2s could be used here, but on little-endian, vector 2139 shifts are added that are not needed for this use-case. 2140 A vector shift to correctly position the 32-bit integer results 2141 (currently at [0] and [2]) to [1] and [3] would then need to be 2142 swapped back again since the desired results are two 64-bit 2143 integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */ 2144 #else 2145 /* Sum across four integers with two integer results. */ 2146 __result = vec_sum2s(__vsum, (__vector signed int)__zero); 2147 /* Rotate the sums into the correct position. */ 2148 __result = vec_sld(__result, __result, 6); 2149 #endif 2150 return (__m128i)__result; 2151 } 2152 2153 extern __inline void 2154 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2155 _mm_stream_si32(int *__A, int __B) { 2156 /* Use the data cache block touch for store transient. */ 2157 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2158 *__A = __B; 2159 } 2160 2161 extern __inline void 2162 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2163 _mm_stream_si64(long long int *__A, long long int __B) { 2164 /* Use the data cache block touch for store transient. */ 2165 __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory"); 2166 *__A = __B; 2167 } 2168 2169 extern __inline void 2170 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2171 _mm_stream_si128(__m128i *__A, __m128i __B) { 2172 /* Use the data cache block touch for store transient. */ 2173 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2174 *__A = __B; 2175 } 2176 2177 extern __inline void 2178 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2179 _mm_stream_pd(double *__A, __m128d __B) { 2180 /* Use the data cache block touch for store transient. */ 2181 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2182 *(__m128d *)__A = __B; 2183 } 2184 2185 extern __inline void 2186 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2187 _mm_clflush(void const *__A) { 2188 /* Use the data cache block flush. */ 2189 __asm__("dcbf 0,%0" : : "b"(__A) : "memory"); 2190 } 2191 2192 extern __inline void 2193 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2194 _mm_lfence(void) { 2195 /* Use light weight sync for load to load ordering. */ 2196 __atomic_thread_fence(__ATOMIC_RELEASE); 2197 } 2198 2199 extern __inline void 2200 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2201 _mm_mfence(void) { 2202 /* Use heavy weight sync for any to any ordering. */ 2203 __atomic_thread_fence(__ATOMIC_SEQ_CST); 2204 } 2205 2206 extern __inline __m128i 2207 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2208 _mm_cvtsi32_si128(int __A) { 2209 return _mm_set_epi32(0, 0, 0, __A); 2210 } 2211 2212 extern __inline __m128i 2213 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2214 _mm_cvtsi64_si128(long long __A) { 2215 return __extension__(__m128i)(__v2di){__A, 0LL}; 2216 } 2217 2218 /* Microsoft intrinsic. */ 2219 extern __inline __m128i 2220 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2221 _mm_cvtsi64x_si128(long long __A) { 2222 return __extension__(__m128i)(__v2di){__A, 0LL}; 2223 } 2224 2225 /* Casts between various SP, DP, INT vector types. Note that these do no 2226 conversion of values, they just change the type. */ 2227 extern __inline __m128 2228 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2229 _mm_castpd_ps(__m128d __A) { 2230 return (__m128)__A; 2231 } 2232 2233 extern __inline __m128i 2234 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2235 _mm_castpd_si128(__m128d __A) { 2236 return (__m128i)__A; 2237 } 2238 2239 extern __inline __m128d 2240 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2241 _mm_castps_pd(__m128 __A) { 2242 return (__m128d)__A; 2243 } 2244 2245 extern __inline __m128i 2246 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2247 _mm_castps_si128(__m128 __A) { 2248 return (__m128i)__A; 2249 } 2250 2251 extern __inline __m128 2252 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2253 _mm_castsi128_ps(__m128i __A) { 2254 return (__m128)__A; 2255 } 2256 2257 extern __inline __m128d 2258 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2259 _mm_castsi128_pd(__m128i __A) { 2260 return (__m128d)__A; 2261 } 2262 2263 #else 2264 #include_next <emmintrin.h> 2265 #endif /* defined(__ppc64__) && 2266 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ 2267 2268 #endif /* EMMINTRIN_H_ */ 2269