1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type, 18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations. 19 However scalar float operations in vector (XMM) registers require 20 the POWER8 VSX ISA (2.07) level. There are differences for data 21 format and placement of float scalars in the vector register, which 22 require extra steps to match SSE2 scalar float semantics on POWER. 23 24 It should be noted that there's much difference between X86_64's 25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use 26 portable <fenv.h> instead of access MXSCR directly. 27 28 Most SSE2 scalar float intrinsic operations can be performed more 29 efficiently as C language float scalar operations or optimized to 30 use vector SIMD operations. We recommend this for new applications. 31 */ 32 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 33 #endif 34 35 #ifndef EMMINTRIN_H_ 36 #define EMMINTRIN_H_ 37 38 #if defined(__linux__) && defined(__ppc64__) 39 40 #include <altivec.h> 41 42 /* We need definitions from the SSE header files. */ 43 #include <xmmintrin.h> 44 45 /* SSE2 */ 46 typedef __vector double __v2df; 47 typedef __vector long long __v2di; 48 typedef __vector unsigned long long __v2du; 49 typedef __vector int __v4si; 50 typedef __vector unsigned int __v4su; 51 typedef __vector short __v8hi; 52 typedef __vector unsigned short __v8hu; 53 typedef __vector signed char __v16qi; 54 typedef __vector unsigned char __v16qu; 55 56 /* The Intel API is flexible enough that we must allow aliasing with other 57 vector types, and their scalar components. */ 58 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); 59 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); 60 61 /* Unaligned version of the same types. */ 62 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 63 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 64 65 /* Define two value permute mask. */ 66 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y)) 67 68 /* Create a vector with element 0 as F and the rest zero. */ 69 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 70 _mm_set_sd (double __F) 71 { 72 return __extension__ (__m128d){ __F, 0.0 }; 73 } 74 75 /* Create a vector with both elements equal to F. */ 76 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 77 _mm_set1_pd (double __F) 78 { 79 return __extension__ (__m128d){ __F, __F }; 80 } 81 82 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 83 _mm_set_pd1 (double __F) 84 { 85 return _mm_set1_pd (__F); 86 } 87 88 /* Create a vector with the lower value X and upper value W. */ 89 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 90 _mm_set_pd (double __W, double __X) 91 { 92 return __extension__ (__m128d){ __X, __W }; 93 } 94 95 /* Create a vector with the lower value W and upper value X. */ 96 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 97 _mm_setr_pd (double __W, double __X) 98 { 99 return __extension__ (__m128d){ __W, __X }; 100 } 101 102 /* Create an undefined vector. */ 103 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 104 _mm_undefined_pd (void) 105 { 106 __m128d __Y = __Y; 107 return __Y; 108 } 109 110 /* Create a vector of zeros. */ 111 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 112 _mm_setzero_pd (void) 113 { 114 return (__m128d) vec_splats (0); 115 } 116 117 /* Sets the low DPFP value of A from the low value of B. */ 118 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 119 _mm_move_sd (__m128d __A, __m128d __B) 120 { 121 __v2df result = (__v2df) __A; 122 result [0] = ((__v2df) __B)[0]; 123 return (__m128d) result; 124 } 125 126 /* Load two DPFP values from P. The address must be 16-byte aligned. */ 127 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 128 _mm_load_pd (double const *__P) 129 { 130 return ((__m128d)vec_ld(0, (__v16qu*)__P)); 131 } 132 133 /* Load two DPFP values from P. The address need not be 16-byte aligned. */ 134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 135 _mm_loadu_pd (double const *__P) 136 { 137 return (vec_vsx_ld(0, __P)); 138 } 139 140 /* Create a vector with all two elements equal to *P. */ 141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 142 _mm_load1_pd (double const *__P) 143 { 144 return (vec_splats (*__P)); 145 } 146 147 /* Create a vector with element 0 as *P and the rest zero. */ 148 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 149 _mm_load_sd (double const *__P) 150 { 151 return _mm_set_sd (*__P); 152 } 153 154 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 155 _mm_load_pd1 (double const *__P) 156 { 157 return _mm_load1_pd (__P); 158 } 159 160 /* Load two DPFP values in reverse order. The address must be aligned. */ 161 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 162 _mm_loadr_pd (double const *__P) 163 { 164 __v2df __tmp = _mm_load_pd (__P); 165 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2); 166 } 167 168 /* Store two DPFP values. The address must be 16-byte aligned. */ 169 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 170 _mm_store_pd (double *__P, __m128d __A) 171 { 172 vec_st((__v16qu)__A, 0, (__v16qu*)__P); 173 } 174 175 /* Store two DPFP values. The address need not be 16-byte aligned. */ 176 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 177 _mm_storeu_pd (double *__P, __m128d __A) 178 { 179 *(__m128d_u *)__P = __A; 180 } 181 182 /* Stores the lower DPFP value. */ 183 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 184 _mm_store_sd (double *__P, __m128d __A) 185 { 186 *__P = ((__v2df)__A)[0]; 187 } 188 189 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 190 _mm_cvtsd_f64 (__m128d __A) 191 { 192 return ((__v2df)__A)[0]; 193 } 194 195 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 196 _mm_storel_pd (double *__P, __m128d __A) 197 { 198 _mm_store_sd (__P, __A); 199 } 200 201 /* Stores the upper DPFP value. */ 202 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 203 _mm_storeh_pd (double *__P, __m128d __A) 204 { 205 *__P = ((__v2df)__A)[1]; 206 } 207 /* Store the lower DPFP value across two words. 208 The address must be 16-byte aligned. */ 209 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 210 _mm_store1_pd (double *__P, __m128d __A) 211 { 212 _mm_store_pd (__P, vec_splat (__A, 0)); 213 } 214 215 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 216 _mm_store_pd1 (double *__P, __m128d __A) 217 { 218 _mm_store1_pd (__P, __A); 219 } 220 221 /* Store two DPFP values in reverse order. The address must be aligned. */ 222 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 223 _mm_storer_pd (double *__P, __m128d __A) 224 { 225 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2)); 226 } 227 228 /* Intel intrinsic. */ 229 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 230 _mm_cvtsi128_si64 (__m128i __A) 231 { 232 return ((__v2di)__A)[0]; 233 } 234 235 /* Microsoft intrinsic. */ 236 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 237 _mm_cvtsi128_si64x (__m128i __A) 238 { 239 return ((__v2di)__A)[0]; 240 } 241 242 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 243 _mm_add_pd (__m128d __A, __m128d __B) 244 { 245 return (__m128d) ((__v2df)__A + (__v2df)__B); 246 } 247 248 /* Add the lower double-precision (64-bit) floating-point element in 249 a and b, store the result in the lower element of dst, and copy 250 the upper element from a to the upper element of dst. */ 251 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 252 _mm_add_sd (__m128d __A, __m128d __B) 253 { 254 __A[0] = __A[0] + __B[0]; 255 return (__A); 256 } 257 258 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 259 _mm_sub_pd (__m128d __A, __m128d __B) 260 { 261 return (__m128d) ((__v2df)__A - (__v2df)__B); 262 } 263 264 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 265 _mm_sub_sd (__m128d __A, __m128d __B) 266 { 267 __A[0] = __A[0] - __B[0]; 268 return (__A); 269 } 270 271 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 272 _mm_mul_pd (__m128d __A, __m128d __B) 273 { 274 return (__m128d) ((__v2df)__A * (__v2df)__B); 275 } 276 277 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 278 _mm_mul_sd (__m128d __A, __m128d __B) 279 { 280 __A[0] = __A[0] * __B[0]; 281 return (__A); 282 } 283 284 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 285 _mm_div_pd (__m128d __A, __m128d __B) 286 { 287 return (__m128d) ((__v2df)__A / (__v2df)__B); 288 } 289 290 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 291 _mm_div_sd (__m128d __A, __m128d __B) 292 { 293 __A[0] = __A[0] / __B[0]; 294 return (__A); 295 } 296 297 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 298 _mm_sqrt_pd (__m128d __A) 299 { 300 return (vec_sqrt (__A)); 301 } 302 303 /* Return pair {sqrt (B[0]), A[1]}. */ 304 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 305 _mm_sqrt_sd (__m128d __A, __m128d __B) 306 { 307 __v2df c; 308 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0])); 309 return (__m128d) _mm_setr_pd (c[0], __A[1]); 310 } 311 312 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 313 _mm_min_pd (__m128d __A, __m128d __B) 314 { 315 return (vec_min (__A, __B)); 316 } 317 318 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 319 _mm_min_sd (__m128d __A, __m128d __B) 320 { 321 __v2df a, b, c; 322 a = vec_splats (__A[0]); 323 b = vec_splats (__B[0]); 324 c = vec_min (a, b); 325 return (__m128d) _mm_setr_pd (c[0], __A[1]); 326 } 327 328 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 329 _mm_max_pd (__m128d __A, __m128d __B) 330 { 331 return (vec_max (__A, __B)); 332 } 333 334 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 335 _mm_max_sd (__m128d __A, __m128d __B) 336 { 337 __v2df a, b, c; 338 a = vec_splats (__A[0]); 339 b = vec_splats (__B[0]); 340 c = vec_max (a, b); 341 return (__m128d) _mm_setr_pd (c[0], __A[1]); 342 } 343 344 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 345 _mm_cmpeq_pd (__m128d __A, __m128d __B) 346 { 347 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B)); 348 } 349 350 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 351 _mm_cmplt_pd (__m128d __A, __m128d __B) 352 { 353 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 354 } 355 356 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 357 _mm_cmple_pd (__m128d __A, __m128d __B) 358 { 359 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 360 } 361 362 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 363 _mm_cmpgt_pd (__m128d __A, __m128d __B) 364 { 365 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 366 } 367 368 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 369 _mm_cmpge_pd (__m128d __A, __m128d __B) 370 { 371 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B)); 372 } 373 374 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 375 _mm_cmpneq_pd (__m128d __A, __m128d __B) 376 { 377 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B); 378 return ((__m128d)vec_nor (temp, temp)); 379 } 380 381 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 382 _mm_cmpnlt_pd (__m128d __A, __m128d __B) 383 { 384 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B)); 385 } 386 387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 388 _mm_cmpnle_pd (__m128d __A, __m128d __B) 389 { 390 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 391 } 392 393 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 394 _mm_cmpngt_pd (__m128d __A, __m128d __B) 395 { 396 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 397 } 398 399 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 400 _mm_cmpnge_pd (__m128d __A, __m128d __B) 401 { 402 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 403 } 404 405 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 406 _mm_cmpord_pd (__m128d __A, __m128d __B) 407 { 408 #if _ARCH_PWR8 409 __v2du c, d; 410 /* Compare against self will return false (0's) if NAN. */ 411 c = (__v2du)vec_cmpeq (__A, __A); 412 d = (__v2du)vec_cmpeq (__B, __B); 413 #else 414 __v2du a, b; 415 __v2du c, d; 416 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000}; 417 a = (__v2du)vec_abs ((__v2df)__A); 418 b = (__v2du)vec_abs ((__v2df)__B); 419 c = (__v2du)vec_cmpgt (double_exp_mask, a); 420 d = (__v2du)vec_cmpgt (double_exp_mask, b); 421 #endif 422 /* A != NAN and B != NAN. */ 423 return ((__m128d)vec_and(c, d)); 424 } 425 426 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 427 _mm_cmpunord_pd (__m128d __A, __m128d __B) 428 { 429 #if _ARCH_PWR8 430 __v2du c, d; 431 /* Compare against self will return false (0's) if NAN. */ 432 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 433 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 434 /* A == NAN OR B == NAN converts too: 435 NOT(A != NAN) OR NOT(B != NAN). */ 436 c = vec_nor (c, c); 437 return ((__m128d)vec_orc(c, d)); 438 #else 439 __v2du c, d; 440 /* Compare against self will return false (0's) if NAN. */ 441 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 442 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 443 /* Convert the true ('1's) is NAN. */ 444 c = vec_nor (c, c); 445 d = vec_nor (d, d); 446 return ((__m128d)vec_or(c, d)); 447 #endif 448 } 449 450 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 451 _mm_cmpeq_sd(__m128d __A, __m128d __B) 452 { 453 __v2df a, b, c; 454 /* PowerISA VSX does not allow partial (for just lower double) 455 results. So to insure we don't generate spurious exceptions 456 (from the upper double values) we splat the lower double 457 before we do the operation. */ 458 a = vec_splats (__A[0]); 459 b = vec_splats (__B[0]); 460 c = (__v2df) vec_cmpeq(a, b); 461 /* Then we merge the lower double result with the original upper 462 double from __A. */ 463 return (__m128d) _mm_setr_pd (c[0], __A[1]); 464 } 465 466 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 467 _mm_cmplt_sd (__m128d __A, __m128d __B) 468 { 469 __v2df a, b, c; 470 a = vec_splats (__A[0]); 471 b = vec_splats (__B[0]); 472 c = (__v2df) vec_cmplt(a, b); 473 return (__m128d) _mm_setr_pd (c[0], __A[1]); 474 } 475 476 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 477 _mm_cmple_sd (__m128d __A, __m128d __B) 478 { 479 __v2df a, b, c; 480 a = vec_splats (__A[0]); 481 b = vec_splats (__B[0]); 482 c = (__v2df) vec_cmple(a, b); 483 return (__m128d) _mm_setr_pd (c[0], __A[1]); 484 } 485 486 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 487 _mm_cmpgt_sd (__m128d __A, __m128d __B) 488 { 489 __v2df a, b, c; 490 a = vec_splats (__A[0]); 491 b = vec_splats (__B[0]); 492 c = (__v2df) vec_cmpgt(a, b); 493 return (__m128d) _mm_setr_pd (c[0], __A[1]); 494 } 495 496 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 497 _mm_cmpge_sd (__m128d __A, __m128d __B) 498 { 499 __v2df a, b, c; 500 a = vec_splats (__A[0]); 501 b = vec_splats (__B[0]); 502 c = (__v2df) vec_cmpge(a, b); 503 return (__m128d) _mm_setr_pd (c[0], __A[1]); 504 } 505 506 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 507 _mm_cmpneq_sd (__m128d __A, __m128d __B) 508 { 509 __v2df a, b, c; 510 a = vec_splats (__A[0]); 511 b = vec_splats (__B[0]); 512 c = (__v2df) vec_cmpeq(a, b); 513 c = vec_nor (c, c); 514 return (__m128d) _mm_setr_pd (c[0], __A[1]); 515 } 516 517 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 518 _mm_cmpnlt_sd (__m128d __A, __m128d __B) 519 { 520 __v2df a, b, c; 521 a = vec_splats (__A[0]); 522 b = vec_splats (__B[0]); 523 /* Not less than is just greater than or equal. */ 524 c = (__v2df) vec_cmpge(a, b); 525 return (__m128d) _mm_setr_pd (c[0], __A[1]); 526 } 527 528 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 529 _mm_cmpnle_sd (__m128d __A, __m128d __B) 530 { 531 __v2df a, b, c; 532 a = vec_splats (__A[0]); 533 b = vec_splats (__B[0]); 534 /* Not less than or equal is just greater than. */ 535 c = (__v2df) vec_cmpge(a, b); 536 return (__m128d) _mm_setr_pd (c[0], __A[1]); 537 } 538 539 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 540 _mm_cmpngt_sd (__m128d __A, __m128d __B) 541 { 542 __v2df a, b, c; 543 a = vec_splats (__A[0]); 544 b = vec_splats (__B[0]); 545 /* Not greater than is just less than or equal. */ 546 c = (__v2df) vec_cmple(a, b); 547 return (__m128d) _mm_setr_pd (c[0], __A[1]); 548 } 549 550 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 551 _mm_cmpnge_sd (__m128d __A, __m128d __B) 552 { 553 __v2df a, b, c; 554 a = vec_splats (__A[0]); 555 b = vec_splats (__B[0]); 556 /* Not greater than or equal is just less than. */ 557 c = (__v2df) vec_cmplt(a, b); 558 return (__m128d) _mm_setr_pd (c[0], __A[1]); 559 } 560 561 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 562 _mm_cmpord_sd (__m128d __A, __m128d __B) 563 { 564 __v2df r; 565 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 566 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]); 567 } 568 569 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 570 _mm_cmpunord_sd (__m128d __A, __m128d __B) 571 { 572 __v2df r; 573 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 574 return (__m128d) _mm_setr_pd (r[0], __A[1]); 575 } 576 577 /* FIXME 578 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are 579 exactly the same because GCC for PowerPC only generates unordered 580 compares (scalar and vector). 581 Technically __mm_comieq_sp et all should be using the ordered 582 compare and signal for QNaNs. The __mm_ucomieq_sd et all should 583 be OK. */ 584 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 585 _mm_comieq_sd (__m128d __A, __m128d __B) 586 { 587 return (__A[0] == __B[0]); 588 } 589 590 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 591 _mm_comilt_sd (__m128d __A, __m128d __B) 592 { 593 return (__A[0] < __B[0]); 594 } 595 596 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 597 _mm_comile_sd (__m128d __A, __m128d __B) 598 { 599 return (__A[0] <= __B[0]); 600 } 601 602 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 603 _mm_comigt_sd (__m128d __A, __m128d __B) 604 { 605 return (__A[0] > __B[0]); 606 } 607 608 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 609 _mm_comige_sd (__m128d __A, __m128d __B) 610 { 611 return (__A[0] >= __B[0]); 612 } 613 614 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 615 _mm_comineq_sd (__m128d __A, __m128d __B) 616 { 617 return (__A[0] != __B[0]); 618 } 619 620 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 621 _mm_ucomieq_sd (__m128d __A, __m128d __B) 622 { 623 return (__A[0] == __B[0]); 624 } 625 626 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 627 _mm_ucomilt_sd (__m128d __A, __m128d __B) 628 { 629 return (__A[0] < __B[0]); 630 } 631 632 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 633 _mm_ucomile_sd (__m128d __A, __m128d __B) 634 { 635 return (__A[0] <= __B[0]); 636 } 637 638 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 639 _mm_ucomigt_sd (__m128d __A, __m128d __B) 640 { 641 return (__A[0] > __B[0]); 642 } 643 644 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 645 _mm_ucomige_sd (__m128d __A, __m128d __B) 646 { 647 return (__A[0] >= __B[0]); 648 } 649 650 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 651 _mm_ucomineq_sd (__m128d __A, __m128d __B) 652 { 653 return (__A[0] != __B[0]); 654 } 655 656 /* Create a vector of Qi, where i is the element number. */ 657 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 658 _mm_set_epi64x (long long __q1, long long __q0) 659 { 660 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 661 } 662 663 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 664 _mm_set_epi64 (__m64 __q1, __m64 __q0) 665 { 666 return _mm_set_epi64x ((long long)__q1, (long long)__q0); 667 } 668 669 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 670 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) 671 { 672 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; 673 } 674 675 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 676 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, 677 short __q3, short __q2, short __q1, short __q0) 678 { 679 return __extension__ (__m128i)(__v8hi){ 680 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; 681 } 682 683 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 684 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, 685 char __q11, char __q10, char __q09, char __q08, 686 char __q07, char __q06, char __q05, char __q04, 687 char __q03, char __q02, char __q01, char __q00) 688 { 689 return __extension__ (__m128i)(__v16qi){ 690 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 691 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 692 }; 693 } 694 695 /* Set all of the elements of the vector to A. */ 696 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 697 _mm_set1_epi64x (long long __A) 698 { 699 return _mm_set_epi64x (__A, __A); 700 } 701 702 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 703 _mm_set1_epi64 (__m64 __A) 704 { 705 return _mm_set_epi64 (__A, __A); 706 } 707 708 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 709 _mm_set1_epi32 (int __A) 710 { 711 return _mm_set_epi32 (__A, __A, __A, __A); 712 } 713 714 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 715 _mm_set1_epi16 (short __A) 716 { 717 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); 718 } 719 720 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 721 _mm_set1_epi8 (char __A) 722 { 723 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 724 __A, __A, __A, __A, __A, __A, __A, __A); 725 } 726 727 /* Create a vector of Qi, where i is the element number. 728 The parameter order is reversed from the _mm_set_epi* functions. */ 729 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 730 _mm_setr_epi64 (__m64 __q0, __m64 __q1) 731 { 732 return _mm_set_epi64 (__q1, __q0); 733 } 734 735 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 736 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) 737 { 738 return _mm_set_epi32 (__q3, __q2, __q1, __q0); 739 } 740 741 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 742 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, 743 short __q4, short __q5, short __q6, short __q7) 744 { 745 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 746 } 747 748 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 749 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, 750 char __q04, char __q05, char __q06, char __q07, 751 char __q08, char __q09, char __q10, char __q11, 752 char __q12, char __q13, char __q14, char __q15) 753 { 754 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 755 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 756 } 757 758 /* Create a vector with element 0 as *P and the rest zero. */ 759 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 760 _mm_load_si128 (__m128i const *__P) 761 { 762 return *__P; 763 } 764 765 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 766 _mm_loadu_si128 (__m128i_u const *__P) 767 { 768 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); 769 } 770 771 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 772 _mm_loadl_epi64 (__m128i_u const *__P) 773 { 774 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); 775 } 776 777 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 778 _mm_store_si128 (__m128i *__P, __m128i __B) 779 { 780 vec_st ((__v16qu) __B, 0, (__v16qu*)__P); 781 } 782 783 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 784 _mm_storeu_si128 (__m128i_u *__P, __m128i __B) 785 { 786 *__P = __B; 787 } 788 789 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 790 _mm_storel_epi64 (__m128i_u *__P, __m128i __B) 791 { 792 *(long long *)__P = ((__v2di)__B)[0]; 793 } 794 795 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 796 _mm_movepi64_pi64 (__m128i_u __B) 797 { 798 return (__m64) ((__v2di)__B)[0]; 799 } 800 801 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 802 _mm_movpi64_epi64 (__m64 __A) 803 { 804 return _mm_set_epi64 ((__m64)0LL, __A); 805 } 806 807 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 808 _mm_move_epi64 (__m128i __A) 809 { 810 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]); 811 } 812 813 /* Create an undefined vector. */ 814 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 815 _mm_undefined_si128 (void) 816 { 817 __m128i __Y = __Y; 818 return __Y; 819 } 820 821 /* Create a vector of zeros. */ 822 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 823 _mm_setzero_si128 (void) 824 { 825 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; 826 } 827 828 #ifdef _ARCH_PWR8 829 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 830 _mm_cvtepi32_pd (__m128i __A) 831 { 832 __v2di val; 833 /* For LE need to generate Vector Unpack Low Signed Word. 834 Which is generated from unpackh. */ 835 val = (__v2di)vec_unpackh ((__v4si)__A); 836 837 return (__m128d)vec_ctf (val, 0); 838 } 839 #endif 840 841 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 842 _mm_cvtepi32_ps (__m128i __A) 843 { 844 return ((__m128)vec_ctf((__v4si)__A, 0)); 845 } 846 847 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 848 _mm_cvtpd_epi32 (__m128d __A) 849 { 850 __v2df rounded = vec_rint (__A); 851 __v4si result, temp; 852 const __v4si vzero = 853 { 0, 0, 0, 0 }; 854 855 /* VSX Vector truncate Double-Precision to integer and Convert to 856 Signed Integer Word format with Saturate. */ 857 __asm__( 858 "xvcvdpsxws %x0,%x1" 859 : "=wa" (temp) 860 : "wa" (rounded) 861 : ); 862 863 #ifdef _ARCH_PWR8 864 temp = vec_mergeo (temp, temp); 865 result = (__v4si) vec_vpkudum ((__vector long long) temp, 866 (__vector long long) vzero); 867 #else 868 { 869 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 870 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 871 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 872 } 873 #endif 874 return (__m128i) result; 875 } 876 877 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 878 _mm_cvtpd_pi32 (__m128d __A) 879 { 880 __m128i result = _mm_cvtpd_epi32(__A); 881 882 return (__m64) result[0]; 883 } 884 885 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 886 _mm_cvtpd_ps (__m128d __A) 887 { 888 __v4sf result; 889 __v4si temp; 890 const __v4si vzero = { 0, 0, 0, 0 }; 891 892 __asm__( 893 "xvcvdpsp %x0,%x1" 894 : "=wa" (temp) 895 : "wa" (__A) 896 : ); 897 898 #ifdef _ARCH_PWR8 899 temp = vec_mergeo (temp, temp); 900 result = (__v4sf) vec_vpkudum ((__vector long long) temp, 901 (__vector long long) vzero); 902 #else 903 { 904 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 905 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 906 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 907 } 908 #endif 909 return ((__m128)result); 910 } 911 912 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 913 _mm_cvttpd_epi32 (__m128d __A) 914 { 915 __v4si result; 916 __v4si temp; 917 const __v4si vzero = { 0, 0, 0, 0 }; 918 919 /* VSX Vector truncate Double-Precision to integer and Convert to 920 Signed Integer Word format with Saturate. */ 921 __asm__( 922 "xvcvdpsxws %x0,%x1" 923 : "=wa" (temp) 924 : "wa" (__A) 925 : ); 926 927 #ifdef _ARCH_PWR8 928 temp = vec_mergeo (temp, temp); 929 result = (__v4si) vec_vpkudum ((__vector long long) temp, 930 (__vector long long) vzero); 931 #else 932 { 933 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 934 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 935 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 936 } 937 #endif 938 939 return ((__m128i) result); 940 } 941 942 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 943 _mm_cvttpd_pi32 (__m128d __A) 944 { 945 __m128i result = _mm_cvttpd_epi32 (__A); 946 947 return (__m64) result[0]; 948 } 949 950 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 951 _mm_cvtsi128_si32 (__m128i __A) 952 { 953 return ((__v4si)__A)[0]; 954 } 955 956 #ifdef _ARCH_PWR8 957 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 958 _mm_cvtpi32_pd (__m64 __A) 959 { 960 __v4si temp; 961 __v2di tmp2; 962 __v2df result; 963 964 temp = (__v4si)vec_splats (__A); 965 tmp2 = (__v2di)vec_unpackl (temp); 966 result = vec_ctf ((__vector signed long long) tmp2, 0); 967 return (__m128d)result; 968 } 969 #endif 970 971 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 972 _mm_cvtps_epi32 (__m128 __A) 973 { 974 __v4sf rounded; 975 __v4si result; 976 977 rounded = vec_rint((__v4sf) __A); 978 result = vec_cts (rounded, 0); 979 return (__m128i) result; 980 } 981 982 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 983 _mm_cvttps_epi32 (__m128 __A) 984 { 985 __v4si result; 986 987 result = vec_cts ((__v4sf) __A, 0); 988 return (__m128i) result; 989 } 990 991 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 992 _mm_cvtps_pd (__m128 __A) 993 { 994 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ 995 #ifdef vec_doubleh 996 return (__m128d) vec_doubleh ((__v4sf)__A); 997 #else 998 /* Otherwise the compiler is not current and so need to generate the 999 equivalent code. */ 1000 __v4sf a = (__v4sf)__A; 1001 __v4sf temp; 1002 __v2df result; 1003 #ifdef __LITTLE_ENDIAN__ 1004 /* The input float values are in elements {[0], [1]} but the convert 1005 instruction needs them in elements {[1], [3]}, So we use two 1006 shift left double vector word immediates to get the elements 1007 lined up. */ 1008 temp = __builtin_vsx_xxsldwi (a, a, 3); 1009 temp = __builtin_vsx_xxsldwi (a, temp, 2); 1010 #else 1011 /* The input float values are in elements {[0], [1]} but the convert 1012 instruction needs them in elements {[0], [2]}, So we use two 1013 shift left double vector word immediates to get the elements 1014 lined up. */ 1015 temp = vec_vmrghw (a, a); 1016 #endif 1017 __asm__( 1018 " xvcvspdp %x0,%x1" 1019 : "=wa" (result) 1020 : "wa" (temp) 1021 : ); 1022 return (__m128d) result; 1023 #endif 1024 } 1025 1026 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1027 _mm_cvtsd_si32 (__m128d __A) 1028 { 1029 __v2df rounded = vec_rint((__v2df) __A); 1030 int result = ((__v2df)rounded)[0]; 1031 1032 return result; 1033 } 1034 /* Intel intrinsic. */ 1035 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1036 _mm_cvtsd_si64 (__m128d __A) 1037 { 1038 __v2df rounded = vec_rint ((__v2df) __A ); 1039 long long result = ((__v2df) rounded)[0]; 1040 1041 return result; 1042 } 1043 1044 /* Microsoft intrinsic. */ 1045 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1046 _mm_cvtsd_si64x (__m128d __A) 1047 { 1048 return _mm_cvtsd_si64 ((__v2df)__A); 1049 } 1050 1051 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1052 _mm_cvttsd_si32 (__m128d __A) 1053 { 1054 int result = ((__v2df)__A)[0]; 1055 1056 return result; 1057 } 1058 1059 /* Intel intrinsic. */ 1060 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1061 _mm_cvttsd_si64 (__m128d __A) 1062 { 1063 long long result = ((__v2df)__A)[0]; 1064 1065 return result; 1066 } 1067 1068 /* Microsoft intrinsic. */ 1069 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1070 _mm_cvttsd_si64x (__m128d __A) 1071 { 1072 return _mm_cvttsd_si64 (__A); 1073 } 1074 1075 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1076 _mm_cvtsd_ss (__m128 __A, __m128d __B) 1077 { 1078 __v4sf result = (__v4sf)__A; 1079 1080 #ifdef __LITTLE_ENDIAN__ 1081 __v4sf temp_s; 1082 /* Copy double element[0] to element [1] for conversion. */ 1083 __v2df temp_b = vec_splat((__v2df)__B, 0); 1084 1085 /* Pre-rotate __A left 3 (logically right 1) elements. */ 1086 result = __builtin_vsx_xxsldwi (result, result, 3); 1087 /* Convert double to single float scalar in a vector. */ 1088 __asm__( 1089 "xscvdpsp %x0,%x1" 1090 : "=wa" (temp_s) 1091 : "wa" (temp_b) 1092 : ); 1093 /* Shift the resulting scalar into vector element [0]. */ 1094 result = __builtin_vsx_xxsldwi (result, temp_s, 1); 1095 #else 1096 result [0] = ((__v2df)__B)[0]; 1097 #endif 1098 return (__m128) result; 1099 } 1100 1101 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1102 _mm_cvtsi32_sd (__m128d __A, int __B) 1103 { 1104 __v2df result = (__v2df)__A; 1105 double db = __B; 1106 result [0] = db; 1107 return (__m128d)result; 1108 } 1109 1110 /* Intel intrinsic. */ 1111 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1112 _mm_cvtsi64_sd (__m128d __A, long long __B) 1113 { 1114 __v2df result = (__v2df)__A; 1115 double db = __B; 1116 result [0] = db; 1117 return (__m128d)result; 1118 } 1119 1120 /* Microsoft intrinsic. */ 1121 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1122 _mm_cvtsi64x_sd (__m128d __A, long long __B) 1123 { 1124 return _mm_cvtsi64_sd (__A, __B); 1125 } 1126 1127 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1128 _mm_cvtss_sd (__m128d __A, __m128 __B) 1129 { 1130 #ifdef __LITTLE_ENDIAN__ 1131 /* Use splat to move element [0] into position for the convert. */ 1132 __v4sf temp = vec_splat ((__v4sf)__B, 0); 1133 __v2df res; 1134 /* Convert single float scalar to double in a vector. */ 1135 __asm__( 1136 "xscvspdp %x0,%x1" 1137 : "=wa" (res) 1138 : "wa" (temp) 1139 : ); 1140 return (__m128d) vec_mergel (res, (__v2df)__A); 1141 #else 1142 __v2df res = (__v2df)__A; 1143 res [0] = ((__v4sf)__B) [0]; 1144 return (__m128d) res; 1145 #endif 1146 } 1147 1148 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1149 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) 1150 { 1151 __vector double result; 1152 const int litmsk = __mask & 0x3; 1153 1154 if (litmsk == 0) 1155 result = vec_mergeh (__A, __B); 1156 #if __GNUC__ < 6 1157 else if (litmsk == 1) 1158 result = vec_xxpermdi (__B, __A, 2); 1159 else if (litmsk == 2) 1160 result = vec_xxpermdi (__B, __A, 1); 1161 #else 1162 else if (litmsk == 1) 1163 result = vec_xxpermdi (__A, __B, 2); 1164 else if (litmsk == 2) 1165 result = vec_xxpermdi (__A, __B, 1); 1166 #endif 1167 else 1168 result = vec_mergel (__A, __B); 1169 1170 return result; 1171 } 1172 1173 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1174 _mm_unpackhi_pd (__m128d __A, __m128d __B) 1175 { 1176 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B); 1177 } 1178 1179 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1180 _mm_unpacklo_pd (__m128d __A, __m128d __B) 1181 { 1182 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B); 1183 } 1184 1185 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1186 _mm_loadh_pd (__m128d __A, double const *__B) 1187 { 1188 __v2df result = (__v2df)__A; 1189 result [1] = *__B; 1190 return (__m128d)result; 1191 } 1192 1193 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1194 _mm_loadl_pd (__m128d __A, double const *__B) 1195 { 1196 __v2df result = (__v2df)__A; 1197 result [0] = *__B; 1198 return (__m128d)result; 1199 } 1200 1201 #ifdef _ARCH_PWR8 1202 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1203 1204 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */ 1205 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1206 _mm_movemask_pd (__m128d __A) 1207 { 1208 __vector unsigned long long result; 1209 static const __vector unsigned int perm_mask = 1210 { 1211 #ifdef __LITTLE_ENDIAN__ 1212 0x80800040, 0x80808080, 0x80808080, 0x80808080 1213 #else 1214 0x80808080, 0x80808080, 0x80808080, 0x80804000 1215 #endif 1216 }; 1217 1218 result = ((__vector unsigned long long) 1219 vec_vbpermq ((__vector unsigned char) __A, 1220 (__vector unsigned char) perm_mask)); 1221 1222 #ifdef __LITTLE_ENDIAN__ 1223 return result[1]; 1224 #else 1225 return result[0]; 1226 #endif 1227 } 1228 #endif /* _ARCH_PWR8 */ 1229 1230 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1231 _mm_packs_epi16 (__m128i __A, __m128i __B) 1232 { 1233 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B); 1234 } 1235 1236 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1237 _mm_packs_epi32 (__m128i __A, __m128i __B) 1238 { 1239 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B); 1240 } 1241 1242 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1243 _mm_packus_epi16 (__m128i __A, __m128i __B) 1244 { 1245 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B); 1246 } 1247 1248 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1249 _mm_unpackhi_epi8 (__m128i __A, __m128i __B) 1250 { 1251 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B); 1252 } 1253 1254 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1255 _mm_unpackhi_epi16 (__m128i __A, __m128i __B) 1256 { 1257 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B); 1258 } 1259 1260 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1261 _mm_unpackhi_epi32 (__m128i __A, __m128i __B) 1262 { 1263 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B); 1264 } 1265 1266 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1267 _mm_unpackhi_epi64 (__m128i __A, __m128i __B) 1268 { 1269 return (__m128i) vec_mergel ((__vector long long) __A, 1270 (__vector long long) __B); 1271 } 1272 1273 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1274 _mm_unpacklo_epi8 (__m128i __A, __m128i __B) 1275 { 1276 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B); 1277 } 1278 1279 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1280 _mm_unpacklo_epi16 (__m128i __A, __m128i __B) 1281 { 1282 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B); 1283 } 1284 1285 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1286 _mm_unpacklo_epi32 (__m128i __A, __m128i __B) 1287 { 1288 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B); 1289 } 1290 1291 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1292 _mm_unpacklo_epi64 (__m128i __A, __m128i __B) 1293 { 1294 return (__m128i) vec_mergeh ((__vector long long) __A, 1295 (__vector long long) __B); 1296 } 1297 1298 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1299 _mm_add_epi8 (__m128i __A, __m128i __B) 1300 { 1301 return (__m128i) ((__v16qu)__A + (__v16qu)__B); 1302 } 1303 1304 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1305 _mm_add_epi16 (__m128i __A, __m128i __B) 1306 { 1307 return (__m128i) ((__v8hu)__A + (__v8hu)__B); 1308 } 1309 1310 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1311 _mm_add_epi32 (__m128i __A, __m128i __B) 1312 { 1313 return (__m128i) ((__v4su)__A + (__v4su)__B); 1314 } 1315 1316 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1317 _mm_add_epi64 (__m128i __A, __m128i __B) 1318 { 1319 return (__m128i) ((__v2du)__A + (__v2du)__B); 1320 } 1321 1322 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1323 _mm_adds_epi8 (__m128i __A, __m128i __B) 1324 { 1325 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B); 1326 } 1327 1328 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1329 _mm_adds_epi16 (__m128i __A, __m128i __B) 1330 { 1331 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B); 1332 } 1333 1334 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1335 _mm_adds_epu8 (__m128i __A, __m128i __B) 1336 { 1337 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B); 1338 } 1339 1340 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1341 _mm_adds_epu16 (__m128i __A, __m128i __B) 1342 { 1343 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B); 1344 } 1345 1346 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1347 _mm_sub_epi8 (__m128i __A, __m128i __B) 1348 { 1349 return (__m128i) ((__v16qu)__A - (__v16qu)__B); 1350 } 1351 1352 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1353 _mm_sub_epi16 (__m128i __A, __m128i __B) 1354 { 1355 return (__m128i) ((__v8hu)__A - (__v8hu)__B); 1356 } 1357 1358 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1359 _mm_sub_epi32 (__m128i __A, __m128i __B) 1360 { 1361 return (__m128i) ((__v4su)__A - (__v4su)__B); 1362 } 1363 1364 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1365 _mm_sub_epi64 (__m128i __A, __m128i __B) 1366 { 1367 return (__m128i) ((__v2du)__A - (__v2du)__B); 1368 } 1369 1370 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1371 _mm_subs_epi8 (__m128i __A, __m128i __B) 1372 { 1373 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B); 1374 } 1375 1376 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1377 _mm_subs_epi16 (__m128i __A, __m128i __B) 1378 { 1379 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B); 1380 } 1381 1382 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1383 _mm_subs_epu8 (__m128i __A, __m128i __B) 1384 { 1385 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B); 1386 } 1387 1388 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1389 _mm_subs_epu16 (__m128i __A, __m128i __B) 1390 { 1391 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B); 1392 } 1393 1394 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1395 _mm_madd_epi16 (__m128i __A, __m128i __B) 1396 { 1397 __vector signed int zero = {0, 0, 0, 0}; 1398 1399 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero); 1400 } 1401 1402 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1403 _mm_mulhi_epi16 (__m128i __A, __m128i __B) 1404 { 1405 __vector signed int w0, w1; 1406 1407 __vector unsigned char xform1 = { 1408 #ifdef __LITTLE_ENDIAN__ 1409 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1410 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1411 #else 1412 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1413 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1414 #endif 1415 }; 1416 1417 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B); 1418 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B); 1419 return (__m128i) vec_perm (w0, w1, xform1); 1420 } 1421 1422 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1423 _mm_mullo_epi16 (__m128i __A, __m128i __B) 1424 { 1425 return (__m128i) ((__v8hi)__A * (__v8hi)__B); 1426 } 1427 1428 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1429 _mm_mul_su32 (__m64 __A, __m64 __B) 1430 { 1431 unsigned int a = __A; 1432 unsigned int b = __B; 1433 1434 return ((__m64)a * (__m64)b); 1435 } 1436 1437 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1438 _mm_mul_epu32 (__m128i __A, __m128i __B) 1439 { 1440 #if __GNUC__ < 8 1441 __v2du result; 1442 1443 #ifdef __LITTLE_ENDIAN__ 1444 /* VMX Vector Multiply Odd Unsigned Word. */ 1445 __asm__( 1446 "vmulouw %0,%1,%2" 1447 : "=v" (result) 1448 : "v" (__A), "v" (__B) 1449 : ); 1450 #else 1451 /* VMX Vector Multiply Even Unsigned Word. */ 1452 __asm__( 1453 "vmuleuw %0,%1,%2" 1454 : "=v" (result) 1455 : "v" (__A), "v" (__B) 1456 : ); 1457 #endif 1458 return (__m128i) result; 1459 #else 1460 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); 1461 #endif 1462 } 1463 1464 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1465 _mm_slli_epi16 (__m128i __A, int __B) 1466 { 1467 __v8hu lshift; 1468 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1469 1470 if (__B >= 0 && __B < 16) 1471 { 1472 if (__builtin_constant_p(__B)) 1473 lshift = (__v8hu) vec_splat_s16(__B); 1474 else 1475 lshift = vec_splats ((unsigned short) __B); 1476 1477 result = vec_sl ((__v8hi) __A, lshift); 1478 } 1479 1480 return (__m128i) result; 1481 } 1482 1483 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1484 _mm_slli_epi32 (__m128i __A, int __B) 1485 { 1486 __v4su lshift; 1487 __v4si result = { 0, 0, 0, 0 }; 1488 1489 if (__B >= 0 && __B < 32) 1490 { 1491 if (__builtin_constant_p(__B) && __B < 16) 1492 lshift = (__v4su) vec_splat_s32(__B); 1493 else 1494 lshift = vec_splats ((unsigned int) __B); 1495 1496 result = vec_sl ((__v4si) __A, lshift); 1497 } 1498 1499 return (__m128i) result; 1500 } 1501 1502 #ifdef _ARCH_PWR8 1503 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1504 _mm_slli_epi64 (__m128i __A, int __B) 1505 { 1506 __v2du lshift; 1507 __v2di result = { 0, 0 }; 1508 1509 if (__B >= 0 && __B < 64) 1510 { 1511 if (__builtin_constant_p(__B) && __B < 16) 1512 lshift = (__v2du) vec_splat_s32(__B); 1513 else 1514 lshift = (__v2du) vec_splats ((unsigned int) __B); 1515 1516 result = vec_sl ((__v2di) __A, lshift); 1517 } 1518 1519 return (__m128i) result; 1520 } 1521 #endif 1522 1523 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1524 _mm_srai_epi16 (__m128i __A, int __B) 1525 { 1526 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1527 __v8hi result; 1528 1529 if (__B < 16) 1530 { 1531 if (__builtin_constant_p(__B)) 1532 rshift = (__v8hu) vec_splat_s16(__B); 1533 else 1534 rshift = vec_splats ((unsigned short) __B); 1535 } 1536 result = vec_sra ((__v8hi) __A, rshift); 1537 1538 return (__m128i) result; 1539 } 1540 1541 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1542 _mm_srai_epi32 (__m128i __A, int __B) 1543 { 1544 __v4su rshift = { 31, 31, 31, 31 }; 1545 __v4si result; 1546 1547 if (__B < 32) 1548 { 1549 if (__builtin_constant_p(__B)) 1550 { 1551 if (__B < 16) 1552 rshift = (__v4su) vec_splat_s32(__B); 1553 else 1554 rshift = (__v4su) vec_splats((unsigned int)__B); 1555 } 1556 else 1557 rshift = vec_splats ((unsigned int) __B); 1558 } 1559 result = vec_sra ((__v4si) __A, rshift); 1560 1561 return (__m128i) result; 1562 } 1563 1564 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1565 _mm_bslli_si128 (__m128i __A, const int __N) 1566 { 1567 __v16qu result; 1568 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1569 1570 if (__N < 16) 1571 result = vec_sld ((__v16qu) __A, zeros, __N); 1572 else 1573 result = zeros; 1574 1575 return (__m128i) result; 1576 } 1577 1578 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1579 _mm_bsrli_si128 (__m128i __A, const int __N) 1580 { 1581 __v16qu result; 1582 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1583 1584 if (__N < 16) 1585 #ifdef __LITTLE_ENDIAN__ 1586 if (__builtin_constant_p(__N)) 1587 /* Would like to use Vector Shift Left Double by Octet 1588 Immediate here to use the immediate form and avoid 1589 load of __N * 8 value into a separate VR. */ 1590 result = vec_sld (zeros, (__v16qu) __A, (16 - __N)); 1591 else 1592 #endif 1593 { 1594 __v16qu shift = vec_splats((unsigned char)(__N*8)); 1595 #ifdef __LITTLE_ENDIAN__ 1596 result = vec_sro ((__v16qu)__A, shift); 1597 #else 1598 result = vec_slo ((__v16qu)__A, shift); 1599 #endif 1600 } 1601 else 1602 result = zeros; 1603 1604 return (__m128i) result; 1605 } 1606 1607 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1608 _mm_srli_si128 (__m128i __A, const int __N) 1609 { 1610 return _mm_bsrli_si128 (__A, __N); 1611 } 1612 1613 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1614 _mm_slli_si128 (__m128i __A, const int _imm5) 1615 { 1616 __v16qu result; 1617 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1618 1619 if (_imm5 < 16) 1620 #ifdef __LITTLE_ENDIAN__ 1621 result = vec_sld ((__v16qu) __A, zeros, _imm5); 1622 #else 1623 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5)); 1624 #endif 1625 else 1626 result = zeros; 1627 1628 return (__m128i) result; 1629 } 1630 1631 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1632 1633 _mm_srli_epi16 (__m128i __A, int __B) 1634 { 1635 __v8hu rshift; 1636 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1637 1638 if (__B < 16) 1639 { 1640 if (__builtin_constant_p(__B)) 1641 rshift = (__v8hu) vec_splat_s16(__B); 1642 else 1643 rshift = vec_splats ((unsigned short) __B); 1644 1645 result = vec_sr ((__v8hi) __A, rshift); 1646 } 1647 1648 return (__m128i) result; 1649 } 1650 1651 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1652 _mm_srli_epi32 (__m128i __A, int __B) 1653 { 1654 __v4su rshift; 1655 __v4si result = { 0, 0, 0, 0 }; 1656 1657 if (__B < 32) 1658 { 1659 if (__builtin_constant_p(__B)) 1660 { 1661 if (__B < 16) 1662 rshift = (__v4su) vec_splat_s32(__B); 1663 else 1664 rshift = (__v4su) vec_splats((unsigned int)__B); 1665 } 1666 else 1667 rshift = vec_splats ((unsigned int) __B); 1668 1669 result = vec_sr ((__v4si) __A, rshift); 1670 } 1671 1672 return (__m128i) result; 1673 } 1674 1675 #ifdef _ARCH_PWR8 1676 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1677 _mm_srli_epi64 (__m128i __A, int __B) 1678 { 1679 __v2du rshift; 1680 __v2di result = { 0, 0 }; 1681 1682 if (__B < 64) 1683 { 1684 if (__builtin_constant_p(__B)) 1685 { 1686 if (__B < 16) 1687 rshift = (__v2du) vec_splat_s32(__B); 1688 else 1689 rshift = (__v2du) vec_splats((unsigned long long)__B); 1690 } 1691 else 1692 rshift = (__v2du) vec_splats ((unsigned int) __B); 1693 1694 result = vec_sr ((__v2di) __A, rshift); 1695 } 1696 1697 return (__m128i) result; 1698 } 1699 #endif 1700 1701 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1702 _mm_sll_epi16 (__m128i __A, __m128i __B) 1703 { 1704 __v8hu lshift; 1705 __vector __bool short shmask; 1706 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1707 __v8hu result; 1708 1709 #ifdef __LITTLE_ENDIAN__ 1710 lshift = vec_splat ((__v8hu) __B, 0); 1711 #else 1712 lshift = vec_splat ((__v8hu) __B, 3); 1713 #endif 1714 shmask = vec_cmple (lshift, shmax); 1715 result = vec_sl ((__v8hu) __A, lshift); 1716 result = vec_sel ((__v8hu) shmask, result, shmask); 1717 1718 return (__m128i) result; 1719 } 1720 1721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1722 _mm_sll_epi32 (__m128i __A, __m128i __B) 1723 { 1724 __v4su lshift; 1725 __vector __bool int shmask; 1726 const __v4su shmax = { 32, 32, 32, 32 }; 1727 __v4su result; 1728 #ifdef __LITTLE_ENDIAN__ 1729 lshift = vec_splat ((__v4su) __B, 0); 1730 #else 1731 lshift = vec_splat ((__v4su) __B, 1); 1732 #endif 1733 shmask = vec_cmplt (lshift, shmax); 1734 result = vec_sl ((__v4su) __A, lshift); 1735 result = vec_sel ((__v4su) shmask, result, shmask); 1736 1737 return (__m128i) result; 1738 } 1739 1740 #ifdef _ARCH_PWR8 1741 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1742 _mm_sll_epi64 (__m128i __A, __m128i __B) 1743 { 1744 __v2du lshift; 1745 __vector __bool long long shmask; 1746 const __v2du shmax = { 64, 64 }; 1747 __v2du result; 1748 1749 lshift = vec_splat ((__v2du) __B, 0); 1750 shmask = vec_cmplt (lshift, shmax); 1751 result = vec_sl ((__v2du) __A, lshift); 1752 result = vec_sel ((__v2du) shmask, result, shmask); 1753 1754 return (__m128i) result; 1755 } 1756 #endif 1757 1758 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1759 _mm_sra_epi16 (__m128i __A, __m128i __B) 1760 { 1761 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1762 __v8hu rshift; 1763 __v8hi result; 1764 1765 #ifdef __LITTLE_ENDIAN__ 1766 rshift = vec_splat ((__v8hu)__B, 0); 1767 #else 1768 rshift = vec_splat ((__v8hu)__B, 3); 1769 #endif 1770 rshift = vec_min (rshift, rshmax); 1771 result = vec_sra ((__v8hi) __A, rshift); 1772 1773 return (__m128i) result; 1774 } 1775 1776 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1777 _mm_sra_epi32 (__m128i __A, __m128i __B) 1778 { 1779 const __v4su rshmax = { 31, 31, 31, 31 }; 1780 __v4su rshift; 1781 __v4si result; 1782 1783 #ifdef __LITTLE_ENDIAN__ 1784 rshift = vec_splat ((__v4su)__B, 0); 1785 #else 1786 rshift = vec_splat ((__v4su)__B, 1); 1787 #endif 1788 rshift = vec_min (rshift, rshmax); 1789 result = vec_sra ((__v4si) __A, rshift); 1790 1791 return (__m128i) result; 1792 } 1793 1794 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1795 _mm_srl_epi16 (__m128i __A, __m128i __B) 1796 { 1797 __v8hu rshift; 1798 __vector __bool short shmask; 1799 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1800 __v8hu result; 1801 1802 #ifdef __LITTLE_ENDIAN__ 1803 rshift = vec_splat ((__v8hu) __B, 0); 1804 #else 1805 rshift = vec_splat ((__v8hu) __B, 3); 1806 #endif 1807 shmask = vec_cmple (rshift, shmax); 1808 result = vec_sr ((__v8hu) __A, rshift); 1809 result = vec_sel ((__v8hu) shmask, result, shmask); 1810 1811 return (__m128i) result; 1812 } 1813 1814 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1815 _mm_srl_epi32 (__m128i __A, __m128i __B) 1816 { 1817 __v4su rshift; 1818 __vector __bool int shmask; 1819 const __v4su shmax = { 32, 32, 32, 32 }; 1820 __v4su result; 1821 1822 #ifdef __LITTLE_ENDIAN__ 1823 rshift = vec_splat ((__v4su) __B, 0); 1824 #else 1825 rshift = vec_splat ((__v4su) __B, 1); 1826 #endif 1827 shmask = vec_cmplt (rshift, shmax); 1828 result = vec_sr ((__v4su) __A, rshift); 1829 result = vec_sel ((__v4su) shmask, result, shmask); 1830 1831 return (__m128i) result; 1832 } 1833 1834 #ifdef _ARCH_PWR8 1835 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1836 _mm_srl_epi64 (__m128i __A, __m128i __B) 1837 { 1838 __v2du rshift; 1839 __vector __bool long long shmask; 1840 const __v2du shmax = { 64, 64 }; 1841 __v2du result; 1842 1843 rshift = vec_splat ((__v2du) __B, 0); 1844 shmask = vec_cmplt (rshift, shmax); 1845 result = vec_sr ((__v2du) __A, rshift); 1846 result = vec_sel ((__v2du) shmask, result, shmask); 1847 1848 return (__m128i) result; 1849 } 1850 #endif 1851 1852 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1853 _mm_and_pd (__m128d __A, __m128d __B) 1854 { 1855 return (vec_and ((__v2df) __A, (__v2df) __B)); 1856 } 1857 1858 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1859 _mm_andnot_pd (__m128d __A, __m128d __B) 1860 { 1861 return (vec_andc ((__v2df) __B, (__v2df) __A)); 1862 } 1863 1864 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1865 _mm_or_pd (__m128d __A, __m128d __B) 1866 { 1867 return (vec_or ((__v2df) __A, (__v2df) __B)); 1868 } 1869 1870 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1871 _mm_xor_pd (__m128d __A, __m128d __B) 1872 { 1873 return (vec_xor ((__v2df) __A, (__v2df) __B)); 1874 } 1875 1876 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1877 _mm_and_si128 (__m128i __A, __m128i __B) 1878 { 1879 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B); 1880 } 1881 1882 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1883 _mm_andnot_si128 (__m128i __A, __m128i __B) 1884 { 1885 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A); 1886 } 1887 1888 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1889 _mm_or_si128 (__m128i __A, __m128i __B) 1890 { 1891 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B); 1892 } 1893 1894 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1895 _mm_xor_si128 (__m128i __A, __m128i __B) 1896 { 1897 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B); 1898 } 1899 1900 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1901 _mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1902 { 1903 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B); 1904 } 1905 1906 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1907 _mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1908 { 1909 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B); 1910 } 1911 1912 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1913 _mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1914 { 1915 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B); 1916 } 1917 1918 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1919 _mm_cmplt_epi8 (__m128i __A, __m128i __B) 1920 { 1921 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B); 1922 } 1923 1924 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1925 _mm_cmplt_epi16 (__m128i __A, __m128i __B) 1926 { 1927 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B); 1928 } 1929 1930 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1931 _mm_cmplt_epi32 (__m128i __A, __m128i __B) 1932 { 1933 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B); 1934 } 1935 1936 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1937 _mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1938 { 1939 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B); 1940 } 1941 1942 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1943 _mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1944 { 1945 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B); 1946 } 1947 1948 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1949 _mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1950 { 1951 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B); 1952 } 1953 1954 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1955 _mm_extract_epi16 (__m128i const __A, int const __N) 1956 { 1957 return (unsigned short) ((__v8hi)__A)[__N & 7]; 1958 } 1959 1960 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1961 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N) 1962 { 1963 __v8hi result = (__v8hi)__A; 1964 1965 result [(__N & 7)] = __D; 1966 1967 return (__m128i) result; 1968 } 1969 1970 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1971 _mm_max_epi16 (__m128i __A, __m128i __B) 1972 { 1973 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B); 1974 } 1975 1976 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1977 _mm_max_epu8 (__m128i __A, __m128i __B) 1978 { 1979 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B); 1980 } 1981 1982 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1983 _mm_min_epi16 (__m128i __A, __m128i __B) 1984 { 1985 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B); 1986 } 1987 1988 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1989 _mm_min_epu8 (__m128i __A, __m128i __B) 1990 { 1991 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B); 1992 } 1993 1994 1995 #ifdef _ARCH_PWR8 1996 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1997 1998 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 1999 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2000 _mm_movemask_epi8 (__m128i __A) 2001 { 2002 __vector unsigned long long result; 2003 static const __vector unsigned char perm_mask = 2004 { 2005 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 2006 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 2007 }; 2008 2009 result = ((__vector unsigned long long) 2010 vec_vbpermq ((__vector unsigned char) __A, 2011 (__vector unsigned char) perm_mask)); 2012 2013 #ifdef __LITTLE_ENDIAN__ 2014 return result[1]; 2015 #else 2016 return result[0]; 2017 #endif 2018 } 2019 #endif /* _ARCH_PWR8 */ 2020 2021 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2022 _mm_mulhi_epu16 (__m128i __A, __m128i __B) 2023 { 2024 __v4su w0, w1; 2025 __v16qu xform1 = { 2026 #ifdef __LITTLE_ENDIAN__ 2027 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 2028 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 2029 #else 2030 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 2031 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 2032 #endif 2033 }; 2034 2035 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B); 2036 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B); 2037 return (__m128i) vec_perm (w0, w1, xform1); 2038 } 2039 2040 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2041 _mm_shufflehi_epi16 (__m128i __A, const int __mask) 2042 { 2043 unsigned long element_selector_98 = __mask & 0x03; 2044 unsigned long element_selector_BA = (__mask >> 2) & 0x03; 2045 unsigned long element_selector_DC = (__mask >> 4) & 0x03; 2046 unsigned long element_selector_FE = (__mask >> 6) & 0x03; 2047 static const unsigned short permute_selectors[4] = 2048 { 2049 #ifdef __LITTLE_ENDIAN__ 2050 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 2051 #else 2052 0x0809, 0x0A0B, 0x0C0D, 0x0E0F 2053 #endif 2054 }; 2055 __v2du pmask = 2056 #ifdef __LITTLE_ENDIAN__ 2057 { 0x1716151413121110UL, 0UL}; 2058 #else 2059 { 0x1011121314151617UL, 0UL}; 2060 #endif 2061 __m64_union t; 2062 __v2du a, r; 2063 2064 t.as_short[0] = permute_selectors[element_selector_98]; 2065 t.as_short[1] = permute_selectors[element_selector_BA]; 2066 t.as_short[2] = permute_selectors[element_selector_DC]; 2067 t.as_short[3] = permute_selectors[element_selector_FE]; 2068 pmask[1] = t.as_m64; 2069 a = (__v2du)__A; 2070 r = vec_perm (a, a, (__vector unsigned char)pmask); 2071 return (__m128i) r; 2072 } 2073 2074 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2075 _mm_shufflelo_epi16 (__m128i __A, const int __mask) 2076 { 2077 unsigned long element_selector_10 = __mask & 0x03; 2078 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 2079 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 2080 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 2081 static const unsigned short permute_selectors[4] = 2082 { 2083 #ifdef __LITTLE_ENDIAN__ 2084 0x0100, 0x0302, 0x0504, 0x0706 2085 #else 2086 0x0001, 0x0203, 0x0405, 0x0607 2087 #endif 2088 }; 2089 __v2du pmask = 2090 #ifdef __LITTLE_ENDIAN__ 2091 { 0UL, 0x1f1e1d1c1b1a1918UL}; 2092 #else 2093 { 0UL, 0x18191a1b1c1d1e1fUL}; 2094 #endif 2095 __m64_union t; 2096 __v2du a, r; 2097 t.as_short[0] = permute_selectors[element_selector_10]; 2098 t.as_short[1] = permute_selectors[element_selector_32]; 2099 t.as_short[2] = permute_selectors[element_selector_54]; 2100 t.as_short[3] = permute_selectors[element_selector_76]; 2101 pmask[0] = t.as_m64; 2102 a = (__v2du)__A; 2103 r = vec_perm (a, a, (__vector unsigned char)pmask); 2104 return (__m128i) r; 2105 } 2106 2107 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2108 _mm_shuffle_epi32 (__m128i __A, const int __mask) 2109 { 2110 unsigned long element_selector_10 = __mask & 0x03; 2111 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 2112 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 2113 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 2114 static const unsigned int permute_selectors[4] = 2115 { 2116 #ifdef __LITTLE_ENDIAN__ 2117 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 2118 #else 2119 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 2120 #endif 2121 }; 2122 __v4su t; 2123 2124 t[0] = permute_selectors[element_selector_10]; 2125 t[1] = permute_selectors[element_selector_32]; 2126 t[2] = permute_selectors[element_selector_54] + 0x10101010; 2127 t[3] = permute_selectors[element_selector_76] + 0x10101010; 2128 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t); 2129 } 2130 2131 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2132 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 2133 { 2134 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; 2135 __v16qu mask, tmp; 2136 __m128i_u *p = (__m128i_u*)__C; 2137 2138 tmp = (__v16qu)_mm_loadu_si128(p); 2139 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit); 2140 tmp = vec_sel (tmp, (__v16qu)__A, mask); 2141 _mm_storeu_si128 (p, (__m128i)tmp); 2142 } 2143 2144 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2145 _mm_avg_epu8 (__m128i __A, __m128i __B) 2146 { 2147 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B); 2148 } 2149 2150 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2151 _mm_avg_epu16 (__m128i __A, __m128i __B) 2152 { 2153 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B); 2154 } 2155 2156 2157 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2158 _mm_sad_epu8 (__m128i __A, __m128i __B) 2159 { 2160 __v16qu a, b; 2161 __v16qu vmin, vmax, vabsdiff; 2162 __v4si vsum; 2163 const __v4su zero = { 0, 0, 0, 0 }; 2164 __v4si result; 2165 2166 a = (__v16qu) __A; 2167 b = (__v16qu) __B; 2168 vmin = vec_min (a, b); 2169 vmax = vec_max (a, b); 2170 vabsdiff = vec_sub (vmax, vmin); 2171 /* Sum four groups of bytes into integers. */ 2172 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); 2173 /* Sum across four integers with two integer results. */ 2174 result = vec_sum2s (vsum, (__vector signed int) zero); 2175 /* Rotate the sums into the correct position. */ 2176 #ifdef __LITTLE_ENDIAN__ 2177 result = vec_sld (result, result, 4); 2178 #else 2179 result = vec_sld (result, result, 6); 2180 #endif 2181 /* Rotate the sums into the correct position. */ 2182 return (__m128i) result; 2183 } 2184 2185 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2186 _mm_stream_si32 (int *__A, int __B) 2187 { 2188 /* Use the data cache block touch for store transient. */ 2189 __asm__ ( 2190 "dcbtstt 0,%0" 2191 : 2192 : "b" (__A) 2193 : "memory" 2194 ); 2195 *__A = __B; 2196 } 2197 2198 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2199 _mm_stream_si64 (long long int *__A, long long int __B) 2200 { 2201 /* Use the data cache block touch for store transient. */ 2202 __asm__ ( 2203 " dcbtstt 0,%0" 2204 : 2205 : "b" (__A) 2206 : "memory" 2207 ); 2208 *__A = __B; 2209 } 2210 2211 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2212 _mm_stream_si128 (__m128i *__A, __m128i __B) 2213 { 2214 /* Use the data cache block touch for store transient. */ 2215 __asm__ ( 2216 "dcbtstt 0,%0" 2217 : 2218 : "b" (__A) 2219 : "memory" 2220 ); 2221 *__A = __B; 2222 } 2223 2224 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2225 _mm_stream_pd (double *__A, __m128d __B) 2226 { 2227 /* Use the data cache block touch for store transient. */ 2228 __asm__ ( 2229 "dcbtstt 0,%0" 2230 : 2231 : "b" (__A) 2232 : "memory" 2233 ); 2234 *(__m128d*)__A = __B; 2235 } 2236 2237 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2238 _mm_clflush (void const *__A) 2239 { 2240 /* Use the data cache block flush. */ 2241 __asm__ ( 2242 "dcbf 0,%0" 2243 : 2244 : "b" (__A) 2245 : "memory" 2246 ); 2247 } 2248 2249 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2250 _mm_lfence (void) 2251 { 2252 /* Use light weight sync for load to load ordering. */ 2253 __atomic_thread_fence (__ATOMIC_RELEASE); 2254 } 2255 2256 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2257 _mm_mfence (void) 2258 { 2259 /* Use heavy weight sync for any to any ordering. */ 2260 __atomic_thread_fence (__ATOMIC_SEQ_CST); 2261 } 2262 2263 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2264 _mm_cvtsi32_si128 (int __A) 2265 { 2266 return _mm_set_epi32 (0, 0, 0, __A); 2267 } 2268 2269 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2270 _mm_cvtsi64_si128 (long long __A) 2271 { 2272 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2273 } 2274 2275 /* Microsoft intrinsic. */ 2276 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2277 _mm_cvtsi64x_si128 (long long __A) 2278 { 2279 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2280 } 2281 2282 /* Casts between various SP, DP, INT vector types. Note that these do no 2283 conversion of values, they just change the type. */ 2284 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2285 _mm_castpd_ps(__m128d __A) 2286 { 2287 return (__m128) __A; 2288 } 2289 2290 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2291 _mm_castpd_si128(__m128d __A) 2292 { 2293 return (__m128i) __A; 2294 } 2295 2296 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2297 _mm_castps_pd(__m128 __A) 2298 { 2299 return (__m128d) __A; 2300 } 2301 2302 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2303 _mm_castps_si128(__m128 __A) 2304 { 2305 return (__m128i) __A; 2306 } 2307 2308 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2309 _mm_castsi128_ps(__m128i __A) 2310 { 2311 return (__m128) __A; 2312 } 2313 2314 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2315 _mm_castsi128_pd(__m128i __A) 2316 { 2317 return (__m128d) __A; 2318 } 2319 2320 #else 2321 #include_next <emmintrin.h> 2322 #endif /* defined(__linux__) && defined(__ppc64__) */ 2323 2324 #endif /* EMMINTRIN_H_ */ 2325