1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type, 18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations. 19 However scalar float operations in vector (XMM) registers require 20 the POWER8 VSX ISA (2.07) level. There are differences for data 21 format and placement of float scalars in the vector register, which 22 require extra steps to match SSE2 scalar float semantics on POWER. 23 24 It should be noted that there's much difference between X86_64's 25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use 26 portable <fenv.h> instead of access MXSCR directly. 27 28 Most SSE2 scalar float intrinsic operations can be performed more 29 efficiently as C language float scalar operations or optimized to 30 use vector SIMD operations. We recommend this for new applications. 31 */ 32 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 33 #endif 34 35 #ifndef EMMINTRIN_H_ 36 #define EMMINTRIN_H_ 37 38 #include <altivec.h> 39 40 /* We need definitions from the SSE header files. */ 41 #include <xmmintrin.h> 42 43 /* SSE2 */ 44 typedef __vector double __v2df; 45 typedef __vector long long __v2di; 46 typedef __vector unsigned long long __v2du; 47 typedef __vector int __v4si; 48 typedef __vector unsigned int __v4su; 49 typedef __vector short __v8hi; 50 typedef __vector unsigned short __v8hu; 51 typedef __vector signed char __v16qi; 52 typedef __vector unsigned char __v16qu; 53 54 /* The Intel API is flexible enough that we must allow aliasing with other 55 vector types, and their scalar components. */ 56 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); 57 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); 58 59 /* Unaligned version of the same types. */ 60 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 61 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); 62 63 /* Define two value permute mask. */ 64 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y)) 65 66 /* Create a vector with element 0 as F and the rest zero. */ 67 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 68 _mm_set_sd (double __F) 69 { 70 return __extension__ (__m128d){ __F, 0.0 }; 71 } 72 73 /* Create a vector with both elements equal to F. */ 74 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 75 _mm_set1_pd (double __F) 76 { 77 return __extension__ (__m128d){ __F, __F }; 78 } 79 80 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 81 _mm_set_pd1 (double __F) 82 { 83 return _mm_set1_pd (__F); 84 } 85 86 /* Create a vector with the lower value X and upper value W. */ 87 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 88 _mm_set_pd (double __W, double __X) 89 { 90 return __extension__ (__m128d){ __X, __W }; 91 } 92 93 /* Create a vector with the lower value W and upper value X. */ 94 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 95 _mm_setr_pd (double __W, double __X) 96 { 97 return __extension__ (__m128d){ __W, __X }; 98 } 99 100 /* Create an undefined vector. */ 101 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 102 _mm_undefined_pd (void) 103 { 104 __m128d __Y = __Y; 105 return __Y; 106 } 107 108 /* Create a vector of zeros. */ 109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 110 _mm_setzero_pd (void) 111 { 112 return (__m128d) vec_splats (0); 113 } 114 115 /* Sets the low DPFP value of A from the low value of B. */ 116 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 117 _mm_move_sd (__m128d __A, __m128d __B) 118 { 119 __v2df result = (__v2df) __A; 120 result [0] = ((__v2df) __B)[0]; 121 return (__m128d) result; 122 } 123 124 /* Load two DPFP values from P. The address must be 16-byte aligned. */ 125 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 126 _mm_load_pd (double const *__P) 127 { 128 return ((__m128d)vec_ld(0, (__v16qu*)__P)); 129 } 130 131 /* Load two DPFP values from P. The address need not be 16-byte aligned. */ 132 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 133 _mm_loadu_pd (double const *__P) 134 { 135 return (vec_vsx_ld(0, __P)); 136 } 137 138 /* Create a vector with all two elements equal to *P. */ 139 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 140 _mm_load1_pd (double const *__P) 141 { 142 return (vec_splats (*__P)); 143 } 144 145 /* Create a vector with element 0 as *P and the rest zero. */ 146 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147 _mm_load_sd (double const *__P) 148 { 149 return _mm_set_sd (*__P); 150 } 151 152 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 153 _mm_load_pd1 (double const *__P) 154 { 155 return _mm_load1_pd (__P); 156 } 157 158 /* Load two DPFP values in reverse order. The address must be aligned. */ 159 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 160 _mm_loadr_pd (double const *__P) 161 { 162 __v2df __tmp = _mm_load_pd (__P); 163 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2); 164 } 165 166 /* Store two DPFP values. The address must be 16-byte aligned. */ 167 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 168 _mm_store_pd (double *__P, __m128d __A) 169 { 170 vec_st((__v16qu)__A, 0, (__v16qu*)__P); 171 } 172 173 /* Store two DPFP values. The address need not be 16-byte aligned. */ 174 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 175 _mm_storeu_pd (double *__P, __m128d __A) 176 { 177 *(__m128d_u *)__P = __A; 178 } 179 180 /* Stores the lower DPFP value. */ 181 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 182 _mm_store_sd (double *__P, __m128d __A) 183 { 184 *__P = ((__v2df)__A)[0]; 185 } 186 187 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 188 _mm_cvtsd_f64 (__m128d __A) 189 { 190 return ((__v2df)__A)[0]; 191 } 192 193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 194 _mm_storel_pd (double *__P, __m128d __A) 195 { 196 _mm_store_sd (__P, __A); 197 } 198 199 /* Stores the upper DPFP value. */ 200 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 201 _mm_storeh_pd (double *__P, __m128d __A) 202 { 203 *__P = ((__v2df)__A)[1]; 204 } 205 /* Store the lower DPFP value across two words. 206 The address must be 16-byte aligned. */ 207 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 208 _mm_store1_pd (double *__P, __m128d __A) 209 { 210 _mm_store_pd (__P, vec_splat (__A, 0)); 211 } 212 213 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 214 _mm_store_pd1 (double *__P, __m128d __A) 215 { 216 _mm_store1_pd (__P, __A); 217 } 218 219 /* Store two DPFP values in reverse order. The address must be aligned. */ 220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221 _mm_storer_pd (double *__P, __m128d __A) 222 { 223 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2)); 224 } 225 226 /* Intel intrinsic. */ 227 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 228 _mm_cvtsi128_si64 (__m128i __A) 229 { 230 return ((__v2di)__A)[0]; 231 } 232 233 /* Microsoft intrinsic. */ 234 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 235 _mm_cvtsi128_si64x (__m128i __A) 236 { 237 return ((__v2di)__A)[0]; 238 } 239 240 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 241 _mm_add_pd (__m128d __A, __m128d __B) 242 { 243 return (__m128d) ((__v2df)__A + (__v2df)__B); 244 } 245 246 /* Add the lower double-precision (64-bit) floating-point element in 247 a and b, store the result in the lower element of dst, and copy 248 the upper element from a to the upper element of dst. */ 249 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 250 _mm_add_sd (__m128d __A, __m128d __B) 251 { 252 __A[0] = __A[0] + __B[0]; 253 return (__A); 254 } 255 256 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 257 _mm_sub_pd (__m128d __A, __m128d __B) 258 { 259 return (__m128d) ((__v2df)__A - (__v2df)__B); 260 } 261 262 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 263 _mm_sub_sd (__m128d __A, __m128d __B) 264 { 265 __A[0] = __A[0] - __B[0]; 266 return (__A); 267 } 268 269 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 270 _mm_mul_pd (__m128d __A, __m128d __B) 271 { 272 return (__m128d) ((__v2df)__A * (__v2df)__B); 273 } 274 275 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 276 _mm_mul_sd (__m128d __A, __m128d __B) 277 { 278 __A[0] = __A[0] * __B[0]; 279 return (__A); 280 } 281 282 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 283 _mm_div_pd (__m128d __A, __m128d __B) 284 { 285 return (__m128d) ((__v2df)__A / (__v2df)__B); 286 } 287 288 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 289 _mm_div_sd (__m128d __A, __m128d __B) 290 { 291 __A[0] = __A[0] / __B[0]; 292 return (__A); 293 } 294 295 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 296 _mm_sqrt_pd (__m128d __A) 297 { 298 return (vec_sqrt (__A)); 299 } 300 301 /* Return pair {sqrt (B[0]), A[1]}. */ 302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 303 _mm_sqrt_sd (__m128d __A, __m128d __B) 304 { 305 __v2df c; 306 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0])); 307 return (__m128d) _mm_setr_pd (c[0], __A[1]); 308 } 309 310 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 311 _mm_min_pd (__m128d __A, __m128d __B) 312 { 313 return (vec_min (__A, __B)); 314 } 315 316 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 317 _mm_min_sd (__m128d __A, __m128d __B) 318 { 319 __v2df a, b, c; 320 a = vec_splats (__A[0]); 321 b = vec_splats (__B[0]); 322 c = vec_min (a, b); 323 return (__m128d) _mm_setr_pd (c[0], __A[1]); 324 } 325 326 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 327 _mm_max_pd (__m128d __A, __m128d __B) 328 { 329 return (vec_max (__A, __B)); 330 } 331 332 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 333 _mm_max_sd (__m128d __A, __m128d __B) 334 { 335 __v2df a, b, c; 336 a = vec_splats (__A[0]); 337 b = vec_splats (__B[0]); 338 c = vec_max (a, b); 339 return (__m128d) _mm_setr_pd (c[0], __A[1]); 340 } 341 342 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 343 _mm_cmpeq_pd (__m128d __A, __m128d __B) 344 { 345 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B)); 346 } 347 348 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 349 _mm_cmplt_pd (__m128d __A, __m128d __B) 350 { 351 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 352 } 353 354 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 355 _mm_cmple_pd (__m128d __A, __m128d __B) 356 { 357 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 358 } 359 360 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 361 _mm_cmpgt_pd (__m128d __A, __m128d __B) 362 { 363 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 364 } 365 366 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 367 _mm_cmpge_pd (__m128d __A, __m128d __B) 368 { 369 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B)); 370 } 371 372 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 373 _mm_cmpneq_pd (__m128d __A, __m128d __B) 374 { 375 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B); 376 return ((__m128d)vec_nor (temp, temp)); 377 } 378 379 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 380 _mm_cmpnlt_pd (__m128d __A, __m128d __B) 381 { 382 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B)); 383 } 384 385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 386 _mm_cmpnle_pd (__m128d __A, __m128d __B) 387 { 388 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); 389 } 390 391 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 392 _mm_cmpngt_pd (__m128d __A, __m128d __B) 393 { 394 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); 395 } 396 397 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 398 _mm_cmpnge_pd (__m128d __A, __m128d __B) 399 { 400 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); 401 } 402 403 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 404 _mm_cmpord_pd (__m128d __A, __m128d __B) 405 { 406 #if _ARCH_PWR8 407 __v2du c, d; 408 /* Compare against self will return false (0's) if NAN. */ 409 c = (__v2du)vec_cmpeq (__A, __A); 410 d = (__v2du)vec_cmpeq (__B, __B); 411 #else 412 __v2du a, b; 413 __v2du c, d; 414 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000}; 415 a = (__v2du)vec_abs ((__v2df)__A); 416 b = (__v2du)vec_abs ((__v2df)__B); 417 c = (__v2du)vec_cmpgt (double_exp_mask, a); 418 d = (__v2du)vec_cmpgt (double_exp_mask, b); 419 #endif 420 /* A != NAN and B != NAN. */ 421 return ((__m128d)vec_and(c, d)); 422 } 423 424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 425 _mm_cmpunord_pd (__m128d __A, __m128d __B) 426 { 427 #if _ARCH_PWR8 428 __v2du c, d; 429 /* Compare against self will return false (0's) if NAN. */ 430 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 431 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 432 /* A == NAN OR B == NAN converts too: 433 NOT(A != NAN) OR NOT(B != NAN). */ 434 c = vec_nor (c, c); 435 return ((__m128d)vec_orc(c, d)); 436 #else 437 __v2du c, d; 438 /* Compare against self will return false (0's) if NAN. */ 439 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); 440 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); 441 /* Convert the true ('1's) is NAN. */ 442 c = vec_nor (c, c); 443 d = vec_nor (d, d); 444 return ((__m128d)vec_or(c, d)); 445 #endif 446 } 447 448 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 449 _mm_cmpeq_sd(__m128d __A, __m128d __B) 450 { 451 __v2df a, b, c; 452 /* PowerISA VSX does not allow partial (for just lower double) 453 results. So to insure we don't generate spurious exceptions 454 (from the upper double values) we splat the lower double 455 before we do the operation. */ 456 a = vec_splats (__A[0]); 457 b = vec_splats (__B[0]); 458 c = (__v2df) vec_cmpeq(a, b); 459 /* Then we merge the lower double result with the original upper 460 double from __A. */ 461 return (__m128d) _mm_setr_pd (c[0], __A[1]); 462 } 463 464 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 465 _mm_cmplt_sd (__m128d __A, __m128d __B) 466 { 467 __v2df a, b, c; 468 a = vec_splats (__A[0]); 469 b = vec_splats (__B[0]); 470 c = (__v2df) vec_cmplt(a, b); 471 return (__m128d) _mm_setr_pd (c[0], __A[1]); 472 } 473 474 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 475 _mm_cmple_sd (__m128d __A, __m128d __B) 476 { 477 __v2df a, b, c; 478 a = vec_splats (__A[0]); 479 b = vec_splats (__B[0]); 480 c = (__v2df) vec_cmple(a, b); 481 return (__m128d) _mm_setr_pd (c[0], __A[1]); 482 } 483 484 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 485 _mm_cmpgt_sd (__m128d __A, __m128d __B) 486 { 487 __v2df a, b, c; 488 a = vec_splats (__A[0]); 489 b = vec_splats (__B[0]); 490 c = (__v2df) vec_cmpgt(a, b); 491 return (__m128d) _mm_setr_pd (c[0], __A[1]); 492 } 493 494 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 495 _mm_cmpge_sd (__m128d __A, __m128d __B) 496 { 497 __v2df a, b, c; 498 a = vec_splats (__A[0]); 499 b = vec_splats (__B[0]); 500 c = (__v2df) vec_cmpge(a, b); 501 return (__m128d) _mm_setr_pd (c[0], __A[1]); 502 } 503 504 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 505 _mm_cmpneq_sd (__m128d __A, __m128d __B) 506 { 507 __v2df a, b, c; 508 a = vec_splats (__A[0]); 509 b = vec_splats (__B[0]); 510 c = (__v2df) vec_cmpeq(a, b); 511 c = vec_nor (c, c); 512 return (__m128d) _mm_setr_pd (c[0], __A[1]); 513 } 514 515 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 516 _mm_cmpnlt_sd (__m128d __A, __m128d __B) 517 { 518 __v2df a, b, c; 519 a = vec_splats (__A[0]); 520 b = vec_splats (__B[0]); 521 /* Not less than is just greater than or equal. */ 522 c = (__v2df) vec_cmpge(a, b); 523 return (__m128d) _mm_setr_pd (c[0], __A[1]); 524 } 525 526 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 527 _mm_cmpnle_sd (__m128d __A, __m128d __B) 528 { 529 __v2df a, b, c; 530 a = vec_splats (__A[0]); 531 b = vec_splats (__B[0]); 532 /* Not less than or equal is just greater than. */ 533 c = (__v2df) vec_cmpge(a, b); 534 return (__m128d) _mm_setr_pd (c[0], __A[1]); 535 } 536 537 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 538 _mm_cmpngt_sd (__m128d __A, __m128d __B) 539 { 540 __v2df a, b, c; 541 a = vec_splats (__A[0]); 542 b = vec_splats (__B[0]); 543 /* Not greater than is just less than or equal. */ 544 c = (__v2df) vec_cmple(a, b); 545 return (__m128d) _mm_setr_pd (c[0], __A[1]); 546 } 547 548 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 549 _mm_cmpnge_sd (__m128d __A, __m128d __B) 550 { 551 __v2df a, b, c; 552 a = vec_splats (__A[0]); 553 b = vec_splats (__B[0]); 554 /* Not greater than or equal is just less than. */ 555 c = (__v2df) vec_cmplt(a, b); 556 return (__m128d) _mm_setr_pd (c[0], __A[1]); 557 } 558 559 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 560 _mm_cmpord_sd (__m128d __A, __m128d __B) 561 { 562 __v2df r; 563 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 564 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]); 565 } 566 567 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 568 _mm_cmpunord_sd (__m128d __A, __m128d __B) 569 { 570 __v2df r; 571 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0])); 572 return (__m128d) _mm_setr_pd (r[0], __A[1]); 573 } 574 575 /* FIXME 576 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are 577 exactly the same because GCC for PowerPC only generates unordered 578 compares (scalar and vector). 579 Technically __mm_comieq_sp et all should be using the ordered 580 compare and signal for QNaNs. The __mm_ucomieq_sd et all should 581 be OK. */ 582 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 583 _mm_comieq_sd (__m128d __A, __m128d __B) 584 { 585 return (__A[0] == __B[0]); 586 } 587 588 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 589 _mm_comilt_sd (__m128d __A, __m128d __B) 590 { 591 return (__A[0] < __B[0]); 592 } 593 594 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 595 _mm_comile_sd (__m128d __A, __m128d __B) 596 { 597 return (__A[0] <= __B[0]); 598 } 599 600 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 601 _mm_comigt_sd (__m128d __A, __m128d __B) 602 { 603 return (__A[0] > __B[0]); 604 } 605 606 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 607 _mm_comige_sd (__m128d __A, __m128d __B) 608 { 609 return (__A[0] >= __B[0]); 610 } 611 612 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 613 _mm_comineq_sd (__m128d __A, __m128d __B) 614 { 615 return (__A[0] != __B[0]); 616 } 617 618 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 619 _mm_ucomieq_sd (__m128d __A, __m128d __B) 620 { 621 return (__A[0] == __B[0]); 622 } 623 624 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 625 _mm_ucomilt_sd (__m128d __A, __m128d __B) 626 { 627 return (__A[0] < __B[0]); 628 } 629 630 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 631 _mm_ucomile_sd (__m128d __A, __m128d __B) 632 { 633 return (__A[0] <= __B[0]); 634 } 635 636 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 637 _mm_ucomigt_sd (__m128d __A, __m128d __B) 638 { 639 return (__A[0] > __B[0]); 640 } 641 642 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 643 _mm_ucomige_sd (__m128d __A, __m128d __B) 644 { 645 return (__A[0] >= __B[0]); 646 } 647 648 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 649 _mm_ucomineq_sd (__m128d __A, __m128d __B) 650 { 651 return (__A[0] != __B[0]); 652 } 653 654 /* Create a vector of Qi, where i is the element number. */ 655 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 656 _mm_set_epi64x (long long __q1, long long __q0) 657 { 658 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 659 } 660 661 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 662 _mm_set_epi64 (__m64 __q1, __m64 __q0) 663 { 664 return _mm_set_epi64x ((long long)__q1, (long long)__q0); 665 } 666 667 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 668 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) 669 { 670 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; 671 } 672 673 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 674 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, 675 short __q3, short __q2, short __q1, short __q0) 676 { 677 return __extension__ (__m128i)(__v8hi){ 678 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; 679 } 680 681 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 682 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, 683 char __q11, char __q10, char __q09, char __q08, 684 char __q07, char __q06, char __q05, char __q04, 685 char __q03, char __q02, char __q01, char __q00) 686 { 687 return __extension__ (__m128i)(__v16qi){ 688 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 689 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 690 }; 691 } 692 693 /* Set all of the elements of the vector to A. */ 694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 695 _mm_set1_epi64x (long long __A) 696 { 697 return _mm_set_epi64x (__A, __A); 698 } 699 700 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 701 _mm_set1_epi64 (__m64 __A) 702 { 703 return _mm_set_epi64 (__A, __A); 704 } 705 706 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 707 _mm_set1_epi32 (int __A) 708 { 709 return _mm_set_epi32 (__A, __A, __A, __A); 710 } 711 712 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 713 _mm_set1_epi16 (short __A) 714 { 715 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); 716 } 717 718 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 719 _mm_set1_epi8 (char __A) 720 { 721 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 722 __A, __A, __A, __A, __A, __A, __A, __A); 723 } 724 725 /* Create a vector of Qi, where i is the element number. 726 The parameter order is reversed from the _mm_set_epi* functions. */ 727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 728 _mm_setr_epi64 (__m64 __q0, __m64 __q1) 729 { 730 return _mm_set_epi64 (__q1, __q0); 731 } 732 733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 734 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) 735 { 736 return _mm_set_epi32 (__q3, __q2, __q1, __q0); 737 } 738 739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 740 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, 741 short __q4, short __q5, short __q6, short __q7) 742 { 743 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 744 } 745 746 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 747 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, 748 char __q04, char __q05, char __q06, char __q07, 749 char __q08, char __q09, char __q10, char __q11, 750 char __q12, char __q13, char __q14, char __q15) 751 { 752 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 753 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 754 } 755 756 /* Create a vector with element 0 as *P and the rest zero. */ 757 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 758 _mm_load_si128 (__m128i const *__P) 759 { 760 return *__P; 761 } 762 763 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 764 _mm_loadu_si128 (__m128i_u const *__P) 765 { 766 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); 767 } 768 769 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 770 _mm_loadl_epi64 (__m128i_u const *__P) 771 { 772 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); 773 } 774 775 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 776 _mm_store_si128 (__m128i *__P, __m128i __B) 777 { 778 vec_st ((__v16qu) __B, 0, (__v16qu*)__P); 779 } 780 781 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 782 _mm_storeu_si128 (__m128i_u *__P, __m128i __B) 783 { 784 *__P = __B; 785 } 786 787 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 788 _mm_storel_epi64 (__m128i_u *__P, __m128i __B) 789 { 790 *(long long *)__P = ((__v2di)__B)[0]; 791 } 792 793 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 794 _mm_movepi64_pi64 (__m128i_u __B) 795 { 796 return (__m64) ((__v2di)__B)[0]; 797 } 798 799 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 800 _mm_movpi64_epi64 (__m64 __A) 801 { 802 return _mm_set_epi64 ((__m64)0LL, __A); 803 } 804 805 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 806 _mm_move_epi64 (__m128i __A) 807 { 808 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]); 809 } 810 811 /* Create an undefined vector. */ 812 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 813 _mm_undefined_si128 (void) 814 { 815 __m128i __Y = __Y; 816 return __Y; 817 } 818 819 /* Create a vector of zeros. */ 820 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 821 _mm_setzero_si128 (void) 822 { 823 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; 824 } 825 826 #ifdef _ARCH_PWR8 827 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 828 _mm_cvtepi32_pd (__m128i __A) 829 { 830 __v2di val; 831 /* For LE need to generate Vector Unpack Low Signed Word. 832 Which is generated from unpackh. */ 833 val = (__v2di)vec_unpackh ((__v4si)__A); 834 835 return (__m128d)vec_ctf (val, 0); 836 } 837 #endif 838 839 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 840 _mm_cvtepi32_ps (__m128i __A) 841 { 842 return ((__m128)vec_ctf((__v4si)__A, 0)); 843 } 844 845 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 846 _mm_cvtpd_epi32 (__m128d __A) 847 { 848 __v2df rounded = vec_rint (__A); 849 __v4si result, temp; 850 const __v4si vzero = 851 { 0, 0, 0, 0 }; 852 853 /* VSX Vector truncate Double-Precision to integer and Convert to 854 Signed Integer Word format with Saturate. */ 855 __asm__( 856 "xvcvdpsxws %x0,%x1" 857 : "=wa" (temp) 858 : "wa" (rounded) 859 : ); 860 861 #ifdef _ARCH_PWR8 862 temp = vec_mergeo (temp, temp); 863 result = (__v4si) vec_vpkudum ((__vector long long) temp, 864 (__vector long long) vzero); 865 #else 866 { 867 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 868 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 869 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 870 } 871 #endif 872 return (__m128i) result; 873 } 874 875 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 876 _mm_cvtpd_pi32 (__m128d __A) 877 { 878 __m128i result = _mm_cvtpd_epi32(__A); 879 880 return (__m64) result[0]; 881 } 882 883 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 884 _mm_cvtpd_ps (__m128d __A) 885 { 886 __v4sf result; 887 __v4si temp; 888 const __v4si vzero = { 0, 0, 0, 0 }; 889 890 __asm__( 891 "xvcvdpsp %x0,%x1" 892 : "=wa" (temp) 893 : "wa" (__A) 894 : ); 895 896 #ifdef _ARCH_PWR8 897 temp = vec_mergeo (temp, temp); 898 result = (__v4sf) vec_vpkudum ((__vector long long) temp, 899 (__vector long long) vzero); 900 #else 901 { 902 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 903 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 904 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 905 } 906 #endif 907 return ((__m128)result); 908 } 909 910 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 911 _mm_cvttpd_epi32 (__m128d __A) 912 { 913 __v4si result; 914 __v4si temp; 915 const __v4si vzero = { 0, 0, 0, 0 }; 916 917 /* VSX Vector truncate Double-Precision to integer and Convert to 918 Signed Integer Word format with Saturate. */ 919 __asm__( 920 "xvcvdpsxws %x0,%x1" 921 : "=wa" (temp) 922 : "wa" (__A) 923 : ); 924 925 #ifdef _ARCH_PWR8 926 temp = vec_mergeo (temp, temp); 927 result = (__v4si) vec_vpkudum ((__vector long long) temp, 928 (__vector long long) vzero); 929 #else 930 { 931 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 932 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; 933 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); 934 } 935 #endif 936 937 return ((__m128i) result); 938 } 939 940 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 941 _mm_cvttpd_pi32 (__m128d __A) 942 { 943 __m128i result = _mm_cvttpd_epi32 (__A); 944 945 return (__m64) result[0]; 946 } 947 948 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 949 _mm_cvtsi128_si32 (__m128i __A) 950 { 951 return ((__v4si)__A)[0]; 952 } 953 954 #ifdef _ARCH_PWR8 955 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 956 _mm_cvtpi32_pd (__m64 __A) 957 { 958 __v4si temp; 959 __v2di tmp2; 960 __v2df result; 961 962 temp = (__v4si)vec_splats (__A); 963 tmp2 = (__v2di)vec_unpackl (temp); 964 result = vec_ctf ((__vector signed long long) tmp2, 0); 965 return (__m128d)result; 966 } 967 #endif 968 969 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 970 _mm_cvtps_epi32 (__m128 __A) 971 { 972 __v4sf rounded; 973 __v4si result; 974 975 rounded = vec_rint((__v4sf) __A); 976 result = vec_cts (rounded, 0); 977 return (__m128i) result; 978 } 979 980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 981 _mm_cvttps_epi32 (__m128 __A) 982 { 983 __v4si result; 984 985 result = vec_cts ((__v4sf) __A, 0); 986 return (__m128i) result; 987 } 988 989 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 990 _mm_cvtps_pd (__m128 __A) 991 { 992 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ 993 #ifdef vec_doubleh 994 return (__m128d) vec_doubleh ((__v4sf)__A); 995 #else 996 /* Otherwise the compiler is not current and so need to generate the 997 equivalent code. */ 998 __v4sf a = (__v4sf)__A; 999 __v4sf temp; 1000 __v2df result; 1001 #ifdef __LITTLE_ENDIAN__ 1002 /* The input float values are in elements {[0], [1]} but the convert 1003 instruction needs them in elements {[1], [3]}, So we use two 1004 shift left double vector word immediates to get the elements 1005 lined up. */ 1006 temp = __builtin_vsx_xxsldwi (a, a, 3); 1007 temp = __builtin_vsx_xxsldwi (a, temp, 2); 1008 #else 1009 /* The input float values are in elements {[0], [1]} but the convert 1010 instruction needs them in elements {[0], [2]}, So we use two 1011 shift left double vector word immediates to get the elements 1012 lined up. */ 1013 temp = vec_vmrghw (a, a); 1014 #endif 1015 __asm__( 1016 " xvcvspdp %x0,%x1" 1017 : "=wa" (result) 1018 : "wa" (temp) 1019 : ); 1020 return (__m128d) result; 1021 #endif 1022 } 1023 1024 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1025 _mm_cvtsd_si32 (__m128d __A) 1026 { 1027 __v2df rounded = vec_rint((__v2df) __A); 1028 int result = ((__v2df)rounded)[0]; 1029 1030 return result; 1031 } 1032 /* Intel intrinsic. */ 1033 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1034 _mm_cvtsd_si64 (__m128d __A) 1035 { 1036 __v2df rounded = vec_rint ((__v2df) __A ); 1037 long long result = ((__v2df) rounded)[0]; 1038 1039 return result; 1040 } 1041 1042 /* Microsoft intrinsic. */ 1043 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1044 _mm_cvtsd_si64x (__m128d __A) 1045 { 1046 return _mm_cvtsd_si64 ((__v2df)__A); 1047 } 1048 1049 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1050 _mm_cvttsd_si32 (__m128d __A) 1051 { 1052 int result = ((__v2df)__A)[0]; 1053 1054 return result; 1055 } 1056 1057 /* Intel intrinsic. */ 1058 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1059 _mm_cvttsd_si64 (__m128d __A) 1060 { 1061 long long result = ((__v2df)__A)[0]; 1062 1063 return result; 1064 } 1065 1066 /* Microsoft intrinsic. */ 1067 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1068 _mm_cvttsd_si64x (__m128d __A) 1069 { 1070 return _mm_cvttsd_si64 (__A); 1071 } 1072 1073 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1074 _mm_cvtsd_ss (__m128 __A, __m128d __B) 1075 { 1076 __v4sf result = (__v4sf)__A; 1077 1078 #ifdef __LITTLE_ENDIAN__ 1079 __v4sf temp_s; 1080 /* Copy double element[0] to element [1] for conversion. */ 1081 __v2df temp_b = vec_splat((__v2df)__B, 0); 1082 1083 /* Pre-rotate __A left 3 (logically right 1) elements. */ 1084 result = __builtin_vsx_xxsldwi (result, result, 3); 1085 /* Convert double to single float scalar in a vector. */ 1086 __asm__( 1087 "xscvdpsp %x0,%x1" 1088 : "=wa" (temp_s) 1089 : "wa" (temp_b) 1090 : ); 1091 /* Shift the resulting scalar into vector element [0]. */ 1092 result = __builtin_vsx_xxsldwi (result, temp_s, 1); 1093 #else 1094 result [0] = ((__v2df)__B)[0]; 1095 #endif 1096 return (__m128) result; 1097 } 1098 1099 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1100 _mm_cvtsi32_sd (__m128d __A, int __B) 1101 { 1102 __v2df result = (__v2df)__A; 1103 double db = __B; 1104 result [0] = db; 1105 return (__m128d)result; 1106 } 1107 1108 /* Intel intrinsic. */ 1109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1110 _mm_cvtsi64_sd (__m128d __A, long long __B) 1111 { 1112 __v2df result = (__v2df)__A; 1113 double db = __B; 1114 result [0] = db; 1115 return (__m128d)result; 1116 } 1117 1118 /* Microsoft intrinsic. */ 1119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1120 _mm_cvtsi64x_sd (__m128d __A, long long __B) 1121 { 1122 return _mm_cvtsi64_sd (__A, __B); 1123 } 1124 1125 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1126 _mm_cvtss_sd (__m128d __A, __m128 __B) 1127 { 1128 #ifdef __LITTLE_ENDIAN__ 1129 /* Use splat to move element [0] into position for the convert. */ 1130 __v4sf temp = vec_splat ((__v4sf)__B, 0); 1131 __v2df res; 1132 /* Convert single float scalar to double in a vector. */ 1133 __asm__( 1134 "xscvspdp %x0,%x1" 1135 : "=wa" (res) 1136 : "wa" (temp) 1137 : ); 1138 return (__m128d) vec_mergel (res, (__v2df)__A); 1139 #else 1140 __v2df res = (__v2df)__A; 1141 res [0] = ((__v4sf)__B) [0]; 1142 return (__m128d) res; 1143 #endif 1144 } 1145 1146 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1147 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) 1148 { 1149 __vector double result; 1150 const int litmsk = __mask & 0x3; 1151 1152 if (litmsk == 0) 1153 result = vec_mergeh (__A, __B); 1154 #if __GNUC__ < 6 1155 else if (litmsk == 1) 1156 result = vec_xxpermdi (__B, __A, 2); 1157 else if (litmsk == 2) 1158 result = vec_xxpermdi (__B, __A, 1); 1159 #else 1160 else if (litmsk == 1) 1161 result = vec_xxpermdi (__A, __B, 2); 1162 else if (litmsk == 2) 1163 result = vec_xxpermdi (__A, __B, 1); 1164 #endif 1165 else 1166 result = vec_mergel (__A, __B); 1167 1168 return result; 1169 } 1170 1171 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1172 _mm_unpackhi_pd (__m128d __A, __m128d __B) 1173 { 1174 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B); 1175 } 1176 1177 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1178 _mm_unpacklo_pd (__m128d __A, __m128d __B) 1179 { 1180 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B); 1181 } 1182 1183 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1184 _mm_loadh_pd (__m128d __A, double const *__B) 1185 { 1186 __v2df result = (__v2df)__A; 1187 result [1] = *__B; 1188 return (__m128d)result; 1189 } 1190 1191 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1192 _mm_loadl_pd (__m128d __A, double const *__B) 1193 { 1194 __v2df result = (__v2df)__A; 1195 result [0] = *__B; 1196 return (__m128d)result; 1197 } 1198 1199 #ifdef _ARCH_PWR8 1200 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1201 1202 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */ 1203 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1204 _mm_movemask_pd (__m128d __A) 1205 { 1206 __vector unsigned long long result; 1207 static const __vector unsigned int perm_mask = 1208 { 1209 #ifdef __LITTLE_ENDIAN__ 1210 0x80800040, 0x80808080, 0x80808080, 0x80808080 1211 #else 1212 0x80808080, 0x80808080, 0x80808080, 0x80804000 1213 #endif 1214 }; 1215 1216 result = ((__vector unsigned long long) 1217 vec_vbpermq ((__vector unsigned char) __A, 1218 (__vector unsigned char) perm_mask)); 1219 1220 #ifdef __LITTLE_ENDIAN__ 1221 return result[1]; 1222 #else 1223 return result[0]; 1224 #endif 1225 } 1226 #endif /* _ARCH_PWR8 */ 1227 1228 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1229 _mm_packs_epi16 (__m128i __A, __m128i __B) 1230 { 1231 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B); 1232 } 1233 1234 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1235 _mm_packs_epi32 (__m128i __A, __m128i __B) 1236 { 1237 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B); 1238 } 1239 1240 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1241 _mm_packus_epi16 (__m128i __A, __m128i __B) 1242 { 1243 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B); 1244 } 1245 1246 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1247 _mm_unpackhi_epi8 (__m128i __A, __m128i __B) 1248 { 1249 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B); 1250 } 1251 1252 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1253 _mm_unpackhi_epi16 (__m128i __A, __m128i __B) 1254 { 1255 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B); 1256 } 1257 1258 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1259 _mm_unpackhi_epi32 (__m128i __A, __m128i __B) 1260 { 1261 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B); 1262 } 1263 1264 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1265 _mm_unpackhi_epi64 (__m128i __A, __m128i __B) 1266 { 1267 return (__m128i) vec_mergel ((__vector long long) __A, 1268 (__vector long long) __B); 1269 } 1270 1271 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1272 _mm_unpacklo_epi8 (__m128i __A, __m128i __B) 1273 { 1274 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B); 1275 } 1276 1277 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1278 _mm_unpacklo_epi16 (__m128i __A, __m128i __B) 1279 { 1280 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B); 1281 } 1282 1283 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1284 _mm_unpacklo_epi32 (__m128i __A, __m128i __B) 1285 { 1286 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B); 1287 } 1288 1289 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1290 _mm_unpacklo_epi64 (__m128i __A, __m128i __B) 1291 { 1292 return (__m128i) vec_mergeh ((__vector long long) __A, 1293 (__vector long long) __B); 1294 } 1295 1296 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1297 _mm_add_epi8 (__m128i __A, __m128i __B) 1298 { 1299 return (__m128i) ((__v16qu)__A + (__v16qu)__B); 1300 } 1301 1302 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1303 _mm_add_epi16 (__m128i __A, __m128i __B) 1304 { 1305 return (__m128i) ((__v8hu)__A + (__v8hu)__B); 1306 } 1307 1308 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1309 _mm_add_epi32 (__m128i __A, __m128i __B) 1310 { 1311 return (__m128i) ((__v4su)__A + (__v4su)__B); 1312 } 1313 1314 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1315 _mm_add_epi64 (__m128i __A, __m128i __B) 1316 { 1317 return (__m128i) ((__v2du)__A + (__v2du)__B); 1318 } 1319 1320 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1321 _mm_adds_epi8 (__m128i __A, __m128i __B) 1322 { 1323 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B); 1324 } 1325 1326 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1327 _mm_adds_epi16 (__m128i __A, __m128i __B) 1328 { 1329 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B); 1330 } 1331 1332 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1333 _mm_adds_epu8 (__m128i __A, __m128i __B) 1334 { 1335 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B); 1336 } 1337 1338 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1339 _mm_adds_epu16 (__m128i __A, __m128i __B) 1340 { 1341 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B); 1342 } 1343 1344 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1345 _mm_sub_epi8 (__m128i __A, __m128i __B) 1346 { 1347 return (__m128i) ((__v16qu)__A - (__v16qu)__B); 1348 } 1349 1350 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1351 _mm_sub_epi16 (__m128i __A, __m128i __B) 1352 { 1353 return (__m128i) ((__v8hu)__A - (__v8hu)__B); 1354 } 1355 1356 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1357 _mm_sub_epi32 (__m128i __A, __m128i __B) 1358 { 1359 return (__m128i) ((__v4su)__A - (__v4su)__B); 1360 } 1361 1362 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1363 _mm_sub_epi64 (__m128i __A, __m128i __B) 1364 { 1365 return (__m128i) ((__v2du)__A - (__v2du)__B); 1366 } 1367 1368 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1369 _mm_subs_epi8 (__m128i __A, __m128i __B) 1370 { 1371 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B); 1372 } 1373 1374 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1375 _mm_subs_epi16 (__m128i __A, __m128i __B) 1376 { 1377 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B); 1378 } 1379 1380 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1381 _mm_subs_epu8 (__m128i __A, __m128i __B) 1382 { 1383 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B); 1384 } 1385 1386 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1387 _mm_subs_epu16 (__m128i __A, __m128i __B) 1388 { 1389 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B); 1390 } 1391 1392 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1393 _mm_madd_epi16 (__m128i __A, __m128i __B) 1394 { 1395 __vector signed int zero = {0, 0, 0, 0}; 1396 1397 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero); 1398 } 1399 1400 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1401 _mm_mulhi_epi16 (__m128i __A, __m128i __B) 1402 { 1403 __vector signed int w0, w1; 1404 1405 __vector unsigned char xform1 = { 1406 #ifdef __LITTLE_ENDIAN__ 1407 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1408 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1409 #else 1410 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1411 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1412 #endif 1413 }; 1414 1415 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B); 1416 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B); 1417 return (__m128i) vec_perm (w0, w1, xform1); 1418 } 1419 1420 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1421 _mm_mullo_epi16 (__m128i __A, __m128i __B) 1422 { 1423 return (__m128i) ((__v8hi)__A * (__v8hi)__B); 1424 } 1425 1426 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1427 _mm_mul_su32 (__m64 __A, __m64 __B) 1428 { 1429 unsigned int a = __A; 1430 unsigned int b = __B; 1431 1432 return ((__m64)a * (__m64)b); 1433 } 1434 1435 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1436 _mm_mul_epu32 (__m128i __A, __m128i __B) 1437 { 1438 #if __GNUC__ < 8 1439 __v2du result; 1440 1441 #ifdef __LITTLE_ENDIAN__ 1442 /* VMX Vector Multiply Odd Unsigned Word. */ 1443 __asm__( 1444 "vmulouw %0,%1,%2" 1445 : "=v" (result) 1446 : "v" (__A), "v" (__B) 1447 : ); 1448 #else 1449 /* VMX Vector Multiply Even Unsigned Word. */ 1450 __asm__( 1451 "vmuleuw %0,%1,%2" 1452 : "=v" (result) 1453 : "v" (__A), "v" (__B) 1454 : ); 1455 #endif 1456 return (__m128i) result; 1457 #else 1458 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); 1459 #endif 1460 } 1461 1462 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1463 _mm_slli_epi16 (__m128i __A, int __B) 1464 { 1465 __v8hu lshift; 1466 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1467 1468 if (__B >= 0 && __B < 16) 1469 { 1470 if (__builtin_constant_p(__B)) 1471 lshift = (__v8hu) vec_splat_s16(__B); 1472 else 1473 lshift = vec_splats ((unsigned short) __B); 1474 1475 result = vec_sl ((__v8hi) __A, lshift); 1476 } 1477 1478 return (__m128i) result; 1479 } 1480 1481 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1482 _mm_slli_epi32 (__m128i __A, int __B) 1483 { 1484 __v4su lshift; 1485 __v4si result = { 0, 0, 0, 0 }; 1486 1487 if (__B >= 0 && __B < 32) 1488 { 1489 if (__builtin_constant_p(__B) && __B < 16) 1490 lshift = (__v4su) vec_splat_s32(__B); 1491 else 1492 lshift = vec_splats ((unsigned int) __B); 1493 1494 result = vec_sl ((__v4si) __A, lshift); 1495 } 1496 1497 return (__m128i) result; 1498 } 1499 1500 #ifdef _ARCH_PWR8 1501 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1502 _mm_slli_epi64 (__m128i __A, int __B) 1503 { 1504 __v2du lshift; 1505 __v2di result = { 0, 0 }; 1506 1507 if (__B >= 0 && __B < 64) 1508 { 1509 if (__builtin_constant_p(__B) && __B < 16) 1510 lshift = (__v2du) vec_splat_s32(__B); 1511 else 1512 lshift = (__v2du) vec_splats ((unsigned int) __B); 1513 1514 result = vec_sl ((__v2di) __A, lshift); 1515 } 1516 1517 return (__m128i) result; 1518 } 1519 #endif 1520 1521 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1522 _mm_srai_epi16 (__m128i __A, int __B) 1523 { 1524 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1525 __v8hi result; 1526 1527 if (__B < 16) 1528 { 1529 if (__builtin_constant_p(__B)) 1530 rshift = (__v8hu) vec_splat_s16(__B); 1531 else 1532 rshift = vec_splats ((unsigned short) __B); 1533 } 1534 result = vec_sra ((__v8hi) __A, rshift); 1535 1536 return (__m128i) result; 1537 } 1538 1539 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1540 _mm_srai_epi32 (__m128i __A, int __B) 1541 { 1542 __v4su rshift = { 31, 31, 31, 31 }; 1543 __v4si result; 1544 1545 if (__B < 32) 1546 { 1547 if (__builtin_constant_p(__B)) 1548 { 1549 if (__B < 16) 1550 rshift = (__v4su) vec_splat_s32(__B); 1551 else 1552 rshift = (__v4su) vec_splats((unsigned int)__B); 1553 } 1554 else 1555 rshift = vec_splats ((unsigned int) __B); 1556 } 1557 result = vec_sra ((__v4si) __A, rshift); 1558 1559 return (__m128i) result; 1560 } 1561 1562 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1563 _mm_bslli_si128 (__m128i __A, const int __N) 1564 { 1565 __v16qu result; 1566 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1567 1568 if (__N < 16) 1569 result = vec_sld ((__v16qu) __A, zeros, __N); 1570 else 1571 result = zeros; 1572 1573 return (__m128i) result; 1574 } 1575 1576 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1577 _mm_bsrli_si128 (__m128i __A, const int __N) 1578 { 1579 __v16qu result; 1580 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1581 1582 if (__N < 16) 1583 #ifdef __LITTLE_ENDIAN__ 1584 if (__builtin_constant_p(__N)) 1585 /* Would like to use Vector Shift Left Double by Octet 1586 Immediate here to use the immediate form and avoid 1587 load of __N * 8 value into a separate VR. */ 1588 result = vec_sld (zeros, (__v16qu) __A, (16 - __N)); 1589 else 1590 #endif 1591 { 1592 __v16qu shift = vec_splats((unsigned char)(__N*8)); 1593 #ifdef __LITTLE_ENDIAN__ 1594 result = vec_sro ((__v16qu)__A, shift); 1595 #else 1596 result = vec_slo ((__v16qu)__A, shift); 1597 #endif 1598 } 1599 else 1600 result = zeros; 1601 1602 return (__m128i) result; 1603 } 1604 1605 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1606 _mm_srli_si128 (__m128i __A, const int __N) 1607 { 1608 return _mm_bsrli_si128 (__A, __N); 1609 } 1610 1611 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1612 _mm_slli_si128 (__m128i __A, const int _imm5) 1613 { 1614 __v16qu result; 1615 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1616 1617 if (_imm5 < 16) 1618 #ifdef __LITTLE_ENDIAN__ 1619 result = vec_sld ((__v16qu) __A, zeros, _imm5); 1620 #else 1621 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5)); 1622 #endif 1623 else 1624 result = zeros; 1625 1626 return (__m128i) result; 1627 } 1628 1629 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1630 1631 _mm_srli_epi16 (__m128i __A, int __B) 1632 { 1633 __v8hu rshift; 1634 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; 1635 1636 if (__B < 16) 1637 { 1638 if (__builtin_constant_p(__B)) 1639 rshift = (__v8hu) vec_splat_s16(__B); 1640 else 1641 rshift = vec_splats ((unsigned short) __B); 1642 1643 result = vec_sr ((__v8hi) __A, rshift); 1644 } 1645 1646 return (__m128i) result; 1647 } 1648 1649 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1650 _mm_srli_epi32 (__m128i __A, int __B) 1651 { 1652 __v4su rshift; 1653 __v4si result = { 0, 0, 0, 0 }; 1654 1655 if (__B < 32) 1656 { 1657 if (__builtin_constant_p(__B)) 1658 { 1659 if (__B < 16) 1660 rshift = (__v4su) vec_splat_s32(__B); 1661 else 1662 rshift = (__v4su) vec_splats((unsigned int)__B); 1663 } 1664 else 1665 rshift = vec_splats ((unsigned int) __B); 1666 1667 result = vec_sr ((__v4si) __A, rshift); 1668 } 1669 1670 return (__m128i) result; 1671 } 1672 1673 #ifdef _ARCH_PWR8 1674 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1675 _mm_srli_epi64 (__m128i __A, int __B) 1676 { 1677 __v2du rshift; 1678 __v2di result = { 0, 0 }; 1679 1680 if (__B < 64) 1681 { 1682 if (__builtin_constant_p(__B)) 1683 { 1684 if (__B < 16) 1685 rshift = (__v2du) vec_splat_s32(__B); 1686 else 1687 rshift = (__v2du) vec_splats((unsigned long long)__B); 1688 } 1689 else 1690 rshift = (__v2du) vec_splats ((unsigned int) __B); 1691 1692 result = vec_sr ((__v2di) __A, rshift); 1693 } 1694 1695 return (__m128i) result; 1696 } 1697 #endif 1698 1699 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1700 _mm_sll_epi16 (__m128i __A, __m128i __B) 1701 { 1702 __v8hu lshift; 1703 __vector __bool short shmask; 1704 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1705 __v8hu result; 1706 1707 #ifdef __LITTLE_ENDIAN__ 1708 lshift = vec_splat ((__v8hu) __B, 0); 1709 #else 1710 lshift = vec_splat ((__v8hu) __B, 3); 1711 #endif 1712 shmask = vec_cmple (lshift, shmax); 1713 result = vec_sl ((__v8hu) __A, lshift); 1714 result = vec_sel ((__v8hu) shmask, result, shmask); 1715 1716 return (__m128i) result; 1717 } 1718 1719 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1720 _mm_sll_epi32 (__m128i __A, __m128i __B) 1721 { 1722 __v4su lshift; 1723 __vector __bool int shmask; 1724 const __v4su shmax = { 32, 32, 32, 32 }; 1725 __v4su result; 1726 #ifdef __LITTLE_ENDIAN__ 1727 lshift = vec_splat ((__v4su) __B, 0); 1728 #else 1729 lshift = vec_splat ((__v4su) __B, 1); 1730 #endif 1731 shmask = vec_cmplt (lshift, shmax); 1732 result = vec_sl ((__v4su) __A, lshift); 1733 result = vec_sel ((__v4su) shmask, result, shmask); 1734 1735 return (__m128i) result; 1736 } 1737 1738 #ifdef _ARCH_PWR8 1739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1740 _mm_sll_epi64 (__m128i __A, __m128i __B) 1741 { 1742 __v2du lshift; 1743 __vector __bool long long shmask; 1744 const __v2du shmax = { 64, 64 }; 1745 __v2du result; 1746 1747 lshift = vec_splat ((__v2du) __B, 0); 1748 shmask = vec_cmplt (lshift, shmax); 1749 result = vec_sl ((__v2du) __A, lshift); 1750 result = vec_sel ((__v2du) shmask, result, shmask); 1751 1752 return (__m128i) result; 1753 } 1754 #endif 1755 1756 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1757 _mm_sra_epi16 (__m128i __A, __m128i __B) 1758 { 1759 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1760 __v8hu rshift; 1761 __v8hi result; 1762 1763 #ifdef __LITTLE_ENDIAN__ 1764 rshift = vec_splat ((__v8hu)__B, 0); 1765 #else 1766 rshift = vec_splat ((__v8hu)__B, 3); 1767 #endif 1768 rshift = vec_min (rshift, rshmax); 1769 result = vec_sra ((__v8hi) __A, rshift); 1770 1771 return (__m128i) result; 1772 } 1773 1774 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1775 _mm_sra_epi32 (__m128i __A, __m128i __B) 1776 { 1777 const __v4su rshmax = { 31, 31, 31, 31 }; 1778 __v4su rshift; 1779 __v4si result; 1780 1781 #ifdef __LITTLE_ENDIAN__ 1782 rshift = vec_splat ((__v4su)__B, 0); 1783 #else 1784 rshift = vec_splat ((__v4su)__B, 1); 1785 #endif 1786 rshift = vec_min (rshift, rshmax); 1787 result = vec_sra ((__v4si) __A, rshift); 1788 1789 return (__m128i) result; 1790 } 1791 1792 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1793 _mm_srl_epi16 (__m128i __A, __m128i __B) 1794 { 1795 __v8hu rshift; 1796 __vector __bool short shmask; 1797 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; 1798 __v8hu result; 1799 1800 #ifdef __LITTLE_ENDIAN__ 1801 rshift = vec_splat ((__v8hu) __B, 0); 1802 #else 1803 rshift = vec_splat ((__v8hu) __B, 3); 1804 #endif 1805 shmask = vec_cmple (rshift, shmax); 1806 result = vec_sr ((__v8hu) __A, rshift); 1807 result = vec_sel ((__v8hu) shmask, result, shmask); 1808 1809 return (__m128i) result; 1810 } 1811 1812 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1813 _mm_srl_epi32 (__m128i __A, __m128i __B) 1814 { 1815 __v4su rshift; 1816 __vector __bool int shmask; 1817 const __v4su shmax = { 32, 32, 32, 32 }; 1818 __v4su result; 1819 1820 #ifdef __LITTLE_ENDIAN__ 1821 rshift = vec_splat ((__v4su) __B, 0); 1822 #else 1823 rshift = vec_splat ((__v4su) __B, 1); 1824 #endif 1825 shmask = vec_cmplt (rshift, shmax); 1826 result = vec_sr ((__v4su) __A, rshift); 1827 result = vec_sel ((__v4su) shmask, result, shmask); 1828 1829 return (__m128i) result; 1830 } 1831 1832 #ifdef _ARCH_PWR8 1833 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1834 _mm_srl_epi64 (__m128i __A, __m128i __B) 1835 { 1836 __v2du rshift; 1837 __vector __bool long long shmask; 1838 const __v2du shmax = { 64, 64 }; 1839 __v2du result; 1840 1841 rshift = vec_splat ((__v2du) __B, 0); 1842 shmask = vec_cmplt (rshift, shmax); 1843 result = vec_sr ((__v2du) __A, rshift); 1844 result = vec_sel ((__v2du) shmask, result, shmask); 1845 1846 return (__m128i) result; 1847 } 1848 #endif 1849 1850 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1851 _mm_and_pd (__m128d __A, __m128d __B) 1852 { 1853 return (vec_and ((__v2df) __A, (__v2df) __B)); 1854 } 1855 1856 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1857 _mm_andnot_pd (__m128d __A, __m128d __B) 1858 { 1859 return (vec_andc ((__v2df) __B, (__v2df) __A)); 1860 } 1861 1862 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1863 _mm_or_pd (__m128d __A, __m128d __B) 1864 { 1865 return (vec_or ((__v2df) __A, (__v2df) __B)); 1866 } 1867 1868 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1869 _mm_xor_pd (__m128d __A, __m128d __B) 1870 { 1871 return (vec_xor ((__v2df) __A, (__v2df) __B)); 1872 } 1873 1874 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1875 _mm_and_si128 (__m128i __A, __m128i __B) 1876 { 1877 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B); 1878 } 1879 1880 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1881 _mm_andnot_si128 (__m128i __A, __m128i __B) 1882 { 1883 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A); 1884 } 1885 1886 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1887 _mm_or_si128 (__m128i __A, __m128i __B) 1888 { 1889 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B); 1890 } 1891 1892 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1893 _mm_xor_si128 (__m128i __A, __m128i __B) 1894 { 1895 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B); 1896 } 1897 1898 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1899 _mm_cmpeq_epi8 (__m128i __A, __m128i __B) 1900 { 1901 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B); 1902 } 1903 1904 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1905 _mm_cmpeq_epi16 (__m128i __A, __m128i __B) 1906 { 1907 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B); 1908 } 1909 1910 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1911 _mm_cmpeq_epi32 (__m128i __A, __m128i __B) 1912 { 1913 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B); 1914 } 1915 1916 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1917 _mm_cmplt_epi8 (__m128i __A, __m128i __B) 1918 { 1919 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B); 1920 } 1921 1922 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1923 _mm_cmplt_epi16 (__m128i __A, __m128i __B) 1924 { 1925 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B); 1926 } 1927 1928 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1929 _mm_cmplt_epi32 (__m128i __A, __m128i __B) 1930 { 1931 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B); 1932 } 1933 1934 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1935 _mm_cmpgt_epi8 (__m128i __A, __m128i __B) 1936 { 1937 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B); 1938 } 1939 1940 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1941 _mm_cmpgt_epi16 (__m128i __A, __m128i __B) 1942 { 1943 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B); 1944 } 1945 1946 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1947 _mm_cmpgt_epi32 (__m128i __A, __m128i __B) 1948 { 1949 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B); 1950 } 1951 1952 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1953 _mm_extract_epi16 (__m128i const __A, int const __N) 1954 { 1955 return (unsigned short) ((__v8hi)__A)[__N & 7]; 1956 } 1957 1958 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1959 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N) 1960 { 1961 __v8hi result = (__v8hi)__A; 1962 1963 result [(__N & 7)] = __D; 1964 1965 return (__m128i) result; 1966 } 1967 1968 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1969 _mm_max_epi16 (__m128i __A, __m128i __B) 1970 { 1971 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B); 1972 } 1973 1974 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1975 _mm_max_epu8 (__m128i __A, __m128i __B) 1976 { 1977 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B); 1978 } 1979 1980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1981 _mm_min_epi16 (__m128i __A, __m128i __B) 1982 { 1983 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B); 1984 } 1985 1986 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1987 _mm_min_epu8 (__m128i __A, __m128i __B) 1988 { 1989 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B); 1990 } 1991 1992 1993 #ifdef _ARCH_PWR8 1994 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1995 1996 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 1997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1998 _mm_movemask_epi8 (__m128i __A) 1999 { 2000 __vector unsigned long long result; 2001 static const __vector unsigned char perm_mask = 2002 { 2003 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 2004 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 2005 }; 2006 2007 result = ((__vector unsigned long long) 2008 vec_vbpermq ((__vector unsigned char) __A, 2009 (__vector unsigned char) perm_mask)); 2010 2011 #ifdef __LITTLE_ENDIAN__ 2012 return result[1]; 2013 #else 2014 return result[0]; 2015 #endif 2016 } 2017 #endif /* _ARCH_PWR8 */ 2018 2019 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2020 _mm_mulhi_epu16 (__m128i __A, __m128i __B) 2021 { 2022 __v4su w0, w1; 2023 __v16qu xform1 = { 2024 #ifdef __LITTLE_ENDIAN__ 2025 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 2026 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 2027 #else 2028 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 2029 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 2030 #endif 2031 }; 2032 2033 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B); 2034 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B); 2035 return (__m128i) vec_perm (w0, w1, xform1); 2036 } 2037 2038 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2039 _mm_shufflehi_epi16 (__m128i __A, const int __mask) 2040 { 2041 unsigned long element_selector_98 = __mask & 0x03; 2042 unsigned long element_selector_BA = (__mask >> 2) & 0x03; 2043 unsigned long element_selector_DC = (__mask >> 4) & 0x03; 2044 unsigned long element_selector_FE = (__mask >> 6) & 0x03; 2045 static const unsigned short permute_selectors[4] = 2046 { 2047 #ifdef __LITTLE_ENDIAN__ 2048 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 2049 #else 2050 0x0809, 0x0A0B, 0x0C0D, 0x0E0F 2051 #endif 2052 }; 2053 __v2du pmask = 2054 #ifdef __LITTLE_ENDIAN__ 2055 { 0x1716151413121110UL, 0UL}; 2056 #else 2057 { 0x1011121314151617UL, 0UL}; 2058 #endif 2059 __m64_union t; 2060 __v2du a, r; 2061 2062 t.as_short[0] = permute_selectors[element_selector_98]; 2063 t.as_short[1] = permute_selectors[element_selector_BA]; 2064 t.as_short[2] = permute_selectors[element_selector_DC]; 2065 t.as_short[3] = permute_selectors[element_selector_FE]; 2066 pmask[1] = t.as_m64; 2067 a = (__v2du)__A; 2068 r = vec_perm (a, a, (__vector unsigned char)pmask); 2069 return (__m128i) r; 2070 } 2071 2072 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2073 _mm_shufflelo_epi16 (__m128i __A, const int __mask) 2074 { 2075 unsigned long element_selector_10 = __mask & 0x03; 2076 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 2077 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 2078 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 2079 static const unsigned short permute_selectors[4] = 2080 { 2081 #ifdef __LITTLE_ENDIAN__ 2082 0x0100, 0x0302, 0x0504, 0x0706 2083 #else 2084 0x0001, 0x0203, 0x0405, 0x0607 2085 #endif 2086 }; 2087 __v2du pmask = 2088 #ifdef __LITTLE_ENDIAN__ 2089 { 0UL, 0x1f1e1d1c1b1a1918UL}; 2090 #else 2091 { 0UL, 0x18191a1b1c1d1e1fUL}; 2092 #endif 2093 __m64_union t; 2094 __v2du a, r; 2095 t.as_short[0] = permute_selectors[element_selector_10]; 2096 t.as_short[1] = permute_selectors[element_selector_32]; 2097 t.as_short[2] = permute_selectors[element_selector_54]; 2098 t.as_short[3] = permute_selectors[element_selector_76]; 2099 pmask[0] = t.as_m64; 2100 a = (__v2du)__A; 2101 r = vec_perm (a, a, (__vector unsigned char)pmask); 2102 return (__m128i) r; 2103 } 2104 2105 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2106 _mm_shuffle_epi32 (__m128i __A, const int __mask) 2107 { 2108 unsigned long element_selector_10 = __mask & 0x03; 2109 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 2110 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 2111 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 2112 static const unsigned int permute_selectors[4] = 2113 { 2114 #ifdef __LITTLE_ENDIAN__ 2115 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 2116 #else 2117 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 2118 #endif 2119 }; 2120 __v4su t; 2121 2122 t[0] = permute_selectors[element_selector_10]; 2123 t[1] = permute_selectors[element_selector_32]; 2124 t[2] = permute_selectors[element_selector_54] + 0x10101010; 2125 t[3] = permute_selectors[element_selector_76] + 0x10101010; 2126 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t); 2127 } 2128 2129 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2130 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) 2131 { 2132 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; 2133 __v16qu mask, tmp; 2134 __m128i_u *p = (__m128i_u*)__C; 2135 2136 tmp = (__v16qu)_mm_loadu_si128(p); 2137 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit); 2138 tmp = vec_sel (tmp, (__v16qu)__A, mask); 2139 _mm_storeu_si128 (p, (__m128i)tmp); 2140 } 2141 2142 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2143 _mm_avg_epu8 (__m128i __A, __m128i __B) 2144 { 2145 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B); 2146 } 2147 2148 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2149 _mm_avg_epu16 (__m128i __A, __m128i __B) 2150 { 2151 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B); 2152 } 2153 2154 2155 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2156 _mm_sad_epu8 (__m128i __A, __m128i __B) 2157 { 2158 __v16qu a, b; 2159 __v16qu vmin, vmax, vabsdiff; 2160 __v4si vsum; 2161 const __v4su zero = { 0, 0, 0, 0 }; 2162 __v4si result; 2163 2164 a = (__v16qu) __A; 2165 b = (__v16qu) __B; 2166 vmin = vec_min (a, b); 2167 vmax = vec_max (a, b); 2168 vabsdiff = vec_sub (vmax, vmin); 2169 /* Sum four groups of bytes into integers. */ 2170 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); 2171 /* Sum across four integers with two integer results. */ 2172 result = vec_sum2s (vsum, (__vector signed int) zero); 2173 /* Rotate the sums into the correct position. */ 2174 #ifdef __LITTLE_ENDIAN__ 2175 result = vec_sld (result, result, 4); 2176 #else 2177 result = vec_sld (result, result, 6); 2178 #endif 2179 /* Rotate the sums into the correct position. */ 2180 return (__m128i) result; 2181 } 2182 2183 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2184 _mm_stream_si32 (int *__A, int __B) 2185 { 2186 /* Use the data cache block touch for store transient. */ 2187 __asm__ ( 2188 "dcbtstt 0,%0" 2189 : 2190 : "b" (__A) 2191 : "memory" 2192 ); 2193 *__A = __B; 2194 } 2195 2196 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2197 _mm_stream_si64 (long long int *__A, long long int __B) 2198 { 2199 /* Use the data cache block touch for store transient. */ 2200 __asm__ ( 2201 " dcbtstt 0,%0" 2202 : 2203 : "b" (__A) 2204 : "memory" 2205 ); 2206 *__A = __B; 2207 } 2208 2209 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2210 _mm_stream_si128 (__m128i *__A, __m128i __B) 2211 { 2212 /* Use the data cache block touch for store transient. */ 2213 __asm__ ( 2214 "dcbtstt 0,%0" 2215 : 2216 : "b" (__A) 2217 : "memory" 2218 ); 2219 *__A = __B; 2220 } 2221 2222 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2223 _mm_stream_pd (double *__A, __m128d __B) 2224 { 2225 /* Use the data cache block touch for store transient. */ 2226 __asm__ ( 2227 "dcbtstt 0,%0" 2228 : 2229 : "b" (__A) 2230 : "memory" 2231 ); 2232 *(__m128d*)__A = __B; 2233 } 2234 2235 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2236 _mm_clflush (void const *__A) 2237 { 2238 /* Use the data cache block flush. */ 2239 __asm__ ( 2240 "dcbf 0,%0" 2241 : 2242 : "b" (__A) 2243 : "memory" 2244 ); 2245 } 2246 2247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2248 _mm_lfence (void) 2249 { 2250 /* Use light weight sync for load to load ordering. */ 2251 __atomic_thread_fence (__ATOMIC_RELEASE); 2252 } 2253 2254 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2255 _mm_mfence (void) 2256 { 2257 /* Use heavy weight sync for any to any ordering. */ 2258 __atomic_thread_fence (__ATOMIC_SEQ_CST); 2259 } 2260 2261 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2262 _mm_cvtsi32_si128 (int __A) 2263 { 2264 return _mm_set_epi32 (0, 0, 0, __A); 2265 } 2266 2267 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2268 _mm_cvtsi64_si128 (long long __A) 2269 { 2270 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2271 } 2272 2273 /* Microsoft intrinsic. */ 2274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2275 _mm_cvtsi64x_si128 (long long __A) 2276 { 2277 return __extension__ (__m128i)(__v2di){ __A, 0LL }; 2278 } 2279 2280 /* Casts between various SP, DP, INT vector types. Note that these do no 2281 conversion of values, they just change the type. */ 2282 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2283 _mm_castpd_ps(__m128d __A) 2284 { 2285 return (__m128) __A; 2286 } 2287 2288 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2289 _mm_castpd_si128(__m128d __A) 2290 { 2291 return (__m128i) __A; 2292 } 2293 2294 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2295 _mm_castps_pd(__m128 __A) 2296 { 2297 return (__m128d) __A; 2298 } 2299 2300 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2301 _mm_castps_si128(__m128 __A) 2302 { 2303 return (__m128i) __A; 2304 } 2305 2306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2307 _mm_castsi128_ps(__m128i __A) 2308 { 2309 return (__m128) __A; 2310 } 2311 2312 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 2313 _mm_castsi128_pd(__m128i __A) 2314 { 2315 return (__m128d) __A; 2316 } 2317 2318 #endif /* EMMINTRIN_H_ */ 2319