1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type, 18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations. 19 However scalar float operations in vector (XMM) registers require 20 the POWER8 VSX ISA (2.07) level. There are differences for data 21 format and placement of float scalars in the vector register, which 22 require extra steps to match SSE2 scalar float semantics on POWER. 23 24 It should be noted that there's much difference between X86_64's 25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use 26 portable <fenv.h> instead of access MXSCR directly. 27 28 Most SSE2 scalar float intrinsic operations can be performed more 29 efficiently as C language float scalar operations or optimized to 30 use vector SIMD operations. We recommend this for new applications. 31 */ 32 #error \ 33 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 34 #endif 35 36 #ifndef EMMINTRIN_H_ 37 #define EMMINTRIN_H_ 38 39 #if defined(__powerpc64__) && \ 40 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) 41 42 #include <altivec.h> 43 44 /* We need definitions from the SSE header files. */ 45 #include <xmmintrin.h> 46 47 /* SSE2 */ 48 typedef __vector double __v2df; 49 typedef __vector float __v4f; 50 typedef __vector long long __v2di; 51 typedef __vector unsigned long long __v2du; 52 typedef __vector int __v4si; 53 typedef __vector unsigned int __v4su; 54 typedef __vector short __v8hi; 55 typedef __vector unsigned short __v8hu; 56 typedef __vector signed char __v16qi; 57 typedef __vector unsigned char __v16qu; 58 59 /* The Intel API is flexible enough that we must allow aliasing with other 60 vector types, and their scalar components. */ 61 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); 62 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__)); 63 64 /* Unaligned version of the same types. */ 65 typedef long long __m128i_u 66 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 67 typedef double __m128d_u 68 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 69 70 /* Define two value permute mask. */ 71 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 72 73 /* Create a vector with element 0 as F and the rest zero. */ 74 extern __inline __m128d 75 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_sd(double __F)76 _mm_set_sd(double __F) { 77 return __extension__(__m128d){__F, 0.0}; 78 } 79 80 /* Create a vector with both elements equal to F. */ 81 extern __inline __m128d 82 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pd(double __F)83 _mm_set1_pd(double __F) { 84 return __extension__(__m128d){__F, __F}; 85 } 86 87 extern __inline __m128d 88 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pd1(double __F)89 _mm_set_pd1(double __F) { 90 return _mm_set1_pd(__F); 91 } 92 93 /* Create a vector with the lower value X and upper value W. */ 94 extern __inline __m128d 95 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pd(double __W,double __X)96 _mm_set_pd(double __W, double __X) { 97 return __extension__(__m128d){__X, __W}; 98 } 99 100 /* Create a vector with the lower value W and upper value X. */ 101 extern __inline __m128d 102 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_pd(double __W,double __X)103 _mm_setr_pd(double __W, double __X) { 104 return __extension__(__m128d){__W, __X}; 105 } 106 107 /* Create an undefined vector. */ 108 extern __inline __m128d 109 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_undefined_pd(void)110 _mm_undefined_pd(void) { 111 __m128d __Y = __Y; 112 return __Y; 113 } 114 115 /* Create a vector of zeros. */ 116 extern __inline __m128d 117 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setzero_pd(void)118 _mm_setzero_pd(void) { 119 return (__m128d)vec_splats(0); 120 } 121 122 /* Sets the low DPFP value of A from the low value of B. */ 123 extern __inline __m128d 124 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_move_sd(__m128d __A,__m128d __B)125 _mm_move_sd(__m128d __A, __m128d __B) { 126 __v2df __result = (__v2df)__A; 127 __result[0] = ((__v2df)__B)[0]; 128 return (__m128d)__result; 129 } 130 131 /* Load two DPFP values from P. The address must be 16-byte aligned. */ 132 extern __inline __m128d 133 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_pd(double const * __P)134 _mm_load_pd(double const *__P) { 135 return ((__m128d)vec_ld(0, (__v16qu *)__P)); 136 } 137 138 /* Load two DPFP values from P. The address need not be 16-byte aligned. */ 139 extern __inline __m128d 140 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadu_pd(double const * __P)141 _mm_loadu_pd(double const *__P) { 142 return (vec_vsx_ld(0, __P)); 143 } 144 145 /* Create a vector with all two elements equal to *P. */ 146 extern __inline __m128d 147 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load1_pd(double const * __P)148 _mm_load1_pd(double const *__P) { 149 return (vec_splats(*__P)); 150 } 151 152 /* Create a vector with element 0 as *P and the rest zero. */ 153 extern __inline __m128d 154 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_sd(double const * __P)155 _mm_load_sd(double const *__P) { 156 return _mm_set_sd(*__P); 157 } 158 159 extern __inline __m128d 160 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_pd1(double const * __P)161 _mm_load_pd1(double const *__P) { 162 return _mm_load1_pd(__P); 163 } 164 165 /* Load two DPFP values in reverse order. The address must be aligned. */ 166 extern __inline __m128d 167 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadr_pd(double const * __P)168 _mm_loadr_pd(double const *__P) { 169 __v2df __tmp = _mm_load_pd(__P); 170 return (__m128d)vec_xxpermdi(__tmp, __tmp, 2); 171 } 172 173 /* Store two DPFP values. The address must be 16-byte aligned. */ 174 extern __inline void 175 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_pd(double * __P,__m128d __A)176 _mm_store_pd(double *__P, __m128d __A) { 177 vec_st((__v16qu)__A, 0, (__v16qu *)__P); 178 } 179 180 /* Store two DPFP values. The address need not be 16-byte aligned. */ 181 extern __inline void 182 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeu_pd(double * __P,__m128d __A)183 _mm_storeu_pd(double *__P, __m128d __A) { 184 *(__m128d_u *)__P = __A; 185 } 186 187 /* Stores the lower DPFP value. */ 188 extern __inline void 189 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_sd(double * __P,__m128d __A)190 _mm_store_sd(double *__P, __m128d __A) { 191 *__P = ((__v2df)__A)[0]; 192 } 193 194 extern __inline double 195 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_f64(__m128d __A)196 _mm_cvtsd_f64(__m128d __A) { 197 return ((__v2df)__A)[0]; 198 } 199 200 extern __inline void 201 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storel_pd(double * __P,__m128d __A)202 _mm_storel_pd(double *__P, __m128d __A) { 203 _mm_store_sd(__P, __A); 204 } 205 206 /* Stores the upper DPFP value. */ 207 extern __inline void 208 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeh_pd(double * __P,__m128d __A)209 _mm_storeh_pd(double *__P, __m128d __A) { 210 *__P = ((__v2df)__A)[1]; 211 } 212 /* Store the lower DPFP value across two words. 213 The address must be 16-byte aligned. */ 214 extern __inline void 215 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store1_pd(double * __P,__m128d __A)216 _mm_store1_pd(double *__P, __m128d __A) { 217 _mm_store_pd(__P, vec_splat(__A, 0)); 218 } 219 220 extern __inline void 221 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_pd1(double * __P,__m128d __A)222 _mm_store_pd1(double *__P, __m128d __A) { 223 _mm_store1_pd(__P, __A); 224 } 225 226 /* Store two DPFP values in reverse order. The address must be aligned. */ 227 extern __inline void 228 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storer_pd(double * __P,__m128d __A)229 _mm_storer_pd(double *__P, __m128d __A) { 230 _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2)); 231 } 232 233 /* Intel intrinsic. */ 234 extern __inline long long 235 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si64(__m128i __A)236 _mm_cvtsi128_si64(__m128i __A) { 237 return ((__v2di)__A)[0]; 238 } 239 240 /* Microsoft intrinsic. */ 241 extern __inline long long 242 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si64x(__m128i __A)243 _mm_cvtsi128_si64x(__m128i __A) { 244 return ((__v2di)__A)[0]; 245 } 246 247 extern __inline __m128d 248 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pd(__m128d __A,__m128d __B)249 _mm_add_pd(__m128d __A, __m128d __B) { 250 return (__m128d)((__v2df)__A + (__v2df)__B); 251 } 252 253 /* Add the lower double-precision (64-bit) floating-point element in 254 a and b, store the result in the lower element of dst, and copy 255 the upper element from a to the upper element of dst. */ 256 extern __inline __m128d 257 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_sd(__m128d __A,__m128d __B)258 _mm_add_sd(__m128d __A, __m128d __B) { 259 __A[0] = __A[0] + __B[0]; 260 return (__A); 261 } 262 263 extern __inline __m128d 264 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pd(__m128d __A,__m128d __B)265 _mm_sub_pd(__m128d __A, __m128d __B) { 266 return (__m128d)((__v2df)__A - (__v2df)__B); 267 } 268 269 extern __inline __m128d 270 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_sd(__m128d __A,__m128d __B)271 _mm_sub_sd(__m128d __A, __m128d __B) { 272 __A[0] = __A[0] - __B[0]; 273 return (__A); 274 } 275 276 extern __inline __m128d 277 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_pd(__m128d __A,__m128d __B)278 _mm_mul_pd(__m128d __A, __m128d __B) { 279 return (__m128d)((__v2df)__A * (__v2df)__B); 280 } 281 282 extern __inline __m128d 283 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_sd(__m128d __A,__m128d __B)284 _mm_mul_sd(__m128d __A, __m128d __B) { 285 __A[0] = __A[0] * __B[0]; 286 return (__A); 287 } 288 289 extern __inline __m128d 290 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_pd(__m128d __A,__m128d __B)291 _mm_div_pd(__m128d __A, __m128d __B) { 292 return (__m128d)((__v2df)__A / (__v2df)__B); 293 } 294 295 extern __inline __m128d 296 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_sd(__m128d __A,__m128d __B)297 _mm_div_sd(__m128d __A, __m128d __B) { 298 __A[0] = __A[0] / __B[0]; 299 return (__A); 300 } 301 302 extern __inline __m128d 303 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_pd(__m128d __A)304 _mm_sqrt_pd(__m128d __A) { 305 return (vec_sqrt(__A)); 306 } 307 308 /* Return pair {sqrt (B[0]), A[1]}. */ 309 extern __inline __m128d 310 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_sd(__m128d __A,__m128d __B)311 _mm_sqrt_sd(__m128d __A, __m128d __B) { 312 __v2df __c; 313 __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0])); 314 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 315 } 316 317 extern __inline __m128d 318 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_pd(__m128d __A,__m128d __B)319 _mm_min_pd(__m128d __A, __m128d __B) { 320 return (vec_min(__A, __B)); 321 } 322 323 extern __inline __m128d 324 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_sd(__m128d __A,__m128d __B)325 _mm_min_sd(__m128d __A, __m128d __B) { 326 __v2df __a, __b, __c; 327 __a = vec_splats(__A[0]); 328 __b = vec_splats(__B[0]); 329 __c = vec_min(__a, __b); 330 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 331 } 332 333 extern __inline __m128d 334 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_pd(__m128d __A,__m128d __B)335 _mm_max_pd(__m128d __A, __m128d __B) { 336 return (vec_max(__A, __B)); 337 } 338 339 extern __inline __m128d 340 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_sd(__m128d __A,__m128d __B)341 _mm_max_sd(__m128d __A, __m128d __B) { 342 __v2df __a, __b, __c; 343 __a = vec_splats(__A[0]); 344 __b = vec_splats(__B[0]); 345 __c = vec_max(__a, __b); 346 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 347 } 348 349 extern __inline __m128d 350 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pd(__m128d __A,__m128d __B)351 _mm_cmpeq_pd(__m128d __A, __m128d __B) { 352 return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B)); 353 } 354 355 extern __inline __m128d 356 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_pd(__m128d __A,__m128d __B)357 _mm_cmplt_pd(__m128d __A, __m128d __B) { 358 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B)); 359 } 360 361 extern __inline __m128d 362 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmple_pd(__m128d __A,__m128d __B)363 _mm_cmple_pd(__m128d __A, __m128d __B) { 364 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B)); 365 } 366 367 extern __inline __m128d 368 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pd(__m128d __A,__m128d __B)369 _mm_cmpgt_pd(__m128d __A, __m128d __B) { 370 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B)); 371 } 372 373 extern __inline __m128d 374 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpge_pd(__m128d __A,__m128d __B)375 _mm_cmpge_pd(__m128d __A, __m128d __B) { 376 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B)); 377 } 378 379 extern __inline __m128d 380 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpneq_pd(__m128d __A,__m128d __B)381 _mm_cmpneq_pd(__m128d __A, __m128d __B) { 382 __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B); 383 return ((__m128d)vec_nor(__temp, __temp)); 384 } 385 386 extern __inline __m128d 387 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnlt_pd(__m128d __A,__m128d __B)388 _mm_cmpnlt_pd(__m128d __A, __m128d __B) { 389 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B)); 390 } 391 392 extern __inline __m128d 393 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnle_pd(__m128d __A,__m128d __B)394 _mm_cmpnle_pd(__m128d __A, __m128d __B) { 395 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B)); 396 } 397 398 extern __inline __m128d 399 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpngt_pd(__m128d __A,__m128d __B)400 _mm_cmpngt_pd(__m128d __A, __m128d __B) { 401 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B)); 402 } 403 404 extern __inline __m128d 405 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnge_pd(__m128d __A,__m128d __B)406 _mm_cmpnge_pd(__m128d __A, __m128d __B) { 407 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B)); 408 } 409 410 extern __inline __m128d 411 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_pd(__m128d __A,__m128d __B)412 _mm_cmpord_pd(__m128d __A, __m128d __B) { 413 __v2du __c, __d; 414 /* Compare against self will return false (0's) if NAN. */ 415 __c = (__v2du)vec_cmpeq(__A, __A); 416 __d = (__v2du)vec_cmpeq(__B, __B); 417 /* A != NAN and B != NAN. */ 418 return ((__m128d)vec_and(__c, __d)); 419 } 420 421 extern __inline __m128d 422 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpunord_pd(__m128d __A,__m128d __B)423 _mm_cmpunord_pd(__m128d __A, __m128d __B) { 424 #if _ARCH_PWR8 425 __v2du __c, __d; 426 /* Compare against self will return false (0's) if NAN. */ 427 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A); 428 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B); 429 /* A == NAN OR B == NAN converts too: 430 NOT(A != NAN) OR NOT(B != NAN). */ 431 __c = vec_nor(__c, __c); 432 return ((__m128d)vec_orc(__c, __d)); 433 #else 434 __v2du __c, __d; 435 /* Compare against self will return false (0's) if NAN. */ 436 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A); 437 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B); 438 /* Convert the true ('1's) is NAN. */ 439 __c = vec_nor(__c, __c); 440 __d = vec_nor(__d, __d); 441 return ((__m128d)vec_or(__c, __d)); 442 #endif 443 } 444 445 extern __inline __m128d 446 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_sd(__m128d __A,__m128d __B)447 _mm_cmpeq_sd(__m128d __A, __m128d __B) { 448 __v2df __a, __b, __c; 449 /* PowerISA VSX does not allow partial (for just lower double) 450 results. So to insure we don't generate spurious exceptions 451 (from the upper double values) we splat the lower double 452 before we do the operation. */ 453 __a = vec_splats(__A[0]); 454 __b = vec_splats(__B[0]); 455 __c = (__v2df)vec_cmpeq(__a, __b); 456 /* Then we merge the lower double result with the original upper 457 double from __A. */ 458 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 459 } 460 461 extern __inline __m128d 462 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_sd(__m128d __A,__m128d __B)463 _mm_cmplt_sd(__m128d __A, __m128d __B) { 464 __v2df __a, __b, __c; 465 __a = vec_splats(__A[0]); 466 __b = vec_splats(__B[0]); 467 __c = (__v2df)vec_cmplt(__a, __b); 468 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 469 } 470 471 extern __inline __m128d 472 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmple_sd(__m128d __A,__m128d __B)473 _mm_cmple_sd(__m128d __A, __m128d __B) { 474 __v2df __a, __b, __c; 475 __a = vec_splats(__A[0]); 476 __b = vec_splats(__B[0]); 477 __c = (__v2df)vec_cmple(__a, __b); 478 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 479 } 480 481 extern __inline __m128d 482 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_sd(__m128d __A,__m128d __B)483 _mm_cmpgt_sd(__m128d __A, __m128d __B) { 484 __v2df __a, __b, __c; 485 __a = vec_splats(__A[0]); 486 __b = vec_splats(__B[0]); 487 __c = (__v2df)vec_cmpgt(__a, __b); 488 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 489 } 490 491 extern __inline __m128d 492 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpge_sd(__m128d __A,__m128d __B)493 _mm_cmpge_sd(__m128d __A, __m128d __B) { 494 __v2df __a, __b, __c; 495 __a = vec_splats(__A[0]); 496 __b = vec_splats(__B[0]); 497 __c = (__v2df)vec_cmpge(__a, __b); 498 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 499 } 500 501 extern __inline __m128d 502 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpneq_sd(__m128d __A,__m128d __B)503 _mm_cmpneq_sd(__m128d __A, __m128d __B) { 504 __v2df __a, __b, __c; 505 __a = vec_splats(__A[0]); 506 __b = vec_splats(__B[0]); 507 __c = (__v2df)vec_cmpeq(__a, __b); 508 __c = vec_nor(__c, __c); 509 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 510 } 511 512 extern __inline __m128d 513 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnlt_sd(__m128d __A,__m128d __B)514 _mm_cmpnlt_sd(__m128d __A, __m128d __B) { 515 __v2df __a, __b, __c; 516 __a = vec_splats(__A[0]); 517 __b = vec_splats(__B[0]); 518 /* Not less than is just greater than or equal. */ 519 __c = (__v2df)vec_cmpge(__a, __b); 520 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 521 } 522 523 extern __inline __m128d 524 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnle_sd(__m128d __A,__m128d __B)525 _mm_cmpnle_sd(__m128d __A, __m128d __B) { 526 __v2df __a, __b, __c; 527 __a = vec_splats(__A[0]); 528 __b = vec_splats(__B[0]); 529 /* Not less than or equal is just greater than. */ 530 __c = (__v2df)vec_cmpge(__a, __b); 531 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 532 } 533 534 extern __inline __m128d 535 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpngt_sd(__m128d __A,__m128d __B)536 _mm_cmpngt_sd(__m128d __A, __m128d __B) { 537 __v2df __a, __b, __c; 538 __a = vec_splats(__A[0]); 539 __b = vec_splats(__B[0]); 540 /* Not greater than is just less than or equal. */ 541 __c = (__v2df)vec_cmple(__a, __b); 542 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 543 } 544 545 extern __inline __m128d 546 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnge_sd(__m128d __A,__m128d __B)547 _mm_cmpnge_sd(__m128d __A, __m128d __B) { 548 __v2df __a, __b, __c; 549 __a = vec_splats(__A[0]); 550 __b = vec_splats(__B[0]); 551 /* Not greater than or equal is just less than. */ 552 __c = (__v2df)vec_cmplt(__a, __b); 553 return (__m128d)_mm_setr_pd(__c[0], __A[1]); 554 } 555 556 extern __inline __m128d 557 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_sd(__m128d __A,__m128d __B)558 _mm_cmpord_sd(__m128d __A, __m128d __B) { 559 __v2df __r; 560 __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0])); 561 return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]); 562 } 563 564 extern __inline __m128d 565 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpunord_sd(__m128d __A,__m128d __B)566 _mm_cmpunord_sd(__m128d __A, __m128d __B) { 567 __v2df __r; 568 __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0])); 569 return (__m128d)_mm_setr_pd(__r[0], __A[1]); 570 } 571 572 /* FIXME 573 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are 574 exactly the same because GCC for PowerPC only generates unordered 575 compares (scalar and vector). 576 Technically __mm_comieq_sp et all should be using the ordered 577 compare and signal for QNaNs. The __mm_ucomieq_sd et all should 578 be OK. */ 579 extern __inline int 580 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comieq_sd(__m128d __A,__m128d __B)581 _mm_comieq_sd(__m128d __A, __m128d __B) { 582 return (__A[0] == __B[0]); 583 } 584 585 extern __inline int 586 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comilt_sd(__m128d __A,__m128d __B)587 _mm_comilt_sd(__m128d __A, __m128d __B) { 588 return (__A[0] < __B[0]); 589 } 590 591 extern __inline int 592 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comile_sd(__m128d __A,__m128d __B)593 _mm_comile_sd(__m128d __A, __m128d __B) { 594 return (__A[0] <= __B[0]); 595 } 596 597 extern __inline int 598 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comigt_sd(__m128d __A,__m128d __B)599 _mm_comigt_sd(__m128d __A, __m128d __B) { 600 return (__A[0] > __B[0]); 601 } 602 603 extern __inline int 604 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comige_sd(__m128d __A,__m128d __B)605 _mm_comige_sd(__m128d __A, __m128d __B) { 606 return (__A[0] >= __B[0]); 607 } 608 609 extern __inline int 610 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_comineq_sd(__m128d __A,__m128d __B)611 _mm_comineq_sd(__m128d __A, __m128d __B) { 612 return (__A[0] != __B[0]); 613 } 614 615 extern __inline int 616 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomieq_sd(__m128d __A,__m128d __B)617 _mm_ucomieq_sd(__m128d __A, __m128d __B) { 618 return (__A[0] == __B[0]); 619 } 620 621 extern __inline int 622 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomilt_sd(__m128d __A,__m128d __B)623 _mm_ucomilt_sd(__m128d __A, __m128d __B) { 624 return (__A[0] < __B[0]); 625 } 626 627 extern __inline int 628 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomile_sd(__m128d __A,__m128d __B)629 _mm_ucomile_sd(__m128d __A, __m128d __B) { 630 return (__A[0] <= __B[0]); 631 } 632 633 extern __inline int 634 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomigt_sd(__m128d __A,__m128d __B)635 _mm_ucomigt_sd(__m128d __A, __m128d __B) { 636 return (__A[0] > __B[0]); 637 } 638 639 extern __inline int 640 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomige_sd(__m128d __A,__m128d __B)641 _mm_ucomige_sd(__m128d __A, __m128d __B) { 642 return (__A[0] >= __B[0]); 643 } 644 645 extern __inline int 646 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomineq_sd(__m128d __A,__m128d __B)647 _mm_ucomineq_sd(__m128d __A, __m128d __B) { 648 return (__A[0] != __B[0]); 649 } 650 651 /* Create a vector of Qi, where i is the element number. */ 652 extern __inline __m128i 653 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi64x(long long __q1,long long __q0)654 _mm_set_epi64x(long long __q1, long long __q0) { 655 return __extension__(__m128i)(__v2di){__q0, __q1}; 656 } 657 658 extern __inline __m128i 659 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi64(__m64 __q1,__m64 __q0)660 _mm_set_epi64(__m64 __q1, __m64 __q0) { 661 return _mm_set_epi64x((long long)__q1, (long long)__q0); 662 } 663 664 extern __inline __m128i 665 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi32(int __q3,int __q2,int __q1,int __q0)666 _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) { 667 return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3}; 668 } 669 670 extern __inline __m128i 671 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi16(short __q7,short __q6,short __q5,short __q4,short __q3,short __q2,short __q1,short __q0)672 _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3, 673 short __q2, short __q1, short __q0) { 674 return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3, 675 __q4, __q5, __q6, __q7}; 676 } 677 678 extern __inline __m128i 679 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi8(char __q15,char __q14,char __q13,char __q12,char __q11,char __q10,char __q09,char __q08,char __q07,char __q06,char __q05,char __q04,char __q03,char __q02,char __q01,char __q00)680 _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11, 681 char __q10, char __q09, char __q08, char __q07, char __q06, 682 char __q05, char __q04, char __q03, char __q02, char __q01, 683 char __q00) { 684 return __extension__(__m128i)(__v16qi){ 685 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 686 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; 687 } 688 689 /* Set all of the elements of the vector to A. */ 690 extern __inline __m128i 691 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_epi64x(long long __A)692 _mm_set1_epi64x(long long __A) { 693 return _mm_set_epi64x(__A, __A); 694 } 695 696 extern __inline __m128i 697 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_epi64(__m64 __A)698 _mm_set1_epi64(__m64 __A) { 699 return _mm_set_epi64(__A, __A); 700 } 701 702 extern __inline __m128i 703 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_epi32(int __A)704 _mm_set1_epi32(int __A) { 705 return _mm_set_epi32(__A, __A, __A, __A); 706 } 707 708 extern __inline __m128i 709 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_epi16(short __A)710 _mm_set1_epi16(short __A) { 711 return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A); 712 } 713 714 extern __inline __m128i 715 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_epi8(char __A)716 _mm_set1_epi8(char __A) { 717 return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, 718 __A, __A, __A, __A, __A); 719 } 720 721 /* Create a vector of Qi, where i is the element number. 722 The parameter order is reversed from the _mm_set_epi* functions. */ 723 extern __inline __m128i 724 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_epi64(__m64 __q0,__m64 __q1)725 _mm_setr_epi64(__m64 __q0, __m64 __q1) { 726 return _mm_set_epi64(__q1, __q0); 727 } 728 729 extern __inline __m128i 730 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_epi32(int __q0,int __q1,int __q2,int __q3)731 _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) { 732 return _mm_set_epi32(__q3, __q2, __q1, __q0); 733 } 734 735 extern __inline __m128i 736 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_epi16(short __q0,short __q1,short __q2,short __q3,short __q4,short __q5,short __q6,short __q7)737 _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4, 738 short __q5, short __q6, short __q7) { 739 return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); 740 } 741 742 extern __inline __m128i 743 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_epi8(char __q00,char __q01,char __q02,char __q03,char __q04,char __q05,char __q06,char __q07,char __q08,char __q09,char __q10,char __q11,char __q12,char __q13,char __q14,char __q15)744 _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04, 745 char __q05, char __q06, char __q07, char __q08, char __q09, 746 char __q10, char __q11, char __q12, char __q13, char __q14, 747 char __q15) { 748 return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, 749 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); 750 } 751 752 /* Create a vector with element 0 as *P and the rest zero. */ 753 extern __inline __m128i 754 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_si128(__m128i const * __P)755 _mm_load_si128(__m128i const *__P) { 756 return *__P; 757 } 758 759 extern __inline __m128i 760 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadu_si128(__m128i_u const * __P)761 _mm_loadu_si128(__m128i_u const *__P) { 762 return (__m128i)(vec_vsx_ld(0, (signed int const *)__P)); 763 } 764 765 extern __inline __m128i 766 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadl_epi64(__m128i_u const * __P)767 _mm_loadl_epi64(__m128i_u const *__P) { 768 return _mm_set_epi64((__m64)0LL, *(__m64 *)__P); 769 } 770 771 extern __inline void 772 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_si128(__m128i * __P,__m128i __B)773 _mm_store_si128(__m128i *__P, __m128i __B) { 774 vec_st((__v16qu)__B, 0, (__v16qu *)__P); 775 } 776 777 extern __inline void 778 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeu_si128(__m128i_u * __P,__m128i __B)779 _mm_storeu_si128(__m128i_u *__P, __m128i __B) { 780 *__P = __B; 781 } 782 783 extern __inline void 784 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storel_epi64(__m128i_u * __P,__m128i __B)785 _mm_storel_epi64(__m128i_u *__P, __m128i __B) { 786 *(long long *)__P = ((__v2di)__B)[0]; 787 } 788 789 extern __inline __m64 790 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movepi64_pi64(__m128i_u __B)791 _mm_movepi64_pi64(__m128i_u __B) { 792 return (__m64)((__v2di)__B)[0]; 793 } 794 795 extern __inline __m128i 796 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movpi64_epi64(__m64 __A)797 _mm_movpi64_epi64(__m64 __A) { 798 return _mm_set_epi64((__m64)0LL, __A); 799 } 800 801 extern __inline __m128i 802 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_move_epi64(__m128i __A)803 _mm_move_epi64(__m128i __A) { 804 return _mm_set_epi64((__m64)0LL, (__m64)__A[0]); 805 } 806 807 /* Create an undefined vector. */ 808 extern __inline __m128i 809 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_undefined_si128(void)810 _mm_undefined_si128(void) { 811 __m128i __Y = __Y; 812 return __Y; 813 } 814 815 /* Create a vector of zeros. */ 816 extern __inline __m128i 817 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setzero_si128(void)818 _mm_setzero_si128(void) { 819 return __extension__(__m128i)(__v4si){0, 0, 0, 0}; 820 } 821 822 #ifdef _ARCH_PWR8 823 extern __inline __m128d 824 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtepi32_pd(__m128i __A)825 _mm_cvtepi32_pd(__m128i __A) { 826 __v2di __val; 827 /* For LE need to generate Vector Unpack Low Signed Word. 828 Which is generated from unpackh. */ 829 __val = (__v2di)vec_unpackh((__v4si)__A); 830 831 return (__m128d)vec_ctf(__val, 0); 832 } 833 #endif 834 835 extern __inline __m128 836 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtepi32_ps(__m128i __A)837 _mm_cvtepi32_ps(__m128i __A) { 838 return ((__m128)vec_ctf((__v4si)__A, 0)); 839 } 840 841 extern __inline __m128i 842 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpd_epi32(__m128d __A)843 _mm_cvtpd_epi32(__m128d __A) { 844 __v2df __rounded = vec_rint(__A); 845 __v4si __result, __temp; 846 const __v4si __vzero = {0, 0, 0, 0}; 847 848 /* VSX Vector truncate Double-Precision to integer and Convert to 849 Signed Integer Word format with Saturate. */ 850 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :); 851 852 #ifdef _ARCH_PWR8 853 #ifdef __LITTLE_ENDIAN__ 854 __temp = vec_mergeo(__temp, __temp); 855 #else 856 __temp = vec_mergee(__temp, __temp); 857 #endif 858 __result = (__v4si)vec_vpkudum((__vector long long)__temp, 859 (__vector long long)__vzero); 860 #else 861 { 862 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 863 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 864 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 865 } 866 #endif 867 return (__m128i)__result; 868 } 869 870 extern __inline __m64 871 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpd_pi32(__m128d __A)872 _mm_cvtpd_pi32(__m128d __A) { 873 __m128i __result = _mm_cvtpd_epi32(__A); 874 875 return (__m64)__result[0]; 876 } 877 878 extern __inline __m128 879 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpd_ps(__m128d __A)880 _mm_cvtpd_ps(__m128d __A) { 881 __v4sf __result; 882 __v4si __temp; 883 const __v4si __vzero = {0, 0, 0, 0}; 884 885 __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :); 886 887 #ifdef _ARCH_PWR8 888 #ifdef __LITTLE_ENDIAN__ 889 __temp = vec_mergeo(__temp, __temp); 890 #else 891 __temp = vec_mergee(__temp, __temp); 892 #endif 893 __result = (__v4sf)vec_vpkudum((__vector long long)__temp, 894 (__vector long long)__vzero); 895 #else 896 { 897 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 898 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 899 __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 900 } 901 #endif 902 return ((__m128)__result); 903 } 904 905 extern __inline __m128i 906 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttpd_epi32(__m128d __A)907 _mm_cvttpd_epi32(__m128d __A) { 908 __v4si __result; 909 __v4si __temp; 910 const __v4si __vzero = {0, 0, 0, 0}; 911 912 /* VSX Vector truncate Double-Precision to integer and Convert to 913 Signed Integer Word format with Saturate. */ 914 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :); 915 916 #ifdef _ARCH_PWR8 917 #ifdef __LITTLE_ENDIAN__ 918 __temp = vec_mergeo(__temp, __temp); 919 #else 920 __temp = vec_mergee(__temp, __temp); 921 #endif 922 __result = (__v4si)vec_vpkudum((__vector long long)__temp, 923 (__vector long long)__vzero); 924 #else 925 { 926 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 927 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f}; 928 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm); 929 } 930 #endif 931 932 return ((__m128i)__result); 933 } 934 935 extern __inline __m64 936 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttpd_pi32(__m128d __A)937 _mm_cvttpd_pi32(__m128d __A) { 938 __m128i __result = _mm_cvttpd_epi32(__A); 939 940 return (__m64)__result[0]; 941 } 942 943 extern __inline int 944 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si32(__m128i __A)945 _mm_cvtsi128_si32(__m128i __A) { 946 return ((__v4si)__A)[0]; 947 } 948 949 #ifdef _ARCH_PWR8 950 extern __inline __m128d 951 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtpi32_pd(__m64 __A)952 _mm_cvtpi32_pd(__m64 __A) { 953 __v4si __temp; 954 __v2di __tmp2; 955 __v4f __result; 956 957 __temp = (__v4si)vec_splats(__A); 958 __tmp2 = (__v2di)vec_unpackl(__temp); 959 __result = vec_ctf((__vector signed long long)__tmp2, 0); 960 return (__m128d)__result; 961 } 962 #endif 963 964 extern __inline __m128i 965 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtps_epi32(__m128 __A)966 _mm_cvtps_epi32(__m128 __A) { 967 __v4sf __rounded; 968 __v4si __result; 969 970 __rounded = vec_rint((__v4sf)__A); 971 __result = vec_cts(__rounded, 0); 972 return (__m128i)__result; 973 } 974 975 extern __inline __m128i 976 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttps_epi32(__m128 __A)977 _mm_cvttps_epi32(__m128 __A) { 978 __v4si __result; 979 980 __result = vec_cts((__v4sf)__A, 0); 981 return (__m128i)__result; 982 } 983 984 extern __inline __m128d 985 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtps_pd(__m128 __A)986 _mm_cvtps_pd(__m128 __A) { 987 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ 988 #ifdef vec_doubleh 989 return (__m128d)vec_doubleh((__v4sf)__A); 990 #else 991 /* Otherwise the compiler is not current and so need to generate the 992 equivalent code. */ 993 __v4sf __a = (__v4sf)__A; 994 __v4sf __temp; 995 __v2df __result; 996 #ifdef __LITTLE_ENDIAN__ 997 /* The input float values are in elements {[0], [1]} but the convert 998 instruction needs them in elements {[1], [3]}, So we use two 999 shift left double vector word immediates to get the elements 1000 lined up. */ 1001 __temp = __builtin_vsx_xxsldwi(__a, __a, 3); 1002 __temp = __builtin_vsx_xxsldwi(__a, __temp, 2); 1003 #else 1004 /* The input float values are in elements {[0], [1]} but the convert 1005 instruction needs them in elements {[0], [2]}, So we use two 1006 shift left double vector word immediates to get the elements 1007 lined up. */ 1008 __temp = vec_vmrghw(__a, __a); 1009 #endif 1010 __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :); 1011 return (__m128d)__result; 1012 #endif 1013 } 1014 1015 extern __inline int 1016 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_si32(__m128d __A)1017 _mm_cvtsd_si32(__m128d __A) { 1018 __v2df __rounded = vec_rint((__v2df)__A); 1019 int __result = ((__v2df)__rounded)[0]; 1020 1021 return __result; 1022 } 1023 /* Intel intrinsic. */ 1024 extern __inline long long 1025 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_si64(__m128d __A)1026 _mm_cvtsd_si64(__m128d __A) { 1027 __v2df __rounded = vec_rint((__v2df)__A); 1028 long long __result = ((__v2df)__rounded)[0]; 1029 1030 return __result; 1031 } 1032 1033 /* Microsoft intrinsic. */ 1034 extern __inline long long 1035 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_si64x(__m128d __A)1036 _mm_cvtsd_si64x(__m128d __A) { 1037 return _mm_cvtsd_si64((__v2df)__A); 1038 } 1039 1040 extern __inline int 1041 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttsd_si32(__m128d __A)1042 _mm_cvttsd_si32(__m128d __A) { 1043 int __result = ((__v2df)__A)[0]; 1044 1045 return __result; 1046 } 1047 1048 /* Intel intrinsic. */ 1049 extern __inline long long 1050 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttsd_si64(__m128d __A)1051 _mm_cvttsd_si64(__m128d __A) { 1052 long long __result = ((__v2df)__A)[0]; 1053 1054 return __result; 1055 } 1056 1057 /* Microsoft intrinsic. */ 1058 extern __inline long long 1059 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvttsd_si64x(__m128d __A)1060 _mm_cvttsd_si64x(__m128d __A) { 1061 return _mm_cvttsd_si64(__A); 1062 } 1063 1064 extern __inline __m128 1065 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_ss(__m128 __A,__m128d __B)1066 _mm_cvtsd_ss(__m128 __A, __m128d __B) { 1067 __v4sf __result = (__v4sf)__A; 1068 1069 #ifdef __LITTLE_ENDIAN__ 1070 __v4sf __temp_s; 1071 /* Copy double element[0] to element [1] for conversion. */ 1072 __v2df __temp_b = vec_splat((__v2df)__B, 0); 1073 1074 /* Pre-rotate __A left 3 (logically right 1) elements. */ 1075 __result = __builtin_vsx_xxsldwi(__result, __result, 3); 1076 /* Convert double to single float scalar in a vector. */ 1077 __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :); 1078 /* Shift the resulting scalar into vector element [0]. */ 1079 __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1); 1080 #else 1081 __result[0] = ((__v2df)__B)[0]; 1082 #endif 1083 return (__m128)__result; 1084 } 1085 1086 extern __inline __m128d 1087 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi32_sd(__m128d __A,int __B)1088 _mm_cvtsi32_sd(__m128d __A, int __B) { 1089 __v2df __result = (__v2df)__A; 1090 double __db = __B; 1091 __result[0] = __db; 1092 return (__m128d)__result; 1093 } 1094 1095 /* Intel intrinsic. */ 1096 extern __inline __m128d 1097 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_sd(__m128d __A,long long __B)1098 _mm_cvtsi64_sd(__m128d __A, long long __B) { 1099 __v2df __result = (__v2df)__A; 1100 double __db = __B; 1101 __result[0] = __db; 1102 return (__m128d)__result; 1103 } 1104 1105 /* Microsoft intrinsic. */ 1106 extern __inline __m128d 1107 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64x_sd(__m128d __A,long long __B)1108 _mm_cvtsi64x_sd(__m128d __A, long long __B) { 1109 return _mm_cvtsi64_sd(__A, __B); 1110 } 1111 1112 extern __inline __m128d 1113 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_sd(__m128d __A,__m128 __B)1114 _mm_cvtss_sd(__m128d __A, __m128 __B) { 1115 #ifdef __LITTLE_ENDIAN__ 1116 /* Use splat to move element [0] into position for the convert. */ 1117 __v4sf __temp = vec_splat((__v4sf)__B, 0); 1118 __v2df __res; 1119 /* Convert single float scalar to double in a vector. */ 1120 __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :); 1121 return (__m128d)vec_mergel(__res, (__v2df)__A); 1122 #else 1123 __v2df __res = (__v2df)__A; 1124 __res[0] = ((__v4sf)__B)[0]; 1125 return (__m128d)__res; 1126 #endif 1127 } 1128 1129 extern __inline __m128d 1130 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shuffle_pd(__m128d __A,__m128d __B,const int __mask)1131 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) { 1132 __vector double __result; 1133 const int __litmsk = __mask & 0x3; 1134 1135 if (__litmsk == 0) 1136 __result = vec_mergeh(__A, __B); 1137 #if __GNUC__ < 6 1138 else if (__litmsk == 1) 1139 __result = vec_xxpermdi(__B, __A, 2); 1140 else if (__litmsk == 2) 1141 __result = vec_xxpermdi(__B, __A, 1); 1142 #else 1143 else if (__litmsk == 1) 1144 __result = vec_xxpermdi(__A, __B, 2); 1145 else if (__litmsk == 2) 1146 __result = vec_xxpermdi(__A, __B, 1); 1147 #endif 1148 else 1149 __result = vec_mergel(__A, __B); 1150 1151 return __result; 1152 } 1153 1154 extern __inline __m128d 1155 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pd(__m128d __A,__m128d __B)1156 _mm_unpackhi_pd(__m128d __A, __m128d __B) { 1157 return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B); 1158 } 1159 1160 extern __inline __m128d 1161 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pd(__m128d __A,__m128d __B)1162 _mm_unpacklo_pd(__m128d __A, __m128d __B) { 1163 return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B); 1164 } 1165 1166 extern __inline __m128d 1167 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadh_pd(__m128d __A,double const * __B)1168 _mm_loadh_pd(__m128d __A, double const *__B) { 1169 __v2df __result = (__v2df)__A; 1170 __result[1] = *__B; 1171 return (__m128d)__result; 1172 } 1173 1174 extern __inline __m128d 1175 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadl_pd(__m128d __A,double const * __B)1176 _mm_loadl_pd(__m128d __A, double const *__B) { 1177 __v2df __result = (__v2df)__A; 1178 __result[0] = *__B; 1179 return (__m128d)__result; 1180 } 1181 1182 #ifdef _ARCH_PWR8 1183 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1184 1185 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */ 1186 extern __inline int 1187 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pd(__m128d __A)1188 _mm_movemask_pd(__m128d __A) { 1189 #ifdef _ARCH_PWR10 1190 return vec_extractm((__v2du)__A); 1191 #else 1192 __vector unsigned long long __result; 1193 static const __vector unsigned int __perm_mask = { 1194 #ifdef __LITTLE_ENDIAN__ 1195 0x80800040, 0x80808080, 0x80808080, 0x80808080 1196 #else 1197 0x80808080, 0x80808080, 0x80808080, 0x80804000 1198 #endif 1199 }; 1200 1201 __result = ((__vector unsigned long long)vec_vbpermq( 1202 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); 1203 1204 #ifdef __LITTLE_ENDIAN__ 1205 return __result[1]; 1206 #else 1207 return __result[0]; 1208 #endif 1209 #endif /* !_ARCH_PWR10 */ 1210 } 1211 #endif /* _ARCH_PWR8 */ 1212 1213 extern __inline __m128i 1214 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_epi16(__m128i __A,__m128i __B)1215 _mm_packs_epi16(__m128i __A, __m128i __B) { 1216 return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B); 1217 } 1218 1219 extern __inline __m128i 1220 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_epi32(__m128i __A,__m128i __B)1221 _mm_packs_epi32(__m128i __A, __m128i __B) { 1222 return (__m128i)vec_packs((__v4si)__A, (__v4si)__B); 1223 } 1224 1225 extern __inline __m128i 1226 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packus_epi16(__m128i __A,__m128i __B)1227 _mm_packus_epi16(__m128i __A, __m128i __B) { 1228 return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B); 1229 } 1230 1231 extern __inline __m128i 1232 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_epi8(__m128i __A,__m128i __B)1233 _mm_unpackhi_epi8(__m128i __A, __m128i __B) { 1234 return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B); 1235 } 1236 1237 extern __inline __m128i 1238 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_epi16(__m128i __A,__m128i __B)1239 _mm_unpackhi_epi16(__m128i __A, __m128i __B) { 1240 return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B); 1241 } 1242 1243 extern __inline __m128i 1244 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_epi32(__m128i __A,__m128i __B)1245 _mm_unpackhi_epi32(__m128i __A, __m128i __B) { 1246 return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B); 1247 } 1248 1249 extern __inline __m128i 1250 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_epi64(__m128i __A,__m128i __B)1251 _mm_unpackhi_epi64(__m128i __A, __m128i __B) { 1252 return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B); 1253 } 1254 1255 extern __inline __m128i 1256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_epi8(__m128i __A,__m128i __B)1257 _mm_unpacklo_epi8(__m128i __A, __m128i __B) { 1258 return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B); 1259 } 1260 1261 extern __inline __m128i 1262 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_epi16(__m128i __A,__m128i __B)1263 _mm_unpacklo_epi16(__m128i __A, __m128i __B) { 1264 return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B); 1265 } 1266 1267 extern __inline __m128i 1268 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_epi32(__m128i __A,__m128i __B)1269 _mm_unpacklo_epi32(__m128i __A, __m128i __B) { 1270 return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B); 1271 } 1272 1273 extern __inline __m128i 1274 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_epi64(__m128i __A,__m128i __B)1275 _mm_unpacklo_epi64(__m128i __A, __m128i __B) { 1276 return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B); 1277 } 1278 1279 extern __inline __m128i 1280 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi8(__m128i __A,__m128i __B)1281 _mm_add_epi8(__m128i __A, __m128i __B) { 1282 return (__m128i)((__v16qu)__A + (__v16qu)__B); 1283 } 1284 1285 extern __inline __m128i 1286 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi16(__m128i __A,__m128i __B)1287 _mm_add_epi16(__m128i __A, __m128i __B) { 1288 return (__m128i)((__v8hu)__A + (__v8hu)__B); 1289 } 1290 1291 extern __inline __m128i 1292 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi32(__m128i __A,__m128i __B)1293 _mm_add_epi32(__m128i __A, __m128i __B) { 1294 return (__m128i)((__v4su)__A + (__v4su)__B); 1295 } 1296 1297 extern __inline __m128i 1298 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi64(__m128i __A,__m128i __B)1299 _mm_add_epi64(__m128i __A, __m128i __B) { 1300 return (__m128i)((__v2du)__A + (__v2du)__B); 1301 } 1302 1303 extern __inline __m128i 1304 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epi8(__m128i __A,__m128i __B)1305 _mm_adds_epi8(__m128i __A, __m128i __B) { 1306 return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B); 1307 } 1308 1309 extern __inline __m128i 1310 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epi16(__m128i __A,__m128i __B)1311 _mm_adds_epi16(__m128i __A, __m128i __B) { 1312 return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B); 1313 } 1314 1315 extern __inline __m128i 1316 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epu8(__m128i __A,__m128i __B)1317 _mm_adds_epu8(__m128i __A, __m128i __B) { 1318 return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B); 1319 } 1320 1321 extern __inline __m128i 1322 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epu16(__m128i __A,__m128i __B)1323 _mm_adds_epu16(__m128i __A, __m128i __B) { 1324 return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B); 1325 } 1326 1327 extern __inline __m128i 1328 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi8(__m128i __A,__m128i __B)1329 _mm_sub_epi8(__m128i __A, __m128i __B) { 1330 return (__m128i)((__v16qu)__A - (__v16qu)__B); 1331 } 1332 1333 extern __inline __m128i 1334 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi16(__m128i __A,__m128i __B)1335 _mm_sub_epi16(__m128i __A, __m128i __B) { 1336 return (__m128i)((__v8hu)__A - (__v8hu)__B); 1337 } 1338 1339 extern __inline __m128i 1340 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi32(__m128i __A,__m128i __B)1341 _mm_sub_epi32(__m128i __A, __m128i __B) { 1342 return (__m128i)((__v4su)__A - (__v4su)__B); 1343 } 1344 1345 extern __inline __m128i 1346 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi64(__m128i __A,__m128i __B)1347 _mm_sub_epi64(__m128i __A, __m128i __B) { 1348 return (__m128i)((__v2du)__A - (__v2du)__B); 1349 } 1350 1351 extern __inline __m128i 1352 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_epi8(__m128i __A,__m128i __B)1353 _mm_subs_epi8(__m128i __A, __m128i __B) { 1354 return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B); 1355 } 1356 1357 extern __inline __m128i 1358 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_epi16(__m128i __A,__m128i __B)1359 _mm_subs_epi16(__m128i __A, __m128i __B) { 1360 return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B); 1361 } 1362 1363 extern __inline __m128i 1364 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_epu8(__m128i __A,__m128i __B)1365 _mm_subs_epu8(__m128i __A, __m128i __B) { 1366 return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B); 1367 } 1368 1369 extern __inline __m128i 1370 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_epu16(__m128i __A,__m128i __B)1371 _mm_subs_epu16(__m128i __A, __m128i __B) { 1372 return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B); 1373 } 1374 1375 extern __inline __m128i 1376 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_madd_epi16(__m128i __A,__m128i __B)1377 _mm_madd_epi16(__m128i __A, __m128i __B) { 1378 __vector signed int __zero = {0, 0, 0, 0}; 1379 1380 return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero); 1381 } 1382 1383 extern __inline __m128i 1384 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_epi16(__m128i __A,__m128i __B)1385 _mm_mulhi_epi16(__m128i __A, __m128i __B) { 1386 __vector signed int __w0, __w1; 1387 1388 __vector unsigned char __xform1 = { 1389 #ifdef __LITTLE_ENDIAN__ 1390 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1391 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1392 #else 1393 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 1394 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1395 #endif 1396 }; 1397 1398 __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B); 1399 __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B); 1400 return (__m128i)vec_perm(__w0, __w1, __xform1); 1401 } 1402 1403 extern __inline __m128i 1404 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mullo_epi16(__m128i __A,__m128i __B)1405 _mm_mullo_epi16(__m128i __A, __m128i __B) { 1406 return (__m128i)((__v8hi)__A * (__v8hi)__B); 1407 } 1408 1409 extern __inline __m64 1410 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_su32(__m64 __A,__m64 __B)1411 _mm_mul_su32(__m64 __A, __m64 __B) { 1412 unsigned int __a = __A; 1413 unsigned int __b = __B; 1414 1415 return ((__m64)__a * (__m64)__b); 1416 } 1417 1418 #ifdef _ARCH_PWR8 1419 extern __inline __m128i 1420 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_epu32(__m128i __A,__m128i __B)1421 _mm_mul_epu32(__m128i __A, __m128i __B) { 1422 #if __GNUC__ < 8 1423 __v2du __result; 1424 1425 #ifdef __LITTLE_ENDIAN__ 1426 /* VMX Vector Multiply Odd Unsigned Word. */ 1427 __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :); 1428 #else 1429 /* VMX Vector Multiply Even Unsigned Word. */ 1430 __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :); 1431 #endif 1432 return (__m128i)__result; 1433 #else 1434 return (__m128i)vec_mule((__v4su)__A, (__v4su)__B); 1435 #endif 1436 } 1437 #endif 1438 1439 extern __inline __m128i 1440 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi16(__m128i __A,int __B)1441 _mm_slli_epi16(__m128i __A, int __B) { 1442 __v8hu __lshift; 1443 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0}; 1444 1445 if (__B >= 0 && __B < 16) { 1446 if (__builtin_constant_p(__B)) 1447 __lshift = (__v8hu)vec_splat_s16(__B); 1448 else 1449 __lshift = vec_splats((unsigned short)__B); 1450 1451 __result = vec_sl((__v8hi)__A, __lshift); 1452 } 1453 1454 return (__m128i)__result; 1455 } 1456 1457 extern __inline __m128i 1458 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi32(__m128i __A,int __B)1459 _mm_slli_epi32(__m128i __A, int __B) { 1460 __v4su __lshift; 1461 __v4si __result = {0, 0, 0, 0}; 1462 1463 if (__B >= 0 && __B < 32) { 1464 if (__builtin_constant_p(__B) && __B < 16) 1465 __lshift = (__v4su)vec_splat_s32(__B); 1466 else 1467 __lshift = vec_splats((unsigned int)__B); 1468 1469 __result = vec_sl((__v4si)__A, __lshift); 1470 } 1471 1472 return (__m128i)__result; 1473 } 1474 1475 #ifdef _ARCH_PWR8 1476 extern __inline __m128i 1477 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi64(__m128i __A,int __B)1478 _mm_slli_epi64(__m128i __A, int __B) { 1479 __v2du __lshift; 1480 __v2di __result = {0, 0}; 1481 1482 if (__B >= 0 && __B < 64) { 1483 if (__builtin_constant_p(__B) && __B < 16) 1484 __lshift = (__v2du)vec_splat_s32(__B); 1485 else 1486 __lshift = (__v2du)vec_splats((unsigned int)__B); 1487 1488 __result = vec_sl((__v2di)__A, __lshift); 1489 } 1490 1491 return (__m128i)__result; 1492 } 1493 #endif 1494 1495 extern __inline __m128i 1496 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_epi16(__m128i __A,int __B)1497 _mm_srai_epi16(__m128i __A, int __B) { 1498 __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15}; 1499 __v8hi __result; 1500 1501 if (__B < 16) { 1502 if (__builtin_constant_p(__B)) 1503 __rshift = (__v8hu)vec_splat_s16(__B); 1504 else 1505 __rshift = vec_splats((unsigned short)__B); 1506 } 1507 __result = vec_sra((__v8hi)__A, __rshift); 1508 1509 return (__m128i)__result; 1510 } 1511 1512 extern __inline __m128i 1513 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_epi32(__m128i __A,int __B)1514 _mm_srai_epi32(__m128i __A, int __B) { 1515 __v4su __rshift = {31, 31, 31, 31}; 1516 __v4si __result; 1517 1518 if (__B < 32) { 1519 if (__builtin_constant_p(__B)) { 1520 if (__B < 16) 1521 __rshift = (__v4su)vec_splat_s32(__B); 1522 else 1523 __rshift = (__v4su)vec_splats((unsigned int)__B); 1524 } else 1525 __rshift = vec_splats((unsigned int)__B); 1526 } 1527 __result = vec_sra((__v4si)__A, __rshift); 1528 1529 return (__m128i)__result; 1530 } 1531 1532 extern __inline __m128i 1533 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_bslli_si128(__m128i __A,const int __N)1534 _mm_bslli_si128(__m128i __A, const int __N) { 1535 __v16qu __result; 1536 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1537 1538 if (__N < 16) 1539 __result = vec_sld((__v16qu)__A, __zeros, __N); 1540 else 1541 __result = __zeros; 1542 1543 return (__m128i)__result; 1544 } 1545 1546 extern __inline __m128i 1547 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_bsrli_si128(__m128i __A,const int __N)1548 _mm_bsrli_si128(__m128i __A, const int __N) { 1549 __v16qu __result; 1550 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1551 1552 if (__N < 16) 1553 #ifdef __LITTLE_ENDIAN__ 1554 if (__builtin_constant_p(__N)) 1555 /* Would like to use Vector Shift Left Double by Octet 1556 Immediate here to use the immediate form and avoid 1557 load of __N * 8 value into a separate VR. */ 1558 __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N)); 1559 else 1560 #endif 1561 { 1562 __v16qu __shift = vec_splats((unsigned char)(__N * 8)); 1563 #ifdef __LITTLE_ENDIAN__ 1564 __result = vec_sro((__v16qu)__A, __shift); 1565 #else 1566 __result = vec_slo((__v16qu)__A, __shift); 1567 #endif 1568 } 1569 else 1570 __result = __zeros; 1571 1572 return (__m128i)__result; 1573 } 1574 1575 extern __inline __m128i 1576 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_si128(__m128i __A,const int __N)1577 _mm_srli_si128(__m128i __A, const int __N) { 1578 return _mm_bsrli_si128(__A, __N); 1579 } 1580 1581 extern __inline __m128i 1582 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_si128(__m128i __A,const int _imm5)1583 _mm_slli_si128(__m128i __A, const int _imm5) { 1584 __v16qu __result; 1585 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1586 1587 if (_imm5 < 16) 1588 #ifdef __LITTLE_ENDIAN__ 1589 __result = vec_sld((__v16qu)__A, __zeros, _imm5); 1590 #else 1591 __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5)); 1592 #endif 1593 else 1594 __result = __zeros; 1595 1596 return (__m128i)__result; 1597 } 1598 1599 extern __inline __m128i 1600 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1601 _mm_srli_epi16(__m128i __A,int __B)1602 _mm_srli_epi16(__m128i __A, int __B) { 1603 __v8hu __rshift; 1604 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0}; 1605 1606 if (__B < 16) { 1607 if (__builtin_constant_p(__B)) 1608 __rshift = (__v8hu)vec_splat_s16(__B); 1609 else 1610 __rshift = vec_splats((unsigned short)__B); 1611 1612 __result = vec_sr((__v8hi)__A, __rshift); 1613 } 1614 1615 return (__m128i)__result; 1616 } 1617 1618 extern __inline __m128i 1619 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi32(__m128i __A,int __B)1620 _mm_srli_epi32(__m128i __A, int __B) { 1621 __v4su __rshift; 1622 __v4si __result = {0, 0, 0, 0}; 1623 1624 if (__B < 32) { 1625 if (__builtin_constant_p(__B)) { 1626 if (__B < 16) 1627 __rshift = (__v4su)vec_splat_s32(__B); 1628 else 1629 __rshift = (__v4su)vec_splats((unsigned int)__B); 1630 } else 1631 __rshift = vec_splats((unsigned int)__B); 1632 1633 __result = vec_sr((__v4si)__A, __rshift); 1634 } 1635 1636 return (__m128i)__result; 1637 } 1638 1639 #ifdef _ARCH_PWR8 1640 extern __inline __m128i 1641 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi64(__m128i __A,int __B)1642 _mm_srli_epi64(__m128i __A, int __B) { 1643 __v2du __rshift; 1644 __v2di __result = {0, 0}; 1645 1646 if (__B < 64) { 1647 if (__builtin_constant_p(__B)) { 1648 if (__B < 16) 1649 __rshift = (__v2du)vec_splat_s32(__B); 1650 else 1651 __rshift = (__v2du)vec_splats((unsigned long long)__B); 1652 } else 1653 __rshift = (__v2du)vec_splats((unsigned int)__B); 1654 1655 __result = vec_sr((__v2di)__A, __rshift); 1656 } 1657 1658 return (__m128i)__result; 1659 } 1660 #endif 1661 1662 extern __inline __m128i 1663 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi16(__m128i __A,__m128i __B)1664 _mm_sll_epi16(__m128i __A, __m128i __B) { 1665 __v8hu __lshift; 1666 __vector __bool short __shmask; 1667 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1668 __v8hu __result; 1669 1670 #ifdef __LITTLE_ENDIAN__ 1671 __lshift = vec_splat((__v8hu)__B, 0); 1672 #else 1673 __lshift = vec_splat((__v8hu)__B, 3); 1674 #endif 1675 __shmask = vec_cmple(__lshift, __shmax); 1676 __result = vec_sl((__v8hu)__A, __lshift); 1677 __result = vec_sel((__v8hu)__shmask, __result, __shmask); 1678 1679 return (__m128i)__result; 1680 } 1681 1682 extern __inline __m128i 1683 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi32(__m128i __A,__m128i __B)1684 _mm_sll_epi32(__m128i __A, __m128i __B) { 1685 __v4su __lshift; 1686 __vector __bool int __shmask; 1687 const __v4su __shmax = {32, 32, 32, 32}; 1688 __v4su __result; 1689 #ifdef __LITTLE_ENDIAN__ 1690 __lshift = vec_splat((__v4su)__B, 0); 1691 #else 1692 __lshift = vec_splat((__v4su)__B, 1); 1693 #endif 1694 __shmask = vec_cmplt(__lshift, __shmax); 1695 __result = vec_sl((__v4su)__A, __lshift); 1696 __result = vec_sel((__v4su)__shmask, __result, __shmask); 1697 1698 return (__m128i)__result; 1699 } 1700 1701 #ifdef _ARCH_PWR8 1702 extern __inline __m128i 1703 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi64(__m128i __A,__m128i __B)1704 _mm_sll_epi64(__m128i __A, __m128i __B) { 1705 __v2du __lshift; 1706 __vector __bool long long __shmask; 1707 const __v2du __shmax = {64, 64}; 1708 __v2du __result; 1709 1710 __lshift = vec_splat((__v2du)__B, 0); 1711 __shmask = vec_cmplt(__lshift, __shmax); 1712 __result = vec_sl((__v2du)__A, __lshift); 1713 __result = vec_sel((__v2du)__shmask, __result, __shmask); 1714 1715 return (__m128i)__result; 1716 } 1717 #endif 1718 1719 extern __inline __m128i 1720 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_epi16(__m128i __A,__m128i __B)1721 _mm_sra_epi16(__m128i __A, __m128i __B) { 1722 const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1723 __v8hu __rshift; 1724 __v8hi __result; 1725 1726 #ifdef __LITTLE_ENDIAN__ 1727 __rshift = vec_splat((__v8hu)__B, 0); 1728 #else 1729 __rshift = vec_splat((__v8hu)__B, 3); 1730 #endif 1731 __rshift = vec_min(__rshift, __rshmax); 1732 __result = vec_sra((__v8hi)__A, __rshift); 1733 1734 return (__m128i)__result; 1735 } 1736 1737 extern __inline __m128i 1738 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_epi32(__m128i __A,__m128i __B)1739 _mm_sra_epi32(__m128i __A, __m128i __B) { 1740 const __v4su __rshmax = {31, 31, 31, 31}; 1741 __v4su __rshift; 1742 __v4si __result; 1743 1744 #ifdef __LITTLE_ENDIAN__ 1745 __rshift = vec_splat((__v4su)__B, 0); 1746 #else 1747 __rshift = vec_splat((__v4su)__B, 1); 1748 #endif 1749 __rshift = vec_min(__rshift, __rshmax); 1750 __result = vec_sra((__v4si)__A, __rshift); 1751 1752 return (__m128i)__result; 1753 } 1754 1755 extern __inline __m128i 1756 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_epi16(__m128i __A,__m128i __B)1757 _mm_srl_epi16(__m128i __A, __m128i __B) { 1758 __v8hu __rshift; 1759 __vector __bool short __shmask; 1760 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15}; 1761 __v8hu __result; 1762 1763 #ifdef __LITTLE_ENDIAN__ 1764 __rshift = vec_splat((__v8hu)__B, 0); 1765 #else 1766 __rshift = vec_splat((__v8hu)__B, 3); 1767 #endif 1768 __shmask = vec_cmple(__rshift, __shmax); 1769 __result = vec_sr((__v8hu)__A, __rshift); 1770 __result = vec_sel((__v8hu)__shmask, __result, __shmask); 1771 1772 return (__m128i)__result; 1773 } 1774 1775 extern __inline __m128i 1776 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_epi32(__m128i __A,__m128i __B)1777 _mm_srl_epi32(__m128i __A, __m128i __B) { 1778 __v4su __rshift; 1779 __vector __bool int __shmask; 1780 const __v4su __shmax = {32, 32, 32, 32}; 1781 __v4su __result; 1782 1783 #ifdef __LITTLE_ENDIAN__ 1784 __rshift = vec_splat((__v4su)__B, 0); 1785 #else 1786 __rshift = vec_splat((__v4su)__B, 1); 1787 #endif 1788 __shmask = vec_cmplt(__rshift, __shmax); 1789 __result = vec_sr((__v4su)__A, __rshift); 1790 __result = vec_sel((__v4su)__shmask, __result, __shmask); 1791 1792 return (__m128i)__result; 1793 } 1794 1795 #ifdef _ARCH_PWR8 1796 extern __inline __m128i 1797 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_epi64(__m128i __A,__m128i __B)1798 _mm_srl_epi64(__m128i __A, __m128i __B) { 1799 __v2du __rshift; 1800 __vector __bool long long __shmask; 1801 const __v2du __shmax = {64, 64}; 1802 __v2du __result; 1803 1804 __rshift = vec_splat((__v2du)__B, 0); 1805 __shmask = vec_cmplt(__rshift, __shmax); 1806 __result = vec_sr((__v2du)__A, __rshift); 1807 __result = vec_sel((__v2du)__shmask, __result, __shmask); 1808 1809 return (__m128i)__result; 1810 } 1811 #endif 1812 1813 extern __inline __m128d 1814 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_and_pd(__m128d __A,__m128d __B)1815 _mm_and_pd(__m128d __A, __m128d __B) { 1816 return (vec_and((__v2df)__A, (__v2df)__B)); 1817 } 1818 1819 extern __inline __m128d 1820 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_andnot_pd(__m128d __A,__m128d __B)1821 _mm_andnot_pd(__m128d __A, __m128d __B) { 1822 return (vec_andc((__v2df)__B, (__v2df)__A)); 1823 } 1824 1825 extern __inline __m128d 1826 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_or_pd(__m128d __A,__m128d __B)1827 _mm_or_pd(__m128d __A, __m128d __B) { 1828 return (vec_or((__v2df)__A, (__v2df)__B)); 1829 } 1830 1831 extern __inline __m128d 1832 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_xor_pd(__m128d __A,__m128d __B)1833 _mm_xor_pd(__m128d __A, __m128d __B) { 1834 return (vec_xor((__v2df)__A, (__v2df)__B)); 1835 } 1836 1837 extern __inline __m128i 1838 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_and_si128(__m128i __A,__m128i __B)1839 _mm_and_si128(__m128i __A, __m128i __B) { 1840 return (__m128i)vec_and((__v2di)__A, (__v2di)__B); 1841 } 1842 1843 extern __inline __m128i 1844 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_andnot_si128(__m128i __A,__m128i __B)1845 _mm_andnot_si128(__m128i __A, __m128i __B) { 1846 return (__m128i)vec_andc((__v2di)__B, (__v2di)__A); 1847 } 1848 1849 extern __inline __m128i 1850 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_or_si128(__m128i __A,__m128i __B)1851 _mm_or_si128(__m128i __A, __m128i __B) { 1852 return (__m128i)vec_or((__v2di)__A, (__v2di)__B); 1853 } 1854 1855 extern __inline __m128i 1856 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_xor_si128(__m128i __A,__m128i __B)1857 _mm_xor_si128(__m128i __A, __m128i __B) { 1858 return (__m128i)vec_xor((__v2di)__A, (__v2di)__B); 1859 } 1860 1861 extern __inline __m128i 1862 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi8(__m128i __A,__m128i __B)1863 _mm_cmpeq_epi8(__m128i __A, __m128i __B) { 1864 return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B); 1865 } 1866 1867 extern __inline __m128i 1868 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi16(__m128i __A,__m128i __B)1869 _mm_cmpeq_epi16(__m128i __A, __m128i __B) { 1870 return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B); 1871 } 1872 1873 extern __inline __m128i 1874 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi32(__m128i __A,__m128i __B)1875 _mm_cmpeq_epi32(__m128i __A, __m128i __B) { 1876 return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B); 1877 } 1878 1879 extern __inline __m128i 1880 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_epi8(__m128i __A,__m128i __B)1881 _mm_cmplt_epi8(__m128i __A, __m128i __B) { 1882 return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B); 1883 } 1884 1885 extern __inline __m128i 1886 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_epi16(__m128i __A,__m128i __B)1887 _mm_cmplt_epi16(__m128i __A, __m128i __B) { 1888 return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B); 1889 } 1890 1891 extern __inline __m128i 1892 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_epi32(__m128i __A,__m128i __B)1893 _mm_cmplt_epi32(__m128i __A, __m128i __B) { 1894 return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B); 1895 } 1896 1897 extern __inline __m128i 1898 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi8(__m128i __A,__m128i __B)1899 _mm_cmpgt_epi8(__m128i __A, __m128i __B) { 1900 return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B); 1901 } 1902 1903 extern __inline __m128i 1904 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi16(__m128i __A,__m128i __B)1905 _mm_cmpgt_epi16(__m128i __A, __m128i __B) { 1906 return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B); 1907 } 1908 1909 extern __inline __m128i 1910 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi32(__m128i __A,__m128i __B)1911 _mm_cmpgt_epi32(__m128i __A, __m128i __B) { 1912 return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B); 1913 } 1914 1915 extern __inline int 1916 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_extract_epi16(__m128i const __A,int const __N)1917 _mm_extract_epi16(__m128i const __A, int const __N) { 1918 return (unsigned short)((__v8hi)__A)[__N & 7]; 1919 } 1920 1921 extern __inline __m128i 1922 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_insert_epi16(__m128i const __A,int const __D,int const __N)1923 _mm_insert_epi16(__m128i const __A, int const __D, int const __N) { 1924 __v8hi __result = (__v8hi)__A; 1925 1926 __result[(__N & 7)] = __D; 1927 1928 return (__m128i)__result; 1929 } 1930 1931 extern __inline __m128i 1932 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_epi16(__m128i __A,__m128i __B)1933 _mm_max_epi16(__m128i __A, __m128i __B) { 1934 return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B); 1935 } 1936 1937 extern __inline __m128i 1938 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_epu8(__m128i __A,__m128i __B)1939 _mm_max_epu8(__m128i __A, __m128i __B) { 1940 return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B); 1941 } 1942 1943 extern __inline __m128i 1944 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_epi16(__m128i __A,__m128i __B)1945 _mm_min_epi16(__m128i __A, __m128i __B) { 1946 return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B); 1947 } 1948 1949 extern __inline __m128i 1950 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_epu8(__m128i __A,__m128i __B)1951 _mm_min_epu8(__m128i __A, __m128i __B) { 1952 return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B); 1953 } 1954 1955 #ifdef _ARCH_PWR8 1956 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1957 1958 /* Return a mask created from the most significant bit of each 8-bit 1959 element in A. */ 1960 extern __inline int 1961 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_epi8(__m128i __A)1962 _mm_movemask_epi8(__m128i __A) { 1963 #ifdef _ARCH_PWR10 1964 return vec_extractm((__v16qu)__A); 1965 #else 1966 __vector unsigned long long __result; 1967 static const __vector unsigned char __perm_mask = { 1968 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 1969 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00}; 1970 1971 __result = ((__vector unsigned long long)vec_vbpermq( 1972 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); 1973 1974 #ifdef __LITTLE_ENDIAN__ 1975 return __result[1]; 1976 #else 1977 return __result[0]; 1978 #endif 1979 #endif /* !_ARCH_PWR10 */ 1980 } 1981 #endif /* _ARCH_PWR8 */ 1982 1983 extern __inline __m128i 1984 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_epu16(__m128i __A,__m128i __B)1985 _mm_mulhi_epu16(__m128i __A, __m128i __B) { 1986 __v4su __w0, __w1; 1987 __v16qu __xform1 = { 1988 #ifdef __LITTLE_ENDIAN__ 1989 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1990 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1991 #else 1992 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 1993 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D 1994 #endif 1995 }; 1996 1997 __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B); 1998 __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B); 1999 return (__m128i)vec_perm(__w0, __w1, __xform1); 2000 } 2001 2002 extern __inline __m128i 2003 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shufflehi_epi16(__m128i __A,const int __mask)2004 _mm_shufflehi_epi16(__m128i __A, const int __mask) { 2005 unsigned long __element_selector_98 = __mask & 0x03; 2006 unsigned long __element_selector_BA = (__mask >> 2) & 0x03; 2007 unsigned long __element_selector_DC = (__mask >> 4) & 0x03; 2008 unsigned long __element_selector_FE = (__mask >> 6) & 0x03; 2009 static const unsigned short __permute_selectors[4] = { 2010 #ifdef __LITTLE_ENDIAN__ 2011 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 2012 #else 2013 0x0809, 0x0A0B, 0x0C0D, 0x0E0F 2014 #endif 2015 }; 2016 __v2du __pmask = 2017 #ifdef __LITTLE_ENDIAN__ 2018 {0x1716151413121110UL, 0UL}; 2019 #else 2020 {0x1011121314151617UL, 0UL}; 2021 #endif 2022 __m64_union __t; 2023 __v2du __a, __r; 2024 2025 __t.as_short[0] = __permute_selectors[__element_selector_98]; 2026 __t.as_short[1] = __permute_selectors[__element_selector_BA]; 2027 __t.as_short[2] = __permute_selectors[__element_selector_DC]; 2028 __t.as_short[3] = __permute_selectors[__element_selector_FE]; 2029 __pmask[1] = __t.as_m64; 2030 __a = (__v2du)__A; 2031 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask); 2032 return (__m128i)__r; 2033 } 2034 2035 extern __inline __m128i 2036 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shufflelo_epi16(__m128i __A,const int __mask)2037 _mm_shufflelo_epi16(__m128i __A, const int __mask) { 2038 unsigned long __element_selector_10 = __mask & 0x03; 2039 unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 2040 unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 2041 unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 2042 static const unsigned short __permute_selectors[4] = { 2043 #ifdef __LITTLE_ENDIAN__ 2044 0x0100, 0x0302, 0x0504, 0x0706 2045 #else 2046 0x0001, 0x0203, 0x0405, 0x0607 2047 #endif 2048 }; 2049 __v2du __pmask = 2050 #ifdef __LITTLE_ENDIAN__ 2051 {0UL, 0x1f1e1d1c1b1a1918UL}; 2052 #else 2053 {0UL, 0x18191a1b1c1d1e1fUL}; 2054 #endif 2055 __m64_union __t; 2056 __v2du __a, __r; 2057 __t.as_short[0] = __permute_selectors[__element_selector_10]; 2058 __t.as_short[1] = __permute_selectors[__element_selector_32]; 2059 __t.as_short[2] = __permute_selectors[__element_selector_54]; 2060 __t.as_short[3] = __permute_selectors[__element_selector_76]; 2061 __pmask[0] = __t.as_m64; 2062 __a = (__v2du)__A; 2063 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask); 2064 return (__m128i)__r; 2065 } 2066 2067 extern __inline __m128i 2068 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shuffle_epi32(__m128i __A,const int __mask)2069 _mm_shuffle_epi32(__m128i __A, const int __mask) { 2070 unsigned long __element_selector_10 = __mask & 0x03; 2071 unsigned long __element_selector_32 = (__mask >> 2) & 0x03; 2072 unsigned long __element_selector_54 = (__mask >> 4) & 0x03; 2073 unsigned long __element_selector_76 = (__mask >> 6) & 0x03; 2074 static const unsigned int __permute_selectors[4] = { 2075 #ifdef __LITTLE_ENDIAN__ 2076 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 2077 #else 2078 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 2079 #endif 2080 }; 2081 __v4su __t; 2082 2083 __t[0] = __permute_selectors[__element_selector_10]; 2084 __t[1] = __permute_selectors[__element_selector_32]; 2085 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; 2086 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; 2087 return (__m128i)vec_perm((__v4si)__A, (__v4si)__A, 2088 (__vector unsigned char)__t); 2089 } 2090 2091 extern __inline void 2092 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskmoveu_si128(__m128i __A,__m128i __B,char * __C)2093 _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) { 2094 __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; 2095 __v16qu __mask, __tmp; 2096 __m128i_u *__p = (__m128i_u *)__C; 2097 2098 __tmp = (__v16qu)_mm_loadu_si128(__p); 2099 __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit); 2100 __tmp = vec_sel(__tmp, (__v16qu)__A, __mask); 2101 _mm_storeu_si128(__p, (__m128i)__tmp); 2102 } 2103 2104 extern __inline __m128i 2105 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_avg_epu8(__m128i __A,__m128i __B)2106 _mm_avg_epu8(__m128i __A, __m128i __B) { 2107 return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B); 2108 } 2109 2110 extern __inline __m128i 2111 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_avg_epu16(__m128i __A,__m128i __B)2112 _mm_avg_epu16(__m128i __A, __m128i __B) { 2113 return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B); 2114 } 2115 2116 extern __inline __m128i 2117 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sad_epu8(__m128i __A,__m128i __B)2118 _mm_sad_epu8(__m128i __A, __m128i __B) { 2119 __v16qu __a, __b; 2120 __v16qu __vabsdiff; 2121 __v4si __vsum; 2122 const __v4su __zero = {0, 0, 0, 0}; 2123 __v4si __result; 2124 2125 __a = (__v16qu)__A; 2126 __b = (__v16qu)__B; 2127 #ifndef _ARCH_PWR9 2128 __v16qu __vmin = vec_min(__a, __b); 2129 __v16qu __vmax = vec_max(__a, __b); 2130 __vabsdiff = vec_sub(__vmax, __vmin); 2131 #else 2132 __vabsdiff = vec_absd(__a, __b); 2133 #endif 2134 /* Sum four groups of bytes into integers. */ 2135 __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero); 2136 #ifdef __LITTLE_ENDIAN__ 2137 /* Sum across four integers with two integer results. */ 2138 __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero)); 2139 /* Note: vec_sum2s could be used here, but on little-endian, vector 2140 shifts are added that are not needed for this use-case. 2141 A vector shift to correctly position the 32-bit integer results 2142 (currently at [0] and [2]) to [1] and [3] would then need to be 2143 swapped back again since the desired results are two 64-bit 2144 integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */ 2145 #else 2146 /* Sum across four integers with two integer results. */ 2147 __result = vec_sum2s(__vsum, (__vector signed int)__zero); 2148 /* Rotate the sums into the correct position. */ 2149 __result = vec_sld(__result, __result, 6); 2150 #endif 2151 return (__m128i)__result; 2152 } 2153 2154 extern __inline void 2155 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_si32(int * __A,int __B)2156 _mm_stream_si32(int *__A, int __B) { 2157 /* Use the data cache block touch for store transient. */ 2158 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2159 *__A = __B; 2160 } 2161 2162 extern __inline void 2163 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_si64(long long int * __A,long long int __B)2164 _mm_stream_si64(long long int *__A, long long int __B) { 2165 /* Use the data cache block touch for store transient. */ 2166 __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory"); 2167 *__A = __B; 2168 } 2169 2170 extern __inline void 2171 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_si128(__m128i * __A,__m128i __B)2172 _mm_stream_si128(__m128i *__A, __m128i __B) { 2173 /* Use the data cache block touch for store transient. */ 2174 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2175 *__A = __B; 2176 } 2177 2178 extern __inline void 2179 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_pd(double * __A,__m128d __B)2180 _mm_stream_pd(double *__A, __m128d __B) { 2181 /* Use the data cache block touch for store transient. */ 2182 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory"); 2183 *(__m128d *)__A = __B; 2184 } 2185 2186 extern __inline void 2187 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_clflush(void const * __A)2188 _mm_clflush(void const *__A) { 2189 /* Use the data cache block flush. */ 2190 __asm__("dcbf 0,%0" : : "b"(__A) : "memory"); 2191 } 2192 2193 extern __inline void 2194 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_lfence(void)2195 _mm_lfence(void) { 2196 /* Use light weight sync for load to load ordering. */ 2197 __atomic_thread_fence(__ATOMIC_RELEASE); 2198 } 2199 2200 extern __inline void 2201 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mfence(void)2202 _mm_mfence(void) { 2203 /* Use heavy weight sync for any to any ordering. */ 2204 __atomic_thread_fence(__ATOMIC_SEQ_CST); 2205 } 2206 2207 extern __inline __m128i 2208 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi32_si128(int __A)2209 _mm_cvtsi32_si128(int __A) { 2210 return _mm_set_epi32(0, 0, 0, __A); 2211 } 2212 2213 extern __inline __m128i 2214 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_si128(long long __A)2215 _mm_cvtsi64_si128(long long __A) { 2216 return __extension__(__m128i)(__v2di){__A, 0LL}; 2217 } 2218 2219 /* Microsoft intrinsic. */ 2220 extern __inline __m128i 2221 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64x_si128(long long __A)2222 _mm_cvtsi64x_si128(long long __A) { 2223 return __extension__(__m128i)(__v2di){__A, 0LL}; 2224 } 2225 2226 /* Casts between various SP, DP, INT vector types. Note that these do no 2227 conversion of values, they just change the type. */ 2228 extern __inline __m128 2229 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castpd_ps(__m128d __A)2230 _mm_castpd_ps(__m128d __A) { 2231 return (__m128)__A; 2232 } 2233 2234 extern __inline __m128i 2235 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castpd_si128(__m128d __A)2236 _mm_castpd_si128(__m128d __A) { 2237 return (__m128i)__A; 2238 } 2239 2240 extern __inline __m128d 2241 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castps_pd(__m128 __A)2242 _mm_castps_pd(__m128 __A) { 2243 return (__m128d)__A; 2244 } 2245 2246 extern __inline __m128i 2247 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castps_si128(__m128 __A)2248 _mm_castps_si128(__m128 __A) { 2249 return (__m128i)__A; 2250 } 2251 2252 extern __inline __m128 2253 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castsi128_ps(__m128i __A)2254 _mm_castsi128_ps(__m128i __A) { 2255 return (__m128)__A; 2256 } 2257 2258 extern __inline __m128d 2259 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castsi128_pd(__m128i __A)2260 _mm_castsi128_pd(__m128i __A) { 2261 return (__m128d)__A; 2262 } 2263 2264 #else 2265 #include_next <emmintrin.h> 2266 #endif /* defined(__powerpc64__) && \ 2267 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ 2268 2269 #endif /* EMMINTRIN_H_ */ 2270