1 /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since X86 SSE intrinsics mainly handles __m128 type, PowerPC 18 VMX/VSX ISA is a good match for vector float SIMD operations. 19 However scalar float operations in vector (XMM) registers require 20 the POWER8 VSX ISA (2.07) level. There are differences for data 21 format and placement of float scalars in the vector register, which 22 require extra steps to match SSE scalar float semantics on POWER. 23 24 It should be noted that there's much difference between X86_64's 25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use 26 portable <fenv.h> instead of access MXSCR directly. 27 28 Most SSE scalar float intrinsic operations can be performed more 29 efficiently as C language float scalar operations or optimized to 30 use vector SIMD operations. We recommend this for new applications. */ 31 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 32 #endif 33 34 #ifndef _XMMINTRIN_H_INCLUDED 35 #define _XMMINTRIN_H_INCLUDED 36 37 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) 38 39 /* Define four value permute mask */ 40 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) 41 42 #include <altivec.h> 43 44 /* Avoid collisions between altivec.h and strict adherence to C++ and 45 C11 standards. This should eventually be done inside altivec.h itself, 46 but only after testing a full distro build. */ 47 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \ 48 (defined(__STDC_VERSION__) && \ 49 __STDC_VERSION__ >= 201112L)) 50 #undef vector 51 #undef pixel 52 #undef bool 53 #endif 54 55 /* We need type definitions from the MMX header file. */ 56 #include <mmintrin.h> 57 58 /* Get _mm_malloc () and _mm_free (). */ 59 #if __STDC_HOSTED__ 60 #include <mm_malloc.h> 61 #endif 62 63 /* The Intel API is flexible enough that we must allow aliasing with other 64 vector types, and their scalar components. */ 65 typedef vector float __m128 __attribute__((__may_alias__)); 66 67 /* Unaligned version of the same type. */ 68 typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1))); 69 70 /* Internal data types for implementing the intrinsics. */ 71 typedef vector float __v4sf; 72 73 /* Create an undefined vector. */ 74 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 75 _mm_undefined_ps (void) 76 { 77 __m128 __Y = __Y; 78 return __Y; 79 } 80 81 /* Create a vector of zeros. */ 82 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 83 _mm_setzero_ps (void) 84 { 85 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; 86 } 87 88 /* Load four SPFP values from P. The address must be 16-byte aligned. */ 89 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 90 _mm_load_ps (float const *__P) 91 { 92 return ((__m128)vec_ld(0, (__v4sf*)__P)); 93 } 94 95 /* Load four SPFP values from P. The address need not be 16-byte aligned. */ 96 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 97 _mm_loadu_ps (float const *__P) 98 { 99 return (vec_vsx_ld(0, __P)); 100 } 101 102 /* Load four SPFP values in reverse order. The address must be aligned. */ 103 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 104 _mm_loadr_ps (float const *__P) 105 { 106 __v4sf __tmp; 107 __m128 result; 108 static const __vector unsigned char permute_vector = 109 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, 110 0x17, 0x10, 0x11, 0x12, 0x13 }; 111 112 __tmp = vec_ld (0, (__v4sf *) __P); 113 result = (__m128) vec_perm (__tmp, __tmp, permute_vector); 114 return result; 115 } 116 117 /* Create a vector with all four elements equal to F. */ 118 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 119 _mm_set1_ps (float __F) 120 { 121 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; 122 } 123 124 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 125 _mm_set_ps1 (float __F) 126 { 127 return _mm_set1_ps (__F); 128 } 129 130 /* Create the vector [Z Y X W]. */ 131 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 132 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) 133 { 134 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; 135 } 136 137 /* Create the vector [W X Y Z]. */ 138 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 139 _mm_setr_ps (float __Z, float __Y, float __X, float __W) 140 { 141 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; 142 } 143 144 /* Store four SPFP values. The address must be 16-byte aligned. */ 145 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 146 _mm_store_ps (float *__P, __m128 __A) 147 { 148 vec_st((__v4sf)__A, 0, (__v4sf*)__P); 149 } 150 151 /* Store four SPFP values. The address need not be 16-byte aligned. */ 152 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 153 _mm_storeu_ps (float *__P, __m128 __A) 154 { 155 *(__m128_u *)__P = __A; 156 } 157 158 /* Store four SPFP values in reverse order. The address must be aligned. */ 159 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 160 _mm_storer_ps (float *__P, __m128 __A) 161 { 162 __v4sf __tmp; 163 static const __vector unsigned char permute_vector = 164 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, 165 0x17, 0x10, 0x11, 0x12, 0x13 }; 166 167 __tmp = (__m128) vec_perm (__A, __A, permute_vector); 168 169 _mm_store_ps (__P, __tmp); 170 } 171 172 /* Store the lower SPFP value across four words. */ 173 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 174 _mm_store1_ps (float *__P, __m128 __A) 175 { 176 __v4sf __va = vec_splat((__v4sf)__A, 0); 177 _mm_store_ps (__P, __va); 178 } 179 180 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 181 _mm_store_ps1 (float *__P, __m128 __A) 182 { 183 _mm_store1_ps (__P, __A); 184 } 185 186 /* Create a vector with element 0 as F and the rest zero. */ 187 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 188 _mm_set_ss (float __F) 189 { 190 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f }; 191 } 192 193 /* Sets the low SPFP value of A from the low value of B. */ 194 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 195 _mm_move_ss (__m128 __A, __m128 __B) 196 { 197 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 198 199 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask)); 200 } 201 202 /* Create a vector with element 0 as *P and the rest zero. */ 203 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 204 _mm_load_ss (float const *__P) 205 { 206 return _mm_set_ss (*__P); 207 } 208 209 /* Stores the lower SPFP value. */ 210 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 211 _mm_store_ss (float *__P, __m128 __A) 212 { 213 *__P = ((__v4sf)__A)[0]; 214 } 215 216 /* Perform the respective operation on the lower SPFP (single-precision 217 floating-point) values of A and B; the upper three SPFP values are 218 passed through from A. */ 219 220 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221 _mm_add_ss (__m128 __A, __m128 __B) 222 { 223 #ifdef _ARCH_PWR7 224 __m128 a, b, c; 225 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 226 /* PowerISA VSX does not allow partial (for just lower double) 227 results. So to insure we don't generate spurious exceptions 228 (from the upper double values) we splat the lower double 229 before we to the operation. */ 230 a = vec_splat (__A, 0); 231 b = vec_splat (__B, 0); 232 c = a + b; 233 /* Then we merge the lower float result with the original upper 234 float elements from __A. */ 235 return (vec_sel (__A, c, mask)); 236 #else 237 __A[0] = __A[0] + __B[0]; 238 return (__A); 239 #endif 240 } 241 242 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 243 _mm_sub_ss (__m128 __A, __m128 __B) 244 { 245 #ifdef _ARCH_PWR7 246 __m128 a, b, c; 247 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 248 /* PowerISA VSX does not allow partial (for just lower double) 249 results. So to insure we don't generate spurious exceptions 250 (from the upper double values) we splat the lower double 251 before we to the operation. */ 252 a = vec_splat (__A, 0); 253 b = vec_splat (__B, 0); 254 c = a - b; 255 /* Then we merge the lower float result with the original upper 256 float elements from __A. */ 257 return (vec_sel (__A, c, mask)); 258 #else 259 __A[0] = __A[0] - __B[0]; 260 return (__A); 261 #endif 262 } 263 264 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 265 _mm_mul_ss (__m128 __A, __m128 __B) 266 { 267 #ifdef _ARCH_PWR7 268 __m128 a, b, c; 269 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 270 /* PowerISA VSX does not allow partial (for just lower double) 271 results. So to insure we don't generate spurious exceptions 272 (from the upper double values) we splat the lower double 273 before we to the operation. */ 274 a = vec_splat (__A, 0); 275 b = vec_splat (__B, 0); 276 c = a * b; 277 /* Then we merge the lower float result with the original upper 278 float elements from __A. */ 279 return (vec_sel (__A, c, mask)); 280 #else 281 __A[0] = __A[0] * __B[0]; 282 return (__A); 283 #endif 284 } 285 286 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 287 _mm_div_ss (__m128 __A, __m128 __B) 288 { 289 #ifdef _ARCH_PWR7 290 __m128 a, b, c; 291 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 292 /* PowerISA VSX does not allow partial (for just lower double) 293 results. So to insure we don't generate spurious exceptions 294 (from the upper double values) we splat the lower double 295 before we to the operation. */ 296 a = vec_splat (__A, 0); 297 b = vec_splat (__B, 0); 298 c = a / b; 299 /* Then we merge the lower float result with the original upper 300 float elements from __A. */ 301 return (vec_sel (__A, c, mask)); 302 #else 303 __A[0] = __A[0] / __B[0]; 304 return (__A); 305 #endif 306 } 307 308 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 309 _mm_sqrt_ss (__m128 __A) 310 { 311 __m128 a, c; 312 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 313 /* PowerISA VSX does not allow partial (for just lower double) 314 * results. So to insure we don't generate spurious exceptions 315 * (from the upper double values) we splat the lower double 316 * before we to the operation. */ 317 a = vec_splat (__A, 0); 318 c = vec_sqrt (a); 319 /* Then we merge the lower float result with the original upper 320 * float elements from __A. */ 321 return (vec_sel (__A, c, mask)); 322 } 323 324 /* Perform the respective operation on the four SPFP values in A and B. */ 325 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 326 _mm_add_ps (__m128 __A, __m128 __B) 327 { 328 return (__m128) ((__v4sf)__A + (__v4sf)__B); 329 } 330 331 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 332 _mm_sub_ps (__m128 __A, __m128 __B) 333 { 334 return (__m128) ((__v4sf)__A - (__v4sf)__B); 335 } 336 337 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 338 _mm_mul_ps (__m128 __A, __m128 __B) 339 { 340 return (__m128) ((__v4sf)__A * (__v4sf)__B); 341 } 342 343 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 344 _mm_div_ps (__m128 __A, __m128 __B) 345 { 346 return (__m128) ((__v4sf)__A / (__v4sf)__B); 347 } 348 349 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 350 _mm_sqrt_ps (__m128 __A) 351 { 352 return (vec_sqrt ((__v4sf)__A)); 353 } 354 355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 356 _mm_rcp_ps (__m128 __A) 357 { 358 return (vec_re ((__v4sf)__A)); 359 } 360 361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 362 _mm_rsqrt_ps (__m128 __A) 363 { 364 return (vec_rsqrte (__A)); 365 } 366 367 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 368 _mm_rcp_ss (__m128 __A) 369 { 370 __m128 a, c; 371 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 372 /* PowerISA VSX does not allow partial (for just lower double) 373 * results. So to insure we don't generate spurious exceptions 374 * (from the upper double values) we splat the lower double 375 * before we to the operation. */ 376 a = vec_splat (__A, 0); 377 c = _mm_rcp_ps (a); 378 /* Then we merge the lower float result with the original upper 379 * float elements from __A. */ 380 return (vec_sel (__A, c, mask)); 381 } 382 383 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 384 _mm_rsqrt_ss (__m128 __A) 385 { 386 __m128 a, c; 387 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 388 /* PowerISA VSX does not allow partial (for just lower double) 389 * results. So to insure we don't generate spurious exceptions 390 * (from the upper double values) we splat the lower double 391 * before we to the operation. */ 392 a = vec_splat (__A, 0); 393 c = vec_rsqrte (a); 394 /* Then we merge the lower float result with the original upper 395 * float elements from __A. */ 396 return (vec_sel (__A, c, mask)); 397 } 398 399 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 400 _mm_min_ss (__m128 __A, __m128 __B) 401 { 402 __v4sf a, b, c; 403 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 404 /* PowerISA VSX does not allow partial (for just lower float) 405 * results. So to insure we don't generate spurious exceptions 406 * (from the upper float values) we splat the lower float 407 * before we to the operation. */ 408 a = vec_splat ((__v4sf)__A, 0); 409 b = vec_splat ((__v4sf)__B, 0); 410 c = vec_min (a, b); 411 /* Then we merge the lower float result with the original upper 412 * float elements from __A. */ 413 return (vec_sel ((__v4sf)__A, c, mask)); 414 } 415 416 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 417 _mm_max_ss (__m128 __A, __m128 __B) 418 { 419 __v4sf a, b, c; 420 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; 421 /* PowerISA VSX does not allow partial (for just lower float) 422 * results. So to insure we don't generate spurious exceptions 423 * (from the upper float values) we splat the lower float 424 * before we to the operation. */ 425 a = vec_splat (__A, 0); 426 b = vec_splat (__B, 0); 427 c = vec_max (a, b); 428 /* Then we merge the lower float result with the original upper 429 * float elements from __A. */ 430 return (vec_sel ((__v4sf)__A, c, mask)); 431 } 432 433 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 434 _mm_min_ps (__m128 __A, __m128 __B) 435 { 436 __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A); 437 return vec_sel (__B, __A, m); 438 } 439 440 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 441 _mm_max_ps (__m128 __A, __m128 __B) 442 { 443 __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B); 444 return vec_sel (__B, __A, m); 445 } 446 447 /* Perform logical bit-wise operations on 128-bit values. */ 448 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 449 _mm_and_ps (__m128 __A, __m128 __B) 450 { 451 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B)); 452 // return __builtin_ia32_andps (__A, __B); 453 } 454 455 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 456 _mm_andnot_ps (__m128 __A, __m128 __B) 457 { 458 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A)); 459 } 460 461 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 462 _mm_or_ps (__m128 __A, __m128 __B) 463 { 464 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B)); 465 } 466 467 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 468 _mm_xor_ps (__m128 __A, __m128 __B) 469 { 470 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B)); 471 } 472 473 /* Perform a comparison on the four SPFP values of A and B. For each 474 element, if the comparison is true, place a mask of all ones in the 475 result, otherwise a mask of zeros. */ 476 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 477 _mm_cmpeq_ps (__m128 __A, __m128 __B) 478 { 479 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B)); 480 } 481 482 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 483 _mm_cmplt_ps (__m128 __A, __m128 __B) 484 { 485 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); 486 } 487 488 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 489 _mm_cmple_ps (__m128 __A, __m128 __B) 490 { 491 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); 492 } 493 494 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 495 _mm_cmpgt_ps (__m128 __A, __m128 __B) 496 { 497 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); 498 } 499 500 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 501 _mm_cmpge_ps (__m128 __A, __m128 __B) 502 { 503 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); 504 } 505 506 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 507 _mm_cmpneq_ps (__m128 __A, __m128 __B) 508 { 509 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B); 510 return ((__m128)vec_nor (temp, temp)); 511 } 512 513 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 514 _mm_cmpnlt_ps (__m128 __A, __m128 __B) 515 { 516 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); 517 } 518 519 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 520 _mm_cmpnle_ps (__m128 __A, __m128 __B) 521 { 522 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); 523 } 524 525 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 526 _mm_cmpngt_ps (__m128 __A, __m128 __B) 527 { 528 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); 529 } 530 531 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 532 _mm_cmpnge_ps (__m128 __A, __m128 __B) 533 { 534 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); 535 } 536 537 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 538 _mm_cmpord_ps (__m128 __A, __m128 __B) 539 { 540 __vector unsigned int a, b; 541 __vector unsigned int c, d; 542 static const __vector unsigned int float_exp_mask = 543 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 544 545 a = (__vector unsigned int) vec_abs ((__v4sf)__A); 546 b = (__vector unsigned int) vec_abs ((__v4sf)__B); 547 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); 548 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); 549 return ((__m128 ) vec_and (c, d)); 550 } 551 552 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 553 _mm_cmpunord_ps (__m128 __A, __m128 __B) 554 { 555 __vector unsigned int a, b; 556 __vector unsigned int c, d; 557 static const __vector unsigned int float_exp_mask = 558 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 559 560 a = (__vector unsigned int) vec_abs ((__v4sf)__A); 561 b = (__vector unsigned int) vec_abs ((__v4sf)__B); 562 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); 563 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); 564 return ((__m128 ) vec_or (c, d)); 565 } 566 567 /* Perform a comparison on the lower SPFP values of A and B. If the 568 comparison is true, place a mask of all ones in the result, otherwise a 569 mask of zeros. The upper three SPFP values are passed through from A. */ 570 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 571 _mm_cmpeq_ss (__m128 __A, __m128 __B) 572 { 573 static const __vector unsigned int mask = 574 { 0xffffffff, 0, 0, 0 }; 575 __v4sf a, b, c; 576 /* PowerISA VMX does not allow partial (for just element 0) 577 * results. So to insure we don't generate spurious exceptions 578 * (from the upper elements) we splat the lower float 579 * before we to the operation. */ 580 a = vec_splat ((__v4sf) __A, 0); 581 b = vec_splat ((__v4sf) __B, 0); 582 c = (__v4sf) vec_cmpeq(a, b); 583 /* Then we merge the lower float result with the original upper 584 * float elements from __A. */ 585 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 586 } 587 588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 589 _mm_cmplt_ss (__m128 __A, __m128 __B) 590 { 591 static const __vector unsigned int mask = 592 { 0xffffffff, 0, 0, 0 }; 593 __v4sf a, b, c; 594 /* PowerISA VMX does not allow partial (for just element 0) 595 * results. So to insure we don't generate spurious exceptions 596 * (from the upper elements) we splat the lower float 597 * before we to the operation. */ 598 a = vec_splat ((__v4sf) __A, 0); 599 b = vec_splat ((__v4sf) __B, 0); 600 c = (__v4sf) vec_cmplt(a, b); 601 /* Then we merge the lower float result with the original upper 602 * float elements from __A. */ 603 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 604 } 605 606 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 607 _mm_cmple_ss (__m128 __A, __m128 __B) 608 { 609 static const __vector unsigned int mask = 610 { 0xffffffff, 0, 0, 0 }; 611 __v4sf a, b, c; 612 /* PowerISA VMX does not allow partial (for just element 0) 613 * results. So to insure we don't generate spurious exceptions 614 * (from the upper elements) we splat the lower float 615 * before we to the operation. */ 616 a = vec_splat ((__v4sf) __A, 0); 617 b = vec_splat ((__v4sf) __B, 0); 618 c = (__v4sf) vec_cmple(a, b); 619 /* Then we merge the lower float result with the original upper 620 * float elements from __A. */ 621 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 622 } 623 624 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 625 _mm_cmpgt_ss (__m128 __A, __m128 __B) 626 { 627 static const __vector unsigned int mask = 628 { 0xffffffff, 0, 0, 0 }; 629 __v4sf a, b, c; 630 /* PowerISA VMX does not allow partial (for just element 0) 631 * results. So to insure we don't generate spurious exceptions 632 * (from the upper elements) we splat the lower float 633 * before we to the operation. */ 634 a = vec_splat ((__v4sf) __A, 0); 635 b = vec_splat ((__v4sf) __B, 0); 636 c = (__v4sf) vec_cmpgt(a, b); 637 /* Then we merge the lower float result with the original upper 638 * float elements from __A. */ 639 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 640 } 641 642 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 643 _mm_cmpge_ss (__m128 __A, __m128 __B) 644 { 645 static const __vector unsigned int mask = 646 { 0xffffffff, 0, 0, 0 }; 647 __v4sf a, b, c; 648 /* PowerISA VMX does not allow partial (for just element 0) 649 * results. So to insure we don't generate spurious exceptions 650 * (from the upper elements) we splat the lower float 651 * before we to the operation. */ 652 a = vec_splat ((__v4sf) __A, 0); 653 b = vec_splat ((__v4sf) __B, 0); 654 c = (__v4sf) vec_cmpge(a, b); 655 /* Then we merge the lower float result with the original upper 656 * float elements from __A. */ 657 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 658 } 659 660 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 661 _mm_cmpneq_ss (__m128 __A, __m128 __B) 662 { 663 static const __vector unsigned int mask = 664 { 0xffffffff, 0, 0, 0 }; 665 __v4sf a, b, c; 666 /* PowerISA VMX does not allow partial (for just element 0) 667 * results. So to insure we don't generate spurious exceptions 668 * (from the upper elements) we splat the lower float 669 * before we to the operation. */ 670 a = vec_splat ((__v4sf) __A, 0); 671 b = vec_splat ((__v4sf) __B, 0); 672 c = (__v4sf) vec_cmpeq(a, b); 673 c = vec_nor (c, c); 674 /* Then we merge the lower float result with the original upper 675 * float elements from __A. */ 676 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 677 } 678 679 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 680 _mm_cmpnlt_ss (__m128 __A, __m128 __B) 681 { 682 static const __vector unsigned int mask = 683 { 0xffffffff, 0, 0, 0 }; 684 __v4sf a, b, c; 685 /* PowerISA VMX does not allow partial (for just element 0) 686 * results. So to insure we don't generate spurious exceptions 687 * (from the upper elements) we splat the lower float 688 * before we to the operation. */ 689 a = vec_splat ((__v4sf) __A, 0); 690 b = vec_splat ((__v4sf) __B, 0); 691 c = (__v4sf) vec_cmpge(a, b); 692 /* Then we merge the lower float result with the original upper 693 * float elements from __A. */ 694 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 695 } 696 697 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 698 _mm_cmpnle_ss (__m128 __A, __m128 __B) 699 { 700 static const __vector unsigned int mask = 701 { 0xffffffff, 0, 0, 0 }; 702 __v4sf a, b, c; 703 /* PowerISA VMX does not allow partial (for just element 0) 704 * results. So to insure we don't generate spurious exceptions 705 * (from the upper elements) we splat the lower float 706 * before we to the operation. */ 707 a = vec_splat ((__v4sf) __A, 0); 708 b = vec_splat ((__v4sf) __B, 0); 709 c = (__v4sf) vec_cmpgt(a, b); 710 /* Then we merge the lower float result with the original upper 711 * float elements from __A. */ 712 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 713 } 714 715 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 716 _mm_cmpngt_ss (__m128 __A, __m128 __B) 717 { 718 static const __vector unsigned int mask = 719 { 0xffffffff, 0, 0, 0 }; 720 __v4sf a, b, c; 721 /* PowerISA VMX does not allow partial (for just element 0) 722 * results. So to insure we don't generate spurious exceptions 723 * (from the upper elements) we splat the lower float 724 * before we to the operation. */ 725 a = vec_splat ((__v4sf) __A, 0); 726 b = vec_splat ((__v4sf) __B, 0); 727 c = (__v4sf) vec_cmple(a, b); 728 /* Then we merge the lower float result with the original upper 729 * float elements from __A. */ 730 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 731 } 732 733 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 734 _mm_cmpnge_ss (__m128 __A, __m128 __B) 735 { 736 static const __vector unsigned int mask = 737 { 0xffffffff, 0, 0, 0 }; 738 __v4sf a, b, c; 739 /* PowerISA VMX does not allow partial (for just element 0) 740 * results. So to insure we don't generate spurious exceptions 741 * (from the upper elements) we splat the lower float 742 * before we do the operation. */ 743 a = vec_splat ((__v4sf) __A, 0); 744 b = vec_splat ((__v4sf) __B, 0); 745 c = (__v4sf) vec_cmplt(a, b); 746 /* Then we merge the lower float result with the original upper 747 * float elements from __A. */ 748 return ((__m128)vec_sel ((__v4sf)__A, c, mask)); 749 } 750 751 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 752 _mm_cmpord_ss (__m128 __A, __m128 __B) 753 { 754 __vector unsigned int a, b; 755 __vector unsigned int c, d; 756 static const __vector unsigned int float_exp_mask = 757 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 758 static const __vector unsigned int mask = 759 { 0xffffffff, 0, 0, 0 }; 760 761 a = (__vector unsigned int) vec_abs ((__v4sf)__A); 762 b = (__vector unsigned int) vec_abs ((__v4sf)__B); 763 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); 764 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); 765 c = vec_and (c, d); 766 /* Then we merge the lower float result with the original upper 767 * float elements from __A. */ 768 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); 769 } 770 771 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 772 _mm_cmpunord_ss (__m128 __A, __m128 __B) 773 { 774 __vector unsigned int a, b; 775 __vector unsigned int c, d; 776 static const __vector unsigned int float_exp_mask = 777 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; 778 static const __vector unsigned int mask = 779 { 0xffffffff, 0, 0, 0 }; 780 781 a = (__vector unsigned int) vec_abs ((__v4sf)__A); 782 b = (__vector unsigned int) vec_abs ((__v4sf)__B); 783 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); 784 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); 785 c = vec_or (c, d); 786 /* Then we merge the lower float result with the original upper 787 * float elements from __A. */ 788 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); 789 } 790 791 /* Compare the lower SPFP values of A and B and return 1 if true 792 and 0 if false. */ 793 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 794 _mm_comieq_ss (__m128 __A, __m128 __B) 795 { 796 return (__A[0] == __B[0]); 797 } 798 799 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 800 _mm_comilt_ss (__m128 __A, __m128 __B) 801 { 802 return (__A[0] < __B[0]); 803 } 804 805 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 806 _mm_comile_ss (__m128 __A, __m128 __B) 807 { 808 return (__A[0] <= __B[0]); 809 } 810 811 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 812 _mm_comigt_ss (__m128 __A, __m128 __B) 813 { 814 return (__A[0] > __B[0]); 815 } 816 817 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 818 _mm_comige_ss (__m128 __A, __m128 __B) 819 { 820 return (__A[0] >= __B[0]); 821 } 822 823 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 824 _mm_comineq_ss (__m128 __A, __m128 __B) 825 { 826 return (__A[0] != __B[0]); 827 } 828 829 /* FIXME 830 * The __mm_ucomi??_ss implementations below are exactly the same as 831 * __mm_comi??_ss because GCC for PowerPC only generates unordered 832 * compares (scalar and vector). 833 * Technically __mm_comieq_ss et al should be using the ordered 834 * compare and signal for QNaNs. 835 * The __mm_ucomieq_sd et all should be OK, as is. 836 */ 837 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 838 _mm_ucomieq_ss (__m128 __A, __m128 __B) 839 { 840 return (__A[0] == __B[0]); 841 } 842 843 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 844 _mm_ucomilt_ss (__m128 __A, __m128 __B) 845 { 846 return (__A[0] < __B[0]); 847 } 848 849 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 850 _mm_ucomile_ss (__m128 __A, __m128 __B) 851 { 852 return (__A[0] <= __B[0]); 853 } 854 855 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 856 _mm_ucomigt_ss (__m128 __A, __m128 __B) 857 { 858 return (__A[0] > __B[0]); 859 } 860 861 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 862 _mm_ucomige_ss (__m128 __A, __m128 __B) 863 { 864 return (__A[0] >= __B[0]); 865 } 866 867 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 868 _mm_ucomineq_ss (__m128 __A, __m128 __B) 869 { 870 return (__A[0] != __B[0]); 871 } 872 873 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 874 _mm_cvtss_f32 (__m128 __A) 875 { 876 return ((__v4sf)__A)[0]; 877 } 878 879 /* Convert the lower SPFP value to a 32-bit integer according to the current 880 rounding mode. */ 881 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 882 _mm_cvtss_si32 (__m128 __A) 883 { 884 __m64 res = 0; 885 #ifdef _ARCH_PWR8 886 double dtmp; 887 __asm__( 888 #ifdef __LITTLE_ENDIAN__ 889 "xxsldwi %x0,%x0,%x0,3;\n" 890 #endif 891 "xscvspdp %x2,%x0;\n" 892 "fctiw %2,%2;\n" 893 "mfvsrd %1,%x2;\n" 894 : "+wa" (__A), 895 "=r" (res), 896 "=f" (dtmp) 897 : ); 898 #else 899 res = __builtin_rint(__A[0]); 900 #endif 901 return (res); 902 } 903 904 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 905 _mm_cvt_ss2si (__m128 __A) 906 { 907 return _mm_cvtss_si32 (__A); 908 } 909 910 /* Convert the lower SPFP value to a 32-bit integer according to the 911 current rounding mode. */ 912 913 /* Intel intrinsic. */ 914 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 915 _mm_cvtss_si64 (__m128 __A) 916 { 917 __m64 res = 0; 918 #ifdef _ARCH_PWR8 919 double dtmp; 920 __asm__( 921 #ifdef __LITTLE_ENDIAN__ 922 "xxsldwi %x0,%x0,%x0,3;\n" 923 #endif 924 "xscvspdp %x2,%x0;\n" 925 "fctid %2,%2;\n" 926 "mfvsrd %1,%x2;\n" 927 : "+wa" (__A), 928 "=r" (res), 929 "=f" (dtmp) 930 : ); 931 #else 932 res = __builtin_llrint(__A[0]); 933 #endif 934 return (res); 935 } 936 937 /* Microsoft intrinsic. */ 938 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 939 _mm_cvtss_si64x (__m128 __A) 940 { 941 return _mm_cvtss_si64 ((__v4sf) __A); 942 } 943 944 /* Constants for use with _mm_prefetch. */ 945 enum _mm_hint 946 { 947 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ 948 _MM_HINT_ET0 = 7, 949 _MM_HINT_ET1 = 6, 950 _MM_HINT_T0 = 3, 951 _MM_HINT_T1 = 2, 952 _MM_HINT_T2 = 1, 953 _MM_HINT_NTA = 0 954 }; 955 956 /* Loads one cache line from address P to a location "closer" to the 957 processor. The selector I specifies the type of prefetch operation. */ 958 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 959 _mm_prefetch (const void *__P, enum _mm_hint __I) 960 { 961 /* Current PowerPC will ignores the hint parameters. */ 962 __builtin_prefetch (__P); 963 } 964 965 /* Convert the two lower SPFP values to 32-bit integers according to the 966 current rounding mode. Return the integers in packed form. */ 967 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 968 _mm_cvtps_pi32 (__m128 __A) 969 { 970 /* Splat two lower SPFP values to both halves. */ 971 __v4sf temp, rounded; 972 __vector unsigned long long result; 973 974 /* Splat two lower SPFP values to both halves. */ 975 temp = (__v4sf) vec_splat ((__vector long long)__A, 0); 976 rounded = vec_rint(temp); 977 result = (__vector unsigned long long) vec_cts (rounded, 0); 978 979 return (__m64) ((__vector long long) result)[0]; 980 } 981 982 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 983 _mm_cvt_ps2pi (__m128 __A) 984 { 985 return _mm_cvtps_pi32 (__A); 986 } 987 988 /* Truncate the lower SPFP value to a 32-bit integer. */ 989 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 990 _mm_cvttss_si32 (__m128 __A) 991 { 992 /* Extract the lower float element. */ 993 float temp = __A[0]; 994 /* truncate to 32-bit integer and return. */ 995 return temp; 996 } 997 998 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 999 _mm_cvtt_ss2si (__m128 __A) 1000 { 1001 return _mm_cvttss_si32 (__A); 1002 } 1003 1004 /* Intel intrinsic. */ 1005 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1006 _mm_cvttss_si64 (__m128 __A) 1007 { 1008 /* Extract the lower float element. */ 1009 float temp = __A[0]; 1010 /* truncate to 32-bit integer and return. */ 1011 return temp; 1012 } 1013 1014 /* Microsoft intrinsic. */ 1015 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1016 _mm_cvttss_si64x (__m128 __A) 1017 { 1018 /* Extract the lower float element. */ 1019 float temp = __A[0]; 1020 /* truncate to 32-bit integer and return. */ 1021 return temp; 1022 } 1023 1024 /* Truncate the two lower SPFP values to 32-bit integers. Return the 1025 integers in packed form. */ 1026 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1027 _mm_cvttps_pi32 (__m128 __A) 1028 { 1029 __v4sf temp; 1030 __vector unsigned long long result; 1031 1032 /* Splat two lower SPFP values to both halves. */ 1033 temp = (__v4sf) vec_splat ((__vector long long)__A, 0); 1034 result = (__vector unsigned long long) vec_cts (temp, 0); 1035 1036 return (__m64) ((__vector long long) result)[0]; 1037 } 1038 1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1040 _mm_cvtt_ps2pi (__m128 __A) 1041 { 1042 return _mm_cvttps_pi32 (__A); 1043 } 1044 1045 /* Convert B to a SPFP value and insert it as element zero in A. */ 1046 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1047 _mm_cvtsi32_ss (__m128 __A, int __B) 1048 { 1049 float temp = __B; 1050 __A[0] = temp; 1051 1052 return __A; 1053 } 1054 1055 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1056 _mm_cvt_si2ss (__m128 __A, int __B) 1057 { 1058 return _mm_cvtsi32_ss (__A, __B); 1059 } 1060 1061 /* Convert B to a SPFP value and insert it as element zero in A. */ 1062 /* Intel intrinsic. */ 1063 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1064 _mm_cvtsi64_ss (__m128 __A, long long __B) 1065 { 1066 float temp = __B; 1067 __A[0] = temp; 1068 1069 return __A; 1070 } 1071 1072 /* Microsoft intrinsic. */ 1073 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1074 _mm_cvtsi64x_ss (__m128 __A, long long __B) 1075 { 1076 return _mm_cvtsi64_ss (__A, __B); 1077 } 1078 1079 /* Convert the two 32-bit values in B to SPFP form and insert them 1080 as the two lower elements in A. */ 1081 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1082 _mm_cvtpi32_ps (__m128 __A, __m64 __B) 1083 { 1084 __vector signed int vm1; 1085 __vector float vf1; 1086 1087 vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B}; 1088 vf1 = (__vector float) vec_ctf (vm1, 0); 1089 1090 return ((__m128) (__vector unsigned long long) 1091 { ((__vector unsigned long long)vf1) [0], 1092 ((__vector unsigned long long)__A) [1]}); 1093 } 1094 1095 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1096 _mm_cvt_pi2ps (__m128 __A, __m64 __B) 1097 { 1098 return _mm_cvtpi32_ps (__A, __B); 1099 } 1100 1101 /* Convert the four signed 16-bit values in A to SPFP form. */ 1102 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1103 _mm_cvtpi16_ps (__m64 __A) 1104 { 1105 __vector signed short vs8; 1106 __vector signed int vi4; 1107 __vector float vf1; 1108 1109 vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A }; 1110 vi4 = vec_vupklsh (vs8); 1111 vf1 = (__vector float) vec_ctf (vi4, 0); 1112 1113 return (__m128) vf1; 1114 } 1115 1116 /* Convert the four unsigned 16-bit values in A to SPFP form. */ 1117 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1118 _mm_cvtpu16_ps (__m64 __A) 1119 { 1120 const __vector unsigned short zero = 1121 { 0, 0, 0, 0, 0, 0, 0, 0 }; 1122 __vector unsigned short vs8; 1123 __vector unsigned int vi4; 1124 __vector float vf1; 1125 1126 vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A }; 1127 vi4 = (__vector unsigned int) vec_mergel 1128 #ifdef __LITTLE_ENDIAN__ 1129 (vs8, zero); 1130 #else 1131 (zero, vs8); 1132 #endif 1133 vf1 = (__vector float) vec_ctf (vi4, 0); 1134 1135 return (__m128) vf1; 1136 } 1137 1138 /* Convert the low four signed 8-bit values in A to SPFP form. */ 1139 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1140 _mm_cvtpi8_ps (__m64 __A) 1141 { 1142 __vector signed char vc16; 1143 __vector signed short vs8; 1144 __vector signed int vi4; 1145 __vector float vf1; 1146 1147 vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A }; 1148 vs8 = vec_vupkhsb (vc16); 1149 vi4 = vec_vupkhsh (vs8); 1150 vf1 = (__vector float) vec_ctf (vi4, 0); 1151 1152 return (__m128) vf1; 1153 } 1154 1155 /* Convert the low four unsigned 8-bit values in A to SPFP form. */ 1156 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1157 1158 _mm_cvtpu8_ps (__m64 __A) 1159 { 1160 const __vector unsigned char zero = 1161 { 0, 0, 0, 0, 0, 0, 0, 0 }; 1162 __vector unsigned char vc16; 1163 __vector unsigned short vs8; 1164 __vector unsigned int vi4; 1165 __vector float vf1; 1166 1167 vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A }; 1168 #ifdef __LITTLE_ENDIAN__ 1169 vs8 = (__vector unsigned short) vec_mergel (vc16, zero); 1170 vi4 = (__vector unsigned int) vec_mergeh (vs8, 1171 (__vector unsigned short) zero); 1172 #else 1173 vs8 = (__vector unsigned short) vec_mergel (zero, vc16); 1174 vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero, 1175 vs8); 1176 #endif 1177 vf1 = (__vector float) vec_ctf (vi4, 0); 1178 1179 return (__m128) vf1; 1180 } 1181 1182 /* Convert the four signed 32-bit values in A and B to SPFP form. */ 1183 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1184 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B) 1185 { 1186 __vector signed int vi4; 1187 __vector float vf4; 1188 1189 vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B }; 1190 vf4 = (__vector float) vec_ctf (vi4, 0); 1191 return (__m128) vf4; 1192 } 1193 1194 /* Convert the four SPFP values in A to four signed 16-bit integers. */ 1195 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1196 _mm_cvtps_pi16 (__m128 __A) 1197 { 1198 __v4sf rounded; 1199 __vector signed int temp; 1200 __vector unsigned long long result; 1201 1202 rounded = vec_rint(__A); 1203 temp = vec_cts (rounded, 0); 1204 result = (__vector unsigned long long) vec_pack (temp, temp); 1205 1206 return (__m64) ((__vector long long) result)[0]; 1207 } 1208 1209 /* Convert the four SPFP values in A to four signed 8-bit integers. */ 1210 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1211 _mm_cvtps_pi8 (__m128 __A) 1212 { 1213 __v4sf rounded; 1214 __vector signed int tmp_i; 1215 static const __vector signed int zero = {0, 0, 0, 0}; 1216 __vector signed short tmp_s; 1217 __vector signed char res_v; 1218 1219 rounded = vec_rint(__A); 1220 tmp_i = vec_cts (rounded, 0); 1221 tmp_s = vec_pack (tmp_i, zero); 1222 res_v = vec_pack (tmp_s, tmp_s); 1223 return (__m64) ((__vector long long) res_v)[0]; 1224 } 1225 1226 /* Selects four specific SPFP values from A and B based on MASK. */ 1227 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1228 1229 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) 1230 { 1231 unsigned long element_selector_10 = __mask & 0x03; 1232 unsigned long element_selector_32 = (__mask >> 2) & 0x03; 1233 unsigned long element_selector_54 = (__mask >> 4) & 0x03; 1234 unsigned long element_selector_76 = (__mask >> 6) & 0x03; 1235 static const unsigned int permute_selectors[4] = 1236 { 1237 #ifdef __LITTLE_ENDIAN__ 1238 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 1239 #else 1240 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F 1241 #endif 1242 }; 1243 __vector unsigned int t; 1244 1245 t[0] = permute_selectors[element_selector_10]; 1246 t[1] = permute_selectors[element_selector_32]; 1247 t[2] = permute_selectors[element_selector_54] + 0x10101010; 1248 t[3] = permute_selectors[element_selector_76] + 0x10101010; 1249 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t); 1250 } 1251 1252 /* Selects and interleaves the upper two SPFP values from A and B. */ 1253 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1254 _mm_unpackhi_ps (__m128 __A, __m128 __B) 1255 { 1256 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B); 1257 } 1258 1259 /* Selects and interleaves the lower two SPFP values from A and B. */ 1260 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1261 _mm_unpacklo_ps (__m128 __A, __m128 __B) 1262 { 1263 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B); 1264 } 1265 1266 /* Sets the upper two SPFP values with 64-bits of data loaded from P; 1267 the lower two values are passed through from A. */ 1268 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1269 _mm_loadh_pi (__m128 __A, __m64 const *__P) 1270 { 1271 __vector unsigned long long __a = (__vector unsigned long long)__A; 1272 __vector unsigned long long __p = vec_splats(*__P); 1273 __a [1] = __p [1]; 1274 1275 return (__m128)__a; 1276 } 1277 1278 /* Stores the upper two SPFP values of A into P. */ 1279 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1280 _mm_storeh_pi (__m64 *__P, __m128 __A) 1281 { 1282 __vector unsigned long long __a = (__vector unsigned long long) __A; 1283 1284 *__P = __a[1]; 1285 } 1286 1287 /* Moves the upper two values of B into the lower two values of A. */ 1288 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1289 _mm_movehl_ps (__m128 __A, __m128 __B) 1290 { 1291 return (__m128) vec_mergel ((__vector unsigned long long)__B, 1292 (__vector unsigned long long)__A); 1293 } 1294 1295 /* Moves the lower two values of B into the upper two values of A. */ 1296 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1297 _mm_movelh_ps (__m128 __A, __m128 __B) 1298 { 1299 return (__m128) vec_mergeh ((__vector unsigned long long)__A, 1300 (__vector unsigned long long)__B); 1301 } 1302 1303 /* Sets the lower two SPFP values with 64-bits of data loaded from P; 1304 the upper two values are passed through from A. */ 1305 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1306 _mm_loadl_pi (__m128 __A, __m64 const *__P) 1307 { 1308 __vector unsigned long long __a = (__vector unsigned long long)__A; 1309 __vector unsigned long long __p = vec_splats(*__P); 1310 __a [0] = __p [0]; 1311 1312 return (__m128)__a; 1313 } 1314 1315 /* Stores the lower two SPFP values of A into P. */ 1316 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1317 _mm_storel_pi (__m64 *__P, __m128 __A) 1318 { 1319 __vector unsigned long long __a = (__vector unsigned long long) __A; 1320 1321 *__P = __a[0]; 1322 } 1323 1324 #ifdef _ARCH_PWR8 1325 /* Intrinsic functions that require PowerISA 2.07 minimum. */ 1326 1327 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 1328 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1329 _mm_movemask_ps (__m128 __A) 1330 { 1331 __vector unsigned long long result; 1332 static const __vector unsigned int perm_mask = 1333 { 1334 #ifdef __LITTLE_ENDIAN__ 1335 0x00204060, 0x80808080, 0x80808080, 0x80808080 1336 #else 1337 0x80808080, 0x80808080, 0x80808080, 0x00204060 1338 #endif 1339 }; 1340 1341 result = ((__vector unsigned long long) 1342 vec_vbpermq ((__vector unsigned char) __A, 1343 (__vector unsigned char) perm_mask)); 1344 1345 #ifdef __LITTLE_ENDIAN__ 1346 return result[1]; 1347 #else 1348 return result[0]; 1349 #endif 1350 } 1351 #endif /* _ARCH_PWR8 */ 1352 1353 /* Create a vector with all four elements equal to *P. */ 1354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1355 _mm_load1_ps (float const *__P) 1356 { 1357 return _mm_set1_ps (*__P); 1358 } 1359 1360 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1361 _mm_load_ps1 (float const *__P) 1362 { 1363 return _mm_load1_ps (__P); 1364 } 1365 1366 /* Extracts one of the four words of A. The selector N must be immediate. */ 1367 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1368 _mm_extract_pi16 (__m64 const __A, int const __N) 1369 { 1370 unsigned int shiftr = __N & 3; 1371 #ifdef __BIG_ENDIAN__ 1372 shiftr = 3 - shiftr; 1373 #endif 1374 1375 return ((__A >> (shiftr * 16)) & 0xffff); 1376 } 1377 1378 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1379 _m_pextrw (__m64 const __A, int const __N) 1380 { 1381 return _mm_extract_pi16 (__A, __N); 1382 } 1383 1384 /* Inserts word D into one of four words of A. The selector N must be 1385 immediate. */ 1386 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1387 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N) 1388 { 1389 const int shiftl = (__N & 3) * 16; 1390 const __m64 shiftD = (const __m64) __D << shiftl; 1391 const __m64 mask = 0xffffUL << shiftl; 1392 __m64 result = (__A & (~mask)) | (shiftD & mask); 1393 1394 return (result); 1395 } 1396 1397 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1398 _m_pinsrw (__m64 const __A, int const __D, int const __N) 1399 { 1400 return _mm_insert_pi16 (__A, __D, __N); 1401 } 1402 1403 /* Compute the element-wise maximum of signed 16-bit values. */ 1404 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1405 1406 _mm_max_pi16 (__m64 __A, __m64 __B) 1407 { 1408 #if _ARCH_PWR8 1409 __vector signed short a, b, r; 1410 __vector __bool short c; 1411 1412 a = (__vector signed short)vec_splats (__A); 1413 b = (__vector signed short)vec_splats (__B); 1414 c = (__vector __bool short)vec_cmpgt (a, b); 1415 r = vec_sel (b, a, c); 1416 return (__m64) ((__vector long long) r)[0]; 1417 #else 1418 __m64_union m1, m2, res; 1419 1420 m1.as_m64 = __A; 1421 m2.as_m64 = __B; 1422 1423 res.as_short[0] = 1424 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; 1425 res.as_short[1] = 1426 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; 1427 res.as_short[2] = 1428 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; 1429 res.as_short[3] = 1430 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; 1431 1432 return (__m64) res.as_m64; 1433 #endif 1434 } 1435 1436 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1437 _m_pmaxsw (__m64 __A, __m64 __B) 1438 { 1439 return _mm_max_pi16 (__A, __B); 1440 } 1441 1442 /* Compute the element-wise maximum of unsigned 8-bit values. */ 1443 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1444 _mm_max_pu8 (__m64 __A, __m64 __B) 1445 { 1446 #if _ARCH_PWR8 1447 __vector unsigned char a, b, r; 1448 __vector __bool char c; 1449 1450 a = (__vector unsigned char)vec_splats (__A); 1451 b = (__vector unsigned char)vec_splats (__B); 1452 c = (__vector __bool char)vec_cmpgt (a, b); 1453 r = vec_sel (b, a, c); 1454 return (__m64) ((__vector long long) r)[0]; 1455 #else 1456 __m64_union m1, m2, res; 1457 long i; 1458 1459 m1.as_m64 = __A; 1460 m2.as_m64 = __B; 1461 1462 1463 for (i = 0; i < 8; i++) 1464 res.as_char[i] = 1465 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ? 1466 m1.as_char[i] : m2.as_char[i]; 1467 1468 return (__m64) res.as_m64; 1469 #endif 1470 } 1471 1472 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1473 _m_pmaxub (__m64 __A, __m64 __B) 1474 { 1475 return _mm_max_pu8 (__A, __B); 1476 } 1477 1478 /* Compute the element-wise minimum of signed 16-bit values. */ 1479 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1480 _mm_min_pi16 (__m64 __A, __m64 __B) 1481 { 1482 #if _ARCH_PWR8 1483 __vector signed short a, b, r; 1484 __vector __bool short c; 1485 1486 a = (__vector signed short)vec_splats (__A); 1487 b = (__vector signed short)vec_splats (__B); 1488 c = (__vector __bool short)vec_cmplt (a, b); 1489 r = vec_sel (b, a, c); 1490 return (__m64) ((__vector long long) r)[0]; 1491 #else 1492 __m64_union m1, m2, res; 1493 1494 m1.as_m64 = __A; 1495 m2.as_m64 = __B; 1496 1497 res.as_short[0] = 1498 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; 1499 res.as_short[1] = 1500 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; 1501 res.as_short[2] = 1502 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; 1503 res.as_short[3] = 1504 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; 1505 1506 return (__m64) res.as_m64; 1507 #endif 1508 } 1509 1510 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1511 _m_pminsw (__m64 __A, __m64 __B) 1512 { 1513 return _mm_min_pi16 (__A, __B); 1514 } 1515 1516 /* Compute the element-wise minimum of unsigned 8-bit values. */ 1517 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1518 _mm_min_pu8 (__m64 __A, __m64 __B) 1519 { 1520 #if _ARCH_PWR8 1521 __vector unsigned char a, b, r; 1522 __vector __bool char c; 1523 1524 a = (__vector unsigned char)vec_splats (__A); 1525 b = (__vector unsigned char)vec_splats (__B); 1526 c = (__vector __bool char)vec_cmplt (a, b); 1527 r = vec_sel (b, a, c); 1528 return (__m64) ((__vector long long) r)[0]; 1529 #else 1530 __m64_union m1, m2, res; 1531 long i; 1532 1533 m1.as_m64 = __A; 1534 m2.as_m64 = __B; 1535 1536 1537 for (i = 0; i < 8; i++) 1538 res.as_char[i] = 1539 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ? 1540 m1.as_char[i] : m2.as_char[i]; 1541 1542 return (__m64) res.as_m64; 1543 #endif 1544 } 1545 1546 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1547 _m_pminub (__m64 __A, __m64 __B) 1548 { 1549 return _mm_min_pu8 (__A, __B); 1550 } 1551 1552 /* Create an 8-bit mask of the signs of 8-bit values. */ 1553 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1554 _mm_movemask_pi8 (__m64 __A) 1555 { 1556 unsigned long long p = 1557 #ifdef __LITTLE_ENDIAN__ 1558 0x0008101820283038UL; // permute control for sign bits 1559 #else 1560 0x3830282018100800UL; // permute control for sign bits 1561 #endif 1562 return __builtin_bpermd (p, __A); 1563 } 1564 1565 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1566 _m_pmovmskb (__m64 __A) 1567 { 1568 return _mm_movemask_pi8 (__A); 1569 } 1570 1571 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 1572 in B and produce the high 16 bits of the 32-bit results. */ 1573 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1574 _mm_mulhi_pu16 (__m64 __A, __m64 __B) 1575 { 1576 __vector unsigned short a, b; 1577 __vector unsigned short c; 1578 __vector unsigned int w0, w1; 1579 __vector unsigned char xform1 = { 1580 #ifdef __LITTLE_ENDIAN__ 1581 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1582 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1583 #else 1584 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1585 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 1586 #endif 1587 }; 1588 1589 a = (__vector unsigned short)vec_splats (__A); 1590 b = (__vector unsigned short)vec_splats (__B); 1591 1592 w0 = vec_vmuleuh (a, b); 1593 w1 = vec_vmulouh (a, b); 1594 c = (__vector unsigned short)vec_perm (w0, w1, xform1); 1595 1596 return (__m64) ((__vector long long) c)[0]; 1597 } 1598 1599 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1600 _m_pmulhuw (__m64 __A, __m64 __B) 1601 { 1602 return _mm_mulhi_pu16 (__A, __B); 1603 } 1604 1605 /* Return a combination of the four 16-bit values in A. The selector 1606 must be an immediate. */ 1607 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1608 _mm_shuffle_pi16 (__m64 __A, int const __N) 1609 { 1610 unsigned long element_selector_10 = __N & 0x03; 1611 unsigned long element_selector_32 = (__N >> 2) & 0x03; 1612 unsigned long element_selector_54 = (__N >> 4) & 0x03; 1613 unsigned long element_selector_76 = (__N >> 6) & 0x03; 1614 static const unsigned short permute_selectors[4] = 1615 { 1616 #ifdef __LITTLE_ENDIAN__ 1617 0x0908, 0x0B0A, 0x0D0C, 0x0F0E 1618 #else 1619 0x0607, 0x0405, 0x0203, 0x0001 1620 #endif 1621 }; 1622 __m64_union t; 1623 __vector unsigned long long a, p, r; 1624 1625 #ifdef __LITTLE_ENDIAN__ 1626 t.as_short[0] = permute_selectors[element_selector_10]; 1627 t.as_short[1] = permute_selectors[element_selector_32]; 1628 t.as_short[2] = permute_selectors[element_selector_54]; 1629 t.as_short[3] = permute_selectors[element_selector_76]; 1630 #else 1631 t.as_short[3] = permute_selectors[element_selector_10]; 1632 t.as_short[2] = permute_selectors[element_selector_32]; 1633 t.as_short[1] = permute_selectors[element_selector_54]; 1634 t.as_short[0] = permute_selectors[element_selector_76]; 1635 #endif 1636 p = vec_splats (t.as_m64); 1637 a = vec_splats (__A); 1638 r = vec_perm (a, a, (__vector unsigned char)p); 1639 return (__m64) ((__vector long long) r)[0]; 1640 } 1641 1642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1643 _m_pshufw (__m64 __A, int const __N) 1644 { 1645 return _mm_shuffle_pi16 (__A, __N); 1646 } 1647 1648 /* Conditionally store byte elements of A into P. The high bit of each 1649 byte in the selector N determines whether the corresponding byte from 1650 A is stored. */ 1651 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1652 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 1653 { 1654 __m64 hibit = 0x8080808080808080UL; 1655 __m64 mask, tmp; 1656 __m64 *p = (__m64*)__P; 1657 1658 tmp = *p; 1659 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit); 1660 tmp = (tmp & (~mask)) | (__A & mask); 1661 *p = tmp; 1662 } 1663 1664 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1665 _m_maskmovq (__m64 __A, __m64 __N, char *__P) 1666 { 1667 _mm_maskmove_si64 (__A, __N, __P); 1668 } 1669 1670 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 1671 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1672 _mm_avg_pu8 (__m64 __A, __m64 __B) 1673 { 1674 __vector unsigned char a, b, c; 1675 1676 a = (__vector unsigned char)vec_splats (__A); 1677 b = (__vector unsigned char)vec_splats (__B); 1678 c = vec_avg (a, b); 1679 return (__m64) ((__vector long long) c)[0]; 1680 } 1681 1682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1683 _m_pavgb (__m64 __A, __m64 __B) 1684 { 1685 return _mm_avg_pu8 (__A, __B); 1686 } 1687 1688 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 1689 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1690 _mm_avg_pu16 (__m64 __A, __m64 __B) 1691 { 1692 __vector unsigned short a, b, c; 1693 1694 a = (__vector unsigned short)vec_splats (__A); 1695 b = (__vector unsigned short)vec_splats (__B); 1696 c = vec_avg (a, b); 1697 return (__m64) ((__vector long long) c)[0]; 1698 } 1699 1700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1701 _m_pavgw (__m64 __A, __m64 __B) 1702 { 1703 return _mm_avg_pu16 (__A, __B); 1704 } 1705 1706 /* Compute the sum of the absolute differences of the unsigned 8-bit 1707 values in A and B. Return the value in the lower 16-bit word; the 1708 upper words are cleared. */ 1709 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1710 _mm_sad_pu8 (__m64 __A, __m64 __B) 1711 { 1712 __vector unsigned char a, b; 1713 __vector unsigned char vmin, vmax, vabsdiff; 1714 __vector signed int vsum; 1715 const __vector unsigned int zero = 1716 { 0, 0, 0, 0 }; 1717 __m64_union result = {0}; 1718 1719 a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A }; 1720 b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B }; 1721 vmin = vec_min (a, b); 1722 vmax = vec_max (a, b); 1723 vabsdiff = vec_sub (vmax, vmin); 1724 /* Sum four groups of bytes into integers. */ 1725 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); 1726 /* Sum across four integers with integer result. */ 1727 vsum = vec_sums (vsum, (__vector signed int) zero); 1728 /* The sum is in the right most 32-bits of the vector result. 1729 Transfer to a GPR and truncate to 16 bits. */ 1730 result.as_short[0] = vsum[3]; 1731 return result.as_m64; 1732 } 1733 1734 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1735 _m_psadbw (__m64 __A, __m64 __B) 1736 { 1737 return _mm_sad_pu8 (__A, __B); 1738 } 1739 1740 /* Stores the data in A to the address P without polluting the caches. */ 1741 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1742 _mm_stream_pi (__m64 *__P, __m64 __A) 1743 { 1744 /* Use the data cache block touch for store transient. */ 1745 __asm__ ( 1746 " dcbtstt 0,%0" 1747 : 1748 : "b" (__P) 1749 : "memory" 1750 ); 1751 *__P = __A; 1752 } 1753 1754 /* Likewise. The address must be 16-byte aligned. */ 1755 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1756 _mm_stream_ps (float *__P, __m128 __A) 1757 { 1758 /* Use the data cache block touch for store transient. */ 1759 __asm__ ( 1760 " dcbtstt 0,%0" 1761 : 1762 : "b" (__P) 1763 : "memory" 1764 ); 1765 _mm_store_ps (__P, __A); 1766 } 1767 1768 /* Guarantees that every preceding store is globally visible before 1769 any subsequent store. */ 1770 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1771 _mm_sfence (void) 1772 { 1773 /* Generate a light weight sync. */ 1774 __atomic_thread_fence (__ATOMIC_RELEASE); 1775 } 1776 1777 /* The execution of the next instruction is delayed by an implementation 1778 specific amount of time. The instruction does not modify the 1779 architectural state. This is after the pop_options pragma because 1780 it does not require SSE support in the processor--the encoding is a 1781 nop on processors that do not support it. */ 1782 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1783 _mm_pause (void) 1784 { 1785 /* There is no exact match with this construct, but the following is 1786 close to the desired effect. */ 1787 #if _ARCH_PWR8 1788 /* On power8 and later processors we can depend on Program Priority 1789 (PRI) and associated "very low" PPI setting. Since we don't know 1790 what PPI this thread is running at we: 1) save the current PRI 1791 from the PPR SPR into a local GRP, 2) set the PRI to "very low* 1792 via the special or 31,31,31 encoding. 3) issue an "isync" to 1793 insure the PRI change takes effect before we execute any more 1794 instructions. 1795 Now we can execute a lwsync (release barrier) while we execute 1796 this thread at "very low" PRI. Finally we restore the original 1797 PRI and continue execution. */ 1798 unsigned long __PPR; 1799 1800 __asm__ volatile ( 1801 " mfppr %0;" 1802 " or 31,31,31;" 1803 " isync;" 1804 " lwsync;" 1805 " isync;" 1806 " mtppr %0;" 1807 : "=r" (__PPR) 1808 : 1809 : "memory" 1810 ); 1811 #else 1812 /* For older processor where we may not even have Program Priority 1813 controls we can only depend on Heavy Weight Sync. */ 1814 __atomic_thread_fence (__ATOMIC_SEQ_CST); 1815 #endif 1816 } 1817 1818 /* Transpose the 4x4 matrix composed of row[0-3]. */ 1819 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 1820 do { \ 1821 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 1822 __v4sf __t0 = vec_vmrghw (__r0, __r1); \ 1823 __v4sf __t1 = vec_vmrghw (__r2, __r3); \ 1824 __v4sf __t2 = vec_vmrglw (__r0, __r1); \ 1825 __v4sf __t3 = vec_vmrglw (__r2, __r3); \ 1826 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \ 1827 (__vector long long)__t1); \ 1828 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \ 1829 (__vector long long)__t1); \ 1830 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \ 1831 (__vector long long)__t3); \ 1832 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \ 1833 (__vector long long)__t3); \ 1834 } while (0) 1835 1836 /* For backward source compatibility. */ 1837 //# include <emmintrin.h> 1838 1839 #else 1840 #include_next <xmmintrin.h> 1841 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ 1842 */ 1843 1844 #endif /* _XMMINTRIN_H_INCLUDED */ 1845