1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header file is to help porting code using Intel intrinsics 15 explicitly from x86_64 to powerpc64/powerpc64le. 16 17 Since PowerPC target doesn't support native 64-bit vector type, we 18 typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which 19 works well for _si64 and some _pi32 operations. 20 21 For _pi16 and _pi8 operations, it's better to transfer __m64 into 22 128-bit PowerPC vector first. Power8 introduced direct register 23 move instructions which helps for more efficient implementation. 24 25 It's user's responsibility to determine if the results of such port 26 are acceptable or further changes are needed. Please note that much 27 code using Intel intrinsics CAN BE REWRITTEN in more portable and 28 efficient standard C or GNU C extensions with 64-bit scalar 29 operations, or 128-bit SSE/Altivec operations, which are more 30 recommended. */ 31 #error \ 32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 33 #endif 34 35 #ifndef _MMINTRIN_H_INCLUDED 36 #define _MMINTRIN_H_INCLUDED 37 38 #if defined(__powerpc64__) && \ 39 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) 40 41 #include <altivec.h> 42 /* The Intel API is flexible enough that we must allow aliasing with other 43 vector types, and their scalar components. */ 44 typedef __attribute__((__aligned__(8))) unsigned long long __m64; 45 46 typedef __attribute__((__aligned__(8))) union { 47 __m64 as_m64; 48 char as_char[8]; 49 signed char as_signed_char[8]; 50 short as_short[4]; 51 int as_int[2]; 52 long long as_long_long; 53 float as_float[2]; 54 double as_double; 55 } __m64_union; 56 57 /* Empty the multimedia state. */ 58 extern __inline void 59 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_empty(void)60 _mm_empty(void) { 61 /* nothing to do on PowerPC. */ 62 } 63 64 extern __inline void 65 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_empty(void)66 _m_empty(void) { 67 /* nothing to do on PowerPC. */ 68 } 69 70 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 71 extern __inline __m64 72 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi32_si64(int __i)73 _mm_cvtsi32_si64(int __i) { 74 return (__m64)(unsigned int)__i; 75 } 76 77 extern __inline __m64 78 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_from_int(int __i)79 _m_from_int(int __i) { 80 return _mm_cvtsi32_si64(__i); 81 } 82 83 /* Convert the lower 32 bits of the __m64 object into an integer. */ 84 extern __inline int 85 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_si32(__m64 __i)86 _mm_cvtsi64_si32(__m64 __i) { 87 return ((int)__i); 88 } 89 90 extern __inline int 91 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_to_int(__m64 __i)92 _m_to_int(__m64 __i) { 93 return _mm_cvtsi64_si32(__i); 94 } 95 96 /* Convert I to a __m64 object. */ 97 98 /* Intel intrinsic. */ 99 extern __inline __m64 100 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_from_int64(long long __i)101 _m_from_int64(long long __i) { 102 return (__m64)__i; 103 } 104 105 extern __inline __m64 106 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_m64(long long __i)107 _mm_cvtsi64_m64(long long __i) { 108 return (__m64)__i; 109 } 110 111 /* Microsoft intrinsic. */ 112 extern __inline __m64 113 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64x_si64(long long __i)114 _mm_cvtsi64x_si64(long long __i) { 115 return (__m64)__i; 116 } 117 118 extern __inline __m64 119 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi64x(long long __i)120 _mm_set_pi64x(long long __i) { 121 return (__m64)__i; 122 } 123 124 /* Convert the __m64 object to a 64bit integer. */ 125 126 /* Intel intrinsic. */ 127 extern __inline long long 128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_to_int64(__m64 __i)129 _m_to_int64(__m64 __i) { 130 return (long long)__i; 131 } 132 133 extern __inline long long 134 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtm64_si64(__m64 __i)135 _mm_cvtm64_si64(__m64 __i) { 136 return (long long)__i; 137 } 138 139 /* Microsoft intrinsic. */ 140 extern __inline long long 141 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi64_si64x(__m64 __i)142 _mm_cvtsi64_si64x(__m64 __i) { 143 return (long long)__i; 144 } 145 146 #ifdef _ARCH_PWR8 147 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 148 the result, and the four 16-bit values from M2 into the upper four 8-bit 149 values of the result, all with signed saturation. */ 150 extern __inline __m64 151 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pi16(__m64 __m1,__m64 __m2)152 _mm_packs_pi16(__m64 __m1, __m64 __m2) { 153 __vector signed short __vm1; 154 __vector signed char __vresult; 155 156 __vm1 = (__vector signed short)(__vector unsigned long long) 157 #ifdef __LITTLE_ENDIAN__ 158 {__m1, __m2}; 159 #else 160 {__m2, __m1}; 161 #endif 162 __vresult = vec_packs(__vm1, __vm1); 163 return (__m64)((__vector long long)__vresult)[0]; 164 } 165 166 extern __inline __m64 167 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_packsswb(__m64 __m1,__m64 __m2)168 _m_packsswb(__m64 __m1, __m64 __m2) { 169 return _mm_packs_pi16(__m1, __m2); 170 } 171 172 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 173 the result, and the two 32-bit values from M2 into the upper two 16-bit 174 values of the result, all with signed saturation. */ 175 extern __inline __m64 176 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pi32(__m64 __m1,__m64 __m2)177 _mm_packs_pi32(__m64 __m1, __m64 __m2) { 178 __vector signed int __vm1; 179 __vector signed short __vresult; 180 181 __vm1 = (__vector signed int)(__vector unsigned long long) 182 #ifdef __LITTLE_ENDIAN__ 183 {__m1, __m2}; 184 #else 185 {__m2, __m1}; 186 #endif 187 __vresult = vec_packs(__vm1, __vm1); 188 return (__m64)((__vector long long)__vresult)[0]; 189 } 190 191 extern __inline __m64 192 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_packssdw(__m64 __m1,__m64 __m2)193 _m_packssdw(__m64 __m1, __m64 __m2) { 194 return _mm_packs_pi32(__m1, __m2); 195 } 196 197 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 198 the result, and the four 16-bit values from M2 into the upper four 8-bit 199 values of the result, all with unsigned saturation. */ 200 extern __inline __m64 201 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pu16(__m64 __m1,__m64 __m2)202 _mm_packs_pu16(__m64 __m1, __m64 __m2) { 203 __vector unsigned char __r; 204 __vector signed short __vm1 = (__vector signed short)(__vector long long) 205 #ifdef __LITTLE_ENDIAN__ 206 {__m1, __m2}; 207 #else 208 {__m2, __m1}; 209 #endif 210 const __vector signed short __zero = {0}; 211 __vector __bool short __select = vec_cmplt(__vm1, __zero); 212 __r = 213 vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1); 214 __vector __bool char __packsel = vec_pack(__select, __select); 215 __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel); 216 return (__m64)((__vector long long)__r)[0]; 217 } 218 219 extern __inline __m64 220 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_packuswb(__m64 __m1,__m64 __m2)221 _m_packuswb(__m64 __m1, __m64 __m2) { 222 return _mm_packs_pu16(__m1, __m2); 223 } 224 #endif /* end ARCH_PWR8 */ 225 226 /* Interleave the four 8-bit values from the high half of M1 with the four 227 8-bit values from the high half of M2. */ 228 extern __inline __m64 229 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi8(__m64 __m1,__m64 __m2)230 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { 231 #if _ARCH_PWR8 232 __vector unsigned char __a, __b, __c; 233 234 __a = (__vector unsigned char)vec_splats(__m1); 235 __b = (__vector unsigned char)vec_splats(__m2); 236 __c = vec_mergel(__a, __b); 237 return (__m64)((__vector long long)__c)[1]; 238 #else 239 __m64_union __mu1, __mu2, __res; 240 241 __mu1.as_m64 = __m1; 242 __mu2.as_m64 = __m2; 243 244 __res.as_char[0] = __mu1.as_char[4]; 245 __res.as_char[1] = __mu2.as_char[4]; 246 __res.as_char[2] = __mu1.as_char[5]; 247 __res.as_char[3] = __mu2.as_char[5]; 248 __res.as_char[4] = __mu1.as_char[6]; 249 __res.as_char[5] = __mu2.as_char[6]; 250 __res.as_char[6] = __mu1.as_char[7]; 251 __res.as_char[7] = __mu2.as_char[7]; 252 253 return (__m64)__res.as_m64; 254 #endif 255 } 256 257 extern __inline __m64 258 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpckhbw(__m64 __m1,__m64 __m2)259 _m_punpckhbw(__m64 __m1, __m64 __m2) { 260 return _mm_unpackhi_pi8(__m1, __m2); 261 } 262 263 /* Interleave the two 16-bit values from the high half of M1 with the two 264 16-bit values from the high half of M2. */ 265 extern __inline __m64 266 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi16(__m64 __m1,__m64 __m2)267 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { 268 __m64_union __mu1, __mu2, __res; 269 270 __mu1.as_m64 = __m1; 271 __mu2.as_m64 = __m2; 272 273 __res.as_short[0] = __mu1.as_short[2]; 274 __res.as_short[1] = __mu2.as_short[2]; 275 __res.as_short[2] = __mu1.as_short[3]; 276 __res.as_short[3] = __mu2.as_short[3]; 277 278 return (__m64)__res.as_m64; 279 } 280 281 extern __inline __m64 282 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpckhwd(__m64 __m1,__m64 __m2)283 _m_punpckhwd(__m64 __m1, __m64 __m2) { 284 return _mm_unpackhi_pi16(__m1, __m2); 285 } 286 /* Interleave the 32-bit value from the high half of M1 with the 32-bit 287 value from the high half of M2. */ 288 extern __inline __m64 289 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pi32(__m64 __m1,__m64 __m2)290 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { 291 __m64_union __mu1, __mu2, __res; 292 293 __mu1.as_m64 = __m1; 294 __mu2.as_m64 = __m2; 295 296 __res.as_int[0] = __mu1.as_int[1]; 297 __res.as_int[1] = __mu2.as_int[1]; 298 299 return (__m64)__res.as_m64; 300 } 301 302 extern __inline __m64 303 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpckhdq(__m64 __m1,__m64 __m2)304 _m_punpckhdq(__m64 __m1, __m64 __m2) { 305 return _mm_unpackhi_pi32(__m1, __m2); 306 } 307 /* Interleave the four 8-bit values from the low half of M1 with the four 308 8-bit values from the low half of M2. */ 309 extern __inline __m64 310 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi8(__m64 __m1,__m64 __m2)311 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { 312 #if _ARCH_PWR8 313 __vector unsigned char __a, __b, __c; 314 315 __a = (__vector unsigned char)vec_splats(__m1); 316 __b = (__vector unsigned char)vec_splats(__m2); 317 __c = vec_mergel(__a, __b); 318 return (__m64)((__vector long long)__c)[0]; 319 #else 320 __m64_union __mu1, __mu2, __res; 321 322 __mu1.as_m64 = __m1; 323 __mu2.as_m64 = __m2; 324 325 __res.as_char[0] = __mu1.as_char[0]; 326 __res.as_char[1] = __mu2.as_char[0]; 327 __res.as_char[2] = __mu1.as_char[1]; 328 __res.as_char[3] = __mu2.as_char[1]; 329 __res.as_char[4] = __mu1.as_char[2]; 330 __res.as_char[5] = __mu2.as_char[2]; 331 __res.as_char[6] = __mu1.as_char[3]; 332 __res.as_char[7] = __mu2.as_char[3]; 333 334 return (__m64)__res.as_m64; 335 #endif 336 } 337 338 extern __inline __m64 339 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpcklbw(__m64 __m1,__m64 __m2)340 _m_punpcklbw(__m64 __m1, __m64 __m2) { 341 return _mm_unpacklo_pi8(__m1, __m2); 342 } 343 /* Interleave the two 16-bit values from the low half of M1 with the two 344 16-bit values from the low half of M2. */ 345 extern __inline __m64 346 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi16(__m64 __m1,__m64 __m2)347 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { 348 __m64_union __mu1, __mu2, __res; 349 350 __mu1.as_m64 = __m1; 351 __mu2.as_m64 = __m2; 352 353 __res.as_short[0] = __mu1.as_short[0]; 354 __res.as_short[1] = __mu2.as_short[0]; 355 __res.as_short[2] = __mu1.as_short[1]; 356 __res.as_short[3] = __mu2.as_short[1]; 357 358 return (__m64)__res.as_m64; 359 } 360 361 extern __inline __m64 362 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpcklwd(__m64 __m1,__m64 __m2)363 _m_punpcklwd(__m64 __m1, __m64 __m2) { 364 return _mm_unpacklo_pi16(__m1, __m2); 365 } 366 367 /* Interleave the 32-bit value from the low half of M1 with the 32-bit 368 value from the low half of M2. */ 369 extern __inline __m64 370 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pi32(__m64 __m1,__m64 __m2)371 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { 372 __m64_union __mu1, __mu2, __res; 373 374 __mu1.as_m64 = __m1; 375 __mu2.as_m64 = __m2; 376 377 __res.as_int[0] = __mu1.as_int[0]; 378 __res.as_int[1] = __mu2.as_int[0]; 379 380 return (__m64)__res.as_m64; 381 } 382 383 extern __inline __m64 384 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_punpckldq(__m64 __m1,__m64 __m2)385 _m_punpckldq(__m64 __m1, __m64 __m2) { 386 return _mm_unpacklo_pi32(__m1, __m2); 387 } 388 389 /* Add the 8-bit values in M1 to the 8-bit values in M2. */ 390 extern __inline __m64 391 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pi8(__m64 __m1,__m64 __m2)392 _mm_add_pi8(__m64 __m1, __m64 __m2) { 393 #if _ARCH_PWR8 394 __vector signed char __a, __b, __c; 395 396 __a = (__vector signed char)vec_splats(__m1); 397 __b = (__vector signed char)vec_splats(__m2); 398 __c = vec_add(__a, __b); 399 return (__m64)((__vector long long)__c)[0]; 400 #else 401 __m64_union __mu1, __mu2, __res; 402 403 __mu1.as_m64 = __m1; 404 __mu2.as_m64 = __m2; 405 406 __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0]; 407 __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1]; 408 __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2]; 409 __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3]; 410 __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4]; 411 __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5]; 412 __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6]; 413 __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7]; 414 415 return (__m64)__res.as_m64; 416 #endif 417 } 418 419 extern __inline __m64 420 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddb(__m64 __m1,__m64 __m2)421 _m_paddb(__m64 __m1, __m64 __m2) { 422 return _mm_add_pi8(__m1, __m2); 423 } 424 425 /* Add the 16-bit values in M1 to the 16-bit values in M2. */ 426 extern __inline __m64 427 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pi16(__m64 __m1,__m64 __m2)428 _mm_add_pi16(__m64 __m1, __m64 __m2) { 429 #if _ARCH_PWR8 430 __vector signed short __a, __b, __c; 431 432 __a = (__vector signed short)vec_splats(__m1); 433 __b = (__vector signed short)vec_splats(__m2); 434 __c = vec_add(__a, __b); 435 return (__m64)((__vector long long)__c)[0]; 436 #else 437 __m64_union __mu1, __mu2, __res; 438 439 __mu1.as_m64 = __m1; 440 __mu2.as_m64 = __m2; 441 442 __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0]; 443 __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1]; 444 __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2]; 445 __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3]; 446 447 return (__m64)__res.as_m64; 448 #endif 449 } 450 451 extern __inline __m64 452 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddw(__m64 __m1,__m64 __m2)453 _m_paddw(__m64 __m1, __m64 __m2) { 454 return _mm_add_pi16(__m1, __m2); 455 } 456 457 /* Add the 32-bit values in M1 to the 32-bit values in M2. */ 458 extern __inline __m64 459 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pi32(__m64 __m1,__m64 __m2)460 _mm_add_pi32(__m64 __m1, __m64 __m2) { 461 #if _ARCH_PWR9 462 __vector signed int __a, __b, __c; 463 464 __a = (__vector signed int)vec_splats(__m1); 465 __b = (__vector signed int)vec_splats(__m2); 466 __c = vec_add(__a, __b); 467 return (__m64)((__vector long long)__c)[0]; 468 #else 469 __m64_union __mu1, __mu2, __res; 470 471 __mu1.as_m64 = __m1; 472 __mu2.as_m64 = __m2; 473 474 __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0]; 475 __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1]; 476 477 return (__m64)__res.as_m64; 478 #endif 479 } 480 481 extern __inline __m64 482 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddd(__m64 __m1,__m64 __m2)483 _m_paddd(__m64 __m1, __m64 __m2) { 484 return _mm_add_pi32(__m1, __m2); 485 } 486 487 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 488 extern __inline __m64 489 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pi8(__m64 __m1,__m64 __m2)490 _mm_sub_pi8(__m64 __m1, __m64 __m2) { 491 #if _ARCH_PWR8 492 __vector signed char __a, __b, __c; 493 494 __a = (__vector signed char)vec_splats(__m1); 495 __b = (__vector signed char)vec_splats(__m2); 496 __c = vec_sub(__a, __b); 497 return (__m64)((__vector long long)__c)[0]; 498 #else 499 __m64_union __mu1, __mu2, __res; 500 501 __mu1.as_m64 = __m1; 502 __mu2.as_m64 = __m2; 503 504 __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0]; 505 __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1]; 506 __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2]; 507 __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3]; 508 __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4]; 509 __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5]; 510 __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6]; 511 __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7]; 512 513 return (__m64)__res.as_m64; 514 #endif 515 } 516 517 extern __inline __m64 518 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubb(__m64 __m1,__m64 __m2)519 _m_psubb(__m64 __m1, __m64 __m2) { 520 return _mm_sub_pi8(__m1, __m2); 521 } 522 523 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 524 extern __inline __m64 525 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pi16(__m64 __m1,__m64 __m2)526 _mm_sub_pi16(__m64 __m1, __m64 __m2) { 527 #if _ARCH_PWR8 528 __vector signed short __a, __b, __c; 529 530 __a = (__vector signed short)vec_splats(__m1); 531 __b = (__vector signed short)vec_splats(__m2); 532 __c = vec_sub(__a, __b); 533 return (__m64)((__vector long long)__c)[0]; 534 #else 535 __m64_union __mu1, __mu2, __res; 536 537 __mu1.as_m64 = __m1; 538 __mu2.as_m64 = __m2; 539 540 __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0]; 541 __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1]; 542 __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2]; 543 __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3]; 544 545 return (__m64)__res.as_m64; 546 #endif 547 } 548 549 extern __inline __m64 550 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubw(__m64 __m1,__m64 __m2)551 _m_psubw(__m64 __m1, __m64 __m2) { 552 return _mm_sub_pi16(__m1, __m2); 553 } 554 555 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 556 extern __inline __m64 557 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pi32(__m64 __m1,__m64 __m2)558 _mm_sub_pi32(__m64 __m1, __m64 __m2) { 559 #if _ARCH_PWR9 560 __vector signed int __a, __b, __c; 561 562 __a = (__vector signed int)vec_splats(__m1); 563 __b = (__vector signed int)vec_splats(__m2); 564 __c = vec_sub(__a, __b); 565 return (__m64)((__vector long long)__c)[0]; 566 #else 567 __m64_union __mu1, __mu2, __res; 568 569 __mu1.as_m64 = __m1; 570 __mu2.as_m64 = __m2; 571 572 __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0]; 573 __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1]; 574 575 return (__m64)__res.as_m64; 576 #endif 577 } 578 579 extern __inline __m64 580 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubd(__m64 __m1,__m64 __m2)581 _m_psubd(__m64 __m1, __m64 __m2) { 582 return _mm_sub_pi32(__m1, __m2); 583 } 584 585 extern __inline __m64 586 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_si64(__m64 __m1,__m64 __m2)587 _mm_add_si64(__m64 __m1, __m64 __m2) { 588 return (__m1 + __m2); 589 } 590 591 extern __inline __m64 592 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_si64(__m64 __m1,__m64 __m2)593 _mm_sub_si64(__m64 __m1, __m64 __m2) { 594 return (__m1 - __m2); 595 } 596 597 /* Shift the 64-bit value in M left by COUNT. */ 598 extern __inline __m64 599 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_si64(__m64 __m,__m64 __count)600 _mm_sll_si64(__m64 __m, __m64 __count) { 601 return (__m << __count); 602 } 603 604 extern __inline __m64 605 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psllq(__m64 __m,__m64 __count)606 _m_psllq(__m64 __m, __m64 __count) { 607 return _mm_sll_si64(__m, __count); 608 } 609 610 extern __inline __m64 611 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_si64(__m64 __m,const int __count)612 _mm_slli_si64(__m64 __m, const int __count) { 613 return (__m << __count); 614 } 615 616 extern __inline __m64 617 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psllqi(__m64 __m,const int __count)618 _m_psllqi(__m64 __m, const int __count) { 619 return _mm_slli_si64(__m, __count); 620 } 621 622 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 623 extern __inline __m64 624 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_si64(__m64 __m,__m64 __count)625 _mm_srl_si64(__m64 __m, __m64 __count) { 626 return (__m >> __count); 627 } 628 629 extern __inline __m64 630 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrlq(__m64 __m,__m64 __count)631 _m_psrlq(__m64 __m, __m64 __count) { 632 return _mm_srl_si64(__m, __count); 633 } 634 635 extern __inline __m64 636 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_si64(__m64 __m,const int __count)637 _mm_srli_si64(__m64 __m, const int __count) { 638 return (__m >> __count); 639 } 640 641 extern __inline __m64 642 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrlqi(__m64 __m,const int __count)643 _m_psrlqi(__m64 __m, const int __count) { 644 return _mm_srli_si64(__m, __count); 645 } 646 647 /* Bit-wise AND the 64-bit values in M1 and M2. */ 648 extern __inline __m64 649 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_and_si64(__m64 __m1,__m64 __m2)650 _mm_and_si64(__m64 __m1, __m64 __m2) { 651 return (__m1 & __m2); 652 } 653 654 extern __inline __m64 655 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pand(__m64 __m1,__m64 __m2)656 _m_pand(__m64 __m1, __m64 __m2) { 657 return _mm_and_si64(__m1, __m2); 658 } 659 660 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 661 64-bit value in M2. */ 662 extern __inline __m64 663 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_andnot_si64(__m64 __m1,__m64 __m2)664 _mm_andnot_si64(__m64 __m1, __m64 __m2) { 665 return (~__m1 & __m2); 666 } 667 668 extern __inline __m64 669 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pandn(__m64 __m1,__m64 __m2)670 _m_pandn(__m64 __m1, __m64 __m2) { 671 return _mm_andnot_si64(__m1, __m2); 672 } 673 674 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 675 extern __inline __m64 676 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_or_si64(__m64 __m1,__m64 __m2)677 _mm_or_si64(__m64 __m1, __m64 __m2) { 678 return (__m1 | __m2); 679 } 680 681 extern __inline __m64 682 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_por(__m64 __m1,__m64 __m2)683 _m_por(__m64 __m1, __m64 __m2) { 684 return _mm_or_si64(__m1, __m2); 685 } 686 687 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 688 extern __inline __m64 689 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_xor_si64(__m64 __m1,__m64 __m2)690 _mm_xor_si64(__m64 __m1, __m64 __m2) { 691 return (__m1 ^ __m2); 692 } 693 694 extern __inline __m64 695 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pxor(__m64 __m1,__m64 __m2)696 _m_pxor(__m64 __m1, __m64 __m2) { 697 return _mm_xor_si64(__m1, __m2); 698 } 699 700 /* Creates a 64-bit zero. */ 701 extern __inline __m64 702 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setzero_si64(void)703 _mm_setzero_si64(void) { 704 return (__m64)0; 705 } 706 707 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the 708 test is true and zero if false. */ 709 extern __inline __m64 710 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi8(__m64 __m1,__m64 __m2)711 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { 712 #if defined(_ARCH_PWR6) && defined(__powerpc64__) 713 __m64 __res; 714 __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :); 715 return (__res); 716 #else 717 __m64_union __mu1, __mu2, __res; 718 719 __mu1.as_m64 = __m1; 720 __mu2.as_m64 = __m2; 721 722 __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0; 723 __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0; 724 __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0; 725 __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0; 726 __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0; 727 __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0; 728 __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0; 729 __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0; 730 731 return (__m64)__res.as_m64; 732 #endif 733 } 734 735 extern __inline __m64 736 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpeqb(__m64 __m1,__m64 __m2)737 _m_pcmpeqb(__m64 __m1, __m64 __m2) { 738 return _mm_cmpeq_pi8(__m1, __m2); 739 } 740 741 extern __inline __m64 742 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pi8(__m64 __m1,__m64 __m2)743 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { 744 #if _ARCH_PWR8 745 __vector signed char __a, __b, __c; 746 747 __a = (__vector signed char)vec_splats(__m1); 748 __b = (__vector signed char)vec_splats(__m2); 749 __c = (__vector signed char)vec_cmpgt(__a, __b); 750 return (__m64)((__vector long long)__c)[0]; 751 #else 752 __m64_union __mu1, __mu2, __res; 753 754 __mu1.as_m64 = __m1; 755 __mu2.as_m64 = __m2; 756 757 __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0; 758 __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0; 759 __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0; 760 __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0; 761 __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0; 762 __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0; 763 __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0; 764 __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0; 765 766 return (__m64)__res.as_m64; 767 #endif 768 } 769 770 extern __inline __m64 771 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpgtb(__m64 __m1,__m64 __m2)772 _m_pcmpgtb(__m64 __m1, __m64 __m2) { 773 return _mm_cmpgt_pi8(__m1, __m2); 774 } 775 776 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if 777 the test is true and zero if false. */ 778 extern __inline __m64 779 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi16(__m64 __m1,__m64 __m2)780 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { 781 #if _ARCH_PWR8 782 __vector signed short __a, __b, __c; 783 784 __a = (__vector signed short)vec_splats(__m1); 785 __b = (__vector signed short)vec_splats(__m2); 786 __c = (__vector signed short)vec_cmpeq(__a, __b); 787 return (__m64)((__vector long long)__c)[0]; 788 #else 789 __m64_union __mu1, __mu2, __res; 790 791 __mu1.as_m64 = __m1; 792 __mu2.as_m64 = __m2; 793 794 __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0; 795 __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0; 796 __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0; 797 __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0; 798 799 return (__m64)__res.as_m64; 800 #endif 801 } 802 803 extern __inline __m64 804 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpeqw(__m64 __m1,__m64 __m2)805 _m_pcmpeqw(__m64 __m1, __m64 __m2) { 806 return _mm_cmpeq_pi16(__m1, __m2); 807 } 808 809 extern __inline __m64 810 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pi16(__m64 __m1,__m64 __m2)811 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { 812 #if _ARCH_PWR8 813 __vector signed short __a, __b, __c; 814 815 __a = (__vector signed short)vec_splats(__m1); 816 __b = (__vector signed short)vec_splats(__m2); 817 __c = (__vector signed short)vec_cmpgt(__a, __b); 818 return (__m64)((__vector long long)__c)[0]; 819 #else 820 __m64_union __mu1, __mu2, __res; 821 822 __mu1.as_m64 = __m1; 823 __mu2.as_m64 = __m2; 824 825 __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0; 826 __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0; 827 __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0; 828 __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0; 829 830 return (__m64)__res.as_m64; 831 #endif 832 } 833 834 extern __inline __m64 835 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpgtw(__m64 __m1,__m64 __m2)836 _m_pcmpgtw(__m64 __m1, __m64 __m2) { 837 return _mm_cmpgt_pi16(__m1, __m2); 838 } 839 840 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 841 the test is true and zero if false. */ 842 extern __inline __m64 843 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi32(__m64 __m1,__m64 __m2)844 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { 845 #if _ARCH_PWR9 846 __vector signed int __a, __b, __c; 847 848 __a = (__vector signed int)vec_splats(__m1); 849 __b = (__vector signed int)vec_splats(__m2); 850 __c = (__vector signed int)vec_cmpeq(__a, __b); 851 return (__m64)((__vector long long)__c)[0]; 852 #else 853 __m64_union __mu1, __mu2, __res; 854 855 __mu1.as_m64 = __m1; 856 __mu2.as_m64 = __m2; 857 858 __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0; 859 __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0; 860 861 return (__m64)__res.as_m64; 862 #endif 863 } 864 865 extern __inline __m64 866 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpeqd(__m64 __m1,__m64 __m2)867 _m_pcmpeqd(__m64 __m1, __m64 __m2) { 868 return _mm_cmpeq_pi32(__m1, __m2); 869 } 870 871 extern __inline __m64 872 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pi32(__m64 __m1,__m64 __m2)873 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { 874 #if _ARCH_PWR9 875 __vector signed int __a, __b, __c; 876 877 __a = (__vector signed int)vec_splats(__m1); 878 __b = (__vector signed int)vec_splats(__m2); 879 __c = (__vector signed int)vec_cmpgt(__a, __b); 880 return (__m64)((__vector long long)__c)[0]; 881 #else 882 __m64_union __mu1, __mu2, __res; 883 884 __mu1.as_m64 = __m1; 885 __mu2.as_m64 = __m2; 886 887 __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0; 888 __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0; 889 890 return (__m64)__res.as_m64; 891 #endif 892 } 893 894 extern __inline __m64 895 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pcmpgtd(__m64 __m1,__m64 __m2)896 _m_pcmpgtd(__m64 __m1, __m64 __m2) { 897 return _mm_cmpgt_pi32(__m1, __m2); 898 } 899 900 #if _ARCH_PWR8 901 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 902 saturated arithmetic. */ 903 extern __inline __m64 904 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pi8(__m64 __m1,__m64 __m2)905 _mm_adds_pi8(__m64 __m1, __m64 __m2) { 906 __vector signed char __a, __b, __c; 907 908 __a = (__vector signed char)vec_splats(__m1); 909 __b = (__vector signed char)vec_splats(__m2); 910 __c = vec_adds(__a, __b); 911 return (__m64)((__vector long long)__c)[0]; 912 } 913 914 extern __inline __m64 915 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddsb(__m64 __m1,__m64 __m2)916 _m_paddsb(__m64 __m1, __m64 __m2) { 917 return _mm_adds_pi8(__m1, __m2); 918 } 919 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 920 saturated arithmetic. */ 921 extern __inline __m64 922 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pi16(__m64 __m1,__m64 __m2)923 _mm_adds_pi16(__m64 __m1, __m64 __m2) { 924 __vector signed short __a, __b, __c; 925 926 __a = (__vector signed short)vec_splats(__m1); 927 __b = (__vector signed short)vec_splats(__m2); 928 __c = vec_adds(__a, __b); 929 return (__m64)((__vector long long)__c)[0]; 930 } 931 932 extern __inline __m64 933 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddsw(__m64 __m1,__m64 __m2)934 _m_paddsw(__m64 __m1, __m64 __m2) { 935 return _mm_adds_pi16(__m1, __m2); 936 } 937 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 938 saturated arithmetic. */ 939 extern __inline __m64 940 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pu8(__m64 __m1,__m64 __m2)941 _mm_adds_pu8(__m64 __m1, __m64 __m2) { 942 __vector unsigned char __a, __b, __c; 943 944 __a = (__vector unsigned char)vec_splats(__m1); 945 __b = (__vector unsigned char)vec_splats(__m2); 946 __c = vec_adds(__a, __b); 947 return (__m64)((__vector long long)__c)[0]; 948 } 949 950 extern __inline __m64 951 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddusb(__m64 __m1,__m64 __m2)952 _m_paddusb(__m64 __m1, __m64 __m2) { 953 return _mm_adds_pu8(__m1, __m2); 954 } 955 956 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 957 saturated arithmetic. */ 958 extern __inline __m64 959 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_pu16(__m64 __m1,__m64 __m2)960 _mm_adds_pu16(__m64 __m1, __m64 __m2) { 961 __vector unsigned short __a, __b, __c; 962 963 __a = (__vector unsigned short)vec_splats(__m1); 964 __b = (__vector unsigned short)vec_splats(__m2); 965 __c = vec_adds(__a, __b); 966 return (__m64)((__vector long long)__c)[0]; 967 } 968 969 extern __inline __m64 970 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_paddusw(__m64 __m1,__m64 __m2)971 _m_paddusw(__m64 __m1, __m64 __m2) { 972 return _mm_adds_pu16(__m1, __m2); 973 } 974 975 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 976 saturating arithmetic. */ 977 extern __inline __m64 978 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pi8(__m64 __m1,__m64 __m2)979 _mm_subs_pi8(__m64 __m1, __m64 __m2) { 980 __vector signed char __a, __b, __c; 981 982 __a = (__vector signed char)vec_splats(__m1); 983 __b = (__vector signed char)vec_splats(__m2); 984 __c = vec_subs(__a, __b); 985 return (__m64)((__vector long long)__c)[0]; 986 } 987 988 extern __inline __m64 989 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubsb(__m64 __m1,__m64 __m2)990 _m_psubsb(__m64 __m1, __m64 __m2) { 991 return _mm_subs_pi8(__m1, __m2); 992 } 993 994 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 995 signed saturating arithmetic. */ 996 extern __inline __m64 997 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pi16(__m64 __m1,__m64 __m2)998 _mm_subs_pi16(__m64 __m1, __m64 __m2) { 999 __vector signed short __a, __b, __c; 1000 1001 __a = (__vector signed short)vec_splats(__m1); 1002 __b = (__vector signed short)vec_splats(__m2); 1003 __c = vec_subs(__a, __b); 1004 return (__m64)((__vector long long)__c)[0]; 1005 } 1006 1007 extern __inline __m64 1008 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubsw(__m64 __m1,__m64 __m2)1009 _m_psubsw(__m64 __m1, __m64 __m2) { 1010 return _mm_subs_pi16(__m1, __m2); 1011 } 1012 1013 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 1014 unsigned saturating arithmetic. */ 1015 extern __inline __m64 1016 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pu8(__m64 __m1,__m64 __m2)1017 _mm_subs_pu8(__m64 __m1, __m64 __m2) { 1018 __vector unsigned char __a, __b, __c; 1019 1020 __a = (__vector unsigned char)vec_splats(__m1); 1021 __b = (__vector unsigned char)vec_splats(__m2); 1022 __c = vec_subs(__a, __b); 1023 return (__m64)((__vector long long)__c)[0]; 1024 } 1025 1026 extern __inline __m64 1027 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubusb(__m64 __m1,__m64 __m2)1028 _m_psubusb(__m64 __m1, __m64 __m2) { 1029 return _mm_subs_pu8(__m1, __m2); 1030 } 1031 1032 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1033 unsigned saturating arithmetic. */ 1034 extern __inline __m64 1035 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_pu16(__m64 __m1,__m64 __m2)1036 _mm_subs_pu16(__m64 __m1, __m64 __m2) { 1037 __vector unsigned short __a, __b, __c; 1038 1039 __a = (__vector unsigned short)vec_splats(__m1); 1040 __b = (__vector unsigned short)vec_splats(__m2); 1041 __c = vec_subs(__a, __b); 1042 return (__m64)((__vector long long)__c)[0]; 1043 } 1044 1045 extern __inline __m64 1046 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psubusw(__m64 __m1,__m64 __m2)1047 _m_psubusw(__m64 __m1, __m64 __m2) { 1048 return _mm_subs_pu16(__m1, __m2); 1049 } 1050 1051 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 1052 four 32-bit intermediate results, which are then summed by pairs to 1053 produce two 32-bit results. */ 1054 extern __inline __m64 1055 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_madd_pi16(__m64 __m1,__m64 __m2)1056 _mm_madd_pi16(__m64 __m1, __m64 __m2) { 1057 __vector signed short __a, __b; 1058 __vector signed int __c; 1059 __vector signed int __zero = {0, 0, 0, 0}; 1060 1061 __a = (__vector signed short)vec_splats(__m1); 1062 __b = (__vector signed short)vec_splats(__m2); 1063 __c = vec_vmsumshm(__a, __b, __zero); 1064 return (__m64)((__vector long long)__c)[0]; 1065 } 1066 1067 extern __inline __m64 1068 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pmaddwd(__m64 __m1,__m64 __m2)1069 _m_pmaddwd(__m64 __m1, __m64 __m2) { 1070 return _mm_madd_pi16(__m1, __m2); 1071 } 1072 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 1073 M2 and produce the high 16 bits of the 32-bit results. */ 1074 extern __inline __m64 1075 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_pi16(__m64 __m1,__m64 __m2)1076 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { 1077 __vector signed short __a, __b; 1078 __vector signed short __c; 1079 __vector signed int __w0, __w1; 1080 __vector unsigned char __xform1 = { 1081 #ifdef __LITTLE_ENDIAN__ 1082 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 1083 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1084 #else 1085 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00, 1086 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 1087 #endif 1088 }; 1089 1090 __a = (__vector signed short)vec_splats(__m1); 1091 __b = (__vector signed short)vec_splats(__m2); 1092 1093 __w0 = vec_vmulesh(__a, __b); 1094 __w1 = vec_vmulosh(__a, __b); 1095 __c = (__vector signed short)vec_perm(__w0, __w1, __xform1); 1096 1097 return (__m64)((__vector long long)__c)[0]; 1098 } 1099 1100 extern __inline __m64 1101 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pmulhw(__m64 __m1,__m64 __m2)1102 _m_pmulhw(__m64 __m1, __m64 __m2) { 1103 return _mm_mulhi_pi16(__m1, __m2); 1104 } 1105 1106 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 1107 the low 16 bits of the results. */ 1108 extern __inline __m64 1109 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mullo_pi16(__m64 __m1,__m64 __m2)1110 _mm_mullo_pi16(__m64 __m1, __m64 __m2) { 1111 __vector signed short __a, __b, __c; 1112 1113 __a = (__vector signed short)vec_splats(__m1); 1114 __b = (__vector signed short)vec_splats(__m2); 1115 __c = __a * __b; 1116 return (__m64)((__vector long long)__c)[0]; 1117 } 1118 1119 extern __inline __m64 1120 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pmullw(__m64 __m1,__m64 __m2)1121 _m_pmullw(__m64 __m1, __m64 __m2) { 1122 return _mm_mullo_pi16(__m1, __m2); 1123 } 1124 1125 /* Shift four 16-bit values in M left by COUNT. */ 1126 extern __inline __m64 1127 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_pi16(__m64 __m,__m64 __count)1128 _mm_sll_pi16(__m64 __m, __m64 __count) { 1129 __vector signed short __r; 1130 __vector unsigned short __c; 1131 1132 if (__count <= 15) { 1133 __r = (__vector signed short)vec_splats(__m); 1134 __c = (__vector unsigned short)vec_splats((unsigned short)__count); 1135 __r = vec_sl(__r, (__vector unsigned short)__c); 1136 return (__m64)((__vector long long)__r)[0]; 1137 } else 1138 return (0); 1139 } 1140 1141 extern __inline __m64 1142 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psllw(__m64 __m,__m64 __count)1143 _m_psllw(__m64 __m, __m64 __count) { 1144 return _mm_sll_pi16(__m, __count); 1145 } 1146 1147 extern __inline __m64 1148 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_pi16(__m64 __m,int __count)1149 _mm_slli_pi16(__m64 __m, int __count) { 1150 /* Promote int to long then invoke mm_sll_pi16. */ 1151 return _mm_sll_pi16(__m, __count); 1152 } 1153 1154 extern __inline __m64 1155 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psllwi(__m64 __m,int __count)1156 _m_psllwi(__m64 __m, int __count) { 1157 return _mm_slli_pi16(__m, __count); 1158 } 1159 1160 /* Shift two 32-bit values in M left by COUNT. */ 1161 extern __inline __m64 1162 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_pi32(__m64 __m,__m64 __count)1163 _mm_sll_pi32(__m64 __m, __m64 __count) { 1164 __m64_union __res; 1165 1166 __res.as_m64 = __m; 1167 1168 __res.as_int[0] = __res.as_int[0] << __count; 1169 __res.as_int[1] = __res.as_int[1] << __count; 1170 return (__res.as_m64); 1171 } 1172 1173 extern __inline __m64 1174 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pslld(__m64 __m,__m64 __count)1175 _m_pslld(__m64 __m, __m64 __count) { 1176 return _mm_sll_pi32(__m, __count); 1177 } 1178 1179 extern __inline __m64 1180 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_pi32(__m64 __m,int __count)1181 _mm_slli_pi32(__m64 __m, int __count) { 1182 /* Promote int to long then invoke mm_sll_pi32. */ 1183 return _mm_sll_pi32(__m, __count); 1184 } 1185 1186 extern __inline __m64 1187 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pslldi(__m64 __m,int __count)1188 _m_pslldi(__m64 __m, int __count) { 1189 return _mm_slli_pi32(__m, __count); 1190 } 1191 1192 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 1193 extern __inline __m64 1194 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_pi16(__m64 __m,__m64 __count)1195 _mm_sra_pi16(__m64 __m, __m64 __count) { 1196 __vector signed short __r; 1197 __vector unsigned short __c; 1198 1199 if (__count <= 15) { 1200 __r = (__vector signed short)vec_splats(__m); 1201 __c = (__vector unsigned short)vec_splats((unsigned short)__count); 1202 __r = vec_sra(__r, (__vector unsigned short)__c); 1203 return (__m64)((__vector long long)__r)[0]; 1204 } else 1205 return (0); 1206 } 1207 1208 extern __inline __m64 1209 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psraw(__m64 __m,__m64 __count)1210 _m_psraw(__m64 __m, __m64 __count) { 1211 return _mm_sra_pi16(__m, __count); 1212 } 1213 1214 extern __inline __m64 1215 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_pi16(__m64 __m,int __count)1216 _mm_srai_pi16(__m64 __m, int __count) { 1217 /* Promote int to long then invoke mm_sra_pi32. */ 1218 return _mm_sra_pi16(__m, __count); 1219 } 1220 1221 extern __inline __m64 1222 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrawi(__m64 __m,int __count)1223 _m_psrawi(__m64 __m, int __count) { 1224 return _mm_srai_pi16(__m, __count); 1225 } 1226 1227 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 1228 extern __inline __m64 1229 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sra_pi32(__m64 __m,__m64 __count)1230 _mm_sra_pi32(__m64 __m, __m64 __count) { 1231 __m64_union __res; 1232 1233 __res.as_m64 = __m; 1234 1235 __res.as_int[0] = __res.as_int[0] >> __count; 1236 __res.as_int[1] = __res.as_int[1] >> __count; 1237 return (__res.as_m64); 1238 } 1239 1240 extern __inline __m64 1241 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrad(__m64 __m,__m64 __count)1242 _m_psrad(__m64 __m, __m64 __count) { 1243 return _mm_sra_pi32(__m, __count); 1244 } 1245 1246 extern __inline __m64 1247 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_pi32(__m64 __m,int __count)1248 _mm_srai_pi32(__m64 __m, int __count) { 1249 /* Promote int to long then invoke mm_sra_pi32. */ 1250 return _mm_sra_pi32(__m, __count); 1251 } 1252 1253 extern __inline __m64 1254 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psradi(__m64 __m,int __count)1255 _m_psradi(__m64 __m, int __count) { 1256 return _mm_srai_pi32(__m, __count); 1257 } 1258 1259 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 1260 extern __inline __m64 1261 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_pi16(__m64 __m,__m64 __count)1262 _mm_srl_pi16(__m64 __m, __m64 __count) { 1263 __vector unsigned short __r; 1264 __vector unsigned short __c; 1265 1266 if (__count <= 15) { 1267 __r = (__vector unsigned short)vec_splats(__m); 1268 __c = (__vector unsigned short)vec_splats((unsigned short)__count); 1269 __r = vec_sr(__r, (__vector unsigned short)__c); 1270 return (__m64)((__vector long long)__r)[0]; 1271 } else 1272 return (0); 1273 } 1274 1275 extern __inline __m64 1276 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrlw(__m64 __m,__m64 __count)1277 _m_psrlw(__m64 __m, __m64 __count) { 1278 return _mm_srl_pi16(__m, __count); 1279 } 1280 1281 extern __inline __m64 1282 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_pi16(__m64 __m,int __count)1283 _mm_srli_pi16(__m64 __m, int __count) { 1284 /* Promote int to long then invoke mm_sra_pi32. */ 1285 return _mm_srl_pi16(__m, __count); 1286 } 1287 1288 extern __inline __m64 1289 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrlwi(__m64 __m,int __count)1290 _m_psrlwi(__m64 __m, int __count) { 1291 return _mm_srli_pi16(__m, __count); 1292 } 1293 1294 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 1295 extern __inline __m64 1296 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srl_pi32(__m64 __m,__m64 __count)1297 _mm_srl_pi32(__m64 __m, __m64 __count) { 1298 __m64_union __res; 1299 1300 __res.as_m64 = __m; 1301 1302 __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count; 1303 __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count; 1304 return (__res.as_m64); 1305 } 1306 1307 extern __inline __m64 1308 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrld(__m64 __m,__m64 __count)1309 _m_psrld(__m64 __m, __m64 __count) { 1310 return _mm_srl_pi32(__m, __count); 1311 } 1312 1313 extern __inline __m64 1314 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_pi32(__m64 __m,int __count)1315 _mm_srli_pi32(__m64 __m, int __count) { 1316 /* Promote int to long then invoke mm_srl_pi32. */ 1317 return _mm_srl_pi32(__m, __count); 1318 } 1319 1320 extern __inline __m64 1321 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_psrldi(__m64 __m,int __count)1322 _m_psrldi(__m64 __m, int __count) { 1323 return _mm_srli_pi32(__m, __count); 1324 } 1325 #endif /* _ARCH_PWR8 */ 1326 1327 /* Creates a vector of two 32-bit values; I0 is least significant. */ 1328 extern __inline __m64 1329 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi32(int __i1,int __i0)1330 _mm_set_pi32(int __i1, int __i0) { 1331 __m64_union __res; 1332 1333 __res.as_int[0] = __i0; 1334 __res.as_int[1] = __i1; 1335 return (__res.as_m64); 1336 } 1337 1338 /* Creates a vector of four 16-bit values; W0 is least significant. */ 1339 extern __inline __m64 1340 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi16(short __w3,short __w2,short __w1,short __w0)1341 _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { 1342 __m64_union __res; 1343 1344 __res.as_short[0] = __w0; 1345 __res.as_short[1] = __w1; 1346 __res.as_short[2] = __w2; 1347 __res.as_short[3] = __w3; 1348 return (__res.as_m64); 1349 } 1350 1351 /* Creates a vector of eight 8-bit values; B0 is least significant. */ 1352 extern __inline __m64 1353 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_pi8(char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)1354 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, 1355 char __b2, char __b1, char __b0) { 1356 __m64_union __res; 1357 1358 __res.as_char[0] = __b0; 1359 __res.as_char[1] = __b1; 1360 __res.as_char[2] = __b2; 1361 __res.as_char[3] = __b3; 1362 __res.as_char[4] = __b4; 1363 __res.as_char[5] = __b5; 1364 __res.as_char[6] = __b6; 1365 __res.as_char[7] = __b7; 1366 return (__res.as_m64); 1367 } 1368 1369 /* Similar, but with the arguments in reverse order. */ 1370 extern __inline __m64 1371 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_pi32(int __i0,int __i1)1372 _mm_setr_pi32(int __i0, int __i1) { 1373 __m64_union __res; 1374 1375 __res.as_int[0] = __i0; 1376 __res.as_int[1] = __i1; 1377 return (__res.as_m64); 1378 } 1379 1380 extern __inline __m64 1381 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_pi16(short __w0,short __w1,short __w2,short __w3)1382 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { 1383 return _mm_set_pi16(__w3, __w2, __w1, __w0); 1384 } 1385 1386 extern __inline __m64 1387 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_pi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7)1388 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, 1389 char __b5, char __b6, char __b7) { 1390 return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 1391 } 1392 1393 /* Creates a vector of two 32-bit values, both elements containing I. */ 1394 extern __inline __m64 1395 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pi32(int __i)1396 _mm_set1_pi32(int __i) { 1397 __m64_union __res; 1398 1399 __res.as_int[0] = __i; 1400 __res.as_int[1] = __i; 1401 return (__res.as_m64); 1402 } 1403 1404 /* Creates a vector of four 16-bit values, all elements containing W. */ 1405 extern __inline __m64 1406 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pi16(short __w)1407 _mm_set1_pi16(short __w) { 1408 #if _ARCH_PWR9 1409 __vector signed short w; 1410 1411 w = (__vector signed short)vec_splats(__w); 1412 return (__m64)((__vector long long)w)[0]; 1413 #else 1414 __m64_union __res; 1415 1416 __res.as_short[0] = __w; 1417 __res.as_short[1] = __w; 1418 __res.as_short[2] = __w; 1419 __res.as_short[3] = __w; 1420 return (__res.as_m64); 1421 #endif 1422 } 1423 1424 /* Creates a vector of eight 8-bit values, all elements containing B. */ 1425 extern __inline __m64 1426 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set1_pi8(signed char __b)1427 _mm_set1_pi8(signed char __b) { 1428 #if _ARCH_PWR8 1429 __vector signed char __res; 1430 1431 __res = (__vector signed char)vec_splats(__b); 1432 return (__m64)((__vector long long)__res)[0]; 1433 #else 1434 __m64_union __res; 1435 1436 __res.as_char[0] = __b; 1437 __res.as_char[1] = __b; 1438 __res.as_char[2] = __b; 1439 __res.as_char[3] = __b; 1440 __res.as_char[4] = __b; 1441 __res.as_char[5] = __b; 1442 __res.as_char[6] = __b; 1443 __res.as_char[7] = __b; 1444 return (__res.as_m64); 1445 #endif 1446 } 1447 1448 #else 1449 #include_next <mmintrin.h> 1450 #endif /* defined(__powerpc64__) && \ 1451 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ 1452 1453 #endif /* _MMINTRIN_H_INCLUDED */ 1454