1 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. 12 13 NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */ 14 15 #ifndef NO_WARN_X86_INTRINSICS 16 /* This header is distributed to simplify porting x86_64 code that 17 makes explicit use of Intel intrinsics to powerpc64/powerpc64le. 18 19 It is the user's responsibility to determine if the results are 20 acceptable and make additional changes as necessary. 21 22 Note that much code that uses Intel intrinsics can be rewritten in 23 standard C or GNU C extensions, which are more portable and better 24 optimized across multiple targets. */ 25 #error \ 26 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 27 #endif 28 29 #ifndef SMMINTRIN_H_ 30 #define SMMINTRIN_H_ 31 32 #if defined(__powerpc64__) && \ 33 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) 34 35 #include <altivec.h> 36 #include <tmmintrin.h> 37 38 /* Rounding mode macros. */ 39 #define _MM_FROUND_TO_NEAREST_INT 0x00 40 #define _MM_FROUND_TO_ZERO 0x01 41 #define _MM_FROUND_TO_POS_INF 0x02 42 #define _MM_FROUND_TO_NEG_INF 0x03 43 #define _MM_FROUND_CUR_DIRECTION 0x04 44 45 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) 46 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) 47 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) 48 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) 49 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) 50 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) 51 52 #define _MM_FROUND_RAISE_EXC 0x00 53 #define _MM_FROUND_NO_EXC 0x08 54 55 extern __inline __m128d 56 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 57 _mm_round_pd(__m128d __A, int __rounding) { 58 __v2df __r; 59 union { 60 double __fr; 61 long long __fpscr; 62 } __enables_save, __fpscr_save; 63 64 if (__rounding & _MM_FROUND_NO_EXC) { 65 /* Save enabled exceptions, disable all exceptions, 66 and preserve the rounding mode. */ 67 #ifdef _ARCH_PWR9 68 __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); 69 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 70 #else 71 __fpscr_save.__fr = __builtin_ppc_mffs(); 72 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 73 __fpscr_save.__fpscr &= ~0xf8; 74 __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); 75 #endif 76 /* Insert an artificial "read/write" reference to the variable 77 read below, to ensure the compiler does not schedule 78 a read/use of the variable before the FPSCR is modified, above. 79 This can be removed if and when GCC PR102783 is fixed. 80 */ 81 __asm__("" : "+wa"(__A)); 82 } 83 84 switch (__rounding) { 85 case _MM_FROUND_TO_NEAREST_INT: 86 #ifdef _ARCH_PWR9 87 __fpscr_save.__fr = __builtin_ppc_mffsl(); 88 #else 89 __fpscr_save.__fr = __builtin_ppc_mffs(); 90 __fpscr_save.__fpscr &= 0x70007f0ffL; 91 #endif 92 __attribute__((fallthrough)); 93 case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: 94 __builtin_ppc_set_fpscr_rn(0b00); 95 /* Insert an artificial "read/write" reference to the variable 96 read below, to ensure the compiler does not schedule 97 a read/use of the variable before the FPSCR is modified, above. 98 This can be removed if and when GCC PR102783 is fixed. 99 */ 100 __asm__("" : "+wa"(__A)); 101 102 __r = vec_rint((__v2df)__A); 103 104 /* Insert an artificial "read" reference to the variable written 105 above, to ensure the compiler does not schedule the computation 106 of the value after the manipulation of the FPSCR, below. 107 This can be removed if and when GCC PR102783 is fixed. 108 */ 109 __asm__("" : : "wa"(__r)); 110 __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr); 111 break; 112 case _MM_FROUND_TO_NEG_INF: 113 case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: 114 __r = vec_floor((__v2df)__A); 115 break; 116 case _MM_FROUND_TO_POS_INF: 117 case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: 118 __r = vec_ceil((__v2df)__A); 119 break; 120 case _MM_FROUND_TO_ZERO: 121 case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: 122 __r = vec_trunc((__v2df)__A); 123 break; 124 case _MM_FROUND_CUR_DIRECTION: 125 __r = vec_rint((__v2df)__A); 126 break; 127 } 128 if (__rounding & _MM_FROUND_NO_EXC) { 129 /* Insert an artificial "read" reference to the variable written 130 above, to ensure the compiler does not schedule the computation 131 of the value after the manipulation of the FPSCR, below. 132 This can be removed if and when GCC PR102783 is fixed. 133 */ 134 __asm__("" : : "wa"(__r)); 135 /* Restore enabled exceptions. */ 136 #ifdef _ARCH_PWR9 137 __fpscr_save.__fr = __builtin_ppc_mffsl(); 138 #else 139 __fpscr_save.__fr = __builtin_ppc_mffs(); 140 __fpscr_save.__fpscr &= 0x70007f0ffL; 141 #endif 142 __fpscr_save.__fpscr |= __enables_save.__fpscr; 143 __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); 144 } 145 return (__m128d)__r; 146 } 147 148 extern __inline __m128d 149 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 150 _mm_round_sd(__m128d __A, __m128d __B, int __rounding) { 151 __B = _mm_round_pd(__B, __rounding); 152 __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]}; 153 return (__m128d)__r; 154 } 155 156 extern __inline __m128 157 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 158 _mm_round_ps(__m128 __A, int __rounding) { 159 __v4sf __r; 160 union { 161 double __fr; 162 long long __fpscr; 163 } __enables_save, __fpscr_save; 164 165 if (__rounding & _MM_FROUND_NO_EXC) { 166 /* Save enabled exceptions, disable all exceptions, 167 and preserve the rounding mode. */ 168 #ifdef _ARCH_PWR9 169 __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); 170 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 171 #else 172 __fpscr_save.__fr = __builtin_ppc_mffs(); 173 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 174 __fpscr_save.__fpscr &= ~0xf8; 175 __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); 176 #endif 177 /* Insert an artificial "read/write" reference to the variable 178 read below, to ensure the compiler does not schedule 179 a read/use of the variable before the FPSCR is modified, above. 180 This can be removed if and when GCC PR102783 is fixed. 181 */ 182 __asm__("" : "+wa"(__A)); 183 } 184 185 switch (__rounding) { 186 case _MM_FROUND_TO_NEAREST_INT: 187 #ifdef _ARCH_PWR9 188 __fpscr_save.__fr = __builtin_ppc_mffsl(); 189 #else 190 __fpscr_save.__fr = __builtin_ppc_mffs(); 191 __fpscr_save.__fpscr &= 0x70007f0ffL; 192 #endif 193 __attribute__((fallthrough)); 194 case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: 195 __builtin_ppc_set_fpscr_rn(0b00); 196 /* Insert an artificial "read/write" reference to the variable 197 read below, to ensure the compiler does not schedule 198 a read/use of the variable before the FPSCR is modified, above. 199 This can be removed if and when GCC PR102783 is fixed. 200 */ 201 __asm__("" : "+wa"(__A)); 202 203 __r = vec_rint((__v4sf)__A); 204 205 /* Insert an artificial "read" reference to the variable written 206 above, to ensure the compiler does not schedule the computation 207 of the value after the manipulation of the FPSCR, below. 208 This can be removed if and when GCC PR102783 is fixed. 209 */ 210 __asm__("" : : "wa"(__r)); 211 __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr); 212 break; 213 case _MM_FROUND_TO_NEG_INF: 214 case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: 215 __r = vec_floor((__v4sf)__A); 216 break; 217 case _MM_FROUND_TO_POS_INF: 218 case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: 219 __r = vec_ceil((__v4sf)__A); 220 break; 221 case _MM_FROUND_TO_ZERO: 222 case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: 223 __r = vec_trunc((__v4sf)__A); 224 break; 225 case _MM_FROUND_CUR_DIRECTION: 226 __r = vec_rint((__v4sf)__A); 227 break; 228 } 229 if (__rounding & _MM_FROUND_NO_EXC) { 230 /* Insert an artificial "read" reference to the variable written 231 above, to ensure the compiler does not schedule the computation 232 of the value after the manipulation of the FPSCR, below. 233 This can be removed if and when GCC PR102783 is fixed. 234 */ 235 __asm__("" : : "wa"(__r)); 236 /* Restore enabled exceptions. */ 237 #ifdef _ARCH_PWR9 238 __fpscr_save.__fr = __builtin_ppc_mffsl(); 239 #else 240 __fpscr_save.__fr = __builtin_ppc_mffs(); 241 __fpscr_save.__fpscr &= 0x70007f0ffL; 242 #endif 243 __fpscr_save.__fpscr |= __enables_save.__fpscr; 244 __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); 245 } 246 return (__m128)__r; 247 } 248 249 extern __inline __m128 250 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 251 _mm_round_ss(__m128 __A, __m128 __B, int __rounding) { 252 __B = _mm_round_ps(__B, __rounding); 253 __v4sf __r = (__v4sf)__A; 254 __r[0] = ((__v4sf)__B)[0]; 255 return (__m128)__r; 256 } 257 258 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL) 259 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL) 260 261 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) 262 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR) 263 264 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL) 265 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL) 266 267 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR) 268 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR) 269 270 extern __inline __m128i 271 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 272 _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { 273 __v16qi __result = (__v16qi)__A; 274 275 __result[__N & 0xf] = __D; 276 277 return (__m128i)__result; 278 } 279 280 extern __inline __m128i 281 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 282 _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { 283 __v4si __result = (__v4si)__A; 284 285 __result[__N & 3] = __D; 286 287 return (__m128i)__result; 288 } 289 290 extern __inline __m128i 291 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 292 _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { 293 __v2di __result = (__v2di)__A; 294 295 __result[__N & 1] = __D; 296 297 return (__m128i)__result; 298 } 299 300 extern __inline int 301 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 302 _mm_extract_epi8(__m128i __X, const int __N) { 303 return (unsigned char)((__v16qi)__X)[__N & 15]; 304 } 305 306 extern __inline int 307 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 308 _mm_extract_epi32(__m128i __X, const int __N) { 309 return ((__v4si)__X)[__N & 3]; 310 } 311 312 extern __inline int 313 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 314 _mm_extract_epi64(__m128i __X, const int __N) { 315 return ((__v2di)__X)[__N & 1]; 316 } 317 318 extern __inline int 319 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 320 _mm_extract_ps(__m128 __X, const int __N) { 321 return ((__v4si)__X)[__N & 3]; 322 } 323 324 #ifdef _ARCH_PWR8 325 extern __inline __m128i 326 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 327 _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { 328 __v16qu __charmask = vec_splats((unsigned char)__imm8); 329 __charmask = vec_gb(__charmask); 330 __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask); 331 #ifdef __BIG_ENDIAN__ 332 __shortmask = vec_reve(__shortmask); 333 #endif 334 return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); 335 } 336 #endif 337 338 extern __inline __m128i 339 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 340 _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { 341 #ifdef _ARCH_PWR10 342 return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask); 343 #else 344 const __v16qu __seven = vec_splats((unsigned char)0x07); 345 __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); 346 return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask); 347 #endif 348 } 349 350 extern __inline __m128 351 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 352 _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) { 353 __v16qu __pcv[] = { 354 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 355 {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 356 {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, 357 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, 358 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, 359 {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, 360 {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, 361 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, 362 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, 363 {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, 364 {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, 365 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, 366 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, 367 {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, 368 {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 369 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 370 }; 371 __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); 372 return (__m128)__r; 373 } 374 375 extern __inline __m128 376 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 377 _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) { 378 #ifdef _ARCH_PWR10 379 return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask); 380 #else 381 const __v4si __zero = {0}; 382 const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero); 383 return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask); 384 #endif 385 } 386 387 extern __inline __m128d 388 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 389 _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) { 390 __v16qu __pcv[] = { 391 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 392 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, 393 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, 394 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}}; 395 __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); 396 return (__m128d)__r; 397 } 398 399 #ifdef _ARCH_PWR8 400 extern __inline __m128d 401 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 402 _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) { 403 #ifdef _ARCH_PWR10 404 return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask); 405 #else 406 const __v2di __zero = {0}; 407 const __vector __bool long long __boolmask = 408 vec_cmplt((__v2di)__mask, __zero); 409 return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask); 410 #endif 411 } 412 #endif 413 414 extern __inline int 415 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 416 _mm_testz_si128(__m128i __A, __m128i __B) { 417 /* Note: This implementation does NOT set "zero" or "carry" flags. */ 418 const __v16qu __zero = {0}; 419 return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero); 420 } 421 422 extern __inline int 423 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 424 _mm_testc_si128(__m128i __A, __m128i __B) { 425 /* Note: This implementation does NOT set "zero" or "carry" flags. */ 426 const __v16qu __zero = {0}; 427 const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A); 428 return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero); 429 } 430 431 extern __inline int 432 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 433 _mm_testnzc_si128(__m128i __A, __m128i __B) { 434 /* Note: This implementation does NOT set "zero" or "carry" flags. */ 435 return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0; 436 } 437 438 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) 439 440 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 441 442 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 443 444 #ifdef _ARCH_PWR8 445 extern __inline __m128i 446 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 447 _mm_cmpeq_epi64(__m128i __X, __m128i __Y) { 448 return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y); 449 } 450 #endif 451 452 extern __inline __m128i 453 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 454 _mm_min_epi8(__m128i __X, __m128i __Y) { 455 return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y); 456 } 457 458 extern __inline __m128i 459 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 460 _mm_min_epu16(__m128i __X, __m128i __Y) { 461 return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y); 462 } 463 464 extern __inline __m128i 465 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 466 _mm_min_epi32(__m128i __X, __m128i __Y) { 467 return (__m128i)vec_min((__v4si)__X, (__v4si)__Y); 468 } 469 470 extern __inline __m128i 471 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 472 _mm_min_epu32(__m128i __X, __m128i __Y) { 473 return (__m128i)vec_min((__v4su)__X, (__v4su)__Y); 474 } 475 476 extern __inline __m128i 477 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 478 _mm_max_epi8(__m128i __X, __m128i __Y) { 479 return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y); 480 } 481 482 extern __inline __m128i 483 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 484 _mm_max_epu16(__m128i __X, __m128i __Y) { 485 return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y); 486 } 487 488 extern __inline __m128i 489 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 490 _mm_max_epi32(__m128i __X, __m128i __Y) { 491 return (__m128i)vec_max((__v4si)__X, (__v4si)__Y); 492 } 493 494 extern __inline __m128i 495 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 496 _mm_max_epu32(__m128i __X, __m128i __Y) { 497 return (__m128i)vec_max((__v4su)__X, (__v4su)__Y); 498 } 499 500 extern __inline __m128i 501 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 502 _mm_mullo_epi32(__m128i __X, __m128i __Y) { 503 return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y); 504 } 505 506 #ifdef _ARCH_PWR8 507 extern __inline __m128i 508 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 509 _mm_mul_epi32(__m128i __X, __m128i __Y) { 510 return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y); 511 } 512 #endif 513 514 extern __inline __m128i 515 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 516 _mm_cvtepi8_epi16(__m128i __A) { 517 return (__m128i)vec_unpackh((__v16qi)__A); 518 } 519 520 extern __inline __m128i 521 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 522 _mm_cvtepi8_epi32(__m128i __A) { 523 __A = (__m128i)vec_unpackh((__v16qi)__A); 524 return (__m128i)vec_unpackh((__v8hi)__A); 525 } 526 527 #ifdef _ARCH_PWR8 528 extern __inline __m128i 529 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 530 _mm_cvtepi8_epi64(__m128i __A) { 531 __A = (__m128i)vec_unpackh((__v16qi)__A); 532 __A = (__m128i)vec_unpackh((__v8hi)__A); 533 return (__m128i)vec_unpackh((__v4si)__A); 534 } 535 #endif 536 537 extern __inline __m128i 538 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 539 _mm_cvtepi16_epi32(__m128i __A) { 540 return (__m128i)vec_unpackh((__v8hi)__A); 541 } 542 543 #ifdef _ARCH_PWR8 544 extern __inline __m128i 545 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 546 _mm_cvtepi16_epi64(__m128i __A) { 547 __A = (__m128i)vec_unpackh((__v8hi)__A); 548 return (__m128i)vec_unpackh((__v4si)__A); 549 } 550 #endif 551 552 #ifdef _ARCH_PWR8 553 extern __inline __m128i 554 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 555 _mm_cvtepi32_epi64(__m128i __A) { 556 return (__m128i)vec_unpackh((__v4si)__A); 557 } 558 #endif 559 560 extern __inline __m128i 561 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 562 _mm_cvtepu8_epi16(__m128i __A) { 563 const __v16qu __zero = {0}; 564 #ifdef __LITTLE_ENDIAN__ 565 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); 566 #else /* __BIG_ENDIAN__. */ 567 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); 568 #endif /* __BIG_ENDIAN__. */ 569 return __A; 570 } 571 572 extern __inline __m128i 573 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 574 _mm_cvtepu8_epi32(__m128i __A) { 575 const __v16qu __zero = {0}; 576 #ifdef __LITTLE_ENDIAN__ 577 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); 578 __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); 579 #else /* __BIG_ENDIAN__. */ 580 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); 581 __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); 582 #endif /* __BIG_ENDIAN__. */ 583 return __A; 584 } 585 586 extern __inline __m128i 587 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 588 _mm_cvtepu8_epi64(__m128i __A) { 589 const __v16qu __zero = {0}; 590 #ifdef __LITTLE_ENDIAN__ 591 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); 592 __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); 593 __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); 594 #else /* __BIG_ENDIAN__. */ 595 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); 596 __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); 597 __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); 598 #endif /* __BIG_ENDIAN__. */ 599 return __A; 600 } 601 602 extern __inline __m128i 603 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 604 _mm_cvtepu16_epi32(__m128i __A) { 605 const __v8hu __zero = {0}; 606 #ifdef __LITTLE_ENDIAN__ 607 __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); 608 #else /* __BIG_ENDIAN__. */ 609 __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); 610 #endif /* __BIG_ENDIAN__. */ 611 return __A; 612 } 613 614 extern __inline __m128i 615 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 616 _mm_cvtepu16_epi64(__m128i __A) { 617 const __v8hu __zero = {0}; 618 #ifdef __LITTLE_ENDIAN__ 619 __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); 620 __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); 621 #else /* __BIG_ENDIAN__. */ 622 __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); 623 __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); 624 #endif /* __BIG_ENDIAN__. */ 625 return __A; 626 } 627 628 extern __inline __m128i 629 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 630 _mm_cvtepu32_epi64(__m128i __A) { 631 const __v4su __zero = {0}; 632 #ifdef __LITTLE_ENDIAN__ 633 __A = (__m128i)vec_mergeh((__v4su)__A, __zero); 634 #else /* __BIG_ENDIAN__. */ 635 __A = (__m128i)vec_mergeh(__zero, (__v4su)__A); 636 #endif /* __BIG_ENDIAN__. */ 637 return __A; 638 } 639 640 /* Return horizontal packed word minimum and its index in bits [15:0] 641 and bits [18:16] respectively. */ 642 extern __inline __m128i 643 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 644 _mm_minpos_epu16(__m128i __A) { 645 union __u { 646 __m128i __m; 647 __v8hu __uh; 648 }; 649 union __u __u = {.__m = __A}, __r = {.__m = {0}}; 650 unsigned short __ridx = 0; 651 unsigned short __rmin = __u.__uh[__ridx]; 652 unsigned long __i; 653 for (__i = 1; __i < 8; __i++) { 654 if (__u.__uh[__i] < __rmin) { 655 __rmin = __u.__uh[__i]; 656 __ridx = __i; 657 } 658 } 659 __r.__uh[0] = __rmin; 660 __r.__uh[1] = __ridx; 661 return __r.__m; 662 } 663 664 extern __inline __m128i 665 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 666 _mm_packus_epi32(__m128i __X, __m128i __Y) { 667 return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y); 668 } 669 670 #ifdef _ARCH_PWR8 671 extern __inline __m128i 672 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 673 _mm_cmpgt_epi64(__m128i __X, __m128i __Y) { 674 return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y); 675 } 676 #endif 677 678 #else 679 #include_next <smmintrin.h> 680 #endif /* defined(__powerpc64__) && \ 681 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ 682 683 #endif /* SMMINTRIN_H_ */ 684