1 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. 12 13 NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */ 14 15 #ifndef NO_WARN_X86_INTRINSICS 16 /* This header is distributed to simplify porting x86_64 code that 17 makes explicit use of Intel intrinsics to powerp64/powerpc64le. 18 19 It is the user's responsibility to determine if the results are 20 acceptable and make additional changes as necessary. 21 22 Note that much code that uses Intel intrinsics can be rewritten in 23 standard C or GNU C extensions, which are more portable and better 24 optimized across multiple targets. */ 25 #error \ 26 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 27 #endif 28 29 #ifndef SMMINTRIN_H_ 30 #define SMMINTRIN_H_ 31 32 #if defined(__powerpc64__) && \ 33 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) 34 35 #include <altivec.h> 36 #include <tmmintrin.h> 37 38 /* Rounding mode macros. */ 39 #define _MM_FROUND_TO_NEAREST_INT 0x00 40 #define _MM_FROUND_TO_ZERO 0x01 41 #define _MM_FROUND_TO_POS_INF 0x02 42 #define _MM_FROUND_TO_NEG_INF 0x03 43 #define _MM_FROUND_CUR_DIRECTION 0x04 44 45 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) 46 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) 47 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) 48 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) 49 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) 50 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) 51 52 #define _MM_FROUND_RAISE_EXC 0x00 53 #define _MM_FROUND_NO_EXC 0x08 54 55 extern __inline __m128d 56 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 57 _mm_round_pd(__m128d __A, int __rounding) { 58 __v2df __r; 59 union { 60 double __fr; 61 long long __fpscr; 62 } __enables_save, __fpscr_save; 63 64 if (__rounding & _MM_FROUND_NO_EXC) { 65 /* Save enabled exceptions, disable all exceptions, 66 and preserve the rounding mode. */ 67 #ifdef _ARCH_PWR9 68 __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); 69 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 70 #else 71 __fpscr_save.__fr = __builtin_mffs(); 72 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 73 __fpscr_save.__fpscr &= ~0xf8; 74 __builtin_mtfsf(0b00000011, __fpscr_save.__fr); 75 #endif 76 /* Insert an artificial "read/write" reference to the variable 77 read below, to ensure the compiler does not schedule 78 a read/use of the variable before the FPSCR is modified, above. 79 This can be removed if and when GCC PR102783 is fixed. 80 */ 81 __asm__("" : "+wa"(__A)); 82 } 83 84 switch (__rounding) { 85 case _MM_FROUND_TO_NEAREST_INT: 86 __fpscr_save.__fr = __builtin_mffsl(); 87 __attribute__((fallthrough)); 88 case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: 89 __builtin_set_fpscr_rn(0b00); 90 /* Insert an artificial "read/write" reference to the variable 91 read below, to ensure the compiler does not schedule 92 a read/use of the variable before the FPSCR is modified, above. 93 This can be removed if and when GCC PR102783 is fixed. 94 */ 95 __asm__("" : "+wa"(__A)); 96 97 __r = vec_rint((__v2df)__A); 98 99 /* Insert an artificial "read" reference to the variable written 100 above, to ensure the compiler does not schedule the computation 101 of the value after the manipulation of the FPSCR, below. 102 This can be removed if and when GCC PR102783 is fixed. 103 */ 104 __asm__("" : : "wa"(__r)); 105 __builtin_set_fpscr_rn(__fpscr_save.__fpscr); 106 break; 107 case _MM_FROUND_TO_NEG_INF: 108 case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: 109 __r = vec_floor((__v2df)__A); 110 break; 111 case _MM_FROUND_TO_POS_INF: 112 case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: 113 __r = vec_ceil((__v2df)__A); 114 break; 115 case _MM_FROUND_TO_ZERO: 116 case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: 117 __r = vec_trunc((__v2df)__A); 118 break; 119 case _MM_FROUND_CUR_DIRECTION: 120 __r = vec_rint((__v2df)__A); 121 break; 122 } 123 if (__rounding & _MM_FROUND_NO_EXC) { 124 /* Insert an artificial "read" reference to the variable written 125 above, to ensure the compiler does not schedule the computation 126 of the value after the manipulation of the FPSCR, below. 127 This can be removed if and when GCC PR102783 is fixed. 128 */ 129 __asm__("" : : "wa"(__r)); 130 /* Restore enabled exceptions. */ 131 __fpscr_save.__fr = __builtin_mffsl(); 132 __fpscr_save.__fpscr |= __enables_save.__fpscr; 133 __builtin_mtfsf(0b00000011, __fpscr_save.__fr); 134 } 135 return (__m128d)__r; 136 } 137 138 extern __inline __m128d 139 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 140 _mm_round_sd(__m128d __A, __m128d __B, int __rounding) { 141 __B = _mm_round_pd(__B, __rounding); 142 __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]}; 143 return (__m128d)__r; 144 } 145 146 extern __inline __m128 147 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 148 _mm_round_ps(__m128 __A, int __rounding) { 149 __v4sf __r; 150 union { 151 double __fr; 152 long long __fpscr; 153 } __enables_save, __fpscr_save; 154 155 if (__rounding & _MM_FROUND_NO_EXC) { 156 /* Save enabled exceptions, disable all exceptions, 157 and preserve the rounding mode. */ 158 #ifdef _ARCH_PWR9 159 __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); 160 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 161 #else 162 __fpscr_save.__fr = __builtin_mffs(); 163 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; 164 __fpscr_save.__fpscr &= ~0xf8; 165 __builtin_mtfsf(0b00000011, __fpscr_save.__fr); 166 #endif 167 /* Insert an artificial "read/write" reference to the variable 168 read below, to ensure the compiler does not schedule 169 a read/use of the variable before the FPSCR is modified, above. 170 This can be removed if and when GCC PR102783 is fixed. 171 */ 172 __asm__("" : "+wa"(__A)); 173 } 174 175 switch (__rounding) { 176 case _MM_FROUND_TO_NEAREST_INT: 177 __fpscr_save.__fr = __builtin_mffsl(); 178 __attribute__((fallthrough)); 179 case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: 180 __builtin_set_fpscr_rn(0b00); 181 /* Insert an artificial "read/write" reference to the variable 182 read below, to ensure the compiler does not schedule 183 a read/use of the variable before the FPSCR is modified, above. 184 This can be removed if and when GCC PR102783 is fixed. 185 */ 186 __asm__("" : "+wa"(__A)); 187 188 __r = vec_rint((__v4sf)__A); 189 190 /* Insert an artificial "read" reference to the variable written 191 above, to ensure the compiler does not schedule the computation 192 of the value after the manipulation of the FPSCR, below. 193 This can be removed if and when GCC PR102783 is fixed. 194 */ 195 __asm__("" : : "wa"(__r)); 196 __builtin_set_fpscr_rn(__fpscr_save.__fpscr); 197 break; 198 case _MM_FROUND_TO_NEG_INF: 199 case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: 200 __r = vec_floor((__v4sf)__A); 201 break; 202 case _MM_FROUND_TO_POS_INF: 203 case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: 204 __r = vec_ceil((__v4sf)__A); 205 break; 206 case _MM_FROUND_TO_ZERO: 207 case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: 208 __r = vec_trunc((__v4sf)__A); 209 break; 210 case _MM_FROUND_CUR_DIRECTION: 211 __r = vec_rint((__v4sf)__A); 212 break; 213 } 214 if (__rounding & _MM_FROUND_NO_EXC) { 215 /* Insert an artificial "read" reference to the variable written 216 above, to ensure the compiler does not schedule the computation 217 of the value after the manipulation of the FPSCR, below. 218 This can be removed if and when GCC PR102783 is fixed. 219 */ 220 __asm__("" : : "wa"(__r)); 221 /* Restore enabled exceptions. */ 222 __fpscr_save.__fr = __builtin_mffsl(); 223 __fpscr_save.__fpscr |= __enables_save.__fpscr; 224 __builtin_mtfsf(0b00000011, __fpscr_save.__fr); 225 } 226 return (__m128)__r; 227 } 228 229 extern __inline __m128 230 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 231 _mm_round_ss(__m128 __A, __m128 __B, int __rounding) { 232 __B = _mm_round_ps(__B, __rounding); 233 __v4sf __r = (__v4sf)__A; 234 __r[0] = ((__v4sf)__B)[0]; 235 return (__m128)__r; 236 } 237 238 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL) 239 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL) 240 241 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) 242 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR) 243 244 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL) 245 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL) 246 247 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR) 248 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR) 249 250 extern __inline __m128i 251 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 252 _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { 253 __v16qi __result = (__v16qi)__A; 254 255 __result[__N & 0xf] = __D; 256 257 return (__m128i)__result; 258 } 259 260 extern __inline __m128i 261 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 262 _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { 263 __v4si __result = (__v4si)__A; 264 265 __result[__N & 3] = __D; 266 267 return (__m128i)__result; 268 } 269 270 extern __inline __m128i 271 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 272 _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { 273 __v2di __result = (__v2di)__A; 274 275 __result[__N & 1] = __D; 276 277 return (__m128i)__result; 278 } 279 280 extern __inline int 281 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 282 _mm_extract_epi8(__m128i __X, const int __N) { 283 return (unsigned char)((__v16qi)__X)[__N & 15]; 284 } 285 286 extern __inline int 287 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 288 _mm_extract_epi32(__m128i __X, const int __N) { 289 return ((__v4si)__X)[__N & 3]; 290 } 291 292 extern __inline int 293 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 294 _mm_extract_epi64(__m128i __X, const int __N) { 295 return ((__v2di)__X)[__N & 1]; 296 } 297 298 extern __inline int 299 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 300 _mm_extract_ps(__m128 __X, const int __N) { 301 return ((__v4si)__X)[__N & 3]; 302 } 303 304 #ifdef _ARCH_PWR8 305 extern __inline __m128i 306 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 307 _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { 308 __v16qi __charmask = vec_splats((signed char)__imm8); 309 __charmask = vec_gb(__charmask); 310 __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask); 311 #ifdef __BIG_ENDIAN__ 312 __shortmask = vec_reve(__shortmask); 313 #endif 314 return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); 315 } 316 #endif 317 318 extern __inline __m128i 319 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 320 _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { 321 #ifdef _ARCH_PWR10 322 return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask); 323 #else 324 const __v16qu __seven = vec_splats((unsigned char)0x07); 325 __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); 326 return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask); 327 #endif 328 } 329 330 extern __inline __m128 331 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 332 _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) { 333 __v16qu __pcv[] = { 334 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 335 {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 336 {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, 337 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, 338 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, 339 {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, 340 {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, 341 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, 342 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, 343 {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, 344 {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, 345 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, 346 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, 347 {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, 348 {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 349 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 350 }; 351 __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); 352 return (__m128)__r; 353 } 354 355 extern __inline __m128 356 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 357 _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) { 358 #ifdef _ARCH_PWR10 359 return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask); 360 #else 361 const __v4si __zero = {0}; 362 const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero); 363 return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask); 364 #endif 365 } 366 367 extern __inline __m128d 368 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 369 _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) { 370 __v16qu __pcv[] = { 371 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 372 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, 373 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, 374 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}}; 375 __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); 376 return (__m128d)__r; 377 } 378 379 #ifdef _ARCH_PWR8 380 extern __inline __m128d 381 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 382 _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) { 383 #ifdef _ARCH_PWR10 384 return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask); 385 #else 386 const __v2di __zero = {0}; 387 const __vector __bool long long __boolmask = 388 vec_cmplt((__v2di)__mask, __zero); 389 return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask); 390 #endif 391 } 392 #endif 393 394 extern __inline int 395 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 396 _mm_testz_si128(__m128i __A, __m128i __B) { 397 /* Note: This implementation does NOT set "zero" or "carry" flags. */ 398 const __v16qu __zero = {0}; 399 return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero); 400 } 401 402 extern __inline int 403 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 404 _mm_testc_si128(__m128i __A, __m128i __B) { 405 /* Note: This implementation does NOT set "zero" or "carry" flags. */ 406 const __v16qu __zero = {0}; 407 const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A); 408 return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero); 409 } 410 411 extern __inline int 412 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 413 _mm_testnzc_si128(__m128i __A, __m128i __B) { 414 /* Note: This implementation does NOT set "zero" or "carry" flags. */ 415 return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0; 416 } 417 418 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) 419 420 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 421 422 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 423 424 #ifdef _ARCH_PWR8 425 extern __inline __m128i 426 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 427 _mm_cmpeq_epi64(__m128i __X, __m128i __Y) { 428 return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y); 429 } 430 #endif 431 432 extern __inline __m128i 433 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 434 _mm_min_epi8(__m128i __X, __m128i __Y) { 435 return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y); 436 } 437 438 extern __inline __m128i 439 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 440 _mm_min_epu16(__m128i __X, __m128i __Y) { 441 return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y); 442 } 443 444 extern __inline __m128i 445 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 446 _mm_min_epi32(__m128i __X, __m128i __Y) { 447 return (__m128i)vec_min((__v4si)__X, (__v4si)__Y); 448 } 449 450 extern __inline __m128i 451 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 452 _mm_min_epu32(__m128i __X, __m128i __Y) { 453 return (__m128i)vec_min((__v4su)__X, (__v4su)__Y); 454 } 455 456 extern __inline __m128i 457 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 458 _mm_max_epi8(__m128i __X, __m128i __Y) { 459 return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y); 460 } 461 462 extern __inline __m128i 463 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 464 _mm_max_epu16(__m128i __X, __m128i __Y) { 465 return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y); 466 } 467 468 extern __inline __m128i 469 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 470 _mm_max_epi32(__m128i __X, __m128i __Y) { 471 return (__m128i)vec_max((__v4si)__X, (__v4si)__Y); 472 } 473 474 extern __inline __m128i 475 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 476 _mm_max_epu32(__m128i __X, __m128i __Y) { 477 return (__m128i)vec_max((__v4su)__X, (__v4su)__Y); 478 } 479 480 extern __inline __m128i 481 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 482 _mm_mullo_epi32(__m128i __X, __m128i __Y) { 483 return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y); 484 } 485 486 #ifdef _ARCH_PWR8 487 extern __inline __m128i 488 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 489 _mm_mul_epi32(__m128i __X, __m128i __Y) { 490 return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y); 491 } 492 #endif 493 494 extern __inline __m128i 495 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 496 _mm_cvtepi8_epi16(__m128i __A) { 497 return (__m128i)vec_unpackh((__v16qi)__A); 498 } 499 500 extern __inline __m128i 501 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 502 _mm_cvtepi8_epi32(__m128i __A) { 503 __A = (__m128i)vec_unpackh((__v16qi)__A); 504 return (__m128i)vec_unpackh((__v8hi)__A); 505 } 506 507 #ifdef _ARCH_PWR8 508 extern __inline __m128i 509 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 510 _mm_cvtepi8_epi64(__m128i __A) { 511 __A = (__m128i)vec_unpackh((__v16qi)__A); 512 __A = (__m128i)vec_unpackh((__v8hi)__A); 513 return (__m128i)vec_unpackh((__v4si)__A); 514 } 515 #endif 516 517 extern __inline __m128i 518 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 519 _mm_cvtepi16_epi32(__m128i __A) { 520 return (__m128i)vec_unpackh((__v8hi)__A); 521 } 522 523 #ifdef _ARCH_PWR8 524 extern __inline __m128i 525 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 526 _mm_cvtepi16_epi64(__m128i __A) { 527 __A = (__m128i)vec_unpackh((__v8hi)__A); 528 return (__m128i)vec_unpackh((__v4si)__A); 529 } 530 #endif 531 532 #ifdef _ARCH_PWR8 533 extern __inline __m128i 534 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 535 _mm_cvtepi32_epi64(__m128i __A) { 536 return (__m128i)vec_unpackh((__v4si)__A); 537 } 538 #endif 539 540 extern __inline __m128i 541 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 542 _mm_cvtepu8_epi16(__m128i __A) { 543 const __v16qu __zero = {0}; 544 #ifdef __LITTLE_ENDIAN__ 545 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); 546 #else /* __BIG_ENDIAN__. */ 547 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); 548 #endif /* __BIG_ENDIAN__. */ 549 return __A; 550 } 551 552 extern __inline __m128i 553 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 554 _mm_cvtepu8_epi32(__m128i __A) { 555 const __v16qu __zero = {0}; 556 #ifdef __LITTLE_ENDIAN__ 557 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); 558 __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); 559 #else /* __BIG_ENDIAN__. */ 560 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); 561 __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); 562 #endif /* __BIG_ENDIAN__. */ 563 return __A; 564 } 565 566 extern __inline __m128i 567 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 568 _mm_cvtepu8_epi64(__m128i __A) { 569 const __v16qu __zero = {0}; 570 #ifdef __LITTLE_ENDIAN__ 571 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); 572 __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); 573 __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); 574 #else /* __BIG_ENDIAN__. */ 575 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); 576 __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); 577 __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); 578 #endif /* __BIG_ENDIAN__. */ 579 return __A; 580 } 581 582 extern __inline __m128i 583 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 584 _mm_cvtepu16_epi32(__m128i __A) { 585 const __v8hu __zero = {0}; 586 #ifdef __LITTLE_ENDIAN__ 587 __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); 588 #else /* __BIG_ENDIAN__. */ 589 __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); 590 #endif /* __BIG_ENDIAN__. */ 591 return __A; 592 } 593 594 extern __inline __m128i 595 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 596 _mm_cvtepu16_epi64(__m128i __A) { 597 const __v8hu __zero = {0}; 598 #ifdef __LITTLE_ENDIAN__ 599 __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); 600 __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); 601 #else /* __BIG_ENDIAN__. */ 602 __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); 603 __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); 604 #endif /* __BIG_ENDIAN__. */ 605 return __A; 606 } 607 608 extern __inline __m128i 609 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 610 _mm_cvtepu32_epi64(__m128i __A) { 611 const __v4su __zero = {0}; 612 #ifdef __LITTLE_ENDIAN__ 613 __A = (__m128i)vec_mergeh((__v4su)__A, __zero); 614 #else /* __BIG_ENDIAN__. */ 615 __A = (__m128i)vec_mergeh(__zero, (__v4su)__A); 616 #endif /* __BIG_ENDIAN__. */ 617 return __A; 618 } 619 620 /* Return horizontal packed word minimum and its index in bits [15:0] 621 and bits [18:16] respectively. */ 622 extern __inline __m128i 623 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 624 _mm_minpos_epu16(__m128i __A) { 625 union __u { 626 __m128i __m; 627 __v8hu __uh; 628 }; 629 union __u __u = {.__m = __A}, __r = {.__m = {0}}; 630 unsigned short __ridx = 0; 631 unsigned short __rmin = __u.__uh[__ridx]; 632 unsigned long __i; 633 for (__i = 1; __i < 8; __i++) { 634 if (__u.__uh[__i] < __rmin) { 635 __rmin = __u.__uh[__i]; 636 __ridx = __i; 637 } 638 } 639 __r.__uh[0] = __rmin; 640 __r.__uh[1] = __ridx; 641 return __r.__m; 642 } 643 644 extern __inline __m128i 645 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 646 _mm_packus_epi32(__m128i __X, __m128i __Y) { 647 return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y); 648 } 649 650 #ifdef _ARCH_PWR8 651 extern __inline __m128i 652 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 653 _mm_cmpgt_epi64(__m128i __X, __m128i __Y) { 654 return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y); 655 } 656 #endif 657 658 #else 659 #include_next <smmintrin.h> 660 #endif /* defined(__powerpc64__) && \ 661 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ 662 663 #endif /* SMMINTRIN_H_ */ 664