1 /*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header is distributed to simplify porting x86_64 code that 15 makes explicit use of Intel intrinsics to powerpc64le. 16 17 It is the user's responsibility to determine if the results are 18 acceptable and make additional changes as necessary. 19 20 Note that much code that uses Intel intrinsics can be rewritten in 21 standard C or GNU C extensions, which are more portable and better 22 optimized across multiple targets. */ 23 #endif 24 25 #ifndef TMMINTRIN_H_ 26 #define TMMINTRIN_H_ 27 28 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) 29 30 #include <altivec.h> 31 32 /* We need definitions from the SSE header files. */ 33 #include <pmmintrin.h> 34 35 extern __inline __m128i 36 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 37 _mm_abs_epi16 (__m128i __A) 38 { 39 return (__m128i) vec_abs ((__v8hi) __A); 40 } 41 42 extern __inline __m128i 43 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 44 _mm_abs_epi32 (__m128i __A) 45 { 46 return (__m128i) vec_abs ((__v4si) __A); 47 } 48 49 extern __inline __m128i 50 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 51 _mm_abs_epi8 (__m128i __A) 52 { 53 return (__m128i) vec_abs ((__v16qi) __A); 54 } 55 56 extern __inline __m64 57 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 58 _mm_abs_pi16 (__m64 __A) 59 { 60 __v8hi __B = (__v8hi) (__v2du) { __A, __A }; 61 return (__m64) ((__v2du) vec_abs (__B))[0]; 62 } 63 64 extern __inline __m64 65 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 66 _mm_abs_pi32 (__m64 __A) 67 { 68 __v4si __B = (__v4si) (__v2du) { __A, __A }; 69 return (__m64) ((__v2du) vec_abs (__B))[0]; 70 } 71 72 extern __inline __m64 73 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 74 _mm_abs_pi8 (__m64 __A) 75 { 76 __v16qi __B = (__v16qi) (__v2du) { __A, __A }; 77 return (__m64) ((__v2du) vec_abs (__B))[0]; 78 } 79 80 extern __inline __m128i 81 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 82 _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) 83 { 84 if (__builtin_constant_p (__count) && __count < 16) 85 { 86 #ifdef __LITTLE_ENDIAN__ 87 __A = (__m128i) vec_reve ((__v16qu) __A); 88 __B = (__m128i) vec_reve ((__v16qu) __B); 89 #endif 90 __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count); 91 #ifdef __LITTLE_ENDIAN__ 92 __A = (__m128i) vec_reve ((__v16qu) __A); 93 #endif 94 return __A; 95 } 96 97 if (__count == 0) 98 return __B; 99 100 if (__count >= 16) 101 { 102 if (__count >= 32) 103 { 104 const __v16qu zero = { 0 }; 105 return (__m128i) zero; 106 } 107 else 108 { 109 const __v16qu __shift = 110 vec_splats ((unsigned char) ((__count - 16) * 8)); 111 #ifdef __LITTLE_ENDIAN__ 112 return (__m128i) vec_sro ((__v16qu) __A, __shift); 113 #else 114 return (__m128i) vec_slo ((__v16qu) __A, __shift); 115 #endif 116 } 117 } 118 else 119 { 120 const __v16qu __shiftA = 121 vec_splats ((unsigned char) ((16 - __count) * 8)); 122 const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8)); 123 #ifdef __LITTLE_ENDIAN__ 124 __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA); 125 __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB); 126 #else 127 __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA); 128 __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB); 129 #endif 130 return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B); 131 } 132 } 133 134 extern __inline __m64 135 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 136 _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count) 137 { 138 if (__count < 16) 139 { 140 __v2du __C = { __B, __A }; 141 #ifdef __LITTLE_ENDIAN__ 142 const __v4su __shift = { __count << 3, 0, 0, 0 }; 143 __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift); 144 #else 145 const __v4su __shift = { 0, 0, 0, __count << 3 }; 146 __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift); 147 #endif 148 return (__m64) __C[0]; 149 } 150 else 151 { 152 const __m64 __zero = { 0 }; 153 return __zero; 154 } 155 } 156 157 extern __inline __m128i 158 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 159 _mm_hadd_epi16 (__m128i __A, __m128i __B) 160 { 161 const __v16qu __P = 162 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 163 const __v16qu __Q = 164 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 165 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 166 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 167 return (__m128i) vec_add (__C, __D); 168 } 169 170 extern __inline __m128i 171 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 172 _mm_hadd_epi32 (__m128i __A, __m128i __B) 173 { 174 const __v16qu __P = 175 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; 176 const __v16qu __Q = 177 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; 178 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); 179 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); 180 return (__m128i) vec_add (__C, __D); 181 } 182 183 extern __inline __m64 184 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 185 _mm_hadd_pi16 (__m64 __A, __m64 __B) 186 { 187 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 188 const __v16qu __P = 189 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 190 const __v16qu __Q = 191 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 192 __v8hi __D = vec_perm (__C, __C, __Q); 193 __C = vec_perm (__C, __C, __P); 194 __C = vec_add (__C, __D); 195 return (__m64) ((__v2du) __C)[1]; 196 } 197 198 extern __inline __m64 199 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 200 _mm_hadd_pi32 (__m64 __A, __m64 __B) 201 { 202 __v4si __C = (__v4si) (__v2du) { __A, __B }; 203 const __v16qu __P = 204 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; 205 const __v16qu __Q = 206 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; 207 __v4si __D = vec_perm (__C, __C, __Q); 208 __C = vec_perm (__C, __C, __P); 209 __C = vec_add (__C, __D); 210 return (__m64) ((__v2du) __C)[1]; 211 } 212 213 extern __inline __m128i 214 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 215 _mm_hadds_epi16 (__m128i __A, __m128i __B) 216 { 217 __v4si __C = { 0 }, __D = { 0 }; 218 __C = vec_sum4s ((__v8hi) __A, __C); 219 __D = vec_sum4s ((__v8hi) __B, __D); 220 __C = (__v4si) vec_packs (__C, __D); 221 return (__m128i) __C; 222 } 223 224 extern __inline __m64 225 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 226 _mm_hadds_pi16 (__m64 __A, __m64 __B) 227 { 228 const __v4si __zero = { 0 }; 229 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 230 __v4si __D = vec_sum4s (__C, __zero); 231 __C = vec_packs (__D, __D); 232 return (__m64) ((__v2du) __C)[1]; 233 } 234 235 extern __inline __m128i 236 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 237 _mm_hsub_epi16 (__m128i __A, __m128i __B) 238 { 239 const __v16qu __P = 240 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 241 const __v16qu __Q = 242 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 243 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 244 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 245 return (__m128i) vec_sub (__C, __D); 246 } 247 248 extern __inline __m128i 249 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 250 _mm_hsub_epi32 (__m128i __A, __m128i __B) 251 { 252 const __v16qu __P = 253 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; 254 const __v16qu __Q = 255 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; 256 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); 257 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); 258 return (__m128i) vec_sub (__C, __D); 259 } 260 261 extern __inline __m64 262 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 263 _mm_hsub_pi16 (__m64 __A, __m64 __B) 264 { 265 const __v16qu __P = 266 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 267 const __v16qu __Q = 268 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 269 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 270 __v8hi __D = vec_perm (__C, __C, __Q); 271 __C = vec_perm (__C, __C, __P); 272 __C = vec_sub (__C, __D); 273 return (__m64) ((__v2du) __C)[1]; 274 } 275 276 extern __inline __m64 277 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 278 _mm_hsub_pi32 (__m64 __A, __m64 __B) 279 { 280 const __v16qu __P = 281 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; 282 const __v16qu __Q = 283 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; 284 __v4si __C = (__v4si) (__v2du) { __A, __B }; 285 __v4si __D = vec_perm (__C, __C, __Q); 286 __C = vec_perm (__C, __C, __P); 287 __C = vec_sub (__C, __D); 288 return (__m64) ((__v2du) __C)[1]; 289 } 290 291 extern __inline __m128i 292 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 293 _mm_hsubs_epi16 (__m128i __A, __m128i __B) 294 { 295 const __v16qu __P = 296 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 297 const __v16qu __Q = 298 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 299 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 300 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 301 return (__m128i) vec_subs (__C, __D); 302 } 303 304 extern __inline __m64 305 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 306 _mm_hsubs_pi16 (__m64 __A, __m64 __B) 307 { 308 const __v16qu __P = 309 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 310 const __v16qu __Q = 311 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 312 __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 313 __v8hi __D = vec_perm (__C, __C, __P); 314 __v8hi __E = vec_perm (__C, __C, __Q); 315 __C = vec_subs (__D, __E); 316 return (__m64) ((__v2du) __C)[1]; 317 } 318 319 extern __inline __m128i 320 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 321 _mm_shuffle_epi8 (__m128i __A, __m128i __B) 322 { 323 const __v16qi __zero = { 0 }; 324 __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero); 325 __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B); 326 return (__m128i) vec_sel (__C, __zero, __select); 327 } 328 329 extern __inline __m64 330 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 331 _mm_shuffle_pi8 (__m64 __A, __m64 __B) 332 { 333 const __v16qi __zero = { 0 }; 334 __v16qi __C = (__v16qi) (__v2du) { __A, __A }; 335 __v16qi __D = (__v16qi) (__v2du) { __B, __B }; 336 __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero); 337 __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D); 338 __C = vec_sel (__C, __zero, __select); 339 return (__m64) ((__v2du) (__C))[0]; 340 } 341 342 extern __inline __m128i 343 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 344 _mm_sign_epi8 (__m128i __A, __m128i __B) 345 { 346 const __v16qi __zero = { 0 }; 347 __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero); 348 __v16qi __selectpos = 349 (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero)); 350 __v16qi __conv = vec_add (__selectneg, __selectpos); 351 return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); 352 } 353 354 extern __inline __m128i 355 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 356 _mm_sign_epi16 (__m128i __A, __m128i __B) 357 { 358 const __v8hi __zero = { 0 }; 359 __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero); 360 __v8hi __selectpos = 361 (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero)); 362 __v8hi __conv = vec_add (__selectneg, __selectpos); 363 return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); 364 } 365 366 extern __inline __m128i 367 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 368 _mm_sign_epi32 (__m128i __A, __m128i __B) 369 { 370 const __v4si __zero = { 0 }; 371 __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero); 372 __v4si __selectpos = 373 (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero)); 374 __v4si __conv = vec_add (__selectneg, __selectpos); 375 return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); 376 } 377 378 extern __inline __m64 379 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 380 _mm_sign_pi8 (__m64 __A, __m64 __B) 381 { 382 const __v16qi __zero = { 0 }; 383 __v16qi __C = (__v16qi) (__v2du) { __A, __A }; 384 __v16qi __D = (__v16qi) (__v2du) { __B, __B }; 385 __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); 386 return (__m64) ((__v2du) (__C))[0]; 387 } 388 389 extern __inline __m64 390 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 391 _mm_sign_pi16 (__m64 __A, __m64 __B) 392 { 393 const __v8hi __zero = { 0 }; 394 __v8hi __C = (__v8hi) (__v2du) { __A, __A }; 395 __v8hi __D = (__v8hi) (__v2du) { __B, __B }; 396 __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); 397 return (__m64) ((__v2du) (__C))[0]; 398 } 399 400 extern __inline __m64 401 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 402 _mm_sign_pi32 (__m64 __A, __m64 __B) 403 { 404 const __v4si __zero = { 0 }; 405 __v4si __C = (__v4si) (__v2du) { __A, __A }; 406 __v4si __D = (__v4si) (__v2du) { __B, __B }; 407 __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); 408 return (__m64) ((__v2du) (__C))[0]; 409 } 410 411 extern __inline __m128i 412 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 413 _mm_maddubs_epi16 (__m128i __A, __m128i __B) 414 { 415 __v8hi __unsigned = vec_splats ((signed short) 0x00ff); 416 __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned); 417 __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned); 418 __v8hi __E = vec_unpackh ((__v16qi) __B); 419 __v8hi __F = vec_unpackl ((__v16qi) __B); 420 __C = vec_mul (__C, __E); 421 __D = vec_mul (__D, __F); 422 const __v16qu __odds = 423 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 424 const __v16qu __evens = 425 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 426 __E = vec_perm (__C, __D, __odds); 427 __F = vec_perm (__C, __D, __evens); 428 return (__m128i) vec_adds (__E, __F); 429 } 430 431 extern __inline __m64 432 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 433 _mm_maddubs_pi16 (__m64 __A, __m64 __B) 434 { 435 __v8hi __C = (__v8hi) (__v2du) { __A, __A }; 436 __C = vec_unpackl ((__v16qi) __C); 437 const __v8hi __unsigned = vec_splats ((signed short) 0x00ff); 438 __C = vec_and (__C, __unsigned); 439 __v8hi __D = (__v8hi) (__v2du) { __B, __B }; 440 __D = vec_unpackl ((__v16qi) __D); 441 __D = vec_mul (__C, __D); 442 const __v16qu __odds = 443 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 444 const __v16qu __evens = 445 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 446 __C = vec_perm (__D, __D, __odds); 447 __D = vec_perm (__D, __D, __evens); 448 __C = vec_adds (__C, __D); 449 return (__m64) ((__v2du) (__C))[0]; 450 } 451 452 extern __inline __m128i 453 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 454 _mm_mulhrs_epi16 (__m128i __A, __m128i __B) 455 { 456 __v4si __C = vec_unpackh ((__v8hi) __A); 457 __v4si __D = vec_unpackh ((__v8hi) __B); 458 __C = vec_mul (__C, __D); 459 __D = vec_unpackl ((__v8hi) __A); 460 __v4si __E = vec_unpackl ((__v8hi) __B); 461 __D = vec_mul (__D, __E); 462 const __v4su __shift = vec_splats ((unsigned int) 14); 463 __C = vec_sr (__C, __shift); 464 __D = vec_sr (__D, __shift); 465 const __v4si __ones = vec_splats ((signed int) 1); 466 __C = vec_add (__C, __ones); 467 __C = vec_sr (__C, (__v4su) __ones); 468 __D = vec_add (__D, __ones); 469 __D = vec_sr (__D, (__v4su) __ones); 470 return (__m128i) vec_pack (__C, __D); 471 } 472 473 extern __inline __m64 474 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 475 _mm_mulhrs_pi16 (__m64 __A, __m64 __B) 476 { 477 __v4si __C = (__v4si) (__v2du) { __A, __A }; 478 __C = vec_unpackh ((__v8hi) __C); 479 __v4si __D = (__v4si) (__v2du) { __B, __B }; 480 __D = vec_unpackh ((__v8hi) __D); 481 __C = vec_mul (__C, __D); 482 const __v4su __shift = vec_splats ((unsigned int) 14); 483 __C = vec_sr (__C, __shift); 484 const __v4si __ones = vec_splats ((signed int) 1); 485 __C = vec_add (__C, __ones); 486 __C = vec_sr (__C, (__v4su) __ones); 487 __v8hi __E = vec_pack (__C, __D); 488 return (__m64) ((__v2du) (__E))[0]; 489 } 490 491 #else 492 #include_next <tmmintrin.h> 493 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ 494 */ 495 496 #endif /* TMMINTRIN_H_ */ 497