1 /*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 /* Implemented from the specification included in the Intel C++ Compiler 11 User Guide and Reference, version 9.0. */ 12 13 #ifndef NO_WARN_X86_INTRINSICS 14 /* This header is distributed to simplify porting x86_64 code that 15 makes explicit use of Intel intrinsics to powerpc64le. 16 17 It is the user's responsibility to determine if the results are 18 acceptable and make additional changes as necessary. 19 20 Note that much code that uses Intel intrinsics can be rewritten in 21 standard C or GNU C extensions, which are more portable and better 22 optimized across multiple targets. */ 23 #endif 24 25 #ifndef TMMINTRIN_H_ 26 #define TMMINTRIN_H_ 27 28 #if defined(__ppc64__) && \ 29 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) 30 31 #include <altivec.h> 32 33 /* We need definitions from the SSE header files. */ 34 #include <pmmintrin.h> 35 36 extern __inline __m128i 37 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 38 _mm_abs_epi16(__m128i __A) { 39 return (__m128i)vec_abs((__v8hi)__A); 40 } 41 42 extern __inline __m128i 43 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 44 _mm_abs_epi32(__m128i __A) { 45 return (__m128i)vec_abs((__v4si)__A); 46 } 47 48 extern __inline __m128i 49 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 50 _mm_abs_epi8(__m128i __A) { 51 return (__m128i)vec_abs((__v16qi)__A); 52 } 53 54 extern __inline __m64 55 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 56 _mm_abs_pi16(__m64 __A) { 57 __v8hi __B = (__v8hi)(__v2du){__A, __A}; 58 return (__m64)((__v2du)vec_abs(__B))[0]; 59 } 60 61 extern __inline __m64 62 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 63 _mm_abs_pi32(__m64 __A) { 64 __v4si __B = (__v4si)(__v2du){__A, __A}; 65 return (__m64)((__v2du)vec_abs(__B))[0]; 66 } 67 68 extern __inline __m64 69 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 70 _mm_abs_pi8(__m64 __A) { 71 __v16qi __B = (__v16qi)(__v2du){__A, __A}; 72 return (__m64)((__v2du)vec_abs(__B))[0]; 73 } 74 75 extern __inline __m128i 76 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 77 _mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) { 78 if (__builtin_constant_p(__count) && __count < 16) { 79 #ifdef __LITTLE_ENDIAN__ 80 __A = (__m128i)vec_reve((__v16qu)__A); 81 __B = (__m128i)vec_reve((__v16qu)__B); 82 #endif 83 __A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count); 84 #ifdef __LITTLE_ENDIAN__ 85 __A = (__m128i)vec_reve((__v16qu)__A); 86 #endif 87 return __A; 88 } 89 90 if (__count == 0) 91 return __B; 92 93 if (__count >= 16) { 94 if (__count >= 32) { 95 const __v16qu __zero = {0}; 96 return (__m128i)__zero; 97 } else { 98 const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8)); 99 #ifdef __LITTLE_ENDIAN__ 100 return (__m128i)vec_sro((__v16qu)__A, __shift); 101 #else 102 return (__m128i)vec_slo((__v16qu)__A, __shift); 103 #endif 104 } 105 } else { 106 const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8)); 107 const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8)); 108 #ifdef __LITTLE_ENDIAN__ 109 __A = (__m128i)vec_slo((__v16qu)__A, __shiftA); 110 __B = (__m128i)vec_sro((__v16qu)__B, __shiftB); 111 #else 112 __A = (__m128i)vec_sro((__v16qu)__A, __shiftA); 113 __B = (__m128i)vec_slo((__v16qu)__B, __shiftB); 114 #endif 115 return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B); 116 } 117 } 118 119 extern __inline __m64 120 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 121 _mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) { 122 if (__count < 16) { 123 __v2du __C = {__B, __A}; 124 #ifdef __LITTLE_ENDIAN__ 125 const __v4su __shift = {__count << 3, 0, 0, 0}; 126 __C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift); 127 #else 128 const __v4su __shift = {0, 0, 0, __count << 3}; 129 __C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift); 130 #endif 131 return (__m64)__C[0]; 132 } else { 133 const __m64 __zero = {0}; 134 return __zero; 135 } 136 } 137 138 extern __inline __m128i 139 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 140 _mm_hadd_epi16(__m128i __A, __m128i __B) { 141 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 142 16, 17, 20, 21, 24, 25, 28, 29}; 143 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 144 18, 19, 22, 23, 26, 27, 30, 31}; 145 __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P); 146 __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q); 147 return (__m128i)vec_add(__C, __D); 148 } 149 150 extern __inline __m128i 151 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 152 _mm_hadd_epi32(__m128i __A, __m128i __B) { 153 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 154 16, 17, 18, 19, 24, 25, 26, 27}; 155 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 156 20, 21, 22, 23, 28, 29, 30, 31}; 157 __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P); 158 __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q); 159 return (__m128i)vec_add(__C, __D); 160 } 161 162 extern __inline __m64 163 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 164 _mm_hadd_pi16(__m64 __A, __m64 __B) { 165 __v8hi __C = (__v8hi)(__v2du){__A, __B}; 166 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; 167 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15}; 168 __v8hi __D = vec_perm(__C, __C, __Q); 169 __C = vec_perm(__C, __C, __P); 170 __C = vec_add(__C, __D); 171 return (__m64)((__v2du)__C)[1]; 172 } 173 174 extern __inline __m64 175 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 176 _mm_hadd_pi32(__m64 __A, __m64 __B) { 177 __v4si __C = (__v4si)(__v2du){__A, __B}; 178 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11}; 179 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15}; 180 __v4si __D = vec_perm(__C, __C, __Q); 181 __C = vec_perm(__C, __C, __P); 182 __C = vec_add(__C, __D); 183 return (__m64)((__v2du)__C)[1]; 184 } 185 186 extern __inline __m128i 187 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 188 _mm_hadds_epi16(__m128i __A, __m128i __B) { 189 __v4si __C = {0}, __D = {0}; 190 __C = vec_sum4s((__v8hi)__A, __C); 191 __D = vec_sum4s((__v8hi)__B, __D); 192 __C = (__v4si)vec_packs(__C, __D); 193 return (__m128i)__C; 194 } 195 196 extern __inline __m64 197 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 198 _mm_hadds_pi16(__m64 __A, __m64 __B) { 199 const __v4si __zero = {0}; 200 __v8hi __C = (__v8hi)(__v2du){__A, __B}; 201 __v4si __D = vec_sum4s(__C, __zero); 202 __C = vec_packs(__D, __D); 203 return (__m64)((__v2du)__C)[1]; 204 } 205 206 extern __inline __m128i 207 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 208 _mm_hsub_epi16(__m128i __A, __m128i __B) { 209 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 210 16, 17, 20, 21, 24, 25, 28, 29}; 211 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 212 18, 19, 22, 23, 26, 27, 30, 31}; 213 __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P); 214 __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q); 215 return (__m128i)vec_sub(__C, __D); 216 } 217 218 extern __inline __m128i 219 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 220 _mm_hsub_epi32(__m128i __A, __m128i __B) { 221 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 222 16, 17, 18, 19, 24, 25, 26, 27}; 223 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 224 20, 21, 22, 23, 28, 29, 30, 31}; 225 __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P); 226 __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q); 227 return (__m128i)vec_sub(__C, __D); 228 } 229 230 extern __inline __m64 231 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 232 _mm_hsub_pi16(__m64 __A, __m64 __B) { 233 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; 234 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15}; 235 __v8hi __C = (__v8hi)(__v2du){__A, __B}; 236 __v8hi __D = vec_perm(__C, __C, __Q); 237 __C = vec_perm(__C, __C, __P); 238 __C = vec_sub(__C, __D); 239 return (__m64)((__v2du)__C)[1]; 240 } 241 242 extern __inline __m64 243 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 244 _mm_hsub_pi32(__m64 __A, __m64 __B) { 245 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11}; 246 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15}; 247 __v4si __C = (__v4si)(__v2du){__A, __B}; 248 __v4si __D = vec_perm(__C, __C, __Q); 249 __C = vec_perm(__C, __C, __P); 250 __C = vec_sub(__C, __D); 251 return (__m64)((__v2du)__C)[1]; 252 } 253 254 extern __inline __m128i 255 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 256 _mm_hsubs_epi16(__m128i __A, __m128i __B) { 257 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 258 16, 17, 20, 21, 24, 25, 28, 29}; 259 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 260 18, 19, 22, 23, 26, 27, 30, 31}; 261 __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P); 262 __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q); 263 return (__m128i)vec_subs(__C, __D); 264 } 265 266 extern __inline __m64 267 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 268 _mm_hsubs_pi16(__m64 __A, __m64 __B) { 269 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; 270 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15}; 271 __v8hi __C = (__v8hi)(__v2du){__A, __B}; 272 __v8hi __D = vec_perm(__C, __C, __P); 273 __v8hi __E = vec_perm(__C, __C, __Q); 274 __C = vec_subs(__D, __E); 275 return (__m64)((__v2du)__C)[1]; 276 } 277 278 extern __inline __m128i 279 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 280 _mm_shuffle_epi8(__m128i __A, __m128i __B) { 281 const __v16qi __zero = {0}; 282 __vector __bool char __select = vec_cmplt((__v16qi)__B, __zero); 283 __v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B); 284 return (__m128i)vec_sel(__C, __zero, __select); 285 } 286 287 extern __inline __m64 288 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 289 _mm_shuffle_pi8(__m64 __A, __m64 __B) { 290 const __v16qi __zero = {0}; 291 __v16qi __C = (__v16qi)(__v2du){__A, __A}; 292 __v16qi __D = (__v16qi)(__v2du){__B, __B}; 293 __vector __bool char __select = vec_cmplt((__v16qi)__D, __zero); 294 __C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D); 295 __C = vec_sel(__C, __zero, __select); 296 return (__m64)((__v2du)(__C))[0]; 297 } 298 299 #ifdef _ARCH_PWR8 300 extern __inline __m128i 301 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 302 _mm_sign_epi8(__m128i __A, __m128i __B) { 303 const __v16qi __zero = {0}; 304 __v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero); 305 __v16qi __selectpos = 306 (__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero)); 307 __v16qi __conv = vec_add(__selectneg, __selectpos); 308 return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv); 309 } 310 #endif 311 312 #ifdef _ARCH_PWR8 313 extern __inline __m128i 314 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 315 _mm_sign_epi16(__m128i __A, __m128i __B) { 316 const __v8hi __zero = {0}; 317 __v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero); 318 __v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero)); 319 __v8hi __conv = vec_add(__selectneg, __selectpos); 320 return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv); 321 } 322 #endif 323 324 #ifdef _ARCH_PWR8 325 extern __inline __m128i 326 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 327 _mm_sign_epi32(__m128i __A, __m128i __B) { 328 const __v4si __zero = {0}; 329 __v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero); 330 __v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero)); 331 __v4si __conv = vec_add(__selectneg, __selectpos); 332 return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv); 333 } 334 #endif 335 336 #ifdef _ARCH_PWR8 337 extern __inline __m64 338 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 339 _mm_sign_pi8(__m64 __A, __m64 __B) { 340 const __v16qi __zero = {0}; 341 __v16qi __C = (__v16qi)(__v2du){__A, __A}; 342 __v16qi __D = (__v16qi)(__v2du){__B, __B}; 343 __C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D); 344 return (__m64)((__v2du)(__C))[0]; 345 } 346 #endif 347 348 #ifdef _ARCH_PWR8 349 extern __inline __m64 350 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 351 _mm_sign_pi16(__m64 __A, __m64 __B) { 352 const __v8hi __zero = {0}; 353 __v8hi __C = (__v8hi)(__v2du){__A, __A}; 354 __v8hi __D = (__v8hi)(__v2du){__B, __B}; 355 __C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D); 356 return (__m64)((__v2du)(__C))[0]; 357 } 358 #endif 359 360 #ifdef _ARCH_PWR8 361 extern __inline __m64 362 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 363 _mm_sign_pi32(__m64 __A, __m64 __B) { 364 const __v4si __zero = {0}; 365 __v4si __C = (__v4si)(__v2du){__A, __A}; 366 __v4si __D = (__v4si)(__v2du){__B, __B}; 367 __C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D); 368 return (__m64)((__v2du)(__C))[0]; 369 } 370 #endif 371 372 extern __inline __m128i 373 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 374 _mm_maddubs_epi16(__m128i __A, __m128i __B) { 375 __v8hi __unsigned = vec_splats((signed short)0x00ff); 376 __v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned); 377 __v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned); 378 __v8hi __E = vec_unpackh((__v16qi)__B); 379 __v8hi __F = vec_unpackl((__v16qi)__B); 380 __C = vec_mul(__C, __E); 381 __D = vec_mul(__D, __F); 382 const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13, 383 16, 17, 20, 21, 24, 25, 28, 29}; 384 const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15, 385 18, 19, 22, 23, 26, 27, 30, 31}; 386 __E = vec_perm(__C, __D, __odds); 387 __F = vec_perm(__C, __D, __evens); 388 return (__m128i)vec_adds(__E, __F); 389 } 390 391 extern __inline __m64 392 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 393 _mm_maddubs_pi16(__m64 __A, __m64 __B) { 394 __v8hi __C = (__v8hi)(__v2du){__A, __A}; 395 __C = vec_unpackl((__v16qi)__C); 396 const __v8hi __unsigned = vec_splats((signed short)0x00ff); 397 __C = vec_and(__C, __unsigned); 398 __v8hi __D = (__v8hi)(__v2du){__B, __B}; 399 __D = vec_unpackl((__v16qi)__D); 400 __D = vec_mul(__C, __D); 401 const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13, 402 16, 17, 20, 21, 24, 25, 28, 29}; 403 const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15, 404 18, 19, 22, 23, 26, 27, 30, 31}; 405 __C = vec_perm(__D, __D, __odds); 406 __D = vec_perm(__D, __D, __evens); 407 __C = vec_adds(__C, __D); 408 return (__m64)((__v2du)(__C))[0]; 409 } 410 411 extern __inline __m128i 412 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 413 _mm_mulhrs_epi16(__m128i __A, __m128i __B) { 414 __v4si __C = vec_unpackh((__v8hi)__A); 415 __v4si __D = vec_unpackh((__v8hi)__B); 416 __C = vec_mul(__C, __D); 417 __D = vec_unpackl((__v8hi)__A); 418 __v4si __E = vec_unpackl((__v8hi)__B); 419 __D = vec_mul(__D, __E); 420 const __v4su __shift = vec_splats((unsigned int)14); 421 __C = vec_sr(__C, __shift); 422 __D = vec_sr(__D, __shift); 423 const __v4si __ones = vec_splats((signed int)1); 424 __C = vec_add(__C, __ones); 425 __C = vec_sr(__C, (__v4su)__ones); 426 __D = vec_add(__D, __ones); 427 __D = vec_sr(__D, (__v4su)__ones); 428 return (__m128i)vec_pack(__C, __D); 429 } 430 431 extern __inline __m64 432 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 433 _mm_mulhrs_pi16(__m64 __A, __m64 __B) { 434 __v4si __C = (__v4si)(__v2du){__A, __A}; 435 __C = vec_unpackh((__v8hi)__C); 436 __v4si __D = (__v4si)(__v2du){__B, __B}; 437 __D = vec_unpackh((__v8hi)__D); 438 __C = vec_mul(__C, __D); 439 const __v4su __shift = vec_splats((unsigned int)14); 440 __C = vec_sr(__C, __shift); 441 const __v4si __ones = vec_splats((signed int)1); 442 __C = vec_add(__C, __ones); 443 __C = vec_sr(__C, (__v4su)__ones); 444 __v8hi __E = vec_pack(__C, __D); 445 return (__m64)((__v2du)(__E))[0]; 446 } 447 448 #else 449 #include_next <tmmintrin.h> 450 #endif /* defined(__ppc64__) && 451 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ 452 453 #endif /* TMMINTRIN_H_ */ 454