1 /*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 * The Arm C Language Extensions specifications can be found in the following 8 * link: https://github.com/ARM-software/acle/releases 9 * 10 * The ACLE section numbers are subject to change. When consulting the 11 * specifications, it is recommended to search using section titles if 12 * the section numbers look outdated. 13 * 14 *===-----------------------------------------------------------------------=== 15 */ 16 17 #ifndef __ARM_ACLE_H 18 #define __ARM_ACLE_H 19 20 #ifndef __ARM_ACLE 21 #error "ACLE intrinsics support not enabled." 22 #endif 23 24 #include <stdint.h> 25 26 #if defined(__cplusplus) 27 extern "C" { 28 #endif 29 30 /* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */ 31 /* 7.3 Memory barriers */ 32 #if !__has_builtin(__dmb) 33 #define __dmb(i) __builtin_arm_dmb(i) 34 #endif 35 #if !__has_builtin(__dsb) 36 #define __dsb(i) __builtin_arm_dsb(i) 37 #endif 38 #if !__has_builtin(__isb) 39 #define __isb(i) __builtin_arm_isb(i) 40 #endif 41 42 /* 7.4 Hints */ 43 44 #if !__has_builtin(__wfi) 45 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) { 46 __builtin_arm_wfi(); 47 } 48 #endif 49 50 #if !__has_builtin(__wfe) 51 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) { 52 __builtin_arm_wfe(); 53 } 54 #endif 55 56 #if !__has_builtin(__sev) 57 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) { 58 __builtin_arm_sev(); 59 } 60 #endif 61 62 #if !__has_builtin(__sevl) 63 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) { 64 __builtin_arm_sevl(); 65 } 66 #endif 67 68 #if !__has_builtin(__yield) 69 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) { 70 __builtin_arm_yield(); 71 } 72 #endif 73 74 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE 75 #define __dbg(t) __builtin_arm_dbg(t) 76 #endif 77 78 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 79 #define _CHKFEAT_GCS 1 80 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) 81 __chkfeat(uint64_t __features) { 82 return __builtin_arm_chkfeat(__features) ^ __features; 83 } 84 #endif 85 86 /* 7.5 Swap */ 87 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 88 __swp(uint32_t __x, volatile uint32_t *__p) { 89 uint32_t v; 90 do 91 v = __builtin_arm_ldrex(__p); 92 while (__builtin_arm_strex(__x, __p)); 93 return v; 94 } 95 96 /* 7.6 Memory prefetch intrinsics */ 97 /* 7.6.1 Data prefetch */ 98 #define __pld(addr) __pldx(0, 0, 0, addr) 99 100 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE 101 #define __pldx(access_kind, cache_level, retention_policy, addr) \ 102 __builtin_arm_prefetch(addr, access_kind, 1) 103 #else 104 #define __pldx(access_kind, cache_level, retention_policy, addr) \ 105 __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1) 106 #endif 107 108 /* 7.6.2 Instruction prefetch */ 109 #define __pli(addr) __plix(0, 0, addr) 110 111 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE 112 #define __plix(cache_level, retention_policy, addr) \ 113 __builtin_arm_prefetch(addr, 0, 0) 114 #else 115 #define __plix(cache_level, retention_policy, addr) \ 116 __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0) 117 #endif 118 119 /* 7.7 NOP */ 120 #if !defined(_MSC_VER) || (!defined(__aarch64__) && !defined(__arm64ec__)) 121 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) { 122 __builtin_arm_nop(); 123 } 124 #endif 125 126 /* 8 DATA-PROCESSING INTRINSICS */ 127 /* 8.2 Miscellaneous data-processing intrinsics */ 128 /* ROR */ 129 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 130 __ror(uint32_t __x, uint32_t __y) { 131 __y %= 32; 132 if (__y == 0) 133 return __x; 134 return (__x >> __y) | (__x << (32 - __y)); 135 } 136 137 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) 138 __rorll(uint64_t __x, uint32_t __y) { 139 __y %= 64; 140 if (__y == 0) 141 return __x; 142 return (__x >> __y) | (__x << (64 - __y)); 143 } 144 145 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) 146 __rorl(unsigned long __x, uint32_t __y) { 147 #if __SIZEOF_LONG__ == 4 148 return __ror(__x, __y); 149 #else 150 return __rorll(__x, __y); 151 #endif 152 } 153 154 155 /* CLZ */ 156 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 157 __clz(uint32_t __t) { 158 return __builtin_arm_clz(__t); 159 } 160 161 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 162 __clzl(unsigned long __t) { 163 #if __SIZEOF_LONG__ == 4 164 return __builtin_arm_clz(__t); 165 #else 166 return __builtin_arm_clz64(__t); 167 #endif 168 } 169 170 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 171 __clzll(uint64_t __t) { 172 return __builtin_arm_clz64(__t); 173 } 174 175 /* CLS */ 176 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 177 __cls(uint32_t __t) { 178 return __builtin_arm_cls(__t); 179 } 180 181 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 182 __clsl(unsigned long __t) { 183 #if __SIZEOF_LONG__ == 4 184 return __builtin_arm_cls(__t); 185 #else 186 return __builtin_arm_cls64(__t); 187 #endif 188 } 189 190 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 191 __clsll(uint64_t __t) { 192 return __builtin_arm_cls64(__t); 193 } 194 195 /* REV */ 196 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 197 __rev(uint32_t __t) { 198 return __builtin_bswap32(__t); 199 } 200 201 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) 202 __revl(unsigned long __t) { 203 #if __SIZEOF_LONG__ == 4 204 return __builtin_bswap32(__t); 205 #else 206 return __builtin_bswap64(__t); 207 #endif 208 } 209 210 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) 211 __revll(uint64_t __t) { 212 return __builtin_bswap64(__t); 213 } 214 215 /* REV16 */ 216 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 217 __rev16(uint32_t __t) { 218 return __ror(__rev(__t), 16); 219 } 220 221 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) 222 __rev16ll(uint64_t __t) { 223 return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t); 224 } 225 226 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) 227 __rev16l(unsigned long __t) { 228 #if __SIZEOF_LONG__ == 4 229 return __rev16(__t); 230 #else 231 return __rev16ll(__t); 232 #endif 233 } 234 235 /* REVSH */ 236 static __inline__ int16_t __attribute__((__always_inline__, __nodebug__)) 237 __revsh(int16_t __t) { 238 return (int16_t)__builtin_bswap16((uint16_t)__t); 239 } 240 241 /* RBIT */ 242 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 243 __rbit(uint32_t __t) { 244 return __builtin_arm_rbit(__t); 245 } 246 247 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) 248 __rbitll(uint64_t __t) { 249 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE 250 return (((uint64_t)__builtin_arm_rbit(__t)) << 32) | 251 __builtin_arm_rbit(__t >> 32); 252 #else 253 return __builtin_arm_rbit64(__t); 254 #endif 255 } 256 257 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) 258 __rbitl(unsigned long __t) { 259 #if __SIZEOF_LONG__ == 4 260 return __rbit(__t); 261 #else 262 return __rbitll(__t); 263 #endif 264 } 265 266 /* 8.3 16-bit multiplications */ 267 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP 268 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 269 __smulbb(int32_t __a, int32_t __b) { 270 return __builtin_arm_smulbb(__a, __b); 271 } 272 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 273 __smulbt(int32_t __a, int32_t __b) { 274 return __builtin_arm_smulbt(__a, __b); 275 } 276 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 277 __smultb(int32_t __a, int32_t __b) { 278 return __builtin_arm_smultb(__a, __b); 279 } 280 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 281 __smultt(int32_t __a, int32_t __b) { 282 return __builtin_arm_smultt(__a, __b); 283 } 284 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 285 __smulwb(int32_t __a, int32_t __b) { 286 return __builtin_arm_smulwb(__a, __b); 287 } 288 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 289 __smulwt(int32_t __a, int32_t __b) { 290 return __builtin_arm_smulwt(__a, __b); 291 } 292 #endif 293 294 /* 295 * 8.4 Saturating intrinsics 296 * 297 * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag 298 * intrinsics are implemented and the flag is enabled. 299 */ 300 /* 8.4.1 Width-specified saturation intrinsics */ 301 #if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT 302 #define __ssat(x, y) __builtin_arm_ssat(x, y) 303 #define __usat(x, y) __builtin_arm_usat(x, y) 304 #endif 305 306 /* 8.4.2 Saturating addition and subtraction intrinsics */ 307 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP 308 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 309 __qadd(int32_t __t, int32_t __v) { 310 return __builtin_arm_qadd(__t, __v); 311 } 312 313 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 314 __qsub(int32_t __t, int32_t __v) { 315 return __builtin_arm_qsub(__t, __v); 316 } 317 318 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 319 __qdbl(int32_t __t) { 320 return __builtin_arm_qadd(__t, __t); 321 } 322 #endif 323 324 /* 8.4.3 Accumulating multiplications */ 325 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP 326 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 327 __smlabb(int32_t __a, int32_t __b, int32_t __c) { 328 return __builtin_arm_smlabb(__a, __b, __c); 329 } 330 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 331 __smlabt(int32_t __a, int32_t __b, int32_t __c) { 332 return __builtin_arm_smlabt(__a, __b, __c); 333 } 334 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 335 __smlatb(int32_t __a, int32_t __b, int32_t __c) { 336 return __builtin_arm_smlatb(__a, __b, __c); 337 } 338 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 339 __smlatt(int32_t __a, int32_t __b, int32_t __c) { 340 return __builtin_arm_smlatt(__a, __b, __c); 341 } 342 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 343 __smlawb(int32_t __a, int32_t __b, int32_t __c) { 344 return __builtin_arm_smlawb(__a, __b, __c); 345 } 346 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 347 __smlawt(int32_t __a, int32_t __b, int32_t __c) { 348 return __builtin_arm_smlawt(__a, __b, __c); 349 } 350 #endif 351 352 353 /* 8.5.4 Parallel 16-bit saturation */ 354 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 355 #define __ssat16(x, y) __builtin_arm_ssat16(x, y) 356 #define __usat16(x, y) __builtin_arm_usat16(x, y) 357 #endif 358 359 /* 8.5.5 Packing and unpacking */ 360 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 361 typedef int32_t int8x4_t; 362 typedef int32_t int16x2_t; 363 typedef uint32_t uint8x4_t; 364 typedef uint32_t uint16x2_t; 365 366 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 367 __sxtab16(int16x2_t __a, int8x4_t __b) { 368 return __builtin_arm_sxtab16(__a, __b); 369 } 370 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 371 __sxtb16(int8x4_t __a) { 372 return __builtin_arm_sxtb16(__a); 373 } 374 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 375 __uxtab16(int16x2_t __a, int8x4_t __b) { 376 return __builtin_arm_uxtab16(__a, __b); 377 } 378 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 379 __uxtb16(int8x4_t __a) { 380 return __builtin_arm_uxtb16(__a); 381 } 382 #endif 383 384 /* 8.5.6 Parallel selection */ 385 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 386 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 387 __sel(uint8x4_t __a, uint8x4_t __b) { 388 return __builtin_arm_sel(__a, __b); 389 } 390 #endif 391 392 /* 8.5.7 Parallel 8-bit addition and subtraction */ 393 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 394 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 395 __qadd8(int8x4_t __a, int8x4_t __b) { 396 return __builtin_arm_qadd8(__a, __b); 397 } 398 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 399 __qsub8(int8x4_t __a, int8x4_t __b) { 400 return __builtin_arm_qsub8(__a, __b); 401 } 402 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 403 __sadd8(int8x4_t __a, int8x4_t __b) { 404 return __builtin_arm_sadd8(__a, __b); 405 } 406 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 407 __shadd8(int8x4_t __a, int8x4_t __b) { 408 return __builtin_arm_shadd8(__a, __b); 409 } 410 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 411 __shsub8(int8x4_t __a, int8x4_t __b) { 412 return __builtin_arm_shsub8(__a, __b); 413 } 414 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 415 __ssub8(int8x4_t __a, int8x4_t __b) { 416 return __builtin_arm_ssub8(__a, __b); 417 } 418 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 419 __uadd8(uint8x4_t __a, uint8x4_t __b) { 420 return __builtin_arm_uadd8(__a, __b); 421 } 422 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 423 __uhadd8(uint8x4_t __a, uint8x4_t __b) { 424 return __builtin_arm_uhadd8(__a, __b); 425 } 426 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 427 __uhsub8(uint8x4_t __a, uint8x4_t __b) { 428 return __builtin_arm_uhsub8(__a, __b); 429 } 430 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 431 __uqadd8(uint8x4_t __a, uint8x4_t __b) { 432 return __builtin_arm_uqadd8(__a, __b); 433 } 434 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 435 __uqsub8(uint8x4_t __a, uint8x4_t __b) { 436 return __builtin_arm_uqsub8(__a, __b); 437 } 438 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 439 __usub8(uint8x4_t __a, uint8x4_t __b) { 440 return __builtin_arm_usub8(__a, __b); 441 } 442 #endif 443 444 /* 8.5.8 Sum of 8-bit absolute differences */ 445 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 446 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 447 __usad8(uint8x4_t __a, uint8x4_t __b) { 448 return __builtin_arm_usad8(__a, __b); 449 } 450 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 451 __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) { 452 return __builtin_arm_usada8(__a, __b, __c); 453 } 454 #endif 455 456 /* 8.5.9 Parallel 16-bit addition and subtraction */ 457 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 458 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 459 __qadd16(int16x2_t __a, int16x2_t __b) { 460 return __builtin_arm_qadd16(__a, __b); 461 } 462 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 463 __qasx(int16x2_t __a, int16x2_t __b) { 464 return __builtin_arm_qasx(__a, __b); 465 } 466 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 467 __qsax(int16x2_t __a, int16x2_t __b) { 468 return __builtin_arm_qsax(__a, __b); 469 } 470 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 471 __qsub16(int16x2_t __a, int16x2_t __b) { 472 return __builtin_arm_qsub16(__a, __b); 473 } 474 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 475 __sadd16(int16x2_t __a, int16x2_t __b) { 476 return __builtin_arm_sadd16(__a, __b); 477 } 478 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 479 __sasx(int16x2_t __a, int16x2_t __b) { 480 return __builtin_arm_sasx(__a, __b); 481 } 482 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 483 __shadd16(int16x2_t __a, int16x2_t __b) { 484 return __builtin_arm_shadd16(__a, __b); 485 } 486 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 487 __shasx(int16x2_t __a, int16x2_t __b) { 488 return __builtin_arm_shasx(__a, __b); 489 } 490 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 491 __shsax(int16x2_t __a, int16x2_t __b) { 492 return __builtin_arm_shsax(__a, __b); 493 } 494 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 495 __shsub16(int16x2_t __a, int16x2_t __b) { 496 return __builtin_arm_shsub16(__a, __b); 497 } 498 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 499 __ssax(int16x2_t __a, int16x2_t __b) { 500 return __builtin_arm_ssax(__a, __b); 501 } 502 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 503 __ssub16(int16x2_t __a, int16x2_t __b) { 504 return __builtin_arm_ssub16(__a, __b); 505 } 506 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 507 __uadd16(uint16x2_t __a, uint16x2_t __b) { 508 return __builtin_arm_uadd16(__a, __b); 509 } 510 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 511 __uasx(uint16x2_t __a, uint16x2_t __b) { 512 return __builtin_arm_uasx(__a, __b); 513 } 514 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 515 __uhadd16(uint16x2_t __a, uint16x2_t __b) { 516 return __builtin_arm_uhadd16(__a, __b); 517 } 518 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 519 __uhasx(uint16x2_t __a, uint16x2_t __b) { 520 return __builtin_arm_uhasx(__a, __b); 521 } 522 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 523 __uhsax(uint16x2_t __a, uint16x2_t __b) { 524 return __builtin_arm_uhsax(__a, __b); 525 } 526 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 527 __uhsub16(uint16x2_t __a, uint16x2_t __b) { 528 return __builtin_arm_uhsub16(__a, __b); 529 } 530 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 531 __uqadd16(uint16x2_t __a, uint16x2_t __b) { 532 return __builtin_arm_uqadd16(__a, __b); 533 } 534 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 535 __uqasx(uint16x2_t __a, uint16x2_t __b) { 536 return __builtin_arm_uqasx(__a, __b); 537 } 538 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 539 __uqsax(uint16x2_t __a, uint16x2_t __b) { 540 return __builtin_arm_uqsax(__a, __b); 541 } 542 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 543 __uqsub16(uint16x2_t __a, uint16x2_t __b) { 544 return __builtin_arm_uqsub16(__a, __b); 545 } 546 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 547 __usax(uint16x2_t __a, uint16x2_t __b) { 548 return __builtin_arm_usax(__a, __b); 549 } 550 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 551 __usub16(uint16x2_t __a, uint16x2_t __b) { 552 return __builtin_arm_usub16(__a, __b); 553 } 554 #endif 555 556 /* 8.5.10 Parallel 16-bit multiplication */ 557 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 558 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 559 __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) { 560 return __builtin_arm_smlad(__a, __b, __c); 561 } 562 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 563 __smladx(int16x2_t __a, int16x2_t __b, int32_t __c) { 564 return __builtin_arm_smladx(__a, __b, __c); 565 } 566 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) 567 __smlald(int16x2_t __a, int16x2_t __b, int64_t __c) { 568 return __builtin_arm_smlald(__a, __b, __c); 569 } 570 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) 571 __smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) { 572 return __builtin_arm_smlaldx(__a, __b, __c); 573 } 574 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 575 __smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) { 576 return __builtin_arm_smlsd(__a, __b, __c); 577 } 578 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 579 __smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) { 580 return __builtin_arm_smlsdx(__a, __b, __c); 581 } 582 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) 583 __smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) { 584 return __builtin_arm_smlsld(__a, __b, __c); 585 } 586 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) 587 __smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) { 588 return __builtin_arm_smlsldx(__a, __b, __c); 589 } 590 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 591 __smuad(int16x2_t __a, int16x2_t __b) { 592 return __builtin_arm_smuad(__a, __b); 593 } 594 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 595 __smuadx(int16x2_t __a, int16x2_t __b) { 596 return __builtin_arm_smuadx(__a, __b); 597 } 598 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 599 __smusd(int16x2_t __a, int16x2_t __b) { 600 return __builtin_arm_smusd(__a, __b); 601 } 602 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 603 __smusdx(int16x2_t __a, int16x2_t __b) { 604 return __builtin_arm_smusdx(__a, __b); 605 } 606 #endif 607 608 /* 8.6 Floating-point data-processing intrinsics */ 609 #if (defined(__ARM_FEATURE_DIRECTED_ROUNDING) && \ 610 (__ARM_FEATURE_DIRECTED_ROUNDING)) && \ 611 (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE) 612 static __inline__ double __attribute__((__always_inline__, __nodebug__)) 613 __rintn(double __a) { 614 return __builtin_roundeven(__a); 615 } 616 617 static __inline__ float __attribute__((__always_inline__, __nodebug__)) 618 __rintnf(float __a) { 619 return __builtin_roundevenf(__a); 620 } 621 #endif 622 623 /* 8.8 CRC32 intrinsics */ 624 #if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) || \ 625 (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE) 626 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 627 __crc32b(uint32_t __a, uint8_t __b) { 628 return __builtin_arm_crc32b(__a, __b); 629 } 630 631 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 632 __crc32h(uint32_t __a, uint16_t __b) { 633 return __builtin_arm_crc32h(__a, __b); 634 } 635 636 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 637 __crc32w(uint32_t __a, uint32_t __b) { 638 return __builtin_arm_crc32w(__a, __b); 639 } 640 641 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 642 __crc32d(uint32_t __a, uint64_t __b) { 643 return __builtin_arm_crc32d(__a, __b); 644 } 645 646 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 647 __crc32cb(uint32_t __a, uint8_t __b) { 648 return __builtin_arm_crc32cb(__a, __b); 649 } 650 651 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 652 __crc32ch(uint32_t __a, uint16_t __b) { 653 return __builtin_arm_crc32ch(__a, __b); 654 } 655 656 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 657 __crc32cw(uint32_t __a, uint32_t __b) { 658 return __builtin_arm_crc32cw(__a, __b); 659 } 660 661 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 662 __crc32cd(uint32_t __a, uint64_t __b) { 663 return __builtin_arm_crc32cd(__a, __b); 664 } 665 #endif 666 667 /* 8.6 Floating-point data-processing intrinsics */ 668 /* Armv8.3-A Javascript conversion intrinsic */ 669 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 670 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a"))) 671 __jcvt(double __a) { 672 return __builtin_arm_jcvt(__a); 673 } 674 #endif 675 676 /* Armv8.5-A FP rounding intrinsics */ 677 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 678 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 679 __rint32zf(float __a) { 680 return __builtin_arm_rint32zf(__a); 681 } 682 683 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 684 __rint32z(double __a) { 685 return __builtin_arm_rint32z(__a); 686 } 687 688 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 689 __rint64zf(float __a) { 690 return __builtin_arm_rint64zf(__a); 691 } 692 693 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 694 __rint64z(double __a) { 695 return __builtin_arm_rint64z(__a); 696 } 697 698 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 699 __rint32xf(float __a) { 700 return __builtin_arm_rint32xf(__a); 701 } 702 703 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 704 __rint32x(double __a) { 705 return __builtin_arm_rint32x(__a); 706 } 707 708 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 709 __rint64xf(float __a) { 710 return __builtin_arm_rint64xf(__a); 711 } 712 713 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 714 __rint64x(double __a) { 715 return __builtin_arm_rint64x(__a); 716 } 717 #endif 718 719 /* 8.9 Armv8.7-A load/store 64-byte intrinsics */ 720 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 721 typedef struct { 722 uint64_t val[8]; 723 } data512_t; 724 725 static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64"))) 726 __arm_ld64b(const void *__addr) { 727 data512_t __value; 728 __builtin_arm_ld64b(__addr, __value.val); 729 return __value; 730 } 731 static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64"))) 732 __arm_st64b(void *__addr, data512_t __value) { 733 __builtin_arm_st64b(__addr, __value.val); 734 } 735 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64"))) 736 __arm_st64bv(void *__addr, data512_t __value) { 737 return __builtin_arm_st64bv(__addr, __value.val); 738 } 739 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64"))) 740 __arm_st64bv0(void *__addr, data512_t __value) { 741 return __builtin_arm_st64bv0(__addr, __value.val); 742 } 743 #endif 744 745 /* 11.1 Special register intrinsics */ 746 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg) 747 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg) 748 #define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg) 749 #define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg) 750 #define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg)) 751 #define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg)) 752 #define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v) 753 #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v) 754 #define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v) 755 #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v) 756 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v)) 757 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v)) 758 759 /* 10.3 MTE intrinsics */ 760 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 761 #define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask) 762 #define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset) 763 #define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded) 764 #define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr) 765 #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr) 766 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb) 767 768 /* 18 memcpy family of operations intrinsics - MOPS */ 769 #define __arm_mops_memset_tag(__tagged_address, __value, __size) \ 770 __builtin_arm_mops_memset_tag(__tagged_address, __value, __size) 771 #endif 772 773 /* 11.3 Coprocessor Intrinsics */ 774 #if defined(__ARM_FEATURE_COPROC) 775 776 #if (__ARM_FEATURE_COPROC & 0x1) 777 778 #if (__ARM_ARCH < 8) 779 #define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \ 780 __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) 781 #endif /* __ARM_ARCH < 8 */ 782 783 #define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p) 784 #define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p) 785 786 #define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2) \ 787 __builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2) 788 #define __arm_mrc(coproc, opc1, CRn, CRm, opc2) \ 789 __builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2) 790 791 #if (__ARM_ARCH != 4) && (__ARM_ARCH < 8) 792 #define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p) 793 #define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p) 794 #endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */ 795 796 #if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__) 797 #define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \ 798 __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) 799 #define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p) 800 #define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p) 801 #endif /* ___ARM_ARCH_8M_MAIN__ */ 802 803 #endif /* __ARM_FEATURE_COPROC & 0x1 */ 804 805 #if (__ARM_FEATURE_COPROC & 0x2) 806 #define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2) \ 807 __builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2) 808 #define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p) 809 #define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p) 810 #define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p) 811 #define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p) 812 #define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2) \ 813 __builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2) 814 #define __arm_mrc2(coproc, opc1, CRn, CRm, opc2) \ 815 __builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2) 816 #endif 817 818 #if (__ARM_FEATURE_COPROC & 0x4) 819 #define __arm_mcrr(coproc, opc1, value, CRm) \ 820 __builtin_arm_mcrr(coproc, opc1, value, CRm) 821 #define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm) 822 #endif 823 824 #if (__ARM_FEATURE_COPROC & 0x8) 825 #define __arm_mcrr2(coproc, opc1, value, CRm) \ 826 __builtin_arm_mcrr2(coproc, opc1, value, CRm) 827 #define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm) 828 #endif 829 830 #endif // __ARM_FEATURE_COPROC 831 832 /* 17 Transactional Memory Extension (TME) Intrinsics */ 833 #if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME 834 835 #define _TMFAILURE_REASON 0x00007fffu 836 #define _TMFAILURE_RTRY 0x00008000u 837 #define _TMFAILURE_CNCL 0x00010000u 838 #define _TMFAILURE_MEM 0x00020000u 839 #define _TMFAILURE_IMP 0x00040000u 840 #define _TMFAILURE_ERR 0x00080000u 841 #define _TMFAILURE_SIZE 0x00100000u 842 #define _TMFAILURE_NEST 0x00200000u 843 #define _TMFAILURE_DBG 0x00400000u 844 #define _TMFAILURE_INT 0x00800000u 845 #define _TMFAILURE_TRIVIAL 0x01000000u 846 847 #define __tstart() __builtin_arm_tstart() 848 #define __tcommit() __builtin_arm_tcommit() 849 #define __tcancel(__arg) __builtin_arm_tcancel(__arg) 850 #define __ttest() __builtin_arm_ttest() 851 852 #endif /* __ARM_FEATURE_TME */ 853 854 /* 8.7 Armv8.5-A Random number generation intrinsics */ 855 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 856 static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand"))) 857 __rndr(uint64_t *__p) { 858 return __builtin_arm_rndr(__p); 859 } 860 static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand"))) 861 __rndrrs(uint64_t *__p) { 862 return __builtin_arm_rndrrs(__p); 863 } 864 #endif 865 866 /* 11.2 Guarded Control Stack intrinsics */ 867 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 868 static __inline__ void * __attribute__((__always_inline__, __nodebug__)) 869 __gcspr() { 870 return (void *)__builtin_arm_rsr64("gcspr_el0"); 871 } 872 873 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("gcs"))) 874 __gcspopm() { 875 return __builtin_arm_gcspopm(0); 876 } 877 878 static __inline__ const void * __attribute__((__always_inline__, __nodebug__, target("gcs"))) 879 __gcsss(const void *__stack) { 880 return __builtin_arm_gcsss(__stack); 881 } 882 #endif 883 884 #if defined(__cplusplus) 885 } 886 #endif 887 888 #endif /* __ARM_ACLE_H */ 889