1 /*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 * The Arm C Language Extensions specifications can be found in the following 8 * link: https://github.com/ARM-software/acle/releases 9 * 10 * The ACLE section numbers are subject to change. When consulting the 11 * specifications, it is recommended to search using section titles if 12 * the section numbers look outdated. 13 * 14 *===-----------------------------------------------------------------------=== 15 */ 16 17 #ifndef __ARM_ACLE_H 18 #define __ARM_ACLE_H 19 20 #ifndef __ARM_ACLE 21 #error "ACLE intrinsics support not enabled." 22 #endif 23 24 #include <stdint.h> 25 26 #if defined(__cplusplus) 27 extern "C" { 28 #endif 29 30 /* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */ 31 /* 7.3 Memory barriers */ 32 #if !__has_builtin(__dmb) 33 #define __dmb(i) __builtin_arm_dmb(i) 34 #endif 35 #if !__has_builtin(__dsb) 36 #define __dsb(i) __builtin_arm_dsb(i) 37 #endif 38 #if !__has_builtin(__isb) 39 #define __isb(i) __builtin_arm_isb(i) 40 #endif 41 42 /* 7.4 Hints */ 43 44 #if !__has_builtin(__wfi) 45 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) { 46 __builtin_arm_wfi(); 47 } 48 #endif 49 50 #if !__has_builtin(__wfe) 51 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) { 52 __builtin_arm_wfe(); 53 } 54 #endif 55 56 #if !__has_builtin(__sev) 57 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) { 58 __builtin_arm_sev(); 59 } 60 #endif 61 62 #if !__has_builtin(__sevl) 63 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) { 64 __builtin_arm_sevl(); 65 } 66 #endif 67 68 #if !__has_builtin(__yield) 69 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) { 70 __builtin_arm_yield(); 71 } 72 #endif 73 74 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE 75 #define __dbg(t) __builtin_arm_dbg(t) 76 #endif 77 78 /* 7.5 Swap */ 79 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 80 __swp(uint32_t __x, volatile uint32_t *__p) { 81 uint32_t v; 82 do 83 v = __builtin_arm_ldrex(__p); 84 while (__builtin_arm_strex(__x, __p)); 85 return v; 86 } 87 88 /* 7.6 Memory prefetch intrinsics */ 89 /* 7.6.1 Data prefetch */ 90 #define __pld(addr) __pldx(0, 0, 0, addr) 91 92 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE 93 #define __pldx(access_kind, cache_level, retention_policy, addr) \ 94 __builtin_arm_prefetch(addr, access_kind, 1) 95 #else 96 #define __pldx(access_kind, cache_level, retention_policy, addr) \ 97 __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1) 98 #endif 99 100 /* 7.6.2 Instruction prefetch */ 101 #define __pli(addr) __plix(0, 0, addr) 102 103 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE 104 #define __plix(cache_level, retention_policy, addr) \ 105 __builtin_arm_prefetch(addr, 0, 0) 106 #else 107 #define __plix(cache_level, retention_policy, addr) \ 108 __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0) 109 #endif 110 111 /* 7.7 NOP */ 112 #if !defined(_MSC_VER) || !defined(__aarch64__) 113 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) { 114 __builtin_arm_nop(); 115 } 116 #endif 117 118 /* 8 DATA-PROCESSING INTRINSICS */ 119 /* 8.2 Miscellaneous data-processing intrinsics */ 120 /* ROR */ 121 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 122 __ror(uint32_t __x, uint32_t __y) { 123 __y %= 32; 124 if (__y == 0) 125 return __x; 126 return (__x >> __y) | (__x << (32 - __y)); 127 } 128 129 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) 130 __rorll(uint64_t __x, uint32_t __y) { 131 __y %= 64; 132 if (__y == 0) 133 return __x; 134 return (__x >> __y) | (__x << (64 - __y)); 135 } 136 137 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) 138 __rorl(unsigned long __x, uint32_t __y) { 139 #if __SIZEOF_LONG__ == 4 140 return __ror(__x, __y); 141 #else 142 return __rorll(__x, __y); 143 #endif 144 } 145 146 147 /* CLZ */ 148 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 149 __clz(uint32_t __t) { 150 return __builtin_arm_clz(__t); 151 } 152 153 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 154 __clzl(unsigned long __t) { 155 #if __SIZEOF_LONG__ == 4 156 return __builtin_arm_clz(__t); 157 #else 158 return __builtin_arm_clz64(__t); 159 #endif 160 } 161 162 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 163 __clzll(uint64_t __t) { 164 return __builtin_arm_clz64(__t); 165 } 166 167 /* CLS */ 168 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 169 __cls(uint32_t __t) { 170 return __builtin_arm_cls(__t); 171 } 172 173 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 174 __clsl(unsigned long __t) { 175 #if __SIZEOF_LONG__ == 4 176 return __builtin_arm_cls(__t); 177 #else 178 return __builtin_arm_cls64(__t); 179 #endif 180 } 181 182 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 183 __clsll(uint64_t __t) { 184 return __builtin_arm_cls64(__t); 185 } 186 187 /* REV */ 188 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 189 __rev(uint32_t __t) { 190 return __builtin_bswap32(__t); 191 } 192 193 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) 194 __revl(unsigned long __t) { 195 #if __SIZEOF_LONG__ == 4 196 return __builtin_bswap32(__t); 197 #else 198 return __builtin_bswap64(__t); 199 #endif 200 } 201 202 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) 203 __revll(uint64_t __t) { 204 return __builtin_bswap64(__t); 205 } 206 207 /* REV16 */ 208 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 209 __rev16(uint32_t __t) { 210 return __ror(__rev(__t), 16); 211 } 212 213 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) 214 __rev16ll(uint64_t __t) { 215 return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t); 216 } 217 218 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) 219 __rev16l(unsigned long __t) { 220 #if __SIZEOF_LONG__ == 4 221 return __rev16(__t); 222 #else 223 return __rev16ll(__t); 224 #endif 225 } 226 227 /* REVSH */ 228 static __inline__ int16_t __attribute__((__always_inline__, __nodebug__)) 229 __revsh(int16_t __t) { 230 return (int16_t)__builtin_bswap16((uint16_t)__t); 231 } 232 233 /* RBIT */ 234 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 235 __rbit(uint32_t __t) { 236 return __builtin_arm_rbit(__t); 237 } 238 239 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) 240 __rbitll(uint64_t __t) { 241 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE 242 return (((uint64_t)__builtin_arm_rbit(__t)) << 32) | 243 __builtin_arm_rbit(__t >> 32); 244 #else 245 return __builtin_arm_rbit64(__t); 246 #endif 247 } 248 249 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) 250 __rbitl(unsigned long __t) { 251 #if __SIZEOF_LONG__ == 4 252 return __rbit(__t); 253 #else 254 return __rbitll(__t); 255 #endif 256 } 257 258 /* 8.3 16-bit multiplications */ 259 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP 260 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 261 __smulbb(int32_t __a, int32_t __b) { 262 return __builtin_arm_smulbb(__a, __b); 263 } 264 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 265 __smulbt(int32_t __a, int32_t __b) { 266 return __builtin_arm_smulbt(__a, __b); 267 } 268 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 269 __smultb(int32_t __a, int32_t __b) { 270 return __builtin_arm_smultb(__a, __b); 271 } 272 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 273 __smultt(int32_t __a, int32_t __b) { 274 return __builtin_arm_smultt(__a, __b); 275 } 276 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 277 __smulwb(int32_t __a, int32_t __b) { 278 return __builtin_arm_smulwb(__a, __b); 279 } 280 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) 281 __smulwt(int32_t __a, int32_t __b) { 282 return __builtin_arm_smulwt(__a, __b); 283 } 284 #endif 285 286 /* 287 * 8.4 Saturating intrinsics 288 * 289 * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag 290 * intrinsics are implemented and the flag is enabled. 291 */ 292 /* 8.4.1 Width-specified saturation intrinsics */ 293 #if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT 294 #define __ssat(x, y) __builtin_arm_ssat(x, y) 295 #define __usat(x, y) __builtin_arm_usat(x, y) 296 #endif 297 298 /* 8.4.2 Saturating addition and subtraction intrinsics */ 299 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP 300 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 301 __qadd(int32_t __t, int32_t __v) { 302 return __builtin_arm_qadd(__t, __v); 303 } 304 305 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 306 __qsub(int32_t __t, int32_t __v) { 307 return __builtin_arm_qsub(__t, __v); 308 } 309 310 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 311 __qdbl(int32_t __t) { 312 return __builtin_arm_qadd(__t, __t); 313 } 314 #endif 315 316 /* 8.4.3 Accumultating multiplications */ 317 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP 318 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 319 __smlabb(int32_t __a, int32_t __b, int32_t __c) { 320 return __builtin_arm_smlabb(__a, __b, __c); 321 } 322 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 323 __smlabt(int32_t __a, int32_t __b, int32_t __c) { 324 return __builtin_arm_smlabt(__a, __b, __c); 325 } 326 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 327 __smlatb(int32_t __a, int32_t __b, int32_t __c) { 328 return __builtin_arm_smlatb(__a, __b, __c); 329 } 330 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 331 __smlatt(int32_t __a, int32_t __b, int32_t __c) { 332 return __builtin_arm_smlatt(__a, __b, __c); 333 } 334 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 335 __smlawb(int32_t __a, int32_t __b, int32_t __c) { 336 return __builtin_arm_smlawb(__a, __b, __c); 337 } 338 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 339 __smlawt(int32_t __a, int32_t __b, int32_t __c) { 340 return __builtin_arm_smlawt(__a, __b, __c); 341 } 342 #endif 343 344 345 /* 8.5.4 Parallel 16-bit saturation */ 346 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 347 #define __ssat16(x, y) __builtin_arm_ssat16(x, y) 348 #define __usat16(x, y) __builtin_arm_usat16(x, y) 349 #endif 350 351 /* 8.5.5 Packing and unpacking */ 352 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 353 typedef int32_t int8x4_t; 354 typedef int32_t int16x2_t; 355 typedef uint32_t uint8x4_t; 356 typedef uint32_t uint16x2_t; 357 358 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 359 __sxtab16(int16x2_t __a, int8x4_t __b) { 360 return __builtin_arm_sxtab16(__a, __b); 361 } 362 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 363 __sxtb16(int8x4_t __a) { 364 return __builtin_arm_sxtb16(__a); 365 } 366 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 367 __uxtab16(int16x2_t __a, int8x4_t __b) { 368 return __builtin_arm_uxtab16(__a, __b); 369 } 370 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 371 __uxtb16(int8x4_t __a) { 372 return __builtin_arm_uxtb16(__a); 373 } 374 #endif 375 376 /* 8.5.6 Parallel selection */ 377 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 378 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 379 __sel(uint8x4_t __a, uint8x4_t __b) { 380 return __builtin_arm_sel(__a, __b); 381 } 382 #endif 383 384 /* 8.5.7 Parallel 8-bit addition and subtraction */ 385 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 386 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 387 __qadd8(int8x4_t __a, int8x4_t __b) { 388 return __builtin_arm_qadd8(__a, __b); 389 } 390 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 391 __qsub8(int8x4_t __a, int8x4_t __b) { 392 return __builtin_arm_qsub8(__a, __b); 393 } 394 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 395 __sadd8(int8x4_t __a, int8x4_t __b) { 396 return __builtin_arm_sadd8(__a, __b); 397 } 398 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 399 __shadd8(int8x4_t __a, int8x4_t __b) { 400 return __builtin_arm_shadd8(__a, __b); 401 } 402 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 403 __shsub8(int8x4_t __a, int8x4_t __b) { 404 return __builtin_arm_shsub8(__a, __b); 405 } 406 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) 407 __ssub8(int8x4_t __a, int8x4_t __b) { 408 return __builtin_arm_ssub8(__a, __b); 409 } 410 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 411 __uadd8(uint8x4_t __a, uint8x4_t __b) { 412 return __builtin_arm_uadd8(__a, __b); 413 } 414 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 415 __uhadd8(uint8x4_t __a, uint8x4_t __b) { 416 return __builtin_arm_uhadd8(__a, __b); 417 } 418 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 419 __uhsub8(uint8x4_t __a, uint8x4_t __b) { 420 return __builtin_arm_uhsub8(__a, __b); 421 } 422 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 423 __uqadd8(uint8x4_t __a, uint8x4_t __b) { 424 return __builtin_arm_uqadd8(__a, __b); 425 } 426 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 427 __uqsub8(uint8x4_t __a, uint8x4_t __b) { 428 return __builtin_arm_uqsub8(__a, __b); 429 } 430 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) 431 __usub8(uint8x4_t __a, uint8x4_t __b) { 432 return __builtin_arm_usub8(__a, __b); 433 } 434 #endif 435 436 /* 8.5.8 Sum of 8-bit absolute differences */ 437 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 438 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 439 __usad8(uint8x4_t __a, uint8x4_t __b) { 440 return __builtin_arm_usad8(__a, __b); 441 } 442 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) 443 __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) { 444 return __builtin_arm_usada8(__a, __b, __c); 445 } 446 #endif 447 448 /* 8.5.9 Parallel 16-bit addition and subtraction */ 449 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 450 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 451 __qadd16(int16x2_t __a, int16x2_t __b) { 452 return __builtin_arm_qadd16(__a, __b); 453 } 454 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 455 __qasx(int16x2_t __a, int16x2_t __b) { 456 return __builtin_arm_qasx(__a, __b); 457 } 458 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 459 __qsax(int16x2_t __a, int16x2_t __b) { 460 return __builtin_arm_qsax(__a, __b); 461 } 462 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 463 __qsub16(int16x2_t __a, int16x2_t __b) { 464 return __builtin_arm_qsub16(__a, __b); 465 } 466 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 467 __sadd16(int16x2_t __a, int16x2_t __b) { 468 return __builtin_arm_sadd16(__a, __b); 469 } 470 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 471 __sasx(int16x2_t __a, int16x2_t __b) { 472 return __builtin_arm_sasx(__a, __b); 473 } 474 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 475 __shadd16(int16x2_t __a, int16x2_t __b) { 476 return __builtin_arm_shadd16(__a, __b); 477 } 478 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 479 __shasx(int16x2_t __a, int16x2_t __b) { 480 return __builtin_arm_shasx(__a, __b); 481 } 482 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 483 __shsax(int16x2_t __a, int16x2_t __b) { 484 return __builtin_arm_shsax(__a, __b); 485 } 486 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 487 __shsub16(int16x2_t __a, int16x2_t __b) { 488 return __builtin_arm_shsub16(__a, __b); 489 } 490 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 491 __ssax(int16x2_t __a, int16x2_t __b) { 492 return __builtin_arm_ssax(__a, __b); 493 } 494 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) 495 __ssub16(int16x2_t __a, int16x2_t __b) { 496 return __builtin_arm_ssub16(__a, __b); 497 } 498 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 499 __uadd16(uint16x2_t __a, uint16x2_t __b) { 500 return __builtin_arm_uadd16(__a, __b); 501 } 502 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 503 __uasx(uint16x2_t __a, uint16x2_t __b) { 504 return __builtin_arm_uasx(__a, __b); 505 } 506 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 507 __uhadd16(uint16x2_t __a, uint16x2_t __b) { 508 return __builtin_arm_uhadd16(__a, __b); 509 } 510 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 511 __uhasx(uint16x2_t __a, uint16x2_t __b) { 512 return __builtin_arm_uhasx(__a, __b); 513 } 514 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 515 __uhsax(uint16x2_t __a, uint16x2_t __b) { 516 return __builtin_arm_uhsax(__a, __b); 517 } 518 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 519 __uhsub16(uint16x2_t __a, uint16x2_t __b) { 520 return __builtin_arm_uhsub16(__a, __b); 521 } 522 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 523 __uqadd16(uint16x2_t __a, uint16x2_t __b) { 524 return __builtin_arm_uqadd16(__a, __b); 525 } 526 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 527 __uqasx(uint16x2_t __a, uint16x2_t __b) { 528 return __builtin_arm_uqasx(__a, __b); 529 } 530 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 531 __uqsax(uint16x2_t __a, uint16x2_t __b) { 532 return __builtin_arm_uqsax(__a, __b); 533 } 534 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 535 __uqsub16(uint16x2_t __a, uint16x2_t __b) { 536 return __builtin_arm_uqsub16(__a, __b); 537 } 538 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 539 __usax(uint16x2_t __a, uint16x2_t __b) { 540 return __builtin_arm_usax(__a, __b); 541 } 542 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) 543 __usub16(uint16x2_t __a, uint16x2_t __b) { 544 return __builtin_arm_usub16(__a, __b); 545 } 546 #endif 547 548 /* 8.5.10 Parallel 16-bit multiplications */ 549 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 550 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 551 __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) { 552 return __builtin_arm_smlad(__a, __b, __c); 553 } 554 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 555 __smladx(int16x2_t __a, int16x2_t __b, int32_t __c) { 556 return __builtin_arm_smladx(__a, __b, __c); 557 } 558 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) 559 __smlald(int16x2_t __a, int16x2_t __b, int64_t __c) { 560 return __builtin_arm_smlald(__a, __b, __c); 561 } 562 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) 563 __smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) { 564 return __builtin_arm_smlaldx(__a, __b, __c); 565 } 566 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 567 __smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) { 568 return __builtin_arm_smlsd(__a, __b, __c); 569 } 570 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 571 __smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) { 572 return __builtin_arm_smlsdx(__a, __b, __c); 573 } 574 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) 575 __smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) { 576 return __builtin_arm_smlsld(__a, __b, __c); 577 } 578 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) 579 __smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) { 580 return __builtin_arm_smlsldx(__a, __b, __c); 581 } 582 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 583 __smuad(int16x2_t __a, int16x2_t __b) { 584 return __builtin_arm_smuad(__a, __b); 585 } 586 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 587 __smuadx(int16x2_t __a, int16x2_t __b) { 588 return __builtin_arm_smuadx(__a, __b); 589 } 590 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 591 __smusd(int16x2_t __a, int16x2_t __b) { 592 return __builtin_arm_smusd(__a, __b); 593 } 594 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) 595 __smusdx(int16x2_t __a, int16x2_t __b) { 596 return __builtin_arm_smusdx(__a, __b); 597 } 598 #endif 599 600 /* 8.6 Floating-point data-processing intrinsics */ 601 #if (defined(__ARM_FEATURE_DIRECTED_ROUNDING) && \ 602 (__ARM_FEATURE_DIRECTED_ROUNDING)) && \ 603 (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE) 604 static __inline__ double __attribute__((__always_inline__, __nodebug__)) 605 __rintn(double __a) { 606 return __builtin_roundeven(__a); 607 } 608 609 static __inline__ float __attribute__((__always_inline__, __nodebug__)) 610 __rintnf(float __a) { 611 return __builtin_roundevenf(__a); 612 } 613 #endif 614 615 /* 8.8 CRC32 intrinsics */ 616 #if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) || \ 617 (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE) 618 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 619 __crc32b(uint32_t __a, uint8_t __b) { 620 return __builtin_arm_crc32b(__a, __b); 621 } 622 623 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 624 __crc32h(uint32_t __a, uint16_t __b) { 625 return __builtin_arm_crc32h(__a, __b); 626 } 627 628 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 629 __crc32w(uint32_t __a, uint32_t __b) { 630 return __builtin_arm_crc32w(__a, __b); 631 } 632 633 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 634 __crc32d(uint32_t __a, uint64_t __b) { 635 return __builtin_arm_crc32d(__a, __b); 636 } 637 638 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 639 __crc32cb(uint32_t __a, uint8_t __b) { 640 return __builtin_arm_crc32cb(__a, __b); 641 } 642 643 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 644 __crc32ch(uint32_t __a, uint16_t __b) { 645 return __builtin_arm_crc32ch(__a, __b); 646 } 647 648 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 649 __crc32cw(uint32_t __a, uint32_t __b) { 650 return __builtin_arm_crc32cw(__a, __b); 651 } 652 653 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) 654 __crc32cd(uint32_t __a, uint64_t __b) { 655 return __builtin_arm_crc32cd(__a, __b); 656 } 657 #endif 658 659 /* 8.6 Floating-point data-processing intrinsics */ 660 /* Armv8.3-A Javascript conversion intrinsic */ 661 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 662 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a"))) 663 __jcvt(double __a) { 664 return __builtin_arm_jcvt(__a); 665 } 666 #endif 667 668 /* Armv8.5-A FP rounding intrinsics */ 669 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 670 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 671 __rint32zf(float __a) { 672 return __builtin_arm_rint32zf(__a); 673 } 674 675 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 676 __rint32z(double __a) { 677 return __builtin_arm_rint32z(__a); 678 } 679 680 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 681 __rint64zf(float __a) { 682 return __builtin_arm_rint64zf(__a); 683 } 684 685 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 686 __rint64z(double __a) { 687 return __builtin_arm_rint64z(__a); 688 } 689 690 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 691 __rint32xf(float __a) { 692 return __builtin_arm_rint32xf(__a); 693 } 694 695 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 696 __rint32x(double __a) { 697 return __builtin_arm_rint32x(__a); 698 } 699 700 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 701 __rint64xf(float __a) { 702 return __builtin_arm_rint64xf(__a); 703 } 704 705 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) 706 __rint64x(double __a) { 707 return __builtin_arm_rint64x(__a); 708 } 709 #endif 710 711 /* 8.9 Armv8.7-A load/store 64-byte intrinsics */ 712 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 713 typedef struct { 714 uint64_t val[8]; 715 } data512_t; 716 717 static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64"))) 718 __arm_ld64b(const void *__addr) { 719 data512_t __value; 720 __builtin_arm_ld64b(__addr, __value.val); 721 return __value; 722 } 723 static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64"))) 724 __arm_st64b(void *__addr, data512_t __value) { 725 __builtin_arm_st64b(__addr, __value.val); 726 } 727 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64"))) 728 __arm_st64bv(void *__addr, data512_t __value) { 729 return __builtin_arm_st64bv(__addr, __value.val); 730 } 731 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64"))) 732 __arm_st64bv0(void *__addr, data512_t __value) { 733 return __builtin_arm_st64bv0(__addr, __value.val); 734 } 735 #endif 736 737 /* 11.1 Special register intrinsics */ 738 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg) 739 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg) 740 #define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg) 741 #define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg) 742 #define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg)) 743 #define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg)) 744 #define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v) 745 #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v) 746 #define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v) 747 #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v) 748 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v)) 749 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v)) 750 751 /* 10.3 Memory Tagging Extensions (MTE) Intrinsics */ 752 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 753 #define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask) 754 #define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset) 755 #define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded) 756 #define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr) 757 #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr) 758 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb) 759 760 /* 18 Memory Operations Intrinsics */ 761 #define __arm_mops_memset_tag(__tagged_address, __value, __size) \ 762 __builtin_arm_mops_memset_tag(__tagged_address, __value, __size) 763 #endif 764 765 /* 11.3 Coprocessor Intrinsics */ 766 #if defined(__ARM_FEATURE_COPROC) 767 768 #if (__ARM_FEATURE_COPROC & 0x1) 769 770 #if (__ARM_ARCH < 8) 771 #define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \ 772 __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) 773 #endif /* __ARM_ARCH < 8 */ 774 775 #define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p) 776 #define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p) 777 778 #define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2) \ 779 __builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2) 780 #define __arm_mrc(coproc, opc1, CRn, CRm, opc2) \ 781 __builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2) 782 783 #if (__ARM_ARCH != 4) && (__ARM_ARCH < 8) 784 #define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p) 785 #define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p) 786 #endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */ 787 788 #if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__) 789 #define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \ 790 __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) 791 #define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p) 792 #define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p) 793 #endif /* ___ARM_ARCH_8M_MAIN__ */ 794 795 #endif /* __ARM_FEATURE_COPROC & 0x1 */ 796 797 #if (__ARM_FEATURE_COPROC & 0x2) 798 #define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2) \ 799 __builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2) 800 #define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p) 801 #define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p) 802 #define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p) 803 #define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p) 804 #define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2) \ 805 __builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2) 806 #define __arm_mrc2(coproc, opc1, CRn, CRm, opc2) \ 807 __builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2) 808 #endif 809 810 #if (__ARM_FEATURE_COPROC & 0x4) 811 #define __arm_mcrr(coproc, opc1, value, CRm) \ 812 __builtin_arm_mcrr(coproc, opc1, value, CRm) 813 #define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm) 814 #endif 815 816 #if (__ARM_FEATURE_COPROC & 0x8) 817 #define __arm_mcrr2(coproc, opc1, value, CRm) \ 818 __builtin_arm_mcrr2(coproc, opc1, value, CRm) 819 #define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm) 820 #endif 821 822 #endif // __ARM_FEATURE_COPROC 823 824 /* 17 Transactional Memory Extension (TME) Intrinsics */ 825 #if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME 826 827 #define _TMFAILURE_REASON 0x00007fffu 828 #define _TMFAILURE_RTRY 0x00008000u 829 #define _TMFAILURE_CNCL 0x00010000u 830 #define _TMFAILURE_MEM 0x00020000u 831 #define _TMFAILURE_IMP 0x00040000u 832 #define _TMFAILURE_ERR 0x00080000u 833 #define _TMFAILURE_SIZE 0x00100000u 834 #define _TMFAILURE_NEST 0x00200000u 835 #define _TMFAILURE_DBG 0x00400000u 836 #define _TMFAILURE_INT 0x00800000u 837 #define _TMFAILURE_TRIVIAL 0x01000000u 838 839 #define __tstart() __builtin_arm_tstart() 840 #define __tcommit() __builtin_arm_tcommit() 841 #define __tcancel(__arg) __builtin_arm_tcancel(__arg) 842 #define __ttest() __builtin_arm_ttest() 843 844 #endif /* __ARM_FEATURE_TME */ 845 846 /* 8.7 Armv8.5-A Random number generation intrinsics */ 847 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE 848 static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand"))) 849 __rndr(uint64_t *__p) { 850 return __builtin_arm_rndr(__p); 851 } 852 static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand"))) 853 __rndrrs(uint64_t *__p) { 854 return __builtin_arm_rndrrs(__p); 855 } 856 #endif 857 858 #if defined(__cplusplus) 859 } 860 #endif 861 862 #endif /* __ARM_ACLE_H */ 863