10b57cec5SDimitry Andric /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
20b57cec5SDimitry Andric *
30b57cec5SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric * See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric *
70b57cec5SDimitry Andric *===-----------------------------------------------------------------------===
80b57cec5SDimitry Andric */
90b57cec5SDimitry Andric #ifndef __CLANG_CUDA_INTRINSICS_H__
100b57cec5SDimitry Andric #define __CLANG_CUDA_INTRINSICS_H__
110b57cec5SDimitry Andric #ifndef __CUDA__
120b57cec5SDimitry Andric #error "This file is for CUDA compilation only."
130b57cec5SDimitry Andric #endif
140b57cec5SDimitry Andric
150b57cec5SDimitry Andric // sm_30 intrinsics: __shfl_{up,down,xor}.
160b57cec5SDimitry Andric
170b57cec5SDimitry Andric #define __SM_30_INTRINSICS_H__
180b57cec5SDimitry Andric #define __SM_30_INTRINSICS_HPP__
190b57cec5SDimitry Andric
200b57cec5SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
210b57cec5SDimitry Andric
220b57cec5SDimitry Andric #pragma push_macro("__MAKE_SHUFFLES")
230b57cec5SDimitry Andric #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \
240b57cec5SDimitry Andric __Type) \
250b57cec5SDimitry Andric inline __device__ int __FnName(int __val, __Type __offset, \
260b57cec5SDimitry Andric int __width = warpSize) { \
270b57cec5SDimitry Andric return __IntIntrinsic(__val, __offset, \
280b57cec5SDimitry Andric ((warpSize - __width) << 8) | (__Mask)); \
290b57cec5SDimitry Andric } \
300b57cec5SDimitry Andric inline __device__ float __FnName(float __val, __Type __offset, \
310b57cec5SDimitry Andric int __width = warpSize) { \
320b57cec5SDimitry Andric return __FloatIntrinsic(__val, __offset, \
330b57cec5SDimitry Andric ((warpSize - __width) << 8) | (__Mask)); \
340b57cec5SDimitry Andric } \
350b57cec5SDimitry Andric inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
360b57cec5SDimitry Andric int __width = warpSize) { \
370b57cec5SDimitry Andric return static_cast<unsigned int>( \
380b57cec5SDimitry Andric ::__FnName(static_cast<int>(__val), __offset, __width)); \
390b57cec5SDimitry Andric } \
400b57cec5SDimitry Andric inline __device__ long long __FnName(long long __val, __Type __offset, \
410b57cec5SDimitry Andric int __width = warpSize) { \
420b57cec5SDimitry Andric struct __Bits { \
430b57cec5SDimitry Andric int __a, __b; \
440b57cec5SDimitry Andric }; \
450b57cec5SDimitry Andric _Static_assert(sizeof(__val) == sizeof(__Bits)); \
460b57cec5SDimitry Andric _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
470b57cec5SDimitry Andric __Bits __tmp; \
4813138422SDimitry Andric memcpy(&__tmp, &__val, sizeof(__val)); \
490b57cec5SDimitry Andric __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
500b57cec5SDimitry Andric __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
510b57cec5SDimitry Andric long long __ret; \
520b57cec5SDimitry Andric memcpy(&__ret, &__tmp, sizeof(__tmp)); \
530b57cec5SDimitry Andric return __ret; \
540b57cec5SDimitry Andric } \
550b57cec5SDimitry Andric inline __device__ long __FnName(long __val, __Type __offset, \
560b57cec5SDimitry Andric int __width = warpSize) { \
570b57cec5SDimitry Andric _Static_assert(sizeof(long) == sizeof(long long) || \
580b57cec5SDimitry Andric sizeof(long) == sizeof(int)); \
590b57cec5SDimitry Andric if (sizeof(long) == sizeof(long long)) { \
600b57cec5SDimitry Andric return static_cast<long>( \
610b57cec5SDimitry Andric ::__FnName(static_cast<long long>(__val), __offset, __width)); \
620b57cec5SDimitry Andric } else if (sizeof(long) == sizeof(int)) { \
630b57cec5SDimitry Andric return static_cast<long>( \
640b57cec5SDimitry Andric ::__FnName(static_cast<int>(__val), __offset, __width)); \
650b57cec5SDimitry Andric } \
660b57cec5SDimitry Andric } \
670b57cec5SDimitry Andric inline __device__ unsigned long __FnName( \
680b57cec5SDimitry Andric unsigned long __val, __Type __offset, int __width = warpSize) { \
690b57cec5SDimitry Andric return static_cast<unsigned long>( \
700b57cec5SDimitry Andric ::__FnName(static_cast<long>(__val), __offset, __width)); \
710b57cec5SDimitry Andric } \
720b57cec5SDimitry Andric inline __device__ unsigned long long __FnName( \
730b57cec5SDimitry Andric unsigned long long __val, __Type __offset, int __width = warpSize) { \
74fcaf7f86SDimitry Andric return static_cast<unsigned long long>( \
75fcaf7f86SDimitry Andric ::__FnName(static_cast<long long>(__val), __offset, __width)); \
760b57cec5SDimitry Andric } \
770b57cec5SDimitry Andric inline __device__ double __FnName(double __val, __Type __offset, \
780b57cec5SDimitry Andric int __width = warpSize) { \
790b57cec5SDimitry Andric long long __tmp; \
800b57cec5SDimitry Andric _Static_assert(sizeof(__tmp) == sizeof(__val)); \
810b57cec5SDimitry Andric memcpy(&__tmp, &__val, sizeof(__val)); \
820b57cec5SDimitry Andric __tmp = ::__FnName(__tmp, __offset, __width); \
830b57cec5SDimitry Andric double __ret; \
840b57cec5SDimitry Andric memcpy(&__ret, &__tmp, sizeof(__ret)); \
850b57cec5SDimitry Andric return __ret; \
860b57cec5SDimitry Andric }
870b57cec5SDimitry Andric
880b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
890b57cec5SDimitry Andric // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
900b57cec5SDimitry Andric // maxLane.
910b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
920b57cec5SDimitry Andric unsigned int);
930b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
940b57cec5SDimitry Andric unsigned int);
950b57cec5SDimitry Andric __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
960b57cec5SDimitry Andric int);
970b57cec5SDimitry Andric #pragma pop_macro("__MAKE_SHUFFLES")
980b57cec5SDimitry Andric
990b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
1000b57cec5SDimitry Andric
1010b57cec5SDimitry Andric #if CUDA_VERSION >= 9000
1020b57cec5SDimitry Andric #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
1030b57cec5SDimitry Andric // __shfl_sync_* variants available in CUDA-9
1040b57cec5SDimitry Andric #pragma push_macro("__MAKE_SYNC_SHUFFLES")
1050b57cec5SDimitry Andric #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
1060b57cec5SDimitry Andric __Mask, __Type) \
1070b57cec5SDimitry Andric inline __device__ int __FnName(unsigned int __mask, int __val, \
1080b57cec5SDimitry Andric __Type __offset, int __width = warpSize) { \
1090b57cec5SDimitry Andric return __IntIntrinsic(__mask, __val, __offset, \
1100b57cec5SDimitry Andric ((warpSize - __width) << 8) | (__Mask)); \
1110b57cec5SDimitry Andric } \
1120b57cec5SDimitry Andric inline __device__ float __FnName(unsigned int __mask, float __val, \
1130b57cec5SDimitry Andric __Type __offset, int __width = warpSize) { \
1140b57cec5SDimitry Andric return __FloatIntrinsic(__mask, __val, __offset, \
1150b57cec5SDimitry Andric ((warpSize - __width) << 8) | (__Mask)); \
1160b57cec5SDimitry Andric } \
1170b57cec5SDimitry Andric inline __device__ unsigned int __FnName(unsigned int __mask, \
1180b57cec5SDimitry Andric unsigned int __val, __Type __offset, \
1190b57cec5SDimitry Andric int __width = warpSize) { \
1200b57cec5SDimitry Andric return static_cast<unsigned int>( \
1210b57cec5SDimitry Andric ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
1220b57cec5SDimitry Andric } \
1230b57cec5SDimitry Andric inline __device__ long long __FnName(unsigned int __mask, long long __val, \
1240b57cec5SDimitry Andric __Type __offset, \
1250b57cec5SDimitry Andric int __width = warpSize) { \
1260b57cec5SDimitry Andric struct __Bits { \
1270b57cec5SDimitry Andric int __a, __b; \
1280b57cec5SDimitry Andric }; \
1290b57cec5SDimitry Andric _Static_assert(sizeof(__val) == sizeof(__Bits)); \
1300b57cec5SDimitry Andric _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
1310b57cec5SDimitry Andric __Bits __tmp; \
13213138422SDimitry Andric memcpy(&__tmp, &__val, sizeof(__val)); \
1330b57cec5SDimitry Andric __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
1340b57cec5SDimitry Andric __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
1350b57cec5SDimitry Andric long long __ret; \
1360b57cec5SDimitry Andric memcpy(&__ret, &__tmp, sizeof(__tmp)); \
1370b57cec5SDimitry Andric return __ret; \
1380b57cec5SDimitry Andric } \
1390b57cec5SDimitry Andric inline __device__ unsigned long long __FnName( \
1400b57cec5SDimitry Andric unsigned int __mask, unsigned long long __val, __Type __offset, \
1410b57cec5SDimitry Andric int __width = warpSize) { \
142fcaf7f86SDimitry Andric return static_cast<unsigned long long>( \
143fcaf7f86SDimitry Andric ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
1440b57cec5SDimitry Andric } \
1450b57cec5SDimitry Andric inline __device__ long __FnName(unsigned int __mask, long __val, \
1460b57cec5SDimitry Andric __Type __offset, int __width = warpSize) { \
1470b57cec5SDimitry Andric _Static_assert(sizeof(long) == sizeof(long long) || \
1480b57cec5SDimitry Andric sizeof(long) == sizeof(int)); \
1490b57cec5SDimitry Andric if (sizeof(long) == sizeof(long long)) { \
1500b57cec5SDimitry Andric return static_cast<long>(::__FnName( \
1510b57cec5SDimitry Andric __mask, static_cast<long long>(__val), __offset, __width)); \
1520b57cec5SDimitry Andric } else if (sizeof(long) == sizeof(int)) { \
1530b57cec5SDimitry Andric return static_cast<long>( \
1540b57cec5SDimitry Andric ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
1550b57cec5SDimitry Andric } \
1560b57cec5SDimitry Andric } \
1570b57cec5SDimitry Andric inline __device__ unsigned long __FnName( \
1580b57cec5SDimitry Andric unsigned int __mask, unsigned long __val, __Type __offset, \
1590b57cec5SDimitry Andric int __width = warpSize) { \
1600b57cec5SDimitry Andric return static_cast<unsigned long>( \
1610b57cec5SDimitry Andric ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \
1620b57cec5SDimitry Andric } \
1630b57cec5SDimitry Andric inline __device__ double __FnName(unsigned int __mask, double __val, \
1640b57cec5SDimitry Andric __Type __offset, int __width = warpSize) { \
1650b57cec5SDimitry Andric long long __tmp; \
1660b57cec5SDimitry Andric _Static_assert(sizeof(__tmp) == sizeof(__val)); \
1670b57cec5SDimitry Andric memcpy(&__tmp, &__val, sizeof(__val)); \
1680b57cec5SDimitry Andric __tmp = ::__FnName(__mask, __tmp, __offset, __width); \
1690b57cec5SDimitry Andric double __ret; \
1700b57cec5SDimitry Andric memcpy(&__ret, &__tmp, sizeof(__ret)); \
1710b57cec5SDimitry Andric return __ret; \
1720b57cec5SDimitry Andric }
1730b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
1740b57cec5SDimitry Andric __nvvm_shfl_sync_idx_f32, 0x1f, int);
1750b57cec5SDimitry Andric // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
1760b57cec5SDimitry Andric // maxLane.
1770b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
1780b57cec5SDimitry Andric __nvvm_shfl_sync_up_f32, 0, unsigned int);
1790b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
1800b57cec5SDimitry Andric __nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
1810b57cec5SDimitry Andric __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
1820b57cec5SDimitry Andric __nvvm_shfl_sync_bfly_f32, 0x1f, int);
1830b57cec5SDimitry Andric #pragma pop_macro("__MAKE_SYNC_SHUFFLES")
1840b57cec5SDimitry Andric
1850b57cec5SDimitry Andric inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
1860b57cec5SDimitry Andric return __nvvm_bar_warp_sync(mask);
1870b57cec5SDimitry Andric }
1880b57cec5SDimitry Andric
__barrier_sync(unsigned int id)1890b57cec5SDimitry Andric inline __device__ void __barrier_sync(unsigned int id) {
1900b57cec5SDimitry Andric __nvvm_barrier_sync(id);
1910b57cec5SDimitry Andric }
1920b57cec5SDimitry Andric
__barrier_sync_count(unsigned int id,unsigned int count)1930b57cec5SDimitry Andric inline __device__ void __barrier_sync_count(unsigned int id,
1940b57cec5SDimitry Andric unsigned int count) {
1950b57cec5SDimitry Andric __nvvm_barrier_sync_cnt(id, count);
1960b57cec5SDimitry Andric }
1970b57cec5SDimitry Andric
__all_sync(unsigned int mask,int pred)1980b57cec5SDimitry Andric inline __device__ int __all_sync(unsigned int mask, int pred) {
1990b57cec5SDimitry Andric return __nvvm_vote_all_sync(mask, pred);
2000b57cec5SDimitry Andric }
2010b57cec5SDimitry Andric
__any_sync(unsigned int mask,int pred)2020b57cec5SDimitry Andric inline __device__ int __any_sync(unsigned int mask, int pred) {
2030b57cec5SDimitry Andric return __nvvm_vote_any_sync(mask, pred);
2040b57cec5SDimitry Andric }
2050b57cec5SDimitry Andric
__uni_sync(unsigned int mask,int pred)2060b57cec5SDimitry Andric inline __device__ int __uni_sync(unsigned int mask, int pred) {
2070b57cec5SDimitry Andric return __nvvm_vote_uni_sync(mask, pred);
2080b57cec5SDimitry Andric }
2090b57cec5SDimitry Andric
__ballot_sync(unsigned int mask,int pred)2100b57cec5SDimitry Andric inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
2110b57cec5SDimitry Andric return __nvvm_vote_ballot_sync(mask, pred);
2120b57cec5SDimitry Andric }
2130b57cec5SDimitry Andric
__activemask()214a7dea167SDimitry Andric inline __device__ unsigned int __activemask() {
215a7dea167SDimitry Andric #if CUDA_VERSION < 9020
216a7dea167SDimitry Andric return __nvvm_vote_ballot(1);
217a7dea167SDimitry Andric #else
218*0fca6ea1SDimitry Andric return __nvvm_activemask();
219a7dea167SDimitry Andric #endif
220a7dea167SDimitry Andric }
2210b57cec5SDimitry Andric
__fns(unsigned mask,unsigned base,int offset)2220b57cec5SDimitry Andric inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
2230b57cec5SDimitry Andric return __nvvm_fns(mask, base, offset);
2240b57cec5SDimitry Andric }
2250b57cec5SDimitry Andric
2260b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
2270b57cec5SDimitry Andric
2280b57cec5SDimitry Andric // Define __match* builtins CUDA-9 headers expect to see.
2290b57cec5SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
__match32_any_sync(unsigned int mask,unsigned int value)2300b57cec5SDimitry Andric inline __device__ unsigned int __match32_any_sync(unsigned int mask,
2310b57cec5SDimitry Andric unsigned int value) {
2320b57cec5SDimitry Andric return __nvvm_match_any_sync_i32(mask, value);
2330b57cec5SDimitry Andric }
2340b57cec5SDimitry Andric
23581ad6265SDimitry Andric inline __device__ unsigned int
__match64_any_sync(unsigned int mask,unsigned long long value)2360b57cec5SDimitry Andric __match64_any_sync(unsigned int mask, unsigned long long value) {
2370b57cec5SDimitry Andric return __nvvm_match_any_sync_i64(mask, value);
2380b57cec5SDimitry Andric }
2390b57cec5SDimitry Andric
2400b57cec5SDimitry Andric inline __device__ unsigned int
__match32_all_sync(unsigned int mask,unsigned int value,int * pred)2410b57cec5SDimitry Andric __match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
2420b57cec5SDimitry Andric return __nvvm_match_all_sync_i32p(mask, value, pred);
2430b57cec5SDimitry Andric }
2440b57cec5SDimitry Andric
24581ad6265SDimitry Andric inline __device__ unsigned int
__match64_all_sync(unsigned int mask,unsigned long long value,int * pred)2460b57cec5SDimitry Andric __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
2470b57cec5SDimitry Andric return __nvvm_match_all_sync_i64p(mask, value, pred);
2480b57cec5SDimitry Andric }
2490b57cec5SDimitry Andric #include "crt/sm_70_rt.hpp"
2500b57cec5SDimitry Andric
2510b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
2520b57cec5SDimitry Andric #endif // __CUDA_VERSION >= 9000
2530b57cec5SDimitry Andric
2540b57cec5SDimitry Andric // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
2550b57cec5SDimitry Andric
2560b57cec5SDimitry Andric // Prevent the vanilla sm_32 intrinsics header from being included.
2570b57cec5SDimitry Andric #define __SM_32_INTRINSICS_H__
2580b57cec5SDimitry Andric #define __SM_32_INTRINSICS_HPP__
2590b57cec5SDimitry Andric
2600b57cec5SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
2610b57cec5SDimitry Andric
__ldg(const char * ptr)2620b57cec5SDimitry Andric inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
__ldg(const short * ptr)2630b57cec5SDimitry Andric inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
__ldg(const int * ptr)2640b57cec5SDimitry Andric inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
__ldg(const long * ptr)2650b57cec5SDimitry Andric inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
__ldg(const long long * ptr)2660b57cec5SDimitry Andric inline __device__ long long __ldg(const long long *ptr) {
2670b57cec5SDimitry Andric return __nvvm_ldg_ll(ptr);
2680b57cec5SDimitry Andric }
__ldg(const unsigned char * ptr)2690b57cec5SDimitry Andric inline __device__ unsigned char __ldg(const unsigned char *ptr) {
2700b57cec5SDimitry Andric return __nvvm_ldg_uc(ptr);
2710b57cec5SDimitry Andric }
__ldg(const signed char * ptr)2720b57cec5SDimitry Andric inline __device__ signed char __ldg(const signed char *ptr) {
2730b57cec5SDimitry Andric return __nvvm_ldg_uc((const unsigned char *)ptr);
2740b57cec5SDimitry Andric }
__ldg(const unsigned short * ptr)2750b57cec5SDimitry Andric inline __device__ unsigned short __ldg(const unsigned short *ptr) {
2760b57cec5SDimitry Andric return __nvvm_ldg_us(ptr);
2770b57cec5SDimitry Andric }
__ldg(const unsigned int * ptr)2780b57cec5SDimitry Andric inline __device__ unsigned int __ldg(const unsigned int *ptr) {
2790b57cec5SDimitry Andric return __nvvm_ldg_ui(ptr);
2800b57cec5SDimitry Andric }
__ldg(const unsigned long * ptr)2810b57cec5SDimitry Andric inline __device__ unsigned long __ldg(const unsigned long *ptr) {
2820b57cec5SDimitry Andric return __nvvm_ldg_ul(ptr);
2830b57cec5SDimitry Andric }
__ldg(const unsigned long long * ptr)2840b57cec5SDimitry Andric inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
2850b57cec5SDimitry Andric return __nvvm_ldg_ull(ptr);
2860b57cec5SDimitry Andric }
__ldg(const float * ptr)2870b57cec5SDimitry Andric inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
__ldg(const double * ptr)2880b57cec5SDimitry Andric inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
2890b57cec5SDimitry Andric
__ldg(const char2 * ptr)2900b57cec5SDimitry Andric inline __device__ char2 __ldg(const char2 *ptr) {
2910b57cec5SDimitry Andric typedef char c2 __attribute__((ext_vector_type(2)));
2920b57cec5SDimitry Andric // We can assume that ptr is aligned at least to char2's alignment, but the
2930b57cec5SDimitry Andric // load will assume that ptr is aligned to char2's alignment. This is only
2940b57cec5SDimitry Andric // safe if alignof(c2) <= alignof(char2).
2950b57cec5SDimitry Andric c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
2960b57cec5SDimitry Andric char2 ret;
2970b57cec5SDimitry Andric ret.x = rv[0];
2980b57cec5SDimitry Andric ret.y = rv[1];
2990b57cec5SDimitry Andric return ret;
3000b57cec5SDimitry Andric }
__ldg(const char4 * ptr)3010b57cec5SDimitry Andric inline __device__ char4 __ldg(const char4 *ptr) {
3020b57cec5SDimitry Andric typedef char c4 __attribute__((ext_vector_type(4)));
3030b57cec5SDimitry Andric c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
3040b57cec5SDimitry Andric char4 ret;
3050b57cec5SDimitry Andric ret.x = rv[0];
3060b57cec5SDimitry Andric ret.y = rv[1];
3070b57cec5SDimitry Andric ret.z = rv[2];
3080b57cec5SDimitry Andric ret.w = rv[3];
3090b57cec5SDimitry Andric return ret;
3100b57cec5SDimitry Andric }
__ldg(const short2 * ptr)3110b57cec5SDimitry Andric inline __device__ short2 __ldg(const short2 *ptr) {
3120b57cec5SDimitry Andric typedef short s2 __attribute__((ext_vector_type(2)));
3130b57cec5SDimitry Andric s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
3140b57cec5SDimitry Andric short2 ret;
3150b57cec5SDimitry Andric ret.x = rv[0];
3160b57cec5SDimitry Andric ret.y = rv[1];
3170b57cec5SDimitry Andric return ret;
3180b57cec5SDimitry Andric }
__ldg(const short4 * ptr)3190b57cec5SDimitry Andric inline __device__ short4 __ldg(const short4 *ptr) {
3200b57cec5SDimitry Andric typedef short s4 __attribute__((ext_vector_type(4)));
3210b57cec5SDimitry Andric s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
3220b57cec5SDimitry Andric short4 ret;
3230b57cec5SDimitry Andric ret.x = rv[0];
3240b57cec5SDimitry Andric ret.y = rv[1];
3250b57cec5SDimitry Andric ret.z = rv[2];
3260b57cec5SDimitry Andric ret.w = rv[3];
3270b57cec5SDimitry Andric return ret;
3280b57cec5SDimitry Andric }
__ldg(const int2 * ptr)3290b57cec5SDimitry Andric inline __device__ int2 __ldg(const int2 *ptr) {
3300b57cec5SDimitry Andric typedef int i2 __attribute__((ext_vector_type(2)));
3310b57cec5SDimitry Andric i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
3320b57cec5SDimitry Andric int2 ret;
3330b57cec5SDimitry Andric ret.x = rv[0];
3340b57cec5SDimitry Andric ret.y = rv[1];
3350b57cec5SDimitry Andric return ret;
3360b57cec5SDimitry Andric }
__ldg(const int4 * ptr)3370b57cec5SDimitry Andric inline __device__ int4 __ldg(const int4 *ptr) {
3380b57cec5SDimitry Andric typedef int i4 __attribute__((ext_vector_type(4)));
3390b57cec5SDimitry Andric i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
3400b57cec5SDimitry Andric int4 ret;
3410b57cec5SDimitry Andric ret.x = rv[0];
3420b57cec5SDimitry Andric ret.y = rv[1];
3430b57cec5SDimitry Andric ret.z = rv[2];
3440b57cec5SDimitry Andric ret.w = rv[3];
3450b57cec5SDimitry Andric return ret;
3460b57cec5SDimitry Andric }
__ldg(const longlong2 * ptr)3470b57cec5SDimitry Andric inline __device__ longlong2 __ldg(const longlong2 *ptr) {
3480b57cec5SDimitry Andric typedef long long ll2 __attribute__((ext_vector_type(2)));
3490b57cec5SDimitry Andric ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
3500b57cec5SDimitry Andric longlong2 ret;
3510b57cec5SDimitry Andric ret.x = rv[0];
3520b57cec5SDimitry Andric ret.y = rv[1];
3530b57cec5SDimitry Andric return ret;
3540b57cec5SDimitry Andric }
3550b57cec5SDimitry Andric
__ldg(const uchar2 * ptr)3560b57cec5SDimitry Andric inline __device__ uchar2 __ldg(const uchar2 *ptr) {
3570b57cec5SDimitry Andric typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
3580b57cec5SDimitry Andric uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
3590b57cec5SDimitry Andric uchar2 ret;
3600b57cec5SDimitry Andric ret.x = rv[0];
3610b57cec5SDimitry Andric ret.y = rv[1];
3620b57cec5SDimitry Andric return ret;
3630b57cec5SDimitry Andric }
__ldg(const uchar4 * ptr)3640b57cec5SDimitry Andric inline __device__ uchar4 __ldg(const uchar4 *ptr) {
3650b57cec5SDimitry Andric typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
3660b57cec5SDimitry Andric uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
3670b57cec5SDimitry Andric uchar4 ret;
3680b57cec5SDimitry Andric ret.x = rv[0];
3690b57cec5SDimitry Andric ret.y = rv[1];
3700b57cec5SDimitry Andric ret.z = rv[2];
3710b57cec5SDimitry Andric ret.w = rv[3];
3720b57cec5SDimitry Andric return ret;
3730b57cec5SDimitry Andric }
__ldg(const ushort2 * ptr)3740b57cec5SDimitry Andric inline __device__ ushort2 __ldg(const ushort2 *ptr) {
3750b57cec5SDimitry Andric typedef unsigned short us2 __attribute__((ext_vector_type(2)));
3760b57cec5SDimitry Andric us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
3770b57cec5SDimitry Andric ushort2 ret;
3780b57cec5SDimitry Andric ret.x = rv[0];
3790b57cec5SDimitry Andric ret.y = rv[1];
3800b57cec5SDimitry Andric return ret;
3810b57cec5SDimitry Andric }
__ldg(const ushort4 * ptr)3820b57cec5SDimitry Andric inline __device__ ushort4 __ldg(const ushort4 *ptr) {
3830b57cec5SDimitry Andric typedef unsigned short us4 __attribute__((ext_vector_type(4)));
3840b57cec5SDimitry Andric us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
3850b57cec5SDimitry Andric ushort4 ret;
3860b57cec5SDimitry Andric ret.x = rv[0];
3870b57cec5SDimitry Andric ret.y = rv[1];
3880b57cec5SDimitry Andric ret.z = rv[2];
3890b57cec5SDimitry Andric ret.w = rv[3];
3900b57cec5SDimitry Andric return ret;
3910b57cec5SDimitry Andric }
__ldg(const uint2 * ptr)3920b57cec5SDimitry Andric inline __device__ uint2 __ldg(const uint2 *ptr) {
3930b57cec5SDimitry Andric typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
3940b57cec5SDimitry Andric ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
3950b57cec5SDimitry Andric uint2 ret;
3960b57cec5SDimitry Andric ret.x = rv[0];
3970b57cec5SDimitry Andric ret.y = rv[1];
3980b57cec5SDimitry Andric return ret;
3990b57cec5SDimitry Andric }
__ldg(const uint4 * ptr)4000b57cec5SDimitry Andric inline __device__ uint4 __ldg(const uint4 *ptr) {
4010b57cec5SDimitry Andric typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
4020b57cec5SDimitry Andric ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
4030b57cec5SDimitry Andric uint4 ret;
4040b57cec5SDimitry Andric ret.x = rv[0];
4050b57cec5SDimitry Andric ret.y = rv[1];
4060b57cec5SDimitry Andric ret.z = rv[2];
4070b57cec5SDimitry Andric ret.w = rv[3];
4080b57cec5SDimitry Andric return ret;
4090b57cec5SDimitry Andric }
__ldg(const ulonglong2 * ptr)4100b57cec5SDimitry Andric inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
4110b57cec5SDimitry Andric typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
4120b57cec5SDimitry Andric ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
4130b57cec5SDimitry Andric ulonglong2 ret;
4140b57cec5SDimitry Andric ret.x = rv[0];
4150b57cec5SDimitry Andric ret.y = rv[1];
4160b57cec5SDimitry Andric return ret;
4170b57cec5SDimitry Andric }
4180b57cec5SDimitry Andric
__ldg(const float2 * ptr)4190b57cec5SDimitry Andric inline __device__ float2 __ldg(const float2 *ptr) {
4200b57cec5SDimitry Andric typedef float f2 __attribute__((ext_vector_type(2)));
4210b57cec5SDimitry Andric f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
4220b57cec5SDimitry Andric float2 ret;
4230b57cec5SDimitry Andric ret.x = rv[0];
4240b57cec5SDimitry Andric ret.y = rv[1];
4250b57cec5SDimitry Andric return ret;
4260b57cec5SDimitry Andric }
__ldg(const float4 * ptr)4270b57cec5SDimitry Andric inline __device__ float4 __ldg(const float4 *ptr) {
4280b57cec5SDimitry Andric typedef float f4 __attribute__((ext_vector_type(4)));
4290b57cec5SDimitry Andric f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
4300b57cec5SDimitry Andric float4 ret;
4310b57cec5SDimitry Andric ret.x = rv[0];
4320b57cec5SDimitry Andric ret.y = rv[1];
4330b57cec5SDimitry Andric ret.z = rv[2];
4340b57cec5SDimitry Andric ret.w = rv[3];
4350b57cec5SDimitry Andric return ret;
4360b57cec5SDimitry Andric }
__ldg(const double2 * ptr)4370b57cec5SDimitry Andric inline __device__ double2 __ldg(const double2 *ptr) {
4380b57cec5SDimitry Andric typedef double d2 __attribute__((ext_vector_type(2)));
4390b57cec5SDimitry Andric d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
4400b57cec5SDimitry Andric double2 ret;
4410b57cec5SDimitry Andric ret.x = rv[0];
4420b57cec5SDimitry Andric ret.y = rv[1];
4430b57cec5SDimitry Andric return ret;
4440b57cec5SDimitry Andric }
4450b57cec5SDimitry Andric
4460b57cec5SDimitry Andric // TODO: Implement these as intrinsics, so the backend can work its magic on
4470b57cec5SDimitry Andric // these. Alternatively, we could implement these as plain C and try to get
4480b57cec5SDimitry Andric // llvm to recognize the relevant patterns.
__funnelshift_l(unsigned low32,unsigned high32,unsigned shiftWidth)4490b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
4500b57cec5SDimitry Andric unsigned shiftWidth) {
4510b57cec5SDimitry Andric unsigned result;
4520b57cec5SDimitry Andric asm("shf.l.wrap.b32 %0, %1, %2, %3;"
4530b57cec5SDimitry Andric : "=r"(result)
4540b57cec5SDimitry Andric : "r"(low32), "r"(high32), "r"(shiftWidth));
4550b57cec5SDimitry Andric return result;
4560b57cec5SDimitry Andric }
__funnelshift_lc(unsigned low32,unsigned high32,unsigned shiftWidth)4570b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
4580b57cec5SDimitry Andric unsigned shiftWidth) {
4590b57cec5SDimitry Andric unsigned result;
4600b57cec5SDimitry Andric asm("shf.l.clamp.b32 %0, %1, %2, %3;"
4610b57cec5SDimitry Andric : "=r"(result)
4620b57cec5SDimitry Andric : "r"(low32), "r"(high32), "r"(shiftWidth));
4630b57cec5SDimitry Andric return result;
4640b57cec5SDimitry Andric }
__funnelshift_r(unsigned low32,unsigned high32,unsigned shiftWidth)4650b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
4660b57cec5SDimitry Andric unsigned shiftWidth) {
4670b57cec5SDimitry Andric unsigned result;
4680b57cec5SDimitry Andric asm("shf.r.wrap.b32 %0, %1, %2, %3;"
4690b57cec5SDimitry Andric : "=r"(result)
4700b57cec5SDimitry Andric : "r"(low32), "r"(high32), "r"(shiftWidth));
4710b57cec5SDimitry Andric return result;
4720b57cec5SDimitry Andric }
__funnelshift_rc(unsigned low32,unsigned high32,unsigned shiftWidth)4730b57cec5SDimitry Andric inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
4740b57cec5SDimitry Andric unsigned shiftWidth) {
4750b57cec5SDimitry Andric unsigned ret;
4760b57cec5SDimitry Andric asm("shf.r.clamp.b32 %0, %1, %2, %3;"
4770b57cec5SDimitry Andric : "=r"(ret)
4780b57cec5SDimitry Andric : "r"(low32), "r"(high32), "r"(shiftWidth));
4790b57cec5SDimitry Andric return ret;
4800b57cec5SDimitry Andric }
4810b57cec5SDimitry Andric
4820b57cec5SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
4830b57cec5SDimitry Andric
484349cc55cSDimitry Andric #if CUDA_VERSION >= 11000
485349cc55cSDimitry Andric extern "C" {
__nv_cvta_generic_to_global_impl(const void * __ptr)486349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) {
487349cc55cSDimitry Andric return (size_t)(void __attribute__((address_space(1))) *)__ptr;
488349cc55cSDimitry Andric }
__nv_cvta_generic_to_shared_impl(const void * __ptr)489349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) {
490349cc55cSDimitry Andric return (size_t)(void __attribute__((address_space(3))) *)__ptr;
491349cc55cSDimitry Andric }
__nv_cvta_generic_to_constant_impl(const void * __ptr)492349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) {
493349cc55cSDimitry Andric return (size_t)(void __attribute__((address_space(4))) *)__ptr;
494349cc55cSDimitry Andric }
__nv_cvta_generic_to_local_impl(const void * __ptr)495349cc55cSDimitry Andric __device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) {
496349cc55cSDimitry Andric return (size_t)(void __attribute__((address_space(5))) *)__ptr;
497349cc55cSDimitry Andric }
__nv_cvta_global_to_generic_impl(size_t __ptr)498349cc55cSDimitry Andric __device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) {
499349cc55cSDimitry Andric return (void *)(void __attribute__((address_space(1))) *)__ptr;
500349cc55cSDimitry Andric }
__nv_cvta_shared_to_generic_impl(size_t __ptr)501349cc55cSDimitry Andric __device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) {
502349cc55cSDimitry Andric return (void *)(void __attribute__((address_space(3))) *)__ptr;
503349cc55cSDimitry Andric }
__nv_cvta_constant_to_generic_impl(size_t __ptr)504349cc55cSDimitry Andric __device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) {
505349cc55cSDimitry Andric return (void *)(void __attribute__((address_space(4))) *)__ptr;
506349cc55cSDimitry Andric }
__nv_cvta_local_to_generic_impl(size_t __ptr)507349cc55cSDimitry Andric __device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) {
508349cc55cSDimitry Andric return (void *)(void __attribute__((address_space(5))) *)__ptr;
509349cc55cSDimitry Andric }
__nvvm_get_smem_pointer(void * __ptr)51056f451bbSDimitry Andric __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) {
511349cc55cSDimitry Andric return __nv_cvta_generic_to_shared_impl(__ptr);
512349cc55cSDimitry Andric }
513349cc55cSDimitry Andric } // extern "C"
51406c3fb27SDimitry Andric
51506c3fb27SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
__reduce_add_sync(unsigned __mask,unsigned __value)51606c3fb27SDimitry Andric __device__ inline unsigned __reduce_add_sync(unsigned __mask,
51706c3fb27SDimitry Andric unsigned __value) {
51806c3fb27SDimitry Andric return __nvvm_redux_sync_add(__mask, __value);
51906c3fb27SDimitry Andric }
__reduce_min_sync(unsigned __mask,unsigned __value)52006c3fb27SDimitry Andric __device__ inline unsigned __reduce_min_sync(unsigned __mask,
52106c3fb27SDimitry Andric unsigned __value) {
52206c3fb27SDimitry Andric return __nvvm_redux_sync_umin(__mask, __value);
52306c3fb27SDimitry Andric }
__reduce_max_sync(unsigned __mask,unsigned __value)52406c3fb27SDimitry Andric __device__ inline unsigned __reduce_max_sync(unsigned __mask,
52506c3fb27SDimitry Andric unsigned __value) {
52606c3fb27SDimitry Andric return __nvvm_redux_sync_umax(__mask, __value);
52706c3fb27SDimitry Andric }
__reduce_min_sync(unsigned __mask,int __value)52806c3fb27SDimitry Andric __device__ inline int __reduce_min_sync(unsigned __mask, int __value) {
52906c3fb27SDimitry Andric return __nvvm_redux_sync_min(__mask, __value);
53006c3fb27SDimitry Andric }
__reduce_max_sync(unsigned __mask,int __value)53106c3fb27SDimitry Andric __device__ inline int __reduce_max_sync(unsigned __mask, int __value) {
53206c3fb27SDimitry Andric return __nvvm_redux_sync_max(__mask, __value);
53306c3fb27SDimitry Andric }
__reduce_or_sync(unsigned __mask,unsigned __value)53406c3fb27SDimitry Andric __device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) {
53506c3fb27SDimitry Andric return __nvvm_redux_sync_or(__mask, __value);
53606c3fb27SDimitry Andric }
__reduce_and_sync(unsigned __mask,unsigned __value)53706c3fb27SDimitry Andric __device__ inline unsigned __reduce_and_sync(unsigned __mask,
53806c3fb27SDimitry Andric unsigned __value) {
53906c3fb27SDimitry Andric return __nvvm_redux_sync_and(__mask, __value);
54006c3fb27SDimitry Andric }
__reduce_xor_sync(unsigned __mask,unsigned __value)54106c3fb27SDimitry Andric __device__ inline unsigned __reduce_xor_sync(unsigned __mask,
54206c3fb27SDimitry Andric unsigned __value) {
54306c3fb27SDimitry Andric return __nvvm_redux_sync_xor(__mask, __value);
54406c3fb27SDimitry Andric }
54506c3fb27SDimitry Andric
__nv_memcpy_async_shared_global_4(void * __dst,const void * __src,unsigned __src_size)54606c3fb27SDimitry Andric __device__ inline void __nv_memcpy_async_shared_global_4(void *__dst,
54706c3fb27SDimitry Andric const void *__src,
54806c3fb27SDimitry Andric unsigned __src_size) {
54906c3fb27SDimitry Andric __nvvm_cp_async_ca_shared_global_4(
55006c3fb27SDimitry Andric (void __attribute__((address_space(3))) *)__dst,
55106c3fb27SDimitry Andric (const void __attribute__((address_space(1))) *)__src, __src_size);
55206c3fb27SDimitry Andric }
__nv_memcpy_async_shared_global_8(void * __dst,const void * __src,unsigned __src_size)55306c3fb27SDimitry Andric __device__ inline void __nv_memcpy_async_shared_global_8(void *__dst,
55406c3fb27SDimitry Andric const void *__src,
55506c3fb27SDimitry Andric unsigned __src_size) {
55606c3fb27SDimitry Andric __nvvm_cp_async_ca_shared_global_8(
55706c3fb27SDimitry Andric (void __attribute__((address_space(3))) *)__dst,
55806c3fb27SDimitry Andric (const void __attribute__((address_space(1))) *)__src, __src_size);
55906c3fb27SDimitry Andric }
__nv_memcpy_async_shared_global_16(void * __dst,const void * __src,unsigned __src_size)56006c3fb27SDimitry Andric __device__ inline void __nv_memcpy_async_shared_global_16(void *__dst,
56106c3fb27SDimitry Andric const void *__src,
56206c3fb27SDimitry Andric unsigned __src_size) {
56306c3fb27SDimitry Andric __nvvm_cp_async_ca_shared_global_16(
56406c3fb27SDimitry Andric (void __attribute__((address_space(3))) *)__dst,
56506c3fb27SDimitry Andric (const void __attribute__((address_space(1))) *)__src, __src_size);
56606c3fb27SDimitry Andric }
56706c3fb27SDimitry Andric
56806c3fb27SDimitry Andric __device__ inline void *
__nv_associate_access_property(const void * __ptr,unsigned long long __prop)56906c3fb27SDimitry Andric __nv_associate_access_property(const void *__ptr, unsigned long long __prop) {
57006c3fb27SDimitry Andric // TODO: it appears to provide compiler with some sort of a hint. We do not
57106c3fb27SDimitry Andric // know what exactly it is supposed to do. However, CUDA headers suggest that
57206c3fb27SDimitry Andric // just passing through __ptr should not affect correctness. They do so on
57306c3fb27SDimitry Andric // pre-sm80 GPUs where this builtin is not available.
57406c3fb27SDimitry Andric return (void*)__ptr;
57506c3fb27SDimitry Andric }
57606c3fb27SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
57706c3fb27SDimitry Andric
57806c3fb27SDimitry Andric #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
__isCtaShared(const void * ptr)57906c3fb27SDimitry Andric __device__ inline unsigned __isCtaShared(const void *ptr) {
58006c3fb27SDimitry Andric return __isShared(ptr);
58106c3fb27SDimitry Andric }
58206c3fb27SDimitry Andric
__isClusterShared(const void * __ptr)58306c3fb27SDimitry Andric __device__ inline unsigned __isClusterShared(const void *__ptr) {
58406c3fb27SDimitry Andric return __nvvm_isspacep_shared_cluster(__ptr);
58506c3fb27SDimitry Andric }
58606c3fb27SDimitry Andric
__cluster_map_shared_rank(const void * __ptr,unsigned __rank)58706c3fb27SDimitry Andric __device__ inline void *__cluster_map_shared_rank(const void *__ptr,
58806c3fb27SDimitry Andric unsigned __rank) {
58906c3fb27SDimitry Andric return __nvvm_mapa((void *)__ptr, __rank);
59006c3fb27SDimitry Andric }
59106c3fb27SDimitry Andric
__cluster_query_shared_rank(const void * __ptr)59206c3fb27SDimitry Andric __device__ inline unsigned __cluster_query_shared_rank(const void *__ptr) {
59306c3fb27SDimitry Andric return __nvvm_getctarank((void *)__ptr);
59406c3fb27SDimitry Andric }
59506c3fb27SDimitry Andric
59606c3fb27SDimitry Andric __device__ inline uint2
__cluster_map_shared_multicast(const void * __ptr,unsigned int __cluster_cta_mask)59706c3fb27SDimitry Andric __cluster_map_shared_multicast(const void *__ptr,
59806c3fb27SDimitry Andric unsigned int __cluster_cta_mask) {
59906c3fb27SDimitry Andric return make_uint2((unsigned)__cvta_generic_to_shared(__ptr),
60006c3fb27SDimitry Andric __cluster_cta_mask);
60106c3fb27SDimitry Andric }
60206c3fb27SDimitry Andric
__clusterDimIsSpecified()60306c3fb27SDimitry Andric __device__ inline unsigned __clusterDimIsSpecified() {
60406c3fb27SDimitry Andric return __nvvm_is_explicit_cluster();
60506c3fb27SDimitry Andric }
60606c3fb27SDimitry Andric
__clusterDim()60706c3fb27SDimitry Andric __device__ inline dim3 __clusterDim() {
60806c3fb27SDimitry Andric return dim3(__nvvm_read_ptx_sreg_cluster_nctaid_x(),
60906c3fb27SDimitry Andric __nvvm_read_ptx_sreg_cluster_nctaid_y(),
61006c3fb27SDimitry Andric __nvvm_read_ptx_sreg_cluster_nctaid_z());
61106c3fb27SDimitry Andric }
61206c3fb27SDimitry Andric
__clusterRelativeBlockIdx()61306c3fb27SDimitry Andric __device__ inline dim3 __clusterRelativeBlockIdx() {
61406c3fb27SDimitry Andric return dim3(__nvvm_read_ptx_sreg_cluster_ctaid_x(),
61506c3fb27SDimitry Andric __nvvm_read_ptx_sreg_cluster_ctaid_y(),
61606c3fb27SDimitry Andric __nvvm_read_ptx_sreg_cluster_ctaid_z());
61706c3fb27SDimitry Andric }
61806c3fb27SDimitry Andric
__clusterGridDimInClusters()61906c3fb27SDimitry Andric __device__ inline dim3 __clusterGridDimInClusters() {
62006c3fb27SDimitry Andric return dim3(__nvvm_read_ptx_sreg_nclusterid_x(),
62106c3fb27SDimitry Andric __nvvm_read_ptx_sreg_nclusterid_y(),
62206c3fb27SDimitry Andric __nvvm_read_ptx_sreg_nclusterid_z());
62306c3fb27SDimitry Andric }
62406c3fb27SDimitry Andric
__clusterIdx()62506c3fb27SDimitry Andric __device__ inline dim3 __clusterIdx() {
62606c3fb27SDimitry Andric return dim3(__nvvm_read_ptx_sreg_clusterid_x(),
62706c3fb27SDimitry Andric __nvvm_read_ptx_sreg_clusterid_y(),
62806c3fb27SDimitry Andric __nvvm_read_ptx_sreg_clusterid_z());
62906c3fb27SDimitry Andric }
63006c3fb27SDimitry Andric
__clusterRelativeBlockRank()63106c3fb27SDimitry Andric __device__ inline unsigned __clusterRelativeBlockRank() {
63206c3fb27SDimitry Andric return __nvvm_read_ptx_sreg_cluster_ctarank();
63306c3fb27SDimitry Andric }
63406c3fb27SDimitry Andric
__clusterSizeInBlocks()63506c3fb27SDimitry Andric __device__ inline unsigned __clusterSizeInBlocks() {
63606c3fb27SDimitry Andric return __nvvm_read_ptx_sreg_cluster_nctarank();
63706c3fb27SDimitry Andric }
63806c3fb27SDimitry Andric
__cluster_barrier_arrive()63906c3fb27SDimitry Andric __device__ inline void __cluster_barrier_arrive() {
64006c3fb27SDimitry Andric __nvvm_barrier_cluster_arrive();
64106c3fb27SDimitry Andric }
64206c3fb27SDimitry Andric
__cluster_barrier_arrive_relaxed()64306c3fb27SDimitry Andric __device__ inline void __cluster_barrier_arrive_relaxed() {
64406c3fb27SDimitry Andric __nvvm_barrier_cluster_arrive_relaxed();
64506c3fb27SDimitry Andric }
64606c3fb27SDimitry Andric
__cluster_barrier_wait()64706c3fb27SDimitry Andric __device__ inline void __cluster_barrier_wait() {
64806c3fb27SDimitry Andric __nvvm_barrier_cluster_wait();
64906c3fb27SDimitry Andric }
65006c3fb27SDimitry Andric
__threadfence_cluster()65106c3fb27SDimitry Andric __device__ inline void __threadfence_cluster() { __nvvm_fence_sc_cluster(); }
65206c3fb27SDimitry Andric
atomicAdd(float2 * __ptr,float2 __val)65306c3fb27SDimitry Andric __device__ inline float2 atomicAdd(float2 *__ptr, float2 __val) {
65406c3fb27SDimitry Andric float2 __ret;
65506c3fb27SDimitry Andric __asm__("atom.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
65606c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y)
65706c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y));
65806c3fb27SDimitry Andric return __ret;
65906c3fb27SDimitry Andric }
66006c3fb27SDimitry Andric
atomicAdd_block(float2 * __ptr,float2 __val)66106c3fb27SDimitry Andric __device__ inline float2 atomicAdd_block(float2 *__ptr, float2 __val) {
66206c3fb27SDimitry Andric float2 __ret;
66306c3fb27SDimitry Andric __asm__("atom.cta.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
66406c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y)
66506c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y));
66606c3fb27SDimitry Andric return __ret;
66706c3fb27SDimitry Andric }
66806c3fb27SDimitry Andric
atomicAdd_system(float2 * __ptr,float2 __val)66906c3fb27SDimitry Andric __device__ inline float2 atomicAdd_system(float2 *__ptr, float2 __val) {
67006c3fb27SDimitry Andric float2 __ret;
67106c3fb27SDimitry Andric __asm__("atom.sys.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
67206c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y)
67306c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y));
67406c3fb27SDimitry Andric return __ret;
67506c3fb27SDimitry Andric }
67606c3fb27SDimitry Andric
atomicAdd(float4 * __ptr,float4 __val)67706c3fb27SDimitry Andric __device__ inline float4 atomicAdd(float4 *__ptr, float4 __val) {
67806c3fb27SDimitry Andric float4 __ret;
67906c3fb27SDimitry Andric __asm__("atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
68006c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
68106c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w));
68206c3fb27SDimitry Andric return __ret;
68306c3fb27SDimitry Andric }
68406c3fb27SDimitry Andric
atomicAdd_block(float4 * __ptr,float4 __val)68506c3fb27SDimitry Andric __device__ inline float4 atomicAdd_block(float4 *__ptr, float4 __val) {
68606c3fb27SDimitry Andric float4 __ret;
68706c3fb27SDimitry Andric __asm__(
68806c3fb27SDimitry Andric "atom.cta.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
68906c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
69006c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w));
69106c3fb27SDimitry Andric return __ret;
69206c3fb27SDimitry Andric }
69306c3fb27SDimitry Andric
atomicAdd_system(float4 * __ptr,float4 __val)69406c3fb27SDimitry Andric __device__ inline float4 atomicAdd_system(float4 *__ptr, float4 __val) {
69506c3fb27SDimitry Andric float4 __ret;
69606c3fb27SDimitry Andric __asm__(
69706c3fb27SDimitry Andric "atom.sys.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
69806c3fb27SDimitry Andric : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
69906c3fb27SDimitry Andric : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w)
70006c3fb27SDimitry Andric :);
70106c3fb27SDimitry Andric return __ret;
70206c3fb27SDimitry Andric }
70306c3fb27SDimitry Andric
70406c3fb27SDimitry Andric #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
705349cc55cSDimitry Andric #endif // CUDA_VERSION >= 11000
706349cc55cSDimitry Andric
7070b57cec5SDimitry Andric #endif // defined(__CLANG_CUDA_INTRINSICS_H__)
708