xref: /freebsd/contrib/llvm-project/clang/lib/Headers/gpuintrin.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Provides wrappers around the clang builtins for accessing GPU hardware
10 // features. The interface is intended to be portable between architectures, but
11 // some targets may provide different implementations. This header can be
12 // included for all the common GPU programming languages, namely OpenMP, HIP,
13 // CUDA, and OpenCL.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef __GPUINTRIN_H
18 #define __GPUINTRIN_H
19 
20 #if !defined(_DEFAULT_FN_ATTRS)
21 #if defined(__HIP__) || defined(__CUDA__)
22 #define _DEFAULT_FN_ATTRS __attribute__((device))
23 #else
24 #define _DEFAULT_FN_ATTRS
25 #endif
26 #endif
27 
28 #include <stdint.h>
29 
30 #if !defined(__cplusplus)
31 _Pragma("push_macro(\"bool\")");
32 #define bool _Bool
33 #endif
34 
35 _Pragma("omp begin declare target device_type(nohost)");
36 _Pragma("omp begin declare variant match(device = {kind(gpu)})");
37 
38 // Forward declare a few functions for the implementation header.
39 
40 // Returns a bitmask marking all lanes that have the same value of __x.
41 _DEFAULT_FN_ATTRS static __inline__ uint64_t
42 __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);
43 
44 // Returns a bitmask marking all lanes that have the same value of __x.
45 _DEFAULT_FN_ATTRS static __inline__ uint64_t
46 __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x);
47 
48 // Returns the current lane mask if every lane contains __x.
49 _DEFAULT_FN_ATTRS static __inline__ uint64_t
50 __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x);
51 
52 // Returns the current lane mask if every lane contains __x.
53 _DEFAULT_FN_ATTRS static __inline__ uint64_t
54 __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
55 
56 _Pragma("omp end declare variant");
57 _Pragma("omp end declare target");
58 
59 #if defined(__NVPTX__)
60 #include <nvptxintrin.h>
61 #elif defined(__AMDGPU__)
62 #include <amdgpuintrin.h>
63 #elif !defined(_OPENMP)
64 #error "This header is only meant to be used on GPU architectures."
65 #endif
66 
67 _Pragma("omp begin declare target device_type(nohost)");
68 _Pragma("omp begin declare variant match(device = {kind(gpu)})");
69 
70 #define __GPU_X_DIM 0
71 #define __GPU_Y_DIM 1
72 #define __GPU_Z_DIM 2
73 
74 // Returns the number of blocks in the requested dimension.
__gpu_num_blocks(int __dim)75 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
76   switch (__dim) {
77   case 0:
78     return __gpu_num_blocks_x();
79   case 1:
80     return __gpu_num_blocks_y();
81   case 2:
82     return __gpu_num_blocks_z();
83   default:
84     __builtin_unreachable();
85   }
86 }
87 
88 // Returns the number of block id in the requested dimension.
__gpu_block_id(int __dim)89 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
90   switch (__dim) {
91   case 0:
92     return __gpu_block_id_x();
93   case 1:
94     return __gpu_block_id_y();
95   case 2:
96     return __gpu_block_id_z();
97   default:
98     __builtin_unreachable();
99   }
100 }
101 
102 // Returns the number of threads in the requested dimension.
__gpu_num_threads(int __dim)103 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
104   switch (__dim) {
105   case 0:
106     return __gpu_num_threads_x();
107   case 1:
108     return __gpu_num_threads_y();
109   case 2:
110     return __gpu_num_threads_z();
111   default:
112     __builtin_unreachable();
113   }
114 }
115 
116 // Returns the thread id in the requested dimension.
__gpu_thread_id(int __dim)117 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
118   switch (__dim) {
119   case 0:
120     return __gpu_thread_id_x();
121   case 1:
122     return __gpu_thread_id_y();
123   case 2:
124     return __gpu_thread_id_z();
125   default:
126     __builtin_unreachable();
127   }
128 }
129 
130 // Get the first active thread inside the lane.
131 _DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_first_lane_id(uint64_t __lane_mask)132 __gpu_first_lane_id(uint64_t __lane_mask) {
133   return __builtin_ffsll(__lane_mask) - 1;
134 }
135 
136 // Conditional that is only true for a single thread in a lane.
137 _DEFAULT_FN_ATTRS static __inline__ bool
__gpu_is_first_in_lane(uint64_t __lane_mask)138 __gpu_is_first_in_lane(uint64_t __lane_mask) {
139   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
140 }
141 
142 // Copies the value from the first active thread to the rest.
143 _DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_read_first_lane_u64(uint64_t __lane_mask,uint64_t __x)144 __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
145   uint32_t __hi = (uint32_t)(__x >> 32ull);
146   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
147   return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
148          ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
149           0xFFFFFFFFull);
150 }
151 
152 // Gets the first floating point value from the active lanes.
153 _DEFAULT_FN_ATTRS static __inline__ float
__gpu_read_first_lane_f32(uint64_t __lane_mask,float __x)154 __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
155   return __builtin_bit_cast(
156       float, __gpu_read_first_lane_u32(__lane_mask,
157                                        __builtin_bit_cast(uint32_t, __x)));
158 }
159 
160 // Gets the first floating point value from the active lanes.
161 _DEFAULT_FN_ATTRS static __inline__ double
__gpu_read_first_lane_f64(uint64_t __lane_mask,double __x)162 __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
163   return __builtin_bit_cast(
164       double, __gpu_read_first_lane_u64(__lane_mask,
165                                         __builtin_bit_cast(uint64_t, __x)));
166 }
167 
168 // Shuffles the the lanes according to the given index.
169 _DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_shuffle_idx_u64(uint64_t __lane_mask,uint32_t __idx,uint64_t __x,uint32_t __width)170 __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
171                       uint32_t __width) {
172   uint32_t __hi = (uint32_t)(__x >> 32ull);
173   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
174   uint32_t __mask = (uint32_t)__lane_mask;
175   return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
176           << 32ull) |
177          ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
178 }
179 
180 // Shuffles the the lanes according to the given index.
181 _DEFAULT_FN_ATTRS static __inline__ float
__gpu_shuffle_idx_f32(uint64_t __lane_mask,uint32_t __idx,float __x,uint32_t __width)182 __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
183                       uint32_t __width) {
184   return __builtin_bit_cast(
185       float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
186                                    __builtin_bit_cast(uint32_t, __x), __width));
187 }
188 
189 // Shuffles the the lanes according to the given index.
190 _DEFAULT_FN_ATTRS static __inline__ double
__gpu_shuffle_idx_f64(uint64_t __lane_mask,uint32_t __idx,double __x,uint32_t __width)191 __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
192                       uint32_t __width) {
193   return __builtin_bit_cast(
194       double,
195       __gpu_shuffle_idx_u64(__lane_mask, __idx,
196                             __builtin_bit_cast(uint64_t, __x), __width));
197 }
198 
199 // Gets the accumulator scan of the threads in the warp or wavefront.
200 #define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
201   _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
202       uint64_t __lane_mask, uint32_t __x) {                                    \
203     uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
204     bool __divergent = __gpu_read_first_lane_##__suffix(                       \
205         __lane_mask, __first & (__first + 1));                                 \
206     if (__divergent) {                                                         \
207       __type __accum = 0;                                                      \
208       for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) {      \
209         __type __index = __builtin_ctzll(__mask);                              \
210         __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
211                                                     __gpu_num_lanes());        \
212         __x = __gpu_lane_id() == __index ? __accum + __tmp : __x;              \
213         __accum += __tmp;                                                      \
214       }                                                                        \
215     } else {                                                                   \
216       for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
217         uint32_t __index = __gpu_lane_id() - __step;                           \
218         __bitmask_type bitmask = __gpu_lane_id() >= __step;                    \
219         __x += __builtin_bit_cast(                                             \
220             __type,                                                            \
221             -bitmask & __builtin_bit_cast(__bitmask_type,                      \
222                                           __gpu_shuffle_idx_##__suffix(        \
223                                               __lane_mask, __index, __x,       \
224                                               __gpu_num_lanes())));            \
225       }                                                                        \
226     }                                                                          \
227     return __x;                                                                \
228   }
229 __DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
230 __DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
231 __DO_LANE_SCAN(float, uint32_t, f32);    // float __gpu_lane_scan_f32(m, x)
232 __DO_LANE_SCAN(double, uint64_t, f64);   // double __gpu_lane_scan_f64(m, x)
233 #undef __DO_LANE_SCAN
234 
235 // Gets the sum of all lanes inside the warp or wavefront.
236 #define __DO_LANE_SUM(__type, __suffix)                                        \
237   _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
238       uint64_t __lane_mask, __type __x) {                                      \
239     uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
240     bool __divergent = __gpu_read_first_lane_##__suffix(                       \
241         __lane_mask, __first & (__first + 1));                                 \
242     if (__divergent) {                                                         \
243       return __gpu_shuffle_idx_##__suffix(                                     \
244           __lane_mask, 63 - __builtin_clzll(__lane_mask),                      \
245           __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes());    \
246     } else {                                                                   \
247       for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
248         uint32_t __index = __step + __gpu_lane_id();                           \
249         __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,         \
250                                             __gpu_num_lanes());                \
251       }                                                                        \
252       return __gpu_read_first_lane_##__suffix(__lane_mask, __x);               \
253     }                                                                          \
254   }
255 __DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
256 __DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
257 __DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
258 __DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
259 #undef __DO_LANE_SUM
260 
261 // Returns a bitmask marking all lanes that have the same value of __x.
262 _DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32_impl(uint64_t __lane_mask,uint32_t __x)263 __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
264   uint64_t __match_mask = 0;
265 
266   bool __done = 0;
267   for (uint64_t __active_mask = __lane_mask; __active_mask;
268        __active_mask = __gpu_ballot(__lane_mask, !__done)) {
269     if (!__done) {
270       uint32_t __first = __gpu_read_first_lane_u32(__active_mask, __x);
271       if (__first == __x) {
272         __match_mask = __gpu_lane_mask();
273         __done = 1;
274       }
275     }
276   }
277   __gpu_sync_lane(__lane_mask);
278   return __match_mask;
279 }
280 
281 // Returns a bitmask marking all lanes that have the same value of __x.
282 _DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u64_impl(uint64_t __lane_mask,uint64_t __x)283 __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
284   uint64_t __match_mask = 0;
285 
286   bool __done = 0;
287   for (uint64_t __active_mask = __lane_mask; __active_mask;
288        __active_mask = __gpu_ballot(__lane_mask, !__done)) {
289     if (!__done) {
290       uint64_t __first = __gpu_read_first_lane_u64(__active_mask, __x);
291       if (__first == __x) {
292         __match_mask = __gpu_lane_mask();
293         __done = 1;
294       }
295     }
296   }
297   __gpu_sync_lane(__lane_mask);
298   return __match_mask;
299 }
300 
301 // Returns the current lane mask if every lane contains __x.
302 _DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32_impl(uint64_t __lane_mask,uint32_t __x)303 __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
304   uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
305   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
306   __gpu_sync_lane(__lane_mask);
307   return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
308 }
309 
310 // Returns the current lane mask if every lane contains __x.
311 _DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u64_impl(uint64_t __lane_mask,uint64_t __x)312 __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
313   uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
314   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
315   __gpu_sync_lane(__lane_mask);
316   return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
317 }
318 
319 _Pragma("omp end declare variant");
320 _Pragma("omp end declare target");
321 
322 #if !defined(__cplusplus)
323 _Pragma("pop_macro(\"bool\")");
324 #endif
325 
326 #undef _DEFAULT_FN_ATTRS
327 
328 #endif // __GPUINTRIN_H
329