xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp (revision e64bea71c21eb42e97aa615188ba91f6cce0d36d)
1 //===-------- NVPTX.cpp - Emit LLVM Code for builtins ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This contains code to emit Builtin calls as LLVM code.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "CGBuiltin.h"
14 #include "clang/Basic/TargetBuiltins.h"
15 #include "llvm/IR/IntrinsicsNVPTX.h"
16 
17 using namespace clang;
18 using namespace CodeGen;
19 using namespace llvm;
20 
21 namespace {
22 // Helper classes for mapping MMA builtins to particular LLVM intrinsic variant.
23 struct NVPTXMmaLdstInfo {
24   unsigned NumResults;  // Number of elements to load/store
25   // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported.
26   unsigned IID_col;
27   unsigned IID_row;
28 };
29 
30 #define MMA_INTR(geom_op_type, layout) \
31   Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride
32 #define MMA_LDST(n, geom_op_type)                                              \
33   { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) }
34 
35 static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) {
36   switch (BuiltinID) {
37   // FP MMA loads
38   case NVPTX::BI__hmma_m16n16k16_ld_a:
39     return MMA_LDST(8, m16n16k16_load_a_f16);
40   case NVPTX::BI__hmma_m16n16k16_ld_b:
41     return MMA_LDST(8, m16n16k16_load_b_f16);
42   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
43     return MMA_LDST(4, m16n16k16_load_c_f16);
44   case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
45     return MMA_LDST(8, m16n16k16_load_c_f32);
46   case NVPTX::BI__hmma_m32n8k16_ld_a:
47     return MMA_LDST(8, m32n8k16_load_a_f16);
48   case NVPTX::BI__hmma_m32n8k16_ld_b:
49     return MMA_LDST(8, m32n8k16_load_b_f16);
50   case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
51     return MMA_LDST(4, m32n8k16_load_c_f16);
52   case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
53     return MMA_LDST(8, m32n8k16_load_c_f32);
54   case NVPTX::BI__hmma_m8n32k16_ld_a:
55     return MMA_LDST(8, m8n32k16_load_a_f16);
56   case NVPTX::BI__hmma_m8n32k16_ld_b:
57     return MMA_LDST(8, m8n32k16_load_b_f16);
58   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
59     return MMA_LDST(4, m8n32k16_load_c_f16);
60   case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
61     return MMA_LDST(8, m8n32k16_load_c_f32);
62 
63   // Integer MMA loads
64   case NVPTX::BI__imma_m16n16k16_ld_a_s8:
65     return MMA_LDST(2, m16n16k16_load_a_s8);
66   case NVPTX::BI__imma_m16n16k16_ld_a_u8:
67     return MMA_LDST(2, m16n16k16_load_a_u8);
68   case NVPTX::BI__imma_m16n16k16_ld_b_s8:
69     return MMA_LDST(2, m16n16k16_load_b_s8);
70   case NVPTX::BI__imma_m16n16k16_ld_b_u8:
71     return MMA_LDST(2, m16n16k16_load_b_u8);
72   case NVPTX::BI__imma_m16n16k16_ld_c:
73     return MMA_LDST(8, m16n16k16_load_c_s32);
74   case NVPTX::BI__imma_m32n8k16_ld_a_s8:
75     return MMA_LDST(4, m32n8k16_load_a_s8);
76   case NVPTX::BI__imma_m32n8k16_ld_a_u8:
77     return MMA_LDST(4, m32n8k16_load_a_u8);
78   case NVPTX::BI__imma_m32n8k16_ld_b_s8:
79     return MMA_LDST(1, m32n8k16_load_b_s8);
80   case NVPTX::BI__imma_m32n8k16_ld_b_u8:
81     return MMA_LDST(1, m32n8k16_load_b_u8);
82   case NVPTX::BI__imma_m32n8k16_ld_c:
83     return MMA_LDST(8, m32n8k16_load_c_s32);
84   case NVPTX::BI__imma_m8n32k16_ld_a_s8:
85     return MMA_LDST(1, m8n32k16_load_a_s8);
86   case NVPTX::BI__imma_m8n32k16_ld_a_u8:
87     return MMA_LDST(1, m8n32k16_load_a_u8);
88   case NVPTX::BI__imma_m8n32k16_ld_b_s8:
89     return MMA_LDST(4, m8n32k16_load_b_s8);
90   case NVPTX::BI__imma_m8n32k16_ld_b_u8:
91     return MMA_LDST(4, m8n32k16_load_b_u8);
92   case NVPTX::BI__imma_m8n32k16_ld_c:
93     return MMA_LDST(8, m8n32k16_load_c_s32);
94 
95   // Sub-integer MMA loads.
96   // Only row/col layout is supported by A/B fragments.
97   case NVPTX::BI__imma_m8n8k32_ld_a_s4:
98     return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)};
99   case NVPTX::BI__imma_m8n8k32_ld_a_u4:
100     return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)};
101   case NVPTX::BI__imma_m8n8k32_ld_b_s4:
102     return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0};
103   case NVPTX::BI__imma_m8n8k32_ld_b_u4:
104     return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0};
105   case NVPTX::BI__imma_m8n8k32_ld_c:
106     return MMA_LDST(2, m8n8k32_load_c_s32);
107   case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
108     return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)};
109   case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
110     return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0};
111   case NVPTX::BI__bmma_m8n8k128_ld_c:
112     return MMA_LDST(2, m8n8k128_load_c_s32);
113 
114   // Double MMA loads
115   case NVPTX::BI__dmma_m8n8k4_ld_a:
116     return MMA_LDST(1, m8n8k4_load_a_f64);
117   case NVPTX::BI__dmma_m8n8k4_ld_b:
118     return MMA_LDST(1, m8n8k4_load_b_f64);
119   case NVPTX::BI__dmma_m8n8k4_ld_c:
120     return MMA_LDST(2, m8n8k4_load_c_f64);
121 
122   // Alternate float MMA loads
123   case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
124     return MMA_LDST(4, m16n16k16_load_a_bf16);
125   case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
126     return MMA_LDST(4, m16n16k16_load_b_bf16);
127   case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
128     return MMA_LDST(2, m8n32k16_load_a_bf16);
129   case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
130     return MMA_LDST(8, m8n32k16_load_b_bf16);
131   case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
132     return MMA_LDST(8, m32n8k16_load_a_bf16);
133   case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
134     return MMA_LDST(2, m32n8k16_load_b_bf16);
135   case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
136     return MMA_LDST(4, m16n16k8_load_a_tf32);
137   case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
138     return MMA_LDST(4, m16n16k8_load_b_tf32);
139   case NVPTX::BI__mma_tf32_m16n16k8_ld_c:
140     return MMA_LDST(8, m16n16k8_load_c_f32);
141 
142   // NOTE: We need to follow inconsitent naming scheme used by NVCC.  Unlike
143   // PTX and LLVM IR where stores always use fragment D, NVCC builtins always
144   // use fragment C for both loads and stores.
145   // FP MMA stores.
146   case NVPTX::BI__hmma_m16n16k16_st_c_f16:
147     return MMA_LDST(4, m16n16k16_store_d_f16);
148   case NVPTX::BI__hmma_m16n16k16_st_c_f32:
149     return MMA_LDST(8, m16n16k16_store_d_f32);
150   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
151     return MMA_LDST(4, m32n8k16_store_d_f16);
152   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
153     return MMA_LDST(8, m32n8k16_store_d_f32);
154   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
155     return MMA_LDST(4, m8n32k16_store_d_f16);
156   case NVPTX::BI__hmma_m8n32k16_st_c_f32:
157     return MMA_LDST(8, m8n32k16_store_d_f32);
158 
159   // Integer and sub-integer MMA stores.
160   // Another naming quirk. Unlike other MMA builtins that use PTX types in the
161   // name, integer loads/stores use LLVM's i32.
162   case NVPTX::BI__imma_m16n16k16_st_c_i32:
163     return MMA_LDST(8, m16n16k16_store_d_s32);
164   case NVPTX::BI__imma_m32n8k16_st_c_i32:
165     return MMA_LDST(8, m32n8k16_store_d_s32);
166   case NVPTX::BI__imma_m8n32k16_st_c_i32:
167     return MMA_LDST(8, m8n32k16_store_d_s32);
168   case NVPTX::BI__imma_m8n8k32_st_c_i32:
169     return MMA_LDST(2, m8n8k32_store_d_s32);
170   case NVPTX::BI__bmma_m8n8k128_st_c_i32:
171     return MMA_LDST(2, m8n8k128_store_d_s32);
172 
173   // Double MMA store
174   case NVPTX::BI__dmma_m8n8k4_st_c_f64:
175     return MMA_LDST(2, m8n8k4_store_d_f64);
176 
177   // Alternate float MMA store
178   case NVPTX::BI__mma_m16n16k8_st_c_f32:
179     return MMA_LDST(8, m16n16k8_store_d_f32);
180 
181   default:
182     llvm_unreachable("Unknown MMA builtin");
183   }
184 }
185 #undef MMA_LDST
186 #undef MMA_INTR
187 
188 
189 struct NVPTXMmaInfo {
190   unsigned NumEltsA;
191   unsigned NumEltsB;
192   unsigned NumEltsC;
193   unsigned NumEltsD;
194 
195   // Variants are ordered by layout-A/layout-B/satf, where 'row' has priority
196   // over 'col' for layout. The index of non-satf variants is expected to match
197   // the undocumented layout constants used by CUDA's mma.hpp.
198   std::array<unsigned, 8> Variants;
199 
200   unsigned getMMAIntrinsic(int Layout, bool Satf) {
201     unsigned Index = Layout + 4 * Satf;
202     if (Index >= Variants.size())
203       return 0;
204     return Variants[Index];
205   }
206 };
207 
208   // Returns an intrinsic that matches Layout and Satf for valid combinations of
209   // Layout and Satf, 0 otherwise.
210 static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
211   // clang-format off
212 #define MMA_VARIANTS(geom, type)                                    \
213       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type,             \
214       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
215       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type,             \
216       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type
217 #define MMA_SATF_VARIANTS(geom, type)                               \
218       MMA_VARIANTS(geom, type),                                     \
219       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
220       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
221       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
222       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite
223 // Sub-integer MMA only supports row.col layout.
224 #define MMA_VARIANTS_I4(geom, type) \
225       0, \
226       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
227       0, \
228       0, \
229       0, \
230       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
231       0, \
232       0
233 // b1 MMA does not support .satfinite.
234 #define MMA_VARIANTS_B1_XOR(geom, type) \
235       0, \
236       Intrinsic::nvvm_wmma_##geom##_mma_xor_popc_row_col_##type,             \
237       0, \
238       0, \
239       0, \
240       0, \
241       0, \
242       0
243 #define MMA_VARIANTS_B1_AND(geom, type) \
244       0, \
245       Intrinsic::nvvm_wmma_##geom##_mma_and_popc_row_col_##type,             \
246       0, \
247       0, \
248       0, \
249       0, \
250       0, \
251       0
252   // clang-format on
253   switch (BuiltinID) {
254   // FP MMA
255   // Note that 'type' argument of MMA_SATF_VARIANTS uses D_C notation, while
256   // NumEltsN of return value are ordered as A,B,C,D.
257   case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
258     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f16)}}};
259   case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
260     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f16)}}};
261   case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
262     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f32)}}};
263   case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
264     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f32)}}};
265   case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
266     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f16)}}};
267   case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
268     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f16)}}};
269   case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
270     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f32)}}};
271   case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
272     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f32)}}};
273   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
274     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f16)}}};
275   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
276     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f16)}}};
277   case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
278     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f32)}}};
279   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
280     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f32)}}};
281 
282   // Integer MMA
283   case NVPTX::BI__imma_m16n16k16_mma_s8:
284     return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, s8)}}};
285   case NVPTX::BI__imma_m16n16k16_mma_u8:
286     return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, u8)}}};
287   case NVPTX::BI__imma_m32n8k16_mma_s8:
288     return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, s8)}}};
289   case NVPTX::BI__imma_m32n8k16_mma_u8:
290     return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, u8)}}};
291   case NVPTX::BI__imma_m8n32k16_mma_s8:
292     return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, s8)}}};
293   case NVPTX::BI__imma_m8n32k16_mma_u8:
294     return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, u8)}}};
295 
296   // Sub-integer MMA
297   case NVPTX::BI__imma_m8n8k32_mma_s4:
298     return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, s4)}}};
299   case NVPTX::BI__imma_m8n8k32_mma_u4:
300     return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, u4)}}};
301   case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
302     return {1, 1, 2, 2, {{MMA_VARIANTS_B1_XOR(m8n8k128, b1)}}};
303   case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
304     return {1, 1, 2, 2, {{MMA_VARIANTS_B1_AND(m8n8k128, b1)}}};
305 
306   // Double MMA
307   case NVPTX::BI__dmma_m8n8k4_mma_f64:
308     return {1, 1, 2, 2, {{MMA_VARIANTS(m8n8k4, f64)}}};
309 
310   // Alternate FP MMA
311   case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
312     return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k16, bf16)}}};
313   case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
314     return {2, 8, 8, 8, {{MMA_VARIANTS(m8n32k16, bf16)}}};
315   case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
316     return {8, 2, 8, 8, {{MMA_VARIANTS(m32n8k16, bf16)}}};
317   case NVPTX::BI__mma_tf32_m16n16k8_mma_f32:
318     return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k8, tf32)}}};
319   default:
320     llvm_unreachable("Unexpected builtin ID.");
321   }
322 #undef MMA_VARIANTS
323 #undef MMA_SATF_VARIANTS
324 #undef MMA_VARIANTS_I4
325 #undef MMA_VARIANTS_B1_AND
326 #undef MMA_VARIANTS_B1_XOR
327 }
328 
329 static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
330                       const CallExpr *E) {
331   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
332   QualType ArgType = E->getArg(0)->getType();
333   clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
334   llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
335   return CGF.Builder.CreateCall(
336       CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
337       {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
338 }
339 
340 static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) {
341   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
342   QualType ArgType = E->getArg(0)->getType();
343   clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
344   llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
345 
346   // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
347   auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1));
348   auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign());
349   MDNode *MD = MDNode::get(CGF.Builder.getContext(), {});
350   LD->setMetadata(LLVMContext::MD_invariant_load, MD);
351 
352   return LD;
353 }
354 
355 static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
356                                const CallExpr *E) {
357   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
358   llvm::Type *ElemTy =
359       CGF.ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
360   return CGF.Builder.CreateCall(
361       CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
362       {Ptr, CGF.EmitScalarExpr(E->getArg(1))});
363 }
364 
365 static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
366                           CodeGenFunction &CGF, const CallExpr *E,
367                           int SrcSize) {
368   return E->getNumArgs() == 3
369              ? CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicIDS),
370                                       {CGF.EmitScalarExpr(E->getArg(0)),
371                                        CGF.EmitScalarExpr(E->getArg(1)),
372                                        CGF.EmitScalarExpr(E->getArg(2))})
373              : CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID),
374                                       {CGF.EmitScalarExpr(E->getArg(0)),
375                                        CGF.EmitScalarExpr(E->getArg(1))});
376 }
377 
378 static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
379                            const CallExpr *E, CodeGenFunction &CGF) {
380   auto &C = CGF.CGM.getContext();
381   if (!(C.getLangOpts().NativeHalfType ||
382         !C.getTargetInfo().useFP16ConversionIntrinsics())) {
383     CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getQuotedName(BuiltinID) +
384                                        " requires native half type support.");
385     return nullptr;
386   }
387 
388   if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
389     return MakeLdg(CGF, E);
390 
391   if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
392     return MakeLdu(IntrinsicID, CGF, E);
393 
394   SmallVector<Value *, 16> Args;
395   auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
396   auto *FTy = F->getFunctionType();
397   unsigned ICEArguments = 0;
398   ASTContext::GetBuiltinTypeError Error;
399   C.GetBuiltinType(BuiltinID, Error, &ICEArguments);
400   assert(Error == ASTContext::GE_None && "Should not codegen an error");
401   for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
402     assert((ICEArguments & (1 << i)) == 0);
403     auto *ArgValue = CGF.EmitScalarExpr(E->getArg(i));
404     auto *PTy = FTy->getParamType(i);
405     if (PTy != ArgValue->getType())
406       ArgValue = CGF.Builder.CreateBitCast(ArgValue, PTy);
407     Args.push_back(ArgValue);
408   }
409 
410   return CGF.Builder.CreateCall(F, Args);
411 }
412 } // namespace
413 
414 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
415                                              const CallExpr *E) {
416   switch (BuiltinID) {
417   case NVPTX::BI__nvvm_atom_add_gen_i:
418   case NVPTX::BI__nvvm_atom_add_gen_l:
419   case NVPTX::BI__nvvm_atom_add_gen_ll:
420     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);
421 
422   case NVPTX::BI__nvvm_atom_sub_gen_i:
423   case NVPTX::BI__nvvm_atom_sub_gen_l:
424   case NVPTX::BI__nvvm_atom_sub_gen_ll:
425     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);
426 
427   case NVPTX::BI__nvvm_atom_and_gen_i:
428   case NVPTX::BI__nvvm_atom_and_gen_l:
429   case NVPTX::BI__nvvm_atom_and_gen_ll:
430     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);
431 
432   case NVPTX::BI__nvvm_atom_or_gen_i:
433   case NVPTX::BI__nvvm_atom_or_gen_l:
434   case NVPTX::BI__nvvm_atom_or_gen_ll:
435     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);
436 
437   case NVPTX::BI__nvvm_atom_xor_gen_i:
438   case NVPTX::BI__nvvm_atom_xor_gen_l:
439   case NVPTX::BI__nvvm_atom_xor_gen_ll:
440     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);
441 
442   case NVPTX::BI__nvvm_atom_xchg_gen_i:
443   case NVPTX::BI__nvvm_atom_xchg_gen_l:
444   case NVPTX::BI__nvvm_atom_xchg_gen_ll:
445     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);
446 
447   case NVPTX::BI__nvvm_atom_max_gen_i:
448   case NVPTX::BI__nvvm_atom_max_gen_l:
449   case NVPTX::BI__nvvm_atom_max_gen_ll:
450     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);
451 
452   case NVPTX::BI__nvvm_atom_max_gen_ui:
453   case NVPTX::BI__nvvm_atom_max_gen_ul:
454   case NVPTX::BI__nvvm_atom_max_gen_ull:
455     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E);
456 
457   case NVPTX::BI__nvvm_atom_min_gen_i:
458   case NVPTX::BI__nvvm_atom_min_gen_l:
459   case NVPTX::BI__nvvm_atom_min_gen_ll:
460     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);
461 
462   case NVPTX::BI__nvvm_atom_min_gen_ui:
463   case NVPTX::BI__nvvm_atom_min_gen_ul:
464   case NVPTX::BI__nvvm_atom_min_gen_ull:
465     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
466 
467   case NVPTX::BI__nvvm_atom_cas_gen_us:
468   case NVPTX::BI__nvvm_atom_cas_gen_i:
469   case NVPTX::BI__nvvm_atom_cas_gen_l:
470   case NVPTX::BI__nvvm_atom_cas_gen_ll:
471     // __nvvm_atom_cas_gen_* should return the old value rather than the
472     // success flag.
473     return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false);
474 
475   case NVPTX::BI__nvvm_atom_add_gen_f:
476   case NVPTX::BI__nvvm_atom_add_gen_d: {
477     Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
478     Value *Val = EmitScalarExpr(E->getArg(1));
479 
480     return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, DestAddr, Val,
481                                    AtomicOrdering::SequentiallyConsistent);
482   }
483 
484   case NVPTX::BI__nvvm_atom_inc_gen_ui:
485     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E);
486 
487   case NVPTX::BI__nvvm_atom_dec_gen_ui:
488     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E);
489 
490   case NVPTX::BI__nvvm_ldg_c:
491   case NVPTX::BI__nvvm_ldg_sc:
492   case NVPTX::BI__nvvm_ldg_c2:
493   case NVPTX::BI__nvvm_ldg_sc2:
494   case NVPTX::BI__nvvm_ldg_c4:
495   case NVPTX::BI__nvvm_ldg_sc4:
496   case NVPTX::BI__nvvm_ldg_s:
497   case NVPTX::BI__nvvm_ldg_s2:
498   case NVPTX::BI__nvvm_ldg_s4:
499   case NVPTX::BI__nvvm_ldg_i:
500   case NVPTX::BI__nvvm_ldg_i2:
501   case NVPTX::BI__nvvm_ldg_i4:
502   case NVPTX::BI__nvvm_ldg_l:
503   case NVPTX::BI__nvvm_ldg_l2:
504   case NVPTX::BI__nvvm_ldg_ll:
505   case NVPTX::BI__nvvm_ldg_ll2:
506   case NVPTX::BI__nvvm_ldg_uc:
507   case NVPTX::BI__nvvm_ldg_uc2:
508   case NVPTX::BI__nvvm_ldg_uc4:
509   case NVPTX::BI__nvvm_ldg_us:
510   case NVPTX::BI__nvvm_ldg_us2:
511   case NVPTX::BI__nvvm_ldg_us4:
512   case NVPTX::BI__nvvm_ldg_ui:
513   case NVPTX::BI__nvvm_ldg_ui2:
514   case NVPTX::BI__nvvm_ldg_ui4:
515   case NVPTX::BI__nvvm_ldg_ul:
516   case NVPTX::BI__nvvm_ldg_ul2:
517   case NVPTX::BI__nvvm_ldg_ull:
518   case NVPTX::BI__nvvm_ldg_ull2:
519   case NVPTX::BI__nvvm_ldg_f:
520   case NVPTX::BI__nvvm_ldg_f2:
521   case NVPTX::BI__nvvm_ldg_f4:
522   case NVPTX::BI__nvvm_ldg_d:
523   case NVPTX::BI__nvvm_ldg_d2:
524     // PTX Interoperability section 2.2: "For a vector with an even number of
525     // elements, its alignment is set to number of elements times the alignment
526     // of its member: n*alignof(t)."
527     return MakeLdg(*this, E);
528 
529   case NVPTX::BI__nvvm_ldu_c:
530   case NVPTX::BI__nvvm_ldu_sc:
531   case NVPTX::BI__nvvm_ldu_c2:
532   case NVPTX::BI__nvvm_ldu_sc2:
533   case NVPTX::BI__nvvm_ldu_c4:
534   case NVPTX::BI__nvvm_ldu_sc4:
535   case NVPTX::BI__nvvm_ldu_s:
536   case NVPTX::BI__nvvm_ldu_s2:
537   case NVPTX::BI__nvvm_ldu_s4:
538   case NVPTX::BI__nvvm_ldu_i:
539   case NVPTX::BI__nvvm_ldu_i2:
540   case NVPTX::BI__nvvm_ldu_i4:
541   case NVPTX::BI__nvvm_ldu_l:
542   case NVPTX::BI__nvvm_ldu_l2:
543   case NVPTX::BI__nvvm_ldu_ll:
544   case NVPTX::BI__nvvm_ldu_ll2:
545   case NVPTX::BI__nvvm_ldu_uc:
546   case NVPTX::BI__nvvm_ldu_uc2:
547   case NVPTX::BI__nvvm_ldu_uc4:
548   case NVPTX::BI__nvvm_ldu_us:
549   case NVPTX::BI__nvvm_ldu_us2:
550   case NVPTX::BI__nvvm_ldu_us4:
551   case NVPTX::BI__nvvm_ldu_ui:
552   case NVPTX::BI__nvvm_ldu_ui2:
553   case NVPTX::BI__nvvm_ldu_ui4:
554   case NVPTX::BI__nvvm_ldu_ul:
555   case NVPTX::BI__nvvm_ldu_ul2:
556   case NVPTX::BI__nvvm_ldu_ull:
557   case NVPTX::BI__nvvm_ldu_ull2:
558     return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
559   case NVPTX::BI__nvvm_ldu_f:
560   case NVPTX::BI__nvvm_ldu_f2:
561   case NVPTX::BI__nvvm_ldu_f4:
562   case NVPTX::BI__nvvm_ldu_d:
563   case NVPTX::BI__nvvm_ldu_d2:
564     return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
565 
566   case NVPTX::BI__nvvm_atom_cta_add_gen_i:
567   case NVPTX::BI__nvvm_atom_cta_add_gen_l:
568   case NVPTX::BI__nvvm_atom_cta_add_gen_ll:
569     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta, *this, E);
570   case NVPTX::BI__nvvm_atom_sys_add_gen_i:
571   case NVPTX::BI__nvvm_atom_sys_add_gen_l:
572   case NVPTX::BI__nvvm_atom_sys_add_gen_ll:
573     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys, *this, E);
574   case NVPTX::BI__nvvm_atom_cta_add_gen_f:
575   case NVPTX::BI__nvvm_atom_cta_add_gen_d:
576     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta, *this, E);
577   case NVPTX::BI__nvvm_atom_sys_add_gen_f:
578   case NVPTX::BI__nvvm_atom_sys_add_gen_d:
579     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys, *this, E);
580   case NVPTX::BI__nvvm_atom_cta_xchg_gen_i:
581   case NVPTX::BI__nvvm_atom_cta_xchg_gen_l:
582   case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll:
583     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta, *this, E);
584   case NVPTX::BI__nvvm_atom_sys_xchg_gen_i:
585   case NVPTX::BI__nvvm_atom_sys_xchg_gen_l:
586   case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll:
587     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys, *this, E);
588   case NVPTX::BI__nvvm_atom_cta_max_gen_i:
589   case NVPTX::BI__nvvm_atom_cta_max_gen_ui:
590   case NVPTX::BI__nvvm_atom_cta_max_gen_l:
591   case NVPTX::BI__nvvm_atom_cta_max_gen_ul:
592   case NVPTX::BI__nvvm_atom_cta_max_gen_ll:
593   case NVPTX::BI__nvvm_atom_cta_max_gen_ull:
594     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta, *this, E);
595   case NVPTX::BI__nvvm_atom_sys_max_gen_i:
596   case NVPTX::BI__nvvm_atom_sys_max_gen_ui:
597   case NVPTX::BI__nvvm_atom_sys_max_gen_l:
598   case NVPTX::BI__nvvm_atom_sys_max_gen_ul:
599   case NVPTX::BI__nvvm_atom_sys_max_gen_ll:
600   case NVPTX::BI__nvvm_atom_sys_max_gen_ull:
601     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys, *this, E);
602   case NVPTX::BI__nvvm_atom_cta_min_gen_i:
603   case NVPTX::BI__nvvm_atom_cta_min_gen_ui:
604   case NVPTX::BI__nvvm_atom_cta_min_gen_l:
605   case NVPTX::BI__nvvm_atom_cta_min_gen_ul:
606   case NVPTX::BI__nvvm_atom_cta_min_gen_ll:
607   case NVPTX::BI__nvvm_atom_cta_min_gen_ull:
608     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta, *this, E);
609   case NVPTX::BI__nvvm_atom_sys_min_gen_i:
610   case NVPTX::BI__nvvm_atom_sys_min_gen_ui:
611   case NVPTX::BI__nvvm_atom_sys_min_gen_l:
612   case NVPTX::BI__nvvm_atom_sys_min_gen_ul:
613   case NVPTX::BI__nvvm_atom_sys_min_gen_ll:
614   case NVPTX::BI__nvvm_atom_sys_min_gen_ull:
615     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys, *this, E);
616   case NVPTX::BI__nvvm_atom_cta_inc_gen_ui:
617     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta, *this, E);
618   case NVPTX::BI__nvvm_atom_cta_dec_gen_ui:
619     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta, *this, E);
620   case NVPTX::BI__nvvm_atom_sys_inc_gen_ui:
621     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys, *this, E);
622   case NVPTX::BI__nvvm_atom_sys_dec_gen_ui:
623     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys, *this, E);
624   case NVPTX::BI__nvvm_atom_cta_and_gen_i:
625   case NVPTX::BI__nvvm_atom_cta_and_gen_l:
626   case NVPTX::BI__nvvm_atom_cta_and_gen_ll:
627     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta, *this, E);
628   case NVPTX::BI__nvvm_atom_sys_and_gen_i:
629   case NVPTX::BI__nvvm_atom_sys_and_gen_l:
630   case NVPTX::BI__nvvm_atom_sys_and_gen_ll:
631     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys, *this, E);
632   case NVPTX::BI__nvvm_atom_cta_or_gen_i:
633   case NVPTX::BI__nvvm_atom_cta_or_gen_l:
634   case NVPTX::BI__nvvm_atom_cta_or_gen_ll:
635     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta, *this, E);
636   case NVPTX::BI__nvvm_atom_sys_or_gen_i:
637   case NVPTX::BI__nvvm_atom_sys_or_gen_l:
638   case NVPTX::BI__nvvm_atom_sys_or_gen_ll:
639     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys, *this, E);
640   case NVPTX::BI__nvvm_atom_cta_xor_gen_i:
641   case NVPTX::BI__nvvm_atom_cta_xor_gen_l:
642   case NVPTX::BI__nvvm_atom_cta_xor_gen_ll:
643     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta, *this, E);
644   case NVPTX::BI__nvvm_atom_sys_xor_gen_i:
645   case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
646   case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
647     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E);
648   case NVPTX::BI__nvvm_atom_cta_cas_gen_us:
649   case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
650   case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
651   case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
652     Value *Ptr = EmitScalarExpr(E->getArg(0));
653     llvm::Type *ElemTy =
654         ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
655     return Builder.CreateCall(
656         CGM.getIntrinsic(
657             Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}),
658         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
659   }
660   case NVPTX::BI__nvvm_atom_sys_cas_gen_us:
661   case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
662   case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
663   case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
664     Value *Ptr = EmitScalarExpr(E->getArg(0));
665     llvm::Type *ElemTy =
666         ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
667     return Builder.CreateCall(
668         CGM.getIntrinsic(
669             Intrinsic::nvvm_atomic_cas_gen_i_sys, {ElemTy, Ptr->getType()}),
670         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
671   }
672   case NVPTX::BI__nvvm_match_all_sync_i32p:
673   case NVPTX::BI__nvvm_match_all_sync_i64p: {
674     Value *Mask = EmitScalarExpr(E->getArg(0));
675     Value *Val = EmitScalarExpr(E->getArg(1));
676     Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2));
677     Value *ResultPair = Builder.CreateCall(
678         CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p
679                              ? Intrinsic::nvvm_match_all_sync_i32p
680                              : Intrinsic::nvvm_match_all_sync_i64p),
681         {Mask, Val});
682     Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1),
683                                      PredOutPtr.getElementType());
684     Builder.CreateStore(Pred, PredOutPtr);
685     return Builder.CreateExtractValue(ResultPair, 0);
686   }
687 
688   // FP MMA loads
689   case NVPTX::BI__hmma_m16n16k16_ld_a:
690   case NVPTX::BI__hmma_m16n16k16_ld_b:
691   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
692   case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
693   case NVPTX::BI__hmma_m32n8k16_ld_a:
694   case NVPTX::BI__hmma_m32n8k16_ld_b:
695   case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
696   case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
697   case NVPTX::BI__hmma_m8n32k16_ld_a:
698   case NVPTX::BI__hmma_m8n32k16_ld_b:
699   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
700   case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
701   // Integer MMA loads.
702   case NVPTX::BI__imma_m16n16k16_ld_a_s8:
703   case NVPTX::BI__imma_m16n16k16_ld_a_u8:
704   case NVPTX::BI__imma_m16n16k16_ld_b_s8:
705   case NVPTX::BI__imma_m16n16k16_ld_b_u8:
706   case NVPTX::BI__imma_m16n16k16_ld_c:
707   case NVPTX::BI__imma_m32n8k16_ld_a_s8:
708   case NVPTX::BI__imma_m32n8k16_ld_a_u8:
709   case NVPTX::BI__imma_m32n8k16_ld_b_s8:
710   case NVPTX::BI__imma_m32n8k16_ld_b_u8:
711   case NVPTX::BI__imma_m32n8k16_ld_c:
712   case NVPTX::BI__imma_m8n32k16_ld_a_s8:
713   case NVPTX::BI__imma_m8n32k16_ld_a_u8:
714   case NVPTX::BI__imma_m8n32k16_ld_b_s8:
715   case NVPTX::BI__imma_m8n32k16_ld_b_u8:
716   case NVPTX::BI__imma_m8n32k16_ld_c:
717   // Sub-integer MMA loads.
718   case NVPTX::BI__imma_m8n8k32_ld_a_s4:
719   case NVPTX::BI__imma_m8n8k32_ld_a_u4:
720   case NVPTX::BI__imma_m8n8k32_ld_b_s4:
721   case NVPTX::BI__imma_m8n8k32_ld_b_u4:
722   case NVPTX::BI__imma_m8n8k32_ld_c:
723   case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
724   case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
725   case NVPTX::BI__bmma_m8n8k128_ld_c:
726   // Double MMA loads.
727   case NVPTX::BI__dmma_m8n8k4_ld_a:
728   case NVPTX::BI__dmma_m8n8k4_ld_b:
729   case NVPTX::BI__dmma_m8n8k4_ld_c:
730   // Alternate float MMA loads.
731   case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
732   case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
733   case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
734   case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
735   case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
736   case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
737   case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
738   case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
739   case NVPTX::BI__mma_tf32_m16n16k8_ld_c: {
740     Address Dst = EmitPointerWithAlignment(E->getArg(0));
741     Value *Src = EmitScalarExpr(E->getArg(1));
742     Value *Ldm = EmitScalarExpr(E->getArg(2));
743     std::optional<llvm::APSInt> isColMajorArg =
744         E->getArg(3)->getIntegerConstantExpr(getContext());
745     if (!isColMajorArg)
746       return nullptr;
747     bool isColMajor = isColMajorArg->getSExtValue();
748     NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
749     unsigned IID = isColMajor ? II.IID_col : II.IID_row;
750     if (IID == 0)
751       return nullptr;
752 
753     Value *Result =
754         Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
755 
756     // Save returned values.
757     assert(II.NumResults);
758     if (II.NumResults == 1) {
759       Builder.CreateAlignedStore(Result, Dst.emitRawPointer(*this),
760                                  CharUnits::fromQuantity(4));
761     } else {
762       for (unsigned i = 0; i < II.NumResults; ++i) {
763         Builder.CreateAlignedStore(
764             Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
765                                   Dst.getElementType()),
766             Builder.CreateGEP(Dst.getElementType(), Dst.emitRawPointer(*this),
767                               llvm::ConstantInt::get(IntTy, i)),
768             CharUnits::fromQuantity(4));
769       }
770     }
771     return Result;
772   }
773 
774   case NVPTX::BI__hmma_m16n16k16_st_c_f16:
775   case NVPTX::BI__hmma_m16n16k16_st_c_f32:
776   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
777   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
778   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
779   case NVPTX::BI__hmma_m8n32k16_st_c_f32:
780   case NVPTX::BI__imma_m16n16k16_st_c_i32:
781   case NVPTX::BI__imma_m32n8k16_st_c_i32:
782   case NVPTX::BI__imma_m8n32k16_st_c_i32:
783   case NVPTX::BI__imma_m8n8k32_st_c_i32:
784   case NVPTX::BI__bmma_m8n8k128_st_c_i32:
785   case NVPTX::BI__dmma_m8n8k4_st_c_f64:
786   case NVPTX::BI__mma_m16n16k8_st_c_f32: {
787     Value *Dst = EmitScalarExpr(E->getArg(0));
788     Address Src = EmitPointerWithAlignment(E->getArg(1));
789     Value *Ldm = EmitScalarExpr(E->getArg(2));
790     std::optional<llvm::APSInt> isColMajorArg =
791         E->getArg(3)->getIntegerConstantExpr(getContext());
792     if (!isColMajorArg)
793       return nullptr;
794     bool isColMajor = isColMajorArg->getSExtValue();
795     NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
796     unsigned IID = isColMajor ? II.IID_col : II.IID_row;
797     if (IID == 0)
798       return nullptr;
799     Function *Intrinsic =
800         CGM.getIntrinsic(IID, Dst->getType());
801     llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
802     SmallVector<Value *, 10> Values = {Dst};
803     for (unsigned i = 0; i < II.NumResults; ++i) {
804       Value *V = Builder.CreateAlignedLoad(
805           Src.getElementType(),
806           Builder.CreateGEP(Src.getElementType(), Src.emitRawPointer(*this),
807                             llvm::ConstantInt::get(IntTy, i)),
808           CharUnits::fromQuantity(4));
809       Values.push_back(Builder.CreateBitCast(V, ParamType));
810     }
811     Values.push_back(Ldm);
812     Value *Result = Builder.CreateCall(Intrinsic, Values);
813     return Result;
814   }
815 
816   // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
817   // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
818   case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
819   case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
820   case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
821   case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
822   case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
823   case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
824   case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
825   case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
826   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
827   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
828   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
829   case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
830   case NVPTX::BI__imma_m16n16k16_mma_s8:
831   case NVPTX::BI__imma_m16n16k16_mma_u8:
832   case NVPTX::BI__imma_m32n8k16_mma_s8:
833   case NVPTX::BI__imma_m32n8k16_mma_u8:
834   case NVPTX::BI__imma_m8n32k16_mma_s8:
835   case NVPTX::BI__imma_m8n32k16_mma_u8:
836   case NVPTX::BI__imma_m8n8k32_mma_s4:
837   case NVPTX::BI__imma_m8n8k32_mma_u4:
838   case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
839   case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
840   case NVPTX::BI__dmma_m8n8k4_mma_f64:
841   case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
842   case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
843   case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
844   case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: {
845     Address Dst = EmitPointerWithAlignment(E->getArg(0));
846     Address SrcA = EmitPointerWithAlignment(E->getArg(1));
847     Address SrcB = EmitPointerWithAlignment(E->getArg(2));
848     Address SrcC = EmitPointerWithAlignment(E->getArg(3));
849     std::optional<llvm::APSInt> LayoutArg =
850         E->getArg(4)->getIntegerConstantExpr(getContext());
851     if (!LayoutArg)
852       return nullptr;
853     int Layout = LayoutArg->getSExtValue();
854     if (Layout < 0 || Layout > 3)
855       return nullptr;
856     llvm::APSInt SatfArg;
857     if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1 ||
858         BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1)
859       SatfArg = 0;  // .b1 does not have satf argument.
860     else if (std::optional<llvm::APSInt> OptSatfArg =
861                  E->getArg(5)->getIntegerConstantExpr(getContext()))
862       SatfArg = *OptSatfArg;
863     else
864       return nullptr;
865     bool Satf = SatfArg.getSExtValue();
866     NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID);
867     unsigned IID = MI.getMMAIntrinsic(Layout, Satf);
868     if (IID == 0)  // Unsupported combination of Layout/Satf.
869       return nullptr;
870 
871     SmallVector<Value *, 24> Values;
872     Function *Intrinsic = CGM.getIntrinsic(IID);
873     llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0);
874     // Load A
875     for (unsigned i = 0; i < MI.NumEltsA; ++i) {
876       Value *V = Builder.CreateAlignedLoad(
877           SrcA.getElementType(),
878           Builder.CreateGEP(SrcA.getElementType(), SrcA.emitRawPointer(*this),
879                             llvm::ConstantInt::get(IntTy, i)),
880           CharUnits::fromQuantity(4));
881       Values.push_back(Builder.CreateBitCast(V, AType));
882     }
883     // Load B
884     llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA);
885     for (unsigned i = 0; i < MI.NumEltsB; ++i) {
886       Value *V = Builder.CreateAlignedLoad(
887           SrcB.getElementType(),
888           Builder.CreateGEP(SrcB.getElementType(), SrcB.emitRawPointer(*this),
889                             llvm::ConstantInt::get(IntTy, i)),
890           CharUnits::fromQuantity(4));
891       Values.push_back(Builder.CreateBitCast(V, BType));
892     }
893     // Load C
894     llvm::Type *CType =
895         Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB);
896     for (unsigned i = 0; i < MI.NumEltsC; ++i) {
897       Value *V = Builder.CreateAlignedLoad(
898           SrcC.getElementType(),
899           Builder.CreateGEP(SrcC.getElementType(), SrcC.emitRawPointer(*this),
900                             llvm::ConstantInt::get(IntTy, i)),
901           CharUnits::fromQuantity(4));
902       Values.push_back(Builder.CreateBitCast(V, CType));
903     }
904     Value *Result = Builder.CreateCall(Intrinsic, Values);
905     llvm::Type *DType = Dst.getElementType();
906     for (unsigned i = 0; i < MI.NumEltsD; ++i)
907       Builder.CreateAlignedStore(
908           Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
909           Builder.CreateGEP(Dst.getElementType(), Dst.emitRawPointer(*this),
910                             llvm::ConstantInt::get(IntTy, i)),
911           CharUnits::fromQuantity(4));
912     return Result;
913   }
914   // The following builtins require half type support
915   case NVPTX::BI__nvvm_ex2_approx_f16:
916     return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this);
917   case NVPTX::BI__nvvm_ex2_approx_f16x2:
918     return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this);
919   case NVPTX::BI__nvvm_ff2f16x2_rn:
920     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this);
921   case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
922     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn_relu, BuiltinID, E, *this);
923   case NVPTX::BI__nvvm_ff2f16x2_rz:
924     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz, BuiltinID, E, *this);
925   case NVPTX::BI__nvvm_ff2f16x2_rz_relu:
926     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz_relu, BuiltinID, E, *this);
927   case NVPTX::BI__nvvm_fma_rn_f16:
928     return MakeHalfType(Intrinsic::nvvm_fma_rn_f16, BuiltinID, E, *this);
929   case NVPTX::BI__nvvm_fma_rn_f16x2:
930     return MakeHalfType(Intrinsic::nvvm_fma_rn_f16x2, BuiltinID, E, *this);
931   case NVPTX::BI__nvvm_fma_rn_ftz_f16:
932     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16, BuiltinID, E, *this);
933   case NVPTX::BI__nvvm_fma_rn_ftz_f16x2:
934     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16x2, BuiltinID, E, *this);
935   case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16:
936     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16, BuiltinID, E,
937                         *this);
938   case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2:
939     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16x2, BuiltinID, E,
940                         *this);
941   case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16:
942     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16, BuiltinID, E,
943                         *this);
944   case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2:
945     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16x2, BuiltinID, E,
946                         *this);
947   case NVPTX::BI__nvvm_fma_rn_relu_f16:
948     return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16, BuiltinID, E, *this);
949   case NVPTX::BI__nvvm_fma_rn_relu_f16x2:
950     return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16x2, BuiltinID, E, *this);
951   case NVPTX::BI__nvvm_fma_rn_sat_f16:
952     return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16, BuiltinID, E, *this);
953   case NVPTX::BI__nvvm_fma_rn_sat_f16x2:
954     return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16x2, BuiltinID, E, *this);
955   case NVPTX::BI__nvvm_fmax_f16:
956     return MakeHalfType(Intrinsic::nvvm_fmax_f16, BuiltinID, E, *this);
957   case NVPTX::BI__nvvm_fmax_f16x2:
958     return MakeHalfType(Intrinsic::nvvm_fmax_f16x2, BuiltinID, E, *this);
959   case NVPTX::BI__nvvm_fmax_ftz_f16:
960     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16, BuiltinID, E, *this);
961   case NVPTX::BI__nvvm_fmax_ftz_f16x2:
962     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16x2, BuiltinID, E, *this);
963   case NVPTX::BI__nvvm_fmax_ftz_nan_f16:
964     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16, BuiltinID, E, *this);
965   case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2:
966     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16x2, BuiltinID, E,
967                         *this);
968   case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16:
969     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16, BuiltinID,
970                         E, *this);
971   case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2:
972     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16x2,
973                         BuiltinID, E, *this);
974   case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16:
975     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16, BuiltinID, E,
976                         *this);
977   case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2:
978     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16x2, BuiltinID,
979                         E, *this);
980   case NVPTX::BI__nvvm_fmax_nan_f16:
981     return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16, BuiltinID, E, *this);
982   case NVPTX::BI__nvvm_fmax_nan_f16x2:
983     return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16x2, BuiltinID, E, *this);
984   case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16:
985     return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16, BuiltinID, E,
986                         *this);
987   case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2:
988     return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16x2, BuiltinID,
989                         E, *this);
990   case NVPTX::BI__nvvm_fmax_xorsign_abs_f16:
991     return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16, BuiltinID, E,
992                         *this);
993   case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2:
994     return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16x2, BuiltinID, E,
995                         *this);
996   case NVPTX::BI__nvvm_fmin_f16:
997     return MakeHalfType(Intrinsic::nvvm_fmin_f16, BuiltinID, E, *this);
998   case NVPTX::BI__nvvm_fmin_f16x2:
999     return MakeHalfType(Intrinsic::nvvm_fmin_f16x2, BuiltinID, E, *this);
1000   case NVPTX::BI__nvvm_fmin_ftz_f16:
1001     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16, BuiltinID, E, *this);
1002   case NVPTX::BI__nvvm_fmin_ftz_f16x2:
1003     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16x2, BuiltinID, E, *this);
1004   case NVPTX::BI__nvvm_fmin_ftz_nan_f16:
1005     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16, BuiltinID, E, *this);
1006   case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2:
1007     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16x2, BuiltinID, E,
1008                         *this);
1009   case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16:
1010     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16, BuiltinID,
1011                         E, *this);
1012   case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2:
1013     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
1014                         BuiltinID, E, *this);
1015   case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16:
1016     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16, BuiltinID, E,
1017                         *this);
1018   case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2:
1019     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16x2, BuiltinID,
1020                         E, *this);
1021   case NVPTX::BI__nvvm_fmin_nan_f16:
1022     return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16, BuiltinID, E, *this);
1023   case NVPTX::BI__nvvm_fmin_nan_f16x2:
1024     return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16x2, BuiltinID, E, *this);
1025   case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16:
1026     return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16, BuiltinID, E,
1027                         *this);
1028   case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2:
1029     return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16x2, BuiltinID,
1030                         E, *this);
1031   case NVPTX::BI__nvvm_fmin_xorsign_abs_f16:
1032     return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16, BuiltinID, E,
1033                         *this);
1034   case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2:
1035     return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
1036                         *this);
1037   case NVPTX::BI__nvvm_fabs_f:
1038   case NVPTX::BI__nvvm_abs_bf16:
1039   case NVPTX::BI__nvvm_abs_bf16x2:
1040   case NVPTX::BI__nvvm_fabs_f16:
1041   case NVPTX::BI__nvvm_fabs_f16x2:
1042     return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs,
1043                                         EmitScalarExpr(E->getArg(0)));
1044   case NVPTX::BI__nvvm_fabs_ftz_f:
1045   case NVPTX::BI__nvvm_fabs_ftz_f16:
1046   case NVPTX::BI__nvvm_fabs_ftz_f16x2:
1047     return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs_ftz,
1048                                         EmitScalarExpr(E->getArg(0)));
1049   case NVPTX::BI__nvvm_fabs_d:
1050     return Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
1051                                         EmitScalarExpr(E->getArg(0)));
1052   case NVPTX::BI__nvvm_ldg_h:
1053   case NVPTX::BI__nvvm_ldg_h2:
1054     return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
1055   case NVPTX::BI__nvvm_ldu_h:
1056   case NVPTX::BI__nvvm_ldu_h2:
1057     return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
1058   case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
1059     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
1060                        Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
1061                        4);
1062   case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
1063     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8,
1064                        Intrinsic::nvvm_cp_async_ca_shared_global_8_s, *this, E,
1065                        8);
1066   case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
1067     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16,
1068                        Intrinsic::nvvm_cp_async_ca_shared_global_16_s, *this, E,
1069                        16);
1070   case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
1071     return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16,
1072                        Intrinsic::nvvm_cp_async_cg_shared_global_16_s, *this, E,
1073                        16);
1074   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_x:
1075     return Builder.CreateCall(
1076         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_x));
1077   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_y:
1078     return Builder.CreateCall(
1079         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_y));
1080   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_z:
1081     return Builder.CreateCall(
1082         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_z));
1083   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_w:
1084     return Builder.CreateCall(
1085         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_w));
1086   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_x:
1087     return Builder.CreateCall(
1088         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_x));
1089   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_y:
1090     return Builder.CreateCall(
1091         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_y));
1092   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_z:
1093     return Builder.CreateCall(
1094         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_z));
1095   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_w:
1096     return Builder.CreateCall(
1097         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_w));
1098   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_x:
1099     return Builder.CreateCall(
1100         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x));
1101   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_y:
1102     return Builder.CreateCall(
1103         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y));
1104   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_z:
1105     return Builder.CreateCall(
1106         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z));
1107   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_w:
1108     return Builder.CreateCall(
1109         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_w));
1110   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_x:
1111     return Builder.CreateCall(
1112         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x));
1113   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_y:
1114     return Builder.CreateCall(
1115         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y));
1116   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_z:
1117     return Builder.CreateCall(
1118         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z));
1119   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_w:
1120     return Builder.CreateCall(
1121         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_w));
1122   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctarank:
1123     return Builder.CreateCall(
1124         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank));
1125   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctarank:
1126     return Builder.CreateCall(
1127         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank));
1128   case NVPTX::BI__nvvm_is_explicit_cluster:
1129     return Builder.CreateCall(
1130         CGM.getIntrinsic(Intrinsic::nvvm_is_explicit_cluster));
1131   case NVPTX::BI__nvvm_isspacep_shared_cluster:
1132     return Builder.CreateCall(
1133         CGM.getIntrinsic(Intrinsic::nvvm_isspacep_shared_cluster),
1134         EmitScalarExpr(E->getArg(0)));
1135   case NVPTX::BI__nvvm_mapa:
1136     return Builder.CreateCall(
1137         CGM.getIntrinsic(Intrinsic::nvvm_mapa),
1138         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
1139   case NVPTX::BI__nvvm_mapa_shared_cluster:
1140     return Builder.CreateCall(
1141         CGM.getIntrinsic(Intrinsic::nvvm_mapa_shared_cluster),
1142         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
1143   case NVPTX::BI__nvvm_getctarank:
1144     return Builder.CreateCall(
1145         CGM.getIntrinsic(Intrinsic::nvvm_getctarank),
1146         EmitScalarExpr(E->getArg(0)));
1147   case NVPTX::BI__nvvm_getctarank_shared_cluster:
1148     return Builder.CreateCall(
1149         CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster),
1150         EmitScalarExpr(E->getArg(0)));
1151   case NVPTX::BI__nvvm_barrier_cluster_arrive:
1152     return Builder.CreateCall(
1153         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive));
1154   case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
1155     return Builder.CreateCall(
1156         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
1157   case NVPTX::BI__nvvm_barrier_cluster_wait:
1158     return Builder.CreateCall(
1159         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait));
1160   case NVPTX::BI__nvvm_fence_sc_cluster:
1161     return Builder.CreateCall(
1162         CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
1163   case NVPTX::BI__nvvm_bar_sync:
1164     return Builder.CreateCall(
1165         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1166         EmitScalarExpr(E->getArg(0)));
1167   case NVPTX::BI__syncthreads:
1168     return Builder.CreateCall(
1169         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1170         Builder.getInt32(0));
1171   case NVPTX::BI__nvvm_barrier_sync:
1172     return Builder.CreateCall(
1173         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all),
1174         EmitScalarExpr(E->getArg(0)));
1175   case NVPTX::BI__nvvm_barrier_sync_cnt:
1176     return Builder.CreateCall(
1177         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_count),
1178         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
1179   default:
1180     return nullptr;
1181   }
1182 }
1183