xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp (revision 0d8fe2373503aeac48492f28073049a8bfa4feb5)
1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCUDARuntime.h"
15 #include "CodeGenFunction.h"
16 #include "CodeGenModule.h"
17 #include "clang/AST/Decl.h"
18 #include "clang/Basic/Cuda.h"
19 #include "clang/CodeGen/CodeGenABITypes.h"
20 #include "clang/CodeGen/ConstantInitBuilder.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/ReplaceConstant.h"
25 #include "llvm/Support/Format.h"
26 
27 using namespace clang;
28 using namespace CodeGen;
29 
30 namespace {
31 constexpr unsigned CudaFatMagic = 0x466243b1;
32 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
33 
34 class CGNVCUDARuntime : public CGCUDARuntime {
35 
36 private:
37   llvm::IntegerType *IntTy, *SizeTy;
38   llvm::Type *VoidTy;
39   llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
40 
41   /// Convenience reference to LLVM Context
42   llvm::LLVMContext &Context;
43   /// Convenience reference to the current module
44   llvm::Module &TheModule;
45   /// Keeps track of kernel launch stubs emitted in this module
46   struct KernelInfo {
47     llvm::Function *Kernel;
48     const Decl *D;
49   };
50   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
51   struct VarInfo {
52     llvm::GlobalVariable *Var;
53     const VarDecl *D;
54     DeviceVarFlags Flags;
55   };
56   llvm::SmallVector<VarInfo, 16> DeviceVars;
57   /// Keeps track of variable containing handle of GPU binary. Populated by
58   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
59   /// ModuleDtorFunction()
60   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
61   /// Whether we generate relocatable device code.
62   bool RelocatableDeviceCode;
63   /// Mangle context for device.
64   std::unique_ptr<MangleContext> DeviceMC;
65 
66   llvm::FunctionCallee getSetupArgumentFn() const;
67   llvm::FunctionCallee getLaunchFn() const;
68 
69   llvm::FunctionType *getRegisterGlobalsFnTy() const;
70   llvm::FunctionType *getCallbackFnTy() const;
71   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
72   std::string addPrefixToName(StringRef FuncName) const;
73   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
74 
75   /// Creates a function to register all kernel stubs generated in this module.
76   llvm::Function *makeRegisterGlobalsFn();
77 
78   /// Helper function that generates a constant string and returns a pointer to
79   /// the start of the string.  The result of this function can be used anywhere
80   /// where the C code specifies const char*.
81   llvm::Constant *makeConstantString(const std::string &Str,
82                                      const std::string &Name = "",
83                                      const std::string &SectionName = "",
84                                      unsigned Alignment = 0) {
85     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
86                                llvm::ConstantInt::get(SizeTy, 0)};
87     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
88     llvm::GlobalVariable *GV =
89         cast<llvm::GlobalVariable>(ConstStr.getPointer());
90     if (!SectionName.empty()) {
91       GV->setSection(SectionName);
92       // Mark the address as used which make sure that this section isn't
93       // merged and we will really have it in the object file.
94       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
95     }
96     if (Alignment)
97       GV->setAlignment(llvm::Align(Alignment));
98 
99     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
100                                                 ConstStr.getPointer(), Zeros);
101   }
102 
103   /// Helper function that generates an empty dummy function returning void.
104   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
105     assert(FnTy->getReturnType()->isVoidTy() &&
106            "Can only generate dummy functions returning void!");
107     llvm::Function *DummyFunc = llvm::Function::Create(
108         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
109 
110     llvm::BasicBlock *DummyBlock =
111         llvm::BasicBlock::Create(Context, "", DummyFunc);
112     CGBuilderTy FuncBuilder(CGM, Context);
113     FuncBuilder.SetInsertPoint(DummyBlock);
114     FuncBuilder.CreateRetVoid();
115 
116     return DummyFunc;
117   }
118 
119   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
120   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
121   std::string getDeviceSideName(const NamedDecl *ND) override;
122 
123 public:
124   CGNVCUDARuntime(CodeGenModule &CGM);
125 
126   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
127   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
128                          bool Extern, bool Constant) override {
129     DeviceVars.push_back({&Var,
130                           VD,
131                           {DeviceVarFlags::Variable, Extern, Constant,
132                            VD->hasAttr<HIPManagedAttr>(),
133                            /*Normalized*/ false, 0}});
134   }
135   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
136                           bool Extern, int Type) override {
137     DeviceVars.push_back({&Var,
138                           VD,
139                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
140                            /*Managed*/ false,
141                            /*Normalized*/ false, Type}});
142   }
143   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
144                          bool Extern, int Type, bool Normalized) override {
145     DeviceVars.push_back({&Var,
146                           VD,
147                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
148                            /*Managed*/ false, Normalized, Type}});
149   }
150 
151   /// Creates module constructor function
152   llvm::Function *makeModuleCtorFunction() override;
153   /// Creates module destructor function
154   llvm::Function *makeModuleDtorFunction() override;
155 };
156 
157 }
158 
159 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
160   if (CGM.getLangOpts().HIP)
161     return ((Twine("hip") + Twine(FuncName)).str());
162   return ((Twine("cuda") + Twine(FuncName)).str());
163 }
164 std::string
165 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
166   if (CGM.getLangOpts().HIP)
167     return ((Twine("__hip") + Twine(FuncName)).str());
168   return ((Twine("__cuda") + Twine(FuncName)).str());
169 }
170 
171 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
172     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
173       TheModule(CGM.getModule()),
174       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
175       DeviceMC(CGM.getContext().createMangleContext(
176           CGM.getContext().getAuxTargetInfo())) {
177   CodeGen::CodeGenTypes &Types = CGM.getTypes();
178   ASTContext &Ctx = CGM.getContext();
179 
180   IntTy = CGM.IntTy;
181   SizeTy = CGM.SizeTy;
182   VoidTy = CGM.VoidTy;
183 
184   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
185   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
186   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
187   if (CGM.getContext().getAuxTargetInfo()) {
188     // If the host and device have different C++ ABIs, mark it as the device
189     // mangle context so that the mangling needs to retrieve the additonal
190     // device lambda mangling number instead of the regular host one.
191     DeviceMC->setDeviceMangleContext(
192         CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
193         CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily());
194   }
195 }
196 
197 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
198   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
199   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
200   return CGM.CreateRuntimeFunction(
201       llvm::FunctionType::get(IntTy, Params, false),
202       addPrefixToName("SetupArgument"));
203 }
204 
205 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
206   if (CGM.getLangOpts().HIP) {
207     // hipError_t hipLaunchByPtr(char *);
208     return CGM.CreateRuntimeFunction(
209         llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
210   } else {
211     // cudaError_t cudaLaunch(char *);
212     return CGM.CreateRuntimeFunction(
213         llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
214   }
215 }
216 
217 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
218   return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
219 }
220 
221 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
222   return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
223 }
224 
225 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
226   auto CallbackFnTy = getCallbackFnTy();
227   auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
228   llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
229                           VoidPtrTy, CallbackFnTy->getPointerTo()};
230   return llvm::FunctionType::get(VoidTy, Params, false);
231 }
232 
233 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
234   GlobalDecl GD;
235   // D could be either a kernel or a variable.
236   if (auto *FD = dyn_cast<FunctionDecl>(ND))
237     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
238   else
239     GD = GlobalDecl(ND);
240   std::string DeviceSideName;
241   if (DeviceMC->shouldMangleDeclName(ND)) {
242     SmallString<256> Buffer;
243     llvm::raw_svector_ostream Out(Buffer);
244     DeviceMC->mangleName(GD, Out);
245     DeviceSideName = std::string(Out.str());
246   } else
247     DeviceSideName = std::string(ND->getIdentifier()->getName());
248   return DeviceSideName;
249 }
250 
251 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
252                                      FunctionArgList &Args) {
253   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
254   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
255                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
256       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
257     emitDeviceStubBodyNew(CGF, Args);
258   else
259     emitDeviceStubBodyLegacy(CGF, Args);
260 }
261 
262 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
263 // array and kernels are launched using cudaLaunchKernel().
264 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
265                                             FunctionArgList &Args) {
266   // Build the shadow stack entry at the very start of the function.
267 
268   // Calculate amount of space we will need for all arguments.  If we have no
269   // args, allocate a single pointer so we still have a valid pointer to the
270   // argument array that we can pass to runtime, even if it will be unused.
271   Address KernelArgs = CGF.CreateTempAlloca(
272       VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
273       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
274   // Store pointers to the arguments in a locally allocated launch_args.
275   for (unsigned i = 0; i < Args.size(); ++i) {
276     llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
277     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
278     CGF.Builder.CreateDefaultAlignedStore(
279         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
280   }
281 
282   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
283 
284   // Lookup cudaLaunchKernel/hipLaunchKernel function.
285   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
286   //                              void **args, size_t sharedMem,
287   //                              cudaStream_t stream);
288   // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
289   //                            void **args, size_t sharedMem,
290   //                            hipStream_t stream);
291   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
292   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
293   auto LaunchKernelName = addPrefixToName("LaunchKernel");
294   IdentifierInfo &cudaLaunchKernelII =
295       CGM.getContext().Idents.get(LaunchKernelName);
296   FunctionDecl *cudaLaunchKernelFD = nullptr;
297   for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
298     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
299       cudaLaunchKernelFD = FD;
300   }
301 
302   if (cudaLaunchKernelFD == nullptr) {
303     CGM.Error(CGF.CurFuncDecl->getLocation(),
304               "Can't find declaration for " + LaunchKernelName);
305     return;
306   }
307   // Create temporary dim3 grid_dim, block_dim.
308   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
309   QualType Dim3Ty = GridDimParam->getType();
310   Address GridDim =
311       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
312   Address BlockDim =
313       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
314   Address ShmemSize =
315       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
316   Address Stream =
317       CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
318   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
319       llvm::FunctionType::get(IntTy,
320                               {/*gridDim=*/GridDim.getType(),
321                                /*blockDim=*/BlockDim.getType(),
322                                /*ShmemSize=*/ShmemSize.getType(),
323                                /*Stream=*/Stream.getType()},
324                               /*isVarArg=*/false),
325       addUnderscoredPrefixToName("PopCallConfiguration"));
326 
327   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
328                               {GridDim.getPointer(), BlockDim.getPointer(),
329                                ShmemSize.getPointer(), Stream.getPointer()});
330 
331   // Emit the call to cudaLaunch
332   llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
333   CallArgList LaunchKernelArgs;
334   LaunchKernelArgs.add(RValue::get(Kernel),
335                        cudaLaunchKernelFD->getParamDecl(0)->getType());
336   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
337   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
338   LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
339                        cudaLaunchKernelFD->getParamDecl(3)->getType());
340   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
341                        cudaLaunchKernelFD->getParamDecl(4)->getType());
342   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
343                        cudaLaunchKernelFD->getParamDecl(5)->getType());
344 
345   QualType QT = cudaLaunchKernelFD->getType();
346   QualType CQT = QT.getCanonicalType();
347   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
348   llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
349 
350   const CGFunctionInfo &FI =
351       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
352   llvm::FunctionCallee cudaLaunchKernelFn =
353       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
354   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
355                LaunchKernelArgs);
356   CGF.EmitBranch(EndBlock);
357 
358   CGF.EmitBlock(EndBlock);
359 }
360 
361 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
362                                                FunctionArgList &Args) {
363   // Emit a call to cudaSetupArgument for each arg in Args.
364   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
365   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
366   CharUnits Offset = CharUnits::Zero();
367   for (const VarDecl *A : Args) {
368     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
369     Offset = Offset.alignTo(TInfo.Align);
370     llvm::Value *Args[] = {
371         CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
372                                       VoidPtrTy),
373         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
374         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
375     };
376     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
377     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
378     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
379     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
380     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
381     CGF.EmitBlock(NextBlock);
382     Offset += TInfo.Width;
383   }
384 
385   // Emit the call to cudaLaunch
386   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
387   llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
388   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
389   CGF.EmitBranch(EndBlock);
390 
391   CGF.EmitBlock(EndBlock);
392 }
393 
394 // Replace the original variable Var with the address loaded from variable
395 // ManagedVar populated by HIP runtime.
396 static void replaceManagedVar(llvm::GlobalVariable *Var,
397                               llvm::GlobalVariable *ManagedVar) {
398   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
399   for (auto &&VarUse : Var->uses()) {
400     WorkList.push_back({VarUse.getUser()});
401   }
402   while (!WorkList.empty()) {
403     auto &&WorkItem = WorkList.pop_back_val();
404     auto *U = WorkItem.back();
405     if (isa<llvm::ConstantExpr>(U)) {
406       for (auto &&UU : U->uses()) {
407         WorkItem.push_back(UU.getUser());
408         WorkList.push_back(WorkItem);
409         WorkItem.pop_back();
410       }
411       continue;
412     }
413     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
414       llvm::Value *OldV = Var;
415       llvm::Instruction *NewV =
416           new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
417                              llvm::Align(Var->getAlignment()), I);
418       WorkItem.pop_back();
419       // Replace constant expressions directly or indirectly using the managed
420       // variable with instructions.
421       for (auto &&Op : WorkItem) {
422         auto *CE = cast<llvm::ConstantExpr>(Op);
423         auto *NewInst = llvm::createReplacementInstr(CE, I);
424         NewInst->replaceUsesOfWith(OldV, NewV);
425         OldV = CE;
426         NewV = NewInst;
427       }
428       I->replaceUsesOfWith(OldV, NewV);
429     } else {
430       llvm_unreachable("Invalid use of managed variable");
431     }
432   }
433 }
434 
435 /// Creates a function that sets up state on the host side for CUDA objects that
436 /// have a presence on both the host and device sides. Specifically, registers
437 /// the host side of kernel functions and device global variables with the CUDA
438 /// runtime.
439 /// \code
440 /// void __cuda_register_globals(void** GpuBinaryHandle) {
441 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
442 ///    ...
443 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
444 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
445 ///    ...
446 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
447 /// }
448 /// \endcode
449 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
450   // No need to register anything
451   if (EmittedKernels.empty() && DeviceVars.empty())
452     return nullptr;
453 
454   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
455       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
456       addUnderscoredPrefixToName("_register_globals"), &TheModule);
457   llvm::BasicBlock *EntryBB =
458       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
459   CGBuilderTy Builder(CGM, Context);
460   Builder.SetInsertPoint(EntryBB);
461 
462   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
463   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
464   llvm::Type *RegisterFuncParams[] = {
465       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
466       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
467   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
468       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
469       addUnderscoredPrefixToName("RegisterFunction"));
470 
471   // Extract GpuBinaryHandle passed as the first argument passed to
472   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
473   // each emitted kernel.
474   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
475   for (auto &&I : EmittedKernels) {
476     llvm::Constant *KernelName =
477         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
478     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
479     llvm::Value *Args[] = {
480         &GpuBinaryHandlePtr,
481         Builder.CreateBitCast(I.Kernel, VoidPtrTy),
482         KernelName,
483         KernelName,
484         llvm::ConstantInt::get(IntTy, -1),
485         NullPtr,
486         NullPtr,
487         NullPtr,
488         NullPtr,
489         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
490     Builder.CreateCall(RegisterFunc, Args);
491   }
492 
493   llvm::Type *VarSizeTy = IntTy;
494   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
495   if (CGM.getLangOpts().HIP ||
496       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
497     VarSizeTy = SizeTy;
498 
499   // void __cudaRegisterVar(void **, char *, char *, const char *,
500   //                        int, int, int, int)
501   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
502                                      CharPtrTy,    IntTy,     VarSizeTy,
503                                      IntTy,        IntTy};
504   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
505       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
506       addUnderscoredPrefixToName("RegisterVar"));
507   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
508   //                              size_t, unsigned)
509   llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
510                                             CharPtrTy,    VarSizeTy, IntTy};
511   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
512       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
513       addUnderscoredPrefixToName("RegisterManagedVar"));
514   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
515   //                            const void **, const char *, int, int);
516   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
517       llvm::FunctionType::get(
518           VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
519           false),
520       addUnderscoredPrefixToName("RegisterSurface"));
521   // void __cudaRegisterTexture(void **, const struct textureReference *,
522   //                            const void **, const char *, int, int, int)
523   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
524       llvm::FunctionType::get(
525           VoidTy,
526           {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
527           false),
528       addUnderscoredPrefixToName("RegisterTexture"));
529   for (auto &&Info : DeviceVars) {
530     llvm::GlobalVariable *Var = Info.Var;
531     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
532     switch (Info.Flags.getKind()) {
533     case DeviceVarFlags::Variable: {
534       uint64_t VarSize =
535           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
536       if (Info.Flags.isManaged()) {
537         auto ManagedVar = new llvm::GlobalVariable(
538             CGM.getModule(), Var->getType(),
539             /*isConstant=*/false, Var->getLinkage(),
540             /*Init=*/llvm::ConstantPointerNull::get(Var->getType()),
541             Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr,
542             llvm::GlobalVariable::NotThreadLocal);
543         replaceManagedVar(Var, ManagedVar);
544         llvm::Value *Args[] = {
545             &GpuBinaryHandlePtr,
546             Builder.CreateBitCast(ManagedVar, VoidPtrTy),
547             Builder.CreateBitCast(Var, VoidPtrTy),
548             VarName,
549             llvm::ConstantInt::get(VarSizeTy, VarSize),
550             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
551         Builder.CreateCall(RegisterManagedVar, Args);
552       } else {
553         llvm::Value *Args[] = {
554             &GpuBinaryHandlePtr,
555             Builder.CreateBitCast(Var, VoidPtrTy),
556             VarName,
557             VarName,
558             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
559             llvm::ConstantInt::get(VarSizeTy, VarSize),
560             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
561             llvm::ConstantInt::get(IntTy, 0)};
562         Builder.CreateCall(RegisterVar, Args);
563       }
564       break;
565     }
566     case DeviceVarFlags::Surface:
567       Builder.CreateCall(
568           RegisterSurf,
569           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
570            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
571            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
572       break;
573     case DeviceVarFlags::Texture:
574       Builder.CreateCall(
575           RegisterTex,
576           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
577            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
578            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
579            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
580       break;
581     }
582   }
583 
584   Builder.CreateRetVoid();
585   return RegisterKernelsFunc;
586 }
587 
588 /// Creates a global constructor function for the module:
589 ///
590 /// For CUDA:
591 /// \code
592 /// void __cuda_module_ctor(void*) {
593 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
594 ///     __cuda_register_globals(Handle);
595 /// }
596 /// \endcode
597 ///
598 /// For HIP:
599 /// \code
600 /// void __hip_module_ctor(void*) {
601 ///     if (__hip_gpubin_handle == 0) {
602 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
603 ///         __hip_register_globals(__hip_gpubin_handle);
604 ///     }
605 /// }
606 /// \endcode
607 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
608   bool IsHIP = CGM.getLangOpts().HIP;
609   bool IsCUDA = CGM.getLangOpts().CUDA;
610   // No need to generate ctors/dtors if there is no GPU binary.
611   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
612   if (CudaGpuBinaryFileName.empty() && !IsHIP)
613     return nullptr;
614   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
615       DeviceVars.empty())
616     return nullptr;
617 
618   // void __{cuda|hip}_register_globals(void* handle);
619   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
620   // We always need a function to pass in as callback. Create a dummy
621   // implementation if we don't need to register anything.
622   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
623     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
624 
625   // void ** __{cuda|hip}RegisterFatBinary(void *);
626   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
627       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
628       addUnderscoredPrefixToName("RegisterFatBinary"));
629   // struct { int magic, int version, void * gpu_binary, void * dont_care };
630   llvm::StructType *FatbinWrapperTy =
631       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
632 
633   // Register GPU binary with the CUDA runtime, store returned handle in a
634   // global variable and save a reference in GpuBinaryHandle to be cleaned up
635   // in destructor on exit. Then associate all known kernels with the GPU binary
636   // handle so CUDA runtime can figure out what to call on the GPU side.
637   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
638   if (!CudaGpuBinaryFileName.empty()) {
639     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
640         llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
641     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
642       CGM.getDiags().Report(diag::err_cannot_open_file)
643           << CudaGpuBinaryFileName << EC.message();
644       return nullptr;
645     }
646     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
647   }
648 
649   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
650       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
651       llvm::GlobalValue::InternalLinkage,
652       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
653   llvm::BasicBlock *CtorEntryBB =
654       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
655   CGBuilderTy CtorBuilder(CGM, Context);
656 
657   CtorBuilder.SetInsertPoint(CtorEntryBB);
658 
659   const char *FatbinConstantName;
660   const char *FatbinSectionName;
661   const char *ModuleIDSectionName;
662   StringRef ModuleIDPrefix;
663   llvm::Constant *FatBinStr;
664   unsigned FatMagic;
665   if (IsHIP) {
666     FatbinConstantName = ".hip_fatbin";
667     FatbinSectionName = ".hipFatBinSegment";
668 
669     ModuleIDSectionName = "__hip_module_id";
670     ModuleIDPrefix = "__hip_";
671 
672     if (CudaGpuBinary) {
673       // If fatbin is available from early finalization, create a string
674       // literal containing the fat binary loaded from the given file.
675       const unsigned HIPCodeObjectAlign = 4096;
676       FatBinStr =
677           makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
678                              FatbinConstantName, HIPCodeObjectAlign);
679     } else {
680       // If fatbin is not available, create an external symbol
681       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
682       // to contain the fat binary but will be populated somewhere else,
683       // e.g. by lld through link script.
684       FatBinStr = new llvm::GlobalVariable(
685         CGM.getModule(), CGM.Int8Ty,
686         /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
687         "__hip_fatbin", nullptr,
688         llvm::GlobalVariable::NotThreadLocal);
689       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
690     }
691 
692     FatMagic = HIPFatMagic;
693   } else {
694     if (RelocatableDeviceCode)
695       FatbinConstantName = CGM.getTriple().isMacOSX()
696                                ? "__NV_CUDA,__nv_relfatbin"
697                                : "__nv_relfatbin";
698     else
699       FatbinConstantName =
700           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
701     // NVIDIA's cuobjdump looks for fatbins in this section.
702     FatbinSectionName =
703         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
704 
705     ModuleIDSectionName = CGM.getTriple().isMacOSX()
706                               ? "__NV_CUDA,__nv_module_id"
707                               : "__nv_module_id";
708     ModuleIDPrefix = "__nv_";
709 
710     // For CUDA, create a string literal containing the fat binary loaded from
711     // the given file.
712     FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
713                                    FatbinConstantName, 8);
714     FatMagic = CudaFatMagic;
715   }
716 
717   // Create initialized wrapper structure that points to the loaded GPU binary
718   ConstantInitBuilder Builder(CGM);
719   auto Values = Builder.beginStruct(FatbinWrapperTy);
720   // Fatbin wrapper magic.
721   Values.addInt(IntTy, FatMagic);
722   // Fatbin version.
723   Values.addInt(IntTy, 1);
724   // Data.
725   Values.add(FatBinStr);
726   // Unused in fatbin v1.
727   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
728   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
729       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
730       /*constant*/ true);
731   FatbinWrapper->setSection(FatbinSectionName);
732 
733   // There is only one HIP fat binary per linked module, however there are
734   // multiple constructor functions. Make sure the fat binary is registered
735   // only once. The constructor functions are executed by the dynamic loader
736   // before the program gains control. The dynamic loader cannot execute the
737   // constructor functions concurrently since doing that would not guarantee
738   // thread safety of the loaded program. Therefore we can assume sequential
739   // execution of constructor functions here.
740   if (IsHIP) {
741     auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
742         llvm::GlobalValue::LinkOnceAnyLinkage;
743     llvm::BasicBlock *IfBlock =
744         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
745     llvm::BasicBlock *ExitBlock =
746         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
747     // The name, size, and initialization pattern of this variable is part
748     // of HIP ABI.
749     GpuBinaryHandle = new llvm::GlobalVariable(
750         TheModule, VoidPtrPtrTy, /*isConstant=*/false,
751         Linkage,
752         /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
753         "__hip_gpubin_handle");
754     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
755     // Prevent the weak symbol in different shared libraries being merged.
756     if (Linkage != llvm::GlobalValue::InternalLinkage)
757       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
758     Address GpuBinaryAddr(
759         GpuBinaryHandle,
760         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
761     {
762       auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
763       llvm::Constant *Zero =
764           llvm::Constant::getNullValue(HandleValue->getType());
765       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
766       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
767     }
768     {
769       CtorBuilder.SetInsertPoint(IfBlock);
770       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
771       llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
772           RegisterFatbinFunc,
773           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
774       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
775       CtorBuilder.CreateBr(ExitBlock);
776     }
777     {
778       CtorBuilder.SetInsertPoint(ExitBlock);
779       // Call __hip_register_globals(GpuBinaryHandle);
780       if (RegisterGlobalsFunc) {
781         auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
782         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
783       }
784     }
785   } else if (!RelocatableDeviceCode) {
786     // Register binary with CUDA runtime. This is substantially different in
787     // default mode vs. separate compilation!
788     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
789     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
790         RegisterFatbinFunc,
791         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
792     GpuBinaryHandle = new llvm::GlobalVariable(
793         TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
794         llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
795     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
796     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
797                                    CGM.getPointerAlign());
798 
799     // Call __cuda_register_globals(GpuBinaryHandle);
800     if (RegisterGlobalsFunc)
801       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
802 
803     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
804     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
805                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
806       // void __cudaRegisterFatBinaryEnd(void **);
807       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
808           llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
809           "__cudaRegisterFatBinaryEnd");
810       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
811     }
812   } else {
813     // Generate a unique module ID.
814     SmallString<64> ModuleID;
815     llvm::raw_svector_ostream OS(ModuleID);
816     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
817     llvm::Constant *ModuleIDConstant = makeConstantString(
818         std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
819 
820     // Create an alias for the FatbinWrapper that nvcc will look for.
821     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
822                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
823 
824     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
825     // void *, void (*)(void **))
826     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
827     RegisterLinkedBinaryName += ModuleID;
828     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
829         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
830 
831     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
832     llvm::Value *Args[] = {RegisterGlobalsFunc,
833                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
834                            ModuleIDConstant,
835                            makeDummyFunction(getCallbackFnTy())};
836     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
837   }
838 
839   // Create destructor and register it with atexit() the way NVCC does it. Doing
840   // it during regular destructor phase worked in CUDA before 9.2 but results in
841   // double-free in 9.2.
842   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
843     // extern "C" int atexit(void (*f)(void));
844     llvm::FunctionType *AtExitTy =
845         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
846     llvm::FunctionCallee AtExitFunc =
847         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
848                                   /*Local=*/true);
849     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
850   }
851 
852   CtorBuilder.CreateRetVoid();
853   return ModuleCtorFunc;
854 }
855 
856 /// Creates a global destructor function that unregisters the GPU code blob
857 /// registered by constructor.
858 ///
859 /// For CUDA:
860 /// \code
861 /// void __cuda_module_dtor(void*) {
862 ///     __cudaUnregisterFatBinary(Handle);
863 /// }
864 /// \endcode
865 ///
866 /// For HIP:
867 /// \code
868 /// void __hip_module_dtor(void*) {
869 ///     if (__hip_gpubin_handle) {
870 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
871 ///         __hip_gpubin_handle = 0;
872 ///     }
873 /// }
874 /// \endcode
875 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
876   // No need for destructor if we don't have a handle to unregister.
877   if (!GpuBinaryHandle)
878     return nullptr;
879 
880   // void __cudaUnregisterFatBinary(void ** handle);
881   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
882       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
883       addUnderscoredPrefixToName("UnregisterFatBinary"));
884 
885   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
886       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
887       llvm::GlobalValue::InternalLinkage,
888       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
889 
890   llvm::BasicBlock *DtorEntryBB =
891       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
892   CGBuilderTy DtorBuilder(CGM, Context);
893   DtorBuilder.SetInsertPoint(DtorEntryBB);
894 
895   Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
896                                              GpuBinaryHandle->getAlignment()));
897   auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
898   // There is only one HIP fat binary per linked module, however there are
899   // multiple destructor functions. Make sure the fat binary is unregistered
900   // only once.
901   if (CGM.getLangOpts().HIP) {
902     llvm::BasicBlock *IfBlock =
903         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
904     llvm::BasicBlock *ExitBlock =
905         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
906     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
907     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
908     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
909 
910     DtorBuilder.SetInsertPoint(IfBlock);
911     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
912     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
913     DtorBuilder.CreateBr(ExitBlock);
914 
915     DtorBuilder.SetInsertPoint(ExitBlock);
916   } else {
917     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
918   }
919   DtorBuilder.CreateRetVoid();
920   return ModuleDtorFunc;
921 }
922 
923 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
924   return new CGNVCUDARuntime(CGM);
925 }
926