xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCUDARuntime.h"
15 #include "CGCXXABI.h"
16 #include "CodeGenFunction.h"
17 #include "CodeGenModule.h"
18 #include "clang/AST/Decl.h"
19 #include "clang/Basic/Cuda.h"
20 #include "clang/CodeGen/CodeGenABITypes.h"
21 #include "clang/CodeGen/ConstantInitBuilder.h"
22 #include "llvm/Frontend/Offloading/Utility.h"
23 #include "llvm/IR/BasicBlock.h"
24 #include "llvm/IR/Constants.h"
25 #include "llvm/IR/DerivedTypes.h"
26 #include "llvm/IR/ReplaceConstant.h"
27 #include "llvm/Support/Format.h"
28 #include "llvm/Support/VirtualFileSystem.h"
29 
30 using namespace clang;
31 using namespace CodeGen;
32 
33 namespace {
34 constexpr unsigned CudaFatMagic = 0x466243b1;
35 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36 
37 class CGNVCUDARuntime : public CGCUDARuntime {
38 
39 private:
40   llvm::IntegerType *IntTy, *SizeTy;
41   llvm::Type *VoidTy;
42   llvm::PointerType *PtrTy;
43 
44   /// Convenience reference to LLVM Context
45   llvm::LLVMContext &Context;
46   /// Convenience reference to the current module
47   llvm::Module &TheModule;
48   /// Keeps track of kernel launch stubs and handles emitted in this module
49   struct KernelInfo {
50     llvm::Function *Kernel; // stub function to help launch kernel
51     const Decl *D;
52   };
53   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
54   // Map a kernel mangled name to a symbol for identifying kernel in host code
55   // For CUDA, the symbol for identifying the kernel is the same as the device
56   // stub function. For HIP, they are different.
57   llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58   // Map a kernel handle to the kernel stub.
59   llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
60   struct VarInfo {
61     llvm::GlobalVariable *Var;
62     const VarDecl *D;
63     DeviceVarFlags Flags;
64   };
65   llvm::SmallVector<VarInfo, 16> DeviceVars;
66   /// Keeps track of variable containing handle of GPU binary. Populated by
67   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
68   /// ModuleDtorFunction()
69   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
70   /// Whether we generate relocatable device code.
71   bool RelocatableDeviceCode;
72   /// Mangle context for device.
73   std::unique_ptr<MangleContext> DeviceMC;
74 
75   llvm::FunctionCallee getSetupArgumentFn() const;
76   llvm::FunctionCallee getLaunchFn() const;
77 
78   llvm::FunctionType *getRegisterGlobalsFnTy() const;
79   llvm::FunctionType *getCallbackFnTy() const;
80   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
81   std::string addPrefixToName(StringRef FuncName) const;
82   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
83 
84   /// Creates a function to register all kernel stubs generated in this module.
85   llvm::Function *makeRegisterGlobalsFn();
86 
87   /// Helper function that generates a constant string and returns a pointer to
88   /// the start of the string.  The result of this function can be used anywhere
89   /// where the C code specifies const char*.
makeConstantString(const std::string & Str,const std::string & Name="")90   llvm::Constant *makeConstantString(const std::string &Str,
91                                      const std::string &Name = "") {
92     return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer();
93   }
94 
95   /// Helper function which generates an initialized constant array from Str,
96   /// and optionally sets section name and alignment. AddNull specifies whether
97   /// the array should nave NUL termination.
makeConstantArray(StringRef Str,StringRef Name="",StringRef SectionName="",unsigned Alignment=0,bool AddNull=false)98   llvm::Constant *makeConstantArray(StringRef Str,
99                                     StringRef Name = "",
100                                     StringRef SectionName = "",
101                                     unsigned Alignment = 0,
102                                     bool AddNull = false) {
103     llvm::Constant *Value =
104         llvm::ConstantDataArray::getString(Context, Str, AddNull);
105     auto *GV = new llvm::GlobalVariable(
106         TheModule, Value->getType(), /*isConstant=*/true,
107         llvm::GlobalValue::PrivateLinkage, Value, Name);
108     if (!SectionName.empty()) {
109       GV->setSection(SectionName);
110       // Mark the address as used which make sure that this section isn't
111       // merged and we will really have it in the object file.
112       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
113     }
114     if (Alignment)
115       GV->setAlignment(llvm::Align(Alignment));
116     return GV;
117   }
118 
119   /// Helper function that generates an empty dummy function returning void.
makeDummyFunction(llvm::FunctionType * FnTy)120   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
121     assert(FnTy->getReturnType()->isVoidTy() &&
122            "Can only generate dummy functions returning void!");
123     llvm::Function *DummyFunc = llvm::Function::Create(
124         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
125 
126     llvm::BasicBlock *DummyBlock =
127         llvm::BasicBlock::Create(Context, "", DummyFunc);
128     CGBuilderTy FuncBuilder(CGM, Context);
129     FuncBuilder.SetInsertPoint(DummyBlock);
130     FuncBuilder.CreateRetVoid();
131 
132     return DummyFunc;
133   }
134 
135   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
136   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
137   std::string getDeviceSideName(const NamedDecl *ND) override;
138 
registerDeviceVar(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,bool Constant)139   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
140                          bool Extern, bool Constant) {
141     DeviceVars.push_back({&Var,
142                           VD,
143                           {DeviceVarFlags::Variable, Extern, Constant,
144                            VD->hasAttr<HIPManagedAttr>(),
145                            /*Normalized*/ false, 0}});
146   }
registerDeviceSurf(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,int Type)147   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
148                           bool Extern, int Type) {
149     DeviceVars.push_back({&Var,
150                           VD,
151                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
152                            /*Managed*/ false,
153                            /*Normalized*/ false, Type}});
154   }
registerDeviceTex(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,int Type,bool Normalized)155   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
156                          bool Extern, int Type, bool Normalized) {
157     DeviceVars.push_back({&Var,
158                           VD,
159                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
160                            /*Managed*/ false, Normalized, Type}});
161   }
162 
163   /// Creates module constructor function
164   llvm::Function *makeModuleCtorFunction();
165   /// Creates module destructor function
166   llvm::Function *makeModuleDtorFunction();
167   /// Transform managed variables for device compilation.
168   void transformManagedVars();
169   /// Create offloading entries to register globals in RDC mode.
170   void createOffloadingEntries();
171 
172 public:
173   CGNVCUDARuntime(CodeGenModule &CGM);
174 
175   llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
getKernelStub(llvm::GlobalValue * Handle)176   llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
177     auto Loc = KernelStubs.find(Handle);
178     assert(Loc != KernelStubs.end());
179     return Loc->second;
180   }
181   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
182   void handleVarRegistration(const VarDecl *VD,
183                              llvm::GlobalVariable &Var) override;
184   void
185   internalizeDeviceSideVar(const VarDecl *D,
186                            llvm::GlobalValue::LinkageTypes &Linkage) override;
187 
188   llvm::Function *finalizeModule() override;
189 };
190 
191 } // end anonymous namespace
192 
addPrefixToName(StringRef FuncName) const193 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
194   if (CGM.getLangOpts().HIP)
195     return ((Twine("hip") + Twine(FuncName)).str());
196   return ((Twine("cuda") + Twine(FuncName)).str());
197 }
198 std::string
addUnderscoredPrefixToName(StringRef FuncName) const199 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
200   if (CGM.getLangOpts().HIP)
201     return ((Twine("__hip") + Twine(FuncName)).str());
202   return ((Twine("__cuda") + Twine(FuncName)).str());
203 }
204 
InitDeviceMC(CodeGenModule & CGM)205 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
206   // If the host and device have different C++ ABIs, mark it as the device
207   // mangle context so that the mangling needs to retrieve the additional
208   // device lambda mangling number instead of the regular host one.
209   if (CGM.getContext().getAuxTargetInfo() &&
210       CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
211       CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
212     return std::unique_ptr<MangleContext>(
213         CGM.getContext().createDeviceMangleContext(
214             *CGM.getContext().getAuxTargetInfo()));
215   }
216 
217   return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
218       CGM.getContext().getAuxTargetInfo()));
219 }
220 
CGNVCUDARuntime(CodeGenModule & CGM)221 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
222     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
223       TheModule(CGM.getModule()),
224       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
225       DeviceMC(InitDeviceMC(CGM)) {
226   IntTy = CGM.IntTy;
227   SizeTy = CGM.SizeTy;
228   VoidTy = CGM.VoidTy;
229   PtrTy = CGM.UnqualPtrTy;
230 }
231 
getSetupArgumentFn() const232 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
233   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
234   llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
235   return CGM.CreateRuntimeFunction(
236       llvm::FunctionType::get(IntTy, Params, false),
237       addPrefixToName("SetupArgument"));
238 }
239 
getLaunchFn() const240 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
241   if (CGM.getLangOpts().HIP) {
242     // hipError_t hipLaunchByPtr(char *);
243     return CGM.CreateRuntimeFunction(
244         llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
245   }
246   // cudaError_t cudaLaunch(char *);
247   return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
248                                    "cudaLaunch");
249 }
250 
getRegisterGlobalsFnTy() const251 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
252   return llvm::FunctionType::get(VoidTy, PtrTy, false);
253 }
254 
getCallbackFnTy() const255 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
256   return llvm::FunctionType::get(VoidTy, PtrTy, false);
257 }
258 
getRegisterLinkedBinaryFnTy() const259 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
260   llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
261                           llvm::PointerType::getUnqual(Context)};
262   return llvm::FunctionType::get(VoidTy, Params, false);
263 }
264 
getDeviceSideName(const NamedDecl * ND)265 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
266   GlobalDecl GD;
267   // D could be either a kernel or a variable.
268   if (auto *FD = dyn_cast<FunctionDecl>(ND))
269     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
270   else
271     GD = GlobalDecl(ND);
272   std::string DeviceSideName;
273   MangleContext *MC;
274   if (CGM.getLangOpts().CUDAIsDevice)
275     MC = &CGM.getCXXABI().getMangleContext();
276   else
277     MC = DeviceMC.get();
278   if (MC->shouldMangleDeclName(ND)) {
279     SmallString<256> Buffer;
280     llvm::raw_svector_ostream Out(Buffer);
281     MC->mangleName(GD, Out);
282     DeviceSideName = std::string(Out.str());
283   } else
284     DeviceSideName = std::string(ND->getIdentifier()->getName());
285 
286   // Make unique name for device side static file-scope variable for HIP.
287   if (CGM.getContext().shouldExternalize(ND) &&
288       CGM.getLangOpts().GPURelocatableDeviceCode) {
289     SmallString<256> Buffer;
290     llvm::raw_svector_ostream Out(Buffer);
291     Out << DeviceSideName;
292     CGM.printPostfixForExternalizedDecl(Out, ND);
293     DeviceSideName = std::string(Out.str());
294   }
295   return DeviceSideName;
296 }
297 
emitDeviceStub(CodeGenFunction & CGF,FunctionArgList & Args)298 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
299                                      FunctionArgList &Args) {
300   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
301   if (auto *GV =
302           dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
303     GV->setLinkage(CGF.CurFn->getLinkage());
304     GV->setInitializer(CGF.CurFn);
305   }
306   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
307                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
308       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
309     emitDeviceStubBodyNew(CGF, Args);
310   else
311     emitDeviceStubBodyLegacy(CGF, Args);
312 }
313 
314 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
315 // array and kernels are launched using cudaLaunchKernel().
emitDeviceStubBodyNew(CodeGenFunction & CGF,FunctionArgList & Args)316 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
317                                             FunctionArgList &Args) {
318   // Build the shadow stack entry at the very start of the function.
319 
320   // Calculate amount of space we will need for all arguments.  If we have no
321   // args, allocate a single pointer so we still have a valid pointer to the
322   // argument array that we can pass to runtime, even if it will be unused.
323   Address KernelArgs = CGF.CreateTempAlloca(
324       PtrTy, CharUnits::fromQuantity(16), "kernel_args",
325       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
326   // Store pointers to the arguments in a locally allocated launch_args.
327   for (unsigned i = 0; i < Args.size(); ++i) {
328     llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF);
329     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
330     CGF.Builder.CreateDefaultAlignedStore(
331         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
332                         PtrTy, KernelArgs.emitRawPointer(CGF), i));
333   }
334 
335   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
336 
337   // Lookup cudaLaunchKernel/hipLaunchKernel function.
338   // HIP kernel launching API name depends on -fgpu-default-stream option. For
339   // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
340   // it is hipLaunchKernel_spt.
341   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
342   //                              void **args, size_t sharedMem,
343   //                              cudaStream_t stream);
344   // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
345   //                                  dim3 blockDim, void **args,
346   //                                  size_t sharedMem, hipStream_t stream);
347   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
348   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
349   std::string KernelLaunchAPI = "LaunchKernel";
350   if (CGF.getLangOpts().GPUDefaultStream ==
351       LangOptions::GPUDefaultStreamKind::PerThread) {
352     if (CGF.getLangOpts().HIP)
353       KernelLaunchAPI = KernelLaunchAPI + "_spt";
354     else if (CGF.getLangOpts().CUDA)
355       KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
356   }
357   auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
358   const IdentifierInfo &cudaLaunchKernelII =
359       CGM.getContext().Idents.get(LaunchKernelName);
360   FunctionDecl *cudaLaunchKernelFD = nullptr;
361   for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
362     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
363       cudaLaunchKernelFD = FD;
364   }
365 
366   if (cudaLaunchKernelFD == nullptr) {
367     CGM.Error(CGF.CurFuncDecl->getLocation(),
368               "Can't find declaration for " + LaunchKernelName);
369     return;
370   }
371   // Create temporary dim3 grid_dim, block_dim.
372   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
373   QualType Dim3Ty = GridDimParam->getType();
374   Address GridDim =
375       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
376   Address BlockDim =
377       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
378   Address ShmemSize =
379       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
380   Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
381   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
382       llvm::FunctionType::get(IntTy,
383                               {/*gridDim=*/GridDim.getType(),
384                                /*blockDim=*/BlockDim.getType(),
385                                /*ShmemSize=*/ShmemSize.getType(),
386                                /*Stream=*/Stream.getType()},
387                               /*isVarArg=*/false),
388       addUnderscoredPrefixToName("PopCallConfiguration"));
389 
390   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF),
391                                                 BlockDim.emitRawPointer(CGF),
392                                                 ShmemSize.emitRawPointer(CGF),
393                                                 Stream.emitRawPointer(CGF)});
394 
395   // Emit the call to cudaLaunch
396   llvm::Value *Kernel =
397       CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
398   CallArgList LaunchKernelArgs;
399   LaunchKernelArgs.add(RValue::get(Kernel),
400                        cudaLaunchKernelFD->getParamDecl(0)->getType());
401   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
402   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
403   LaunchKernelArgs.add(RValue::get(KernelArgs, CGF),
404                        cudaLaunchKernelFD->getParamDecl(3)->getType());
405   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
406                        cudaLaunchKernelFD->getParamDecl(4)->getType());
407   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
408                        cudaLaunchKernelFD->getParamDecl(5)->getType());
409 
410   QualType QT = cudaLaunchKernelFD->getType();
411   QualType CQT = QT.getCanonicalType();
412   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
413   llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
414 
415   const CGFunctionInfo &FI =
416       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
417   llvm::FunctionCallee cudaLaunchKernelFn =
418       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
419   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
420                LaunchKernelArgs);
421 
422   // To prevent CUDA device stub functions from being merged by ICF in MSVC
423   // environment, create an unique global variable for each kernel and write to
424   // the variable in the device stub.
425   if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
426       !CGF.getLangOpts().HIP) {
427     llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
428     std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
429 
430     llvm::GlobalVariable *HandleVar =
431         CGM.getModule().getNamedGlobal(GlobalVarName);
432     if (!HandleVar) {
433       HandleVar = new llvm::GlobalVariable(
434           CGM.getModule(), CGM.Int8Ty,
435           /*Constant=*/false, KernelFunction->getLinkage(),
436           llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
437       HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
438       HandleVar->setVisibility(KernelFunction->getVisibility());
439       if (KernelFunction->hasComdat())
440         HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
441     }
442 
443     CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1),
444                                    HandleVar, CharUnits::One(),
445                                    /*IsVolatile=*/true);
446   }
447 
448   CGF.EmitBranch(EndBlock);
449 
450   CGF.EmitBlock(EndBlock);
451 }
452 
emitDeviceStubBodyLegacy(CodeGenFunction & CGF,FunctionArgList & Args)453 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
454                                                FunctionArgList &Args) {
455   // Emit a call to cudaSetupArgument for each arg in Args.
456   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
457   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
458   CharUnits Offset = CharUnits::Zero();
459   for (const VarDecl *A : Args) {
460     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
461     Offset = Offset.alignTo(TInfo.Align);
462     llvm::Value *Args[] = {
463         CGF.Builder.CreatePointerCast(
464             CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy),
465         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
466         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
467     };
468     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
469     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
470     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
471     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
472     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
473     CGF.EmitBlock(NextBlock);
474     Offset += TInfo.Width;
475   }
476 
477   // Emit the call to cudaLaunch
478   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
479   llvm::Value *Arg =
480       CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
481   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
482   CGF.EmitBranch(EndBlock);
483 
484   CGF.EmitBlock(EndBlock);
485 }
486 
487 // Replace the original variable Var with the address loaded from variable
488 // ManagedVar populated by HIP runtime.
replaceManagedVar(llvm::GlobalVariable * Var,llvm::GlobalVariable * ManagedVar)489 static void replaceManagedVar(llvm::GlobalVariable *Var,
490                               llvm::GlobalVariable *ManagedVar) {
491   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
492   for (auto &&VarUse : Var->uses()) {
493     WorkList.push_back({VarUse.getUser()});
494   }
495   while (!WorkList.empty()) {
496     auto &&WorkItem = WorkList.pop_back_val();
497     auto *U = WorkItem.back();
498     if (isa<llvm::ConstantExpr>(U)) {
499       for (auto &&UU : U->uses()) {
500         WorkItem.push_back(UU.getUser());
501         WorkList.push_back(WorkItem);
502         WorkItem.pop_back();
503       }
504       continue;
505     }
506     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
507       llvm::Value *OldV = Var;
508       llvm::Instruction *NewV =
509           new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
510                              llvm::Align(Var->getAlignment()), I);
511       WorkItem.pop_back();
512       // Replace constant expressions directly or indirectly using the managed
513       // variable with instructions.
514       for (auto &&Op : WorkItem) {
515         auto *CE = cast<llvm::ConstantExpr>(Op);
516         auto *NewInst = CE->getAsInstruction();
517         NewInst->insertBefore(*I->getParent(), I->getIterator());
518         NewInst->replaceUsesOfWith(OldV, NewV);
519         OldV = CE;
520         NewV = NewInst;
521       }
522       I->replaceUsesOfWith(OldV, NewV);
523     } else {
524       llvm_unreachable("Invalid use of managed variable");
525     }
526   }
527 }
528 
529 /// Creates a function that sets up state on the host side for CUDA objects that
530 /// have a presence on both the host and device sides. Specifically, registers
531 /// the host side of kernel functions and device global variables with the CUDA
532 /// runtime.
533 /// \code
534 /// void __cuda_register_globals(void** GpuBinaryHandle) {
535 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
536 ///    ...
537 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
538 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
539 ///    ...
540 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
541 /// }
542 /// \endcode
makeRegisterGlobalsFn()543 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
544   // No need to register anything
545   if (EmittedKernels.empty() && DeviceVars.empty())
546     return nullptr;
547 
548   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
549       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
550       addUnderscoredPrefixToName("_register_globals"), &TheModule);
551   llvm::BasicBlock *EntryBB =
552       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
553   CGBuilderTy Builder(CGM, Context);
554   Builder.SetInsertPoint(EntryBB);
555 
556   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
557   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
558   llvm::Type *RegisterFuncParams[] = {
559       PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
560       PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
561   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
562       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
563       addUnderscoredPrefixToName("RegisterFunction"));
564 
565   // Extract GpuBinaryHandle passed as the first argument passed to
566   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
567   // each emitted kernel.
568   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
569   for (auto &&I : EmittedKernels) {
570     llvm::Constant *KernelName =
571         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
572     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
573     llvm::Value *Args[] = {
574         &GpuBinaryHandlePtr,
575         KernelHandles[I.Kernel->getName()],
576         KernelName,
577         KernelName,
578         llvm::ConstantInt::get(IntTy, -1),
579         NullPtr,
580         NullPtr,
581         NullPtr,
582         NullPtr,
583         llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
584     Builder.CreateCall(RegisterFunc, Args);
585   }
586 
587   llvm::Type *VarSizeTy = IntTy;
588   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
589   if (CGM.getLangOpts().HIP ||
590       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
591     VarSizeTy = SizeTy;
592 
593   // void __cudaRegisterVar(void **, char *, char *, const char *,
594   //                        int, int, int, int)
595   llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy,     PtrTy, PtrTy,
596                                      IntTy, VarSizeTy, IntTy, IntTy};
597   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
598       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
599       addUnderscoredPrefixToName("RegisterVar"));
600   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
601   //                              size_t, unsigned)
602   llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy,     PtrTy,
603                                             PtrTy, VarSizeTy, IntTy};
604   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
605       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
606       addUnderscoredPrefixToName("RegisterManagedVar"));
607   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
608   //                            const void **, const char *, int, int);
609   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
610       llvm::FunctionType::get(
611           VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
612       addUnderscoredPrefixToName("RegisterSurface"));
613   // void __cudaRegisterTexture(void **, const struct textureReference *,
614   //                            const void **, const char *, int, int, int)
615   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
616       llvm::FunctionType::get(
617           VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
618       addUnderscoredPrefixToName("RegisterTexture"));
619   for (auto &&Info : DeviceVars) {
620     llvm::GlobalVariable *Var = Info.Var;
621     assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
622            "External variables should not show up here, except HIP managed "
623            "variables");
624     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
625     switch (Info.Flags.getKind()) {
626     case DeviceVarFlags::Variable: {
627       uint64_t VarSize =
628           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
629       if (Info.Flags.isManaged()) {
630         assert(Var->getName().ends_with(".managed") &&
631                "HIP managed variables not transformed");
632         auto *ManagedVar = CGM.getModule().getNamedGlobal(
633             Var->getName().drop_back(StringRef(".managed").size()));
634         llvm::Value *Args[] = {
635             &GpuBinaryHandlePtr,
636             ManagedVar,
637             Var,
638             VarName,
639             llvm::ConstantInt::get(VarSizeTy, VarSize),
640             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
641         if (!Var->isDeclaration())
642           Builder.CreateCall(RegisterManagedVar, Args);
643       } else {
644         llvm::Value *Args[] = {
645             &GpuBinaryHandlePtr,
646             Var,
647             VarName,
648             VarName,
649             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
650             llvm::ConstantInt::get(VarSizeTy, VarSize),
651             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
652             llvm::ConstantInt::get(IntTy, 0)};
653         Builder.CreateCall(RegisterVar, Args);
654       }
655       break;
656     }
657     case DeviceVarFlags::Surface:
658       Builder.CreateCall(
659           RegisterSurf,
660           {&GpuBinaryHandlePtr, Var, VarName, VarName,
661            llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
662            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
663       break;
664     case DeviceVarFlags::Texture:
665       Builder.CreateCall(
666           RegisterTex,
667           {&GpuBinaryHandlePtr, Var, VarName, VarName,
668            llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
669            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
670            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
671       break;
672     }
673   }
674 
675   Builder.CreateRetVoid();
676   return RegisterKernelsFunc;
677 }
678 
679 /// Creates a global constructor function for the module:
680 ///
681 /// For CUDA:
682 /// \code
683 /// void __cuda_module_ctor() {
684 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
685 ///     __cuda_register_globals(Handle);
686 /// }
687 /// \endcode
688 ///
689 /// For HIP:
690 /// \code
691 /// void __hip_module_ctor() {
692 ///     if (__hip_gpubin_handle == 0) {
693 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
694 ///         __hip_register_globals(__hip_gpubin_handle);
695 ///     }
696 /// }
697 /// \endcode
makeModuleCtorFunction()698 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
699   bool IsHIP = CGM.getLangOpts().HIP;
700   bool IsCUDA = CGM.getLangOpts().CUDA;
701   // No need to generate ctors/dtors if there is no GPU binary.
702   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
703   if (CudaGpuBinaryFileName.empty() && !IsHIP)
704     return nullptr;
705   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
706       DeviceVars.empty())
707     return nullptr;
708 
709   // void __{cuda|hip}_register_globals(void* handle);
710   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
711   // We always need a function to pass in as callback. Create a dummy
712   // implementation if we don't need to register anything.
713   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
714     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
715 
716   // void ** __{cuda|hip}RegisterFatBinary(void *);
717   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
718       llvm::FunctionType::get(PtrTy, PtrTy, false),
719       addUnderscoredPrefixToName("RegisterFatBinary"));
720   // struct { int magic, int version, void * gpu_binary, void * dont_care };
721   llvm::StructType *FatbinWrapperTy =
722       llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
723 
724   // Register GPU binary with the CUDA runtime, store returned handle in a
725   // global variable and save a reference in GpuBinaryHandle to be cleaned up
726   // in destructor on exit. Then associate all known kernels with the GPU binary
727   // handle so CUDA runtime can figure out what to call on the GPU side.
728   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
729   if (!CudaGpuBinaryFileName.empty()) {
730     auto VFS = CGM.getFileSystem();
731     auto CudaGpuBinaryOrErr =
732         VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
733     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
734       CGM.getDiags().Report(diag::err_cannot_open_file)
735           << CudaGpuBinaryFileName << EC.message();
736       return nullptr;
737     }
738     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
739   }
740 
741   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
742       llvm::FunctionType::get(VoidTy, false),
743       llvm::GlobalValue::InternalLinkage,
744       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
745   llvm::BasicBlock *CtorEntryBB =
746       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
747   CGBuilderTy CtorBuilder(CGM, Context);
748 
749   CtorBuilder.SetInsertPoint(CtorEntryBB);
750 
751   const char *FatbinConstantName;
752   const char *FatbinSectionName;
753   const char *ModuleIDSectionName;
754   StringRef ModuleIDPrefix;
755   llvm::Constant *FatBinStr;
756   unsigned FatMagic;
757   if (IsHIP) {
758     FatbinConstantName = ".hip_fatbin";
759     FatbinSectionName = ".hipFatBinSegment";
760 
761     ModuleIDSectionName = "__hip_module_id";
762     ModuleIDPrefix = "__hip_";
763 
764     if (CudaGpuBinary) {
765       // If fatbin is available from early finalization, create a string
766       // literal containing the fat binary loaded from the given file.
767       const unsigned HIPCodeObjectAlign = 4096;
768       FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
769                                     FatbinConstantName, HIPCodeObjectAlign);
770     } else {
771       // If fatbin is not available, create an external symbol
772       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
773       // to contain the fat binary but will be populated somewhere else,
774       // e.g. by lld through link script.
775       FatBinStr = new llvm::GlobalVariable(
776           CGM.getModule(), CGM.Int8Ty,
777           /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
778           "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
779           llvm::GlobalVariable::NotThreadLocal);
780       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
781     }
782 
783     FatMagic = HIPFatMagic;
784   } else {
785     if (RelocatableDeviceCode)
786       FatbinConstantName = CGM.getTriple().isMacOSX()
787                                ? "__NV_CUDA,__nv_relfatbin"
788                                : "__nv_relfatbin";
789     else
790       FatbinConstantName =
791           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
792     // NVIDIA's cuobjdump looks for fatbins in this section.
793     FatbinSectionName =
794         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
795 
796     ModuleIDSectionName = CGM.getTriple().isMacOSX()
797                               ? "__NV_CUDA,__nv_module_id"
798                               : "__nv_module_id";
799     ModuleIDPrefix = "__nv_";
800 
801     // For CUDA, create a string literal containing the fat binary loaded from
802     // the given file.
803     FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
804                                   FatbinConstantName, 8);
805     FatMagic = CudaFatMagic;
806   }
807 
808   // Create initialized wrapper structure that points to the loaded GPU binary
809   ConstantInitBuilder Builder(CGM);
810   auto Values = Builder.beginStruct(FatbinWrapperTy);
811   // Fatbin wrapper magic.
812   Values.addInt(IntTy, FatMagic);
813   // Fatbin version.
814   Values.addInt(IntTy, 1);
815   // Data.
816   Values.add(FatBinStr);
817   // Unused in fatbin v1.
818   Values.add(llvm::ConstantPointerNull::get(PtrTy));
819   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
820       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
821       /*constant*/ true);
822   FatbinWrapper->setSection(FatbinSectionName);
823 
824   // There is only one HIP fat binary per linked module, however there are
825   // multiple constructor functions. Make sure the fat binary is registered
826   // only once. The constructor functions are executed by the dynamic loader
827   // before the program gains control. The dynamic loader cannot execute the
828   // constructor functions concurrently since doing that would not guarantee
829   // thread safety of the loaded program. Therefore we can assume sequential
830   // execution of constructor functions here.
831   if (IsHIP) {
832     auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
833                                  : llvm::GlobalValue::ExternalLinkage;
834     llvm::BasicBlock *IfBlock =
835         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
836     llvm::BasicBlock *ExitBlock =
837         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
838     // The name, size, and initialization pattern of this variable is part
839     // of HIP ABI.
840     GpuBinaryHandle = new llvm::GlobalVariable(
841         TheModule, PtrTy, /*isConstant=*/false, Linkage,
842         /*Initializer=*/
843         CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
844         CudaGpuBinary
845             ? "__hip_gpubin_handle"
846             : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
847     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
848     // Prevent the weak symbol in different shared libraries being merged.
849     if (Linkage != llvm::GlobalValue::InternalLinkage)
850       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
851     Address GpuBinaryAddr(
852         GpuBinaryHandle, PtrTy,
853         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
854     {
855       auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
856       llvm::Constant *Zero =
857           llvm::Constant::getNullValue(HandleValue->getType());
858       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
859       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
860     }
861     {
862       CtorBuilder.SetInsertPoint(IfBlock);
863       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
864       llvm::CallInst *RegisterFatbinCall =
865           CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
866       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
867       CtorBuilder.CreateBr(ExitBlock);
868     }
869     {
870       CtorBuilder.SetInsertPoint(ExitBlock);
871       // Call __hip_register_globals(GpuBinaryHandle);
872       if (RegisterGlobalsFunc) {
873         auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
874         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
875       }
876     }
877   } else if (!RelocatableDeviceCode) {
878     // Register binary with CUDA runtime. This is substantially different in
879     // default mode vs. separate compilation!
880     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
881     llvm::CallInst *RegisterFatbinCall =
882         CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
883     GpuBinaryHandle = new llvm::GlobalVariable(
884         TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
885         llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
886     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
887     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
888                                    CGM.getPointerAlign());
889 
890     // Call __cuda_register_globals(GpuBinaryHandle);
891     if (RegisterGlobalsFunc)
892       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
893 
894     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
895     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
896                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
897       // void __cudaRegisterFatBinaryEnd(void **);
898       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
899           llvm::FunctionType::get(VoidTy, PtrTy, false),
900           "__cudaRegisterFatBinaryEnd");
901       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
902     }
903   } else {
904     // Generate a unique module ID.
905     SmallString<64> ModuleID;
906     llvm::raw_svector_ostream OS(ModuleID);
907     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
908     llvm::Constant *ModuleIDConstant = makeConstantArray(
909         std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
910 
911     // Create an alias for the FatbinWrapper that nvcc will look for.
912     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
913                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
914 
915     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
916     // void *, void (*)(void **))
917     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
918     RegisterLinkedBinaryName += ModuleID;
919     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
920         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
921 
922     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
923     llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
924                            makeDummyFunction(getCallbackFnTy())};
925     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
926   }
927 
928   // Create destructor and register it with atexit() the way NVCC does it. Doing
929   // it during regular destructor phase worked in CUDA before 9.2 but results in
930   // double-free in 9.2.
931   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
932     // extern "C" int atexit(void (*f)(void));
933     llvm::FunctionType *AtExitTy =
934         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
935     llvm::FunctionCallee AtExitFunc =
936         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
937                                   /*Local=*/true);
938     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
939   }
940 
941   CtorBuilder.CreateRetVoid();
942   return ModuleCtorFunc;
943 }
944 
945 /// Creates a global destructor function that unregisters the GPU code blob
946 /// registered by constructor.
947 ///
948 /// For CUDA:
949 /// \code
950 /// void __cuda_module_dtor() {
951 ///     __cudaUnregisterFatBinary(Handle);
952 /// }
953 /// \endcode
954 ///
955 /// For HIP:
956 /// \code
957 /// void __hip_module_dtor() {
958 ///     if (__hip_gpubin_handle) {
959 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
960 ///         __hip_gpubin_handle = 0;
961 ///     }
962 /// }
963 /// \endcode
makeModuleDtorFunction()964 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
965   // No need for destructor if we don't have a handle to unregister.
966   if (!GpuBinaryHandle)
967     return nullptr;
968 
969   // void __cudaUnregisterFatBinary(void ** handle);
970   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
971       llvm::FunctionType::get(VoidTy, PtrTy, false),
972       addUnderscoredPrefixToName("UnregisterFatBinary"));
973 
974   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
975       llvm::FunctionType::get(VoidTy, false),
976       llvm::GlobalValue::InternalLinkage,
977       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
978 
979   llvm::BasicBlock *DtorEntryBB =
980       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
981   CGBuilderTy DtorBuilder(CGM, Context);
982   DtorBuilder.SetInsertPoint(DtorEntryBB);
983 
984   Address GpuBinaryAddr(
985       GpuBinaryHandle, GpuBinaryHandle->getValueType(),
986       CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
987   auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
988   // There is only one HIP fat binary per linked module, however there are
989   // multiple destructor functions. Make sure the fat binary is unregistered
990   // only once.
991   if (CGM.getLangOpts().HIP) {
992     llvm::BasicBlock *IfBlock =
993         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
994     llvm::BasicBlock *ExitBlock =
995         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
996     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
997     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
998     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
999 
1000     DtorBuilder.SetInsertPoint(IfBlock);
1001     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1002     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1003     DtorBuilder.CreateBr(ExitBlock);
1004 
1005     DtorBuilder.SetInsertPoint(ExitBlock);
1006   } else {
1007     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1008   }
1009   DtorBuilder.CreateRetVoid();
1010   return ModuleDtorFunc;
1011 }
1012 
CreateNVCUDARuntime(CodeGenModule & CGM)1013 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
1014   return new CGNVCUDARuntime(CGM);
1015 }
1016 
internalizeDeviceSideVar(const VarDecl * D,llvm::GlobalValue::LinkageTypes & Linkage)1017 void CGNVCUDARuntime::internalizeDeviceSideVar(
1018     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1019   // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1020   // global variables become internal definitions. These have to be internal in
1021   // order to prevent name conflicts with global host variables with the same
1022   // name in a different TUs.
1023   //
1024   // For -fgpu-rdc, the shadow variables should not be internalized because
1025   // they may be accessed by different TU.
1026   if (CGM.getLangOpts().GPURelocatableDeviceCode)
1027     return;
1028 
1029   // __shared__ variables are odd. Shadows do get created, but
1030   // they are not registered with the CUDA runtime, so they
1031   // can't really be used to access their device-side
1032   // counterparts. It's not clear yet whether it's nvcc's bug or
1033   // a feature, but we've got to do the same for compatibility.
1034   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1035       D->hasAttr<CUDASharedAttr>() ||
1036       D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1037       D->getType()->isCUDADeviceBuiltinTextureType()) {
1038     Linkage = llvm::GlobalValue::InternalLinkage;
1039   }
1040 }
1041 
handleVarRegistration(const VarDecl * D,llvm::GlobalVariable & GV)1042 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1043                                             llvm::GlobalVariable &GV) {
1044   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1045     // Shadow variables and their properties must be registered with CUDA
1046     // runtime. Skip Extern global variables, which will be registered in
1047     // the TU where they are defined.
1048     //
1049     // Don't register a C++17 inline variable. The local symbol can be
1050     // discarded and referencing a discarded local symbol from outside the
1051     // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1052     //
1053     // HIP managed variables need to be always recorded in device and host
1054     // compilations for transformation.
1055     //
1056     // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1057     // added to llvm.compiler-used, therefore they are safe to be registered.
1058     if ((!D->hasExternalStorage() && !D->isInline()) ||
1059         CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1060         D->hasAttr<HIPManagedAttr>()) {
1061       registerDeviceVar(D, GV, !D->hasDefinition(),
1062                         D->hasAttr<CUDAConstantAttr>());
1063     }
1064   } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1065              D->getType()->isCUDADeviceBuiltinTextureType()) {
1066     // Builtin surfaces and textures and their template arguments are
1067     // also registered with CUDA runtime.
1068     const auto *TD = cast<ClassTemplateSpecializationDecl>(
1069         D->getType()->castAs<RecordType>()->getDecl());
1070     const TemplateArgumentList &Args = TD->getTemplateArgs();
1071     if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1072       assert(Args.size() == 2 &&
1073              "Unexpected number of template arguments of CUDA device "
1074              "builtin surface type.");
1075       auto SurfType = Args[1].getAsIntegral();
1076       if (!D->hasExternalStorage())
1077         registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1078     } else {
1079       assert(Args.size() == 3 &&
1080              "Unexpected number of template arguments of CUDA device "
1081              "builtin texture type.");
1082       auto TexType = Args[1].getAsIntegral();
1083       auto Normalized = Args[2].getAsIntegral();
1084       if (!D->hasExternalStorage())
1085         registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1086                           Normalized.getZExtValue());
1087     }
1088   }
1089 }
1090 
1091 // Transform managed variables to pointers to managed variables in device code.
1092 // Each use of the original managed variable is replaced by a load from the
1093 // transformed managed variable. The transformed managed variable contains
1094 // the address of managed memory which will be allocated by the runtime.
transformManagedVars()1095 void CGNVCUDARuntime::transformManagedVars() {
1096   for (auto &&Info : DeviceVars) {
1097     llvm::GlobalVariable *Var = Info.Var;
1098     if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1099         Info.Flags.isManaged()) {
1100       auto *ManagedVar = new llvm::GlobalVariable(
1101           CGM.getModule(), Var->getType(),
1102           /*isConstant=*/false, Var->getLinkage(),
1103           /*Init=*/Var->isDeclaration()
1104               ? nullptr
1105               : llvm::ConstantPointerNull::get(Var->getType()),
1106           /*Name=*/"", /*InsertBefore=*/nullptr,
1107           llvm::GlobalVariable::NotThreadLocal,
1108           CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1109                                                      ? LangAS::cuda_device
1110                                                      : LangAS::Default));
1111       ManagedVar->setDSOLocal(Var->isDSOLocal());
1112       ManagedVar->setVisibility(Var->getVisibility());
1113       ManagedVar->setExternallyInitialized(true);
1114       replaceManagedVar(Var, ManagedVar);
1115       ManagedVar->takeName(Var);
1116       Var->setName(Twine(ManagedVar->getName()) + ".managed");
1117       // Keep managed variables even if they are not used in device code since
1118       // they need to be allocated by the runtime.
1119       if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1120         assert(!ManagedVar->isDeclaration());
1121         CGM.addCompilerUsedGlobal(Var);
1122         CGM.addCompilerUsedGlobal(ManagedVar);
1123       }
1124     }
1125   }
1126 }
1127 
1128 // Creates offloading entries for all the kernels and globals that must be
1129 // registered. The linker will provide a pointer to this section so we can
1130 // register the symbols with the linked device image.
createOffloadingEntries()1131 void CGNVCUDARuntime::createOffloadingEntries() {
1132   StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1133                                             : "cuda_offloading_entries";
1134   llvm::Module &M = CGM.getModule();
1135   for (KernelInfo &I : EmittedKernels)
1136     llvm::offloading::emitOffloadingEntry(
1137         M, KernelHandles[I.Kernel->getName()],
1138         getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1139         llvm::offloading::OffloadGlobalEntry, Section);
1140 
1141   for (VarInfo &I : DeviceVars) {
1142     uint64_t VarSize =
1143         CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1144     int32_t Flags =
1145         (I.Flags.isExtern()
1146              ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1147              : 0) |
1148         (I.Flags.isConstant()
1149              ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1150              : 0) |
1151         (I.Flags.isNormalized()
1152              ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1153              : 0);
1154     if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1155       llvm::offloading::emitOffloadingEntry(
1156           M, I.Var, getDeviceSideName(I.D), VarSize,
1157           (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1158                                : llvm::offloading::OffloadGlobalEntry) |
1159               Flags,
1160           /*Data=*/0, Section);
1161     } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1162       llvm::offloading::emitOffloadingEntry(
1163           M, I.Var, getDeviceSideName(I.D), VarSize,
1164           llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1165           I.Flags.getSurfTexType(), Section);
1166     } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1167       llvm::offloading::emitOffloadingEntry(
1168           M, I.Var, getDeviceSideName(I.D), VarSize,
1169           llvm::offloading::OffloadGlobalTextureEntry | Flags,
1170           I.Flags.getSurfTexType(), Section);
1171     }
1172   }
1173 }
1174 
1175 // Returns module constructor to be added.
finalizeModule()1176 llvm::Function *CGNVCUDARuntime::finalizeModule() {
1177   transformManagedVars();
1178   if (CGM.getLangOpts().CUDAIsDevice) {
1179     // Mark ODR-used device variables as compiler used to prevent it from being
1180     // eliminated by optimization. This is necessary for device variables
1181     // ODR-used by host functions. Sema correctly marks them as ODR-used no
1182     // matter whether they are ODR-used by device or host functions.
1183     //
1184     // We do not need to do this if the variable has used attribute since it
1185     // has already been added.
1186     //
1187     // Static device variables have been externalized at this point, therefore
1188     // variables with LLVM private or internal linkage need not be added.
1189     for (auto &&Info : DeviceVars) {
1190       auto Kind = Info.Flags.getKind();
1191       if (!Info.Var->isDeclaration() &&
1192           !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1193           (Kind == DeviceVarFlags::Variable ||
1194            Kind == DeviceVarFlags::Surface ||
1195            Kind == DeviceVarFlags::Texture) &&
1196           Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1197         CGM.addCompilerUsedGlobal(Info.Var);
1198       }
1199     }
1200     return nullptr;
1201   }
1202   if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1203     createOffloadingEntries();
1204   else
1205     return makeModuleCtorFunction();
1206 
1207   return nullptr;
1208 }
1209 
getKernelHandle(llvm::Function * F,GlobalDecl GD)1210 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1211                                                     GlobalDecl GD) {
1212   auto Loc = KernelHandles.find(F->getName());
1213   if (Loc != KernelHandles.end()) {
1214     auto OldHandle = Loc->second;
1215     if (KernelStubs[OldHandle] == F)
1216       return OldHandle;
1217 
1218     // We've found the function name, but F itself has changed, so we need to
1219     // update the references.
1220     if (CGM.getLangOpts().HIP) {
1221       // For HIP compilation the handle itself does not change, so we only need
1222       // to update the Stub value.
1223       KernelStubs[OldHandle] = F;
1224       return OldHandle;
1225     }
1226     // For non-HIP compilation, erase the old Stub and fall-through to creating
1227     // new entries.
1228     KernelStubs.erase(OldHandle);
1229   }
1230 
1231   if (!CGM.getLangOpts().HIP) {
1232     KernelHandles[F->getName()] = F;
1233     KernelStubs[F] = F;
1234     return F;
1235   }
1236 
1237   auto *Var = new llvm::GlobalVariable(
1238       TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1239       /*Initializer=*/nullptr,
1240       CGM.getMangledName(
1241           GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1242   Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1243   Var->setDSOLocal(F->isDSOLocal());
1244   Var->setVisibility(F->getVisibility());
1245   auto *FD = cast<FunctionDecl>(GD.getDecl());
1246   auto *FT = FD->getPrimaryTemplate();
1247   if (!FT || FT->isThisDeclarationADefinition())
1248     CGM.maybeSetTrivialComdat(*FD, *Var);
1249   KernelHandles[F->getName()] = Var;
1250   KernelStubs[Var] = F;
1251   return Var;
1252 }
1253