xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCUDARuntime.h"
15 #include "CGCXXABI.h"
16 #include "CodeGenFunction.h"
17 #include "CodeGenModule.h"
18 #include "clang/AST/CharUnits.h"
19 #include "clang/AST/Decl.h"
20 #include "clang/Basic/Cuda.h"
21 #include "clang/CodeGen/CodeGenABITypes.h"
22 #include "clang/CodeGen/ConstantInitBuilder.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/Frontend/Offloading/Utility.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/ReplaceConstant.h"
29 #include "llvm/Support/Format.h"
30 #include "llvm/Support/VirtualFileSystem.h"
31 
32 using namespace clang;
33 using namespace CodeGen;
34 
35 namespace {
36 constexpr unsigned CudaFatMagic = 0x466243b1;
37 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
38 
39 class CGNVCUDARuntime : public CGCUDARuntime {
40 
41   /// The prefix used for function calls and section names (CUDA, HIP, LLVM)
42   StringRef Prefix;
43 
44 private:
45   llvm::IntegerType *IntTy, *SizeTy;
46   llvm::Type *VoidTy;
47   llvm::PointerType *PtrTy;
48 
49   /// Convenience reference to LLVM Context
50   llvm::LLVMContext &Context;
51   /// Convenience reference to the current module
52   llvm::Module &TheModule;
53   /// Keeps track of kernel launch stubs and handles emitted in this module
54   struct KernelInfo {
55     llvm::Function *Kernel; // stub function to help launch kernel
56     const Decl *D;
57   };
58   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
59   // Map a kernel mangled name to a symbol for identifying kernel in host code
60   // For CUDA, the symbol for identifying the kernel is the same as the device
61   // stub function. For HIP, they are different.
62   llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
63   // Map a kernel handle to the kernel stub.
64   llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
65   struct VarInfo {
66     llvm::GlobalVariable *Var;
67     const VarDecl *D;
68     DeviceVarFlags Flags;
69   };
70   llvm::SmallVector<VarInfo, 16> DeviceVars;
71   /// Keeps track of variable containing handle of GPU binary. Populated by
72   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
73   /// ModuleDtorFunction()
74   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
75   /// Whether we generate relocatable device code.
76   bool RelocatableDeviceCode;
77   /// Mangle context for device.
78   std::unique_ptr<MangleContext> DeviceMC;
79 
80   llvm::FunctionCallee getSetupArgumentFn() const;
81   llvm::FunctionCallee getLaunchFn() const;
82 
83   llvm::FunctionType *getRegisterGlobalsFnTy() const;
84   llvm::FunctionType *getCallbackFnTy() const;
85   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
86   std::string addPrefixToName(StringRef FuncName) const;
87   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
88 
89   /// Creates a function to register all kernel stubs generated in this module.
90   llvm::Function *makeRegisterGlobalsFn();
91 
92   /// Helper function that generates a constant string and returns a pointer to
93   /// the start of the string.  The result of this function can be used anywhere
94   /// where the C code specifies const char*.
makeConstantString(const std::string & Str,const std::string & Name="")95   llvm::Constant *makeConstantString(const std::string &Str,
96                                      const std::string &Name = "") {
97     return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer();
98   }
99 
100   /// Helper function which generates an initialized constant array from Str,
101   /// and optionally sets section name and alignment. AddNull specifies whether
102   /// the array should nave NUL termination.
makeConstantArray(StringRef Str,StringRef Name="",StringRef SectionName="",unsigned Alignment=0,bool AddNull=false)103   llvm::Constant *makeConstantArray(StringRef Str,
104                                     StringRef Name = "",
105                                     StringRef SectionName = "",
106                                     unsigned Alignment = 0,
107                                     bool AddNull = false) {
108     llvm::Constant *Value =
109         llvm::ConstantDataArray::getString(Context, Str, AddNull);
110     auto *GV = new llvm::GlobalVariable(
111         TheModule, Value->getType(), /*isConstant=*/true,
112         llvm::GlobalValue::PrivateLinkage, Value, Name);
113     if (!SectionName.empty()) {
114       GV->setSection(SectionName);
115       // Mark the address as used which make sure that this section isn't
116       // merged and we will really have it in the object file.
117       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
118     }
119     if (Alignment)
120       GV->setAlignment(llvm::Align(Alignment));
121     return GV;
122   }
123 
124   /// Helper function that generates an empty dummy function returning void.
makeDummyFunction(llvm::FunctionType * FnTy)125   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
126     assert(FnTy->getReturnType()->isVoidTy() &&
127            "Can only generate dummy functions returning void!");
128     llvm::Function *DummyFunc = llvm::Function::Create(
129         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
130 
131     llvm::BasicBlock *DummyBlock =
132         llvm::BasicBlock::Create(Context, "", DummyFunc);
133     CGBuilderTy FuncBuilder(CGM, Context);
134     FuncBuilder.SetInsertPoint(DummyBlock);
135     FuncBuilder.CreateRetVoid();
136 
137     return DummyFunc;
138   }
139 
140   Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
141   Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
142                                        FunctionArgList &Args);
143   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
144   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
145   std::string getDeviceSideName(const NamedDecl *ND) override;
146 
registerDeviceVar(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,bool Constant)147   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
148                          bool Extern, bool Constant) {
149     DeviceVars.push_back({&Var,
150                           VD,
151                           {DeviceVarFlags::Variable, Extern, Constant,
152                            VD->hasAttr<HIPManagedAttr>(),
153                            /*Normalized*/ false, 0}});
154   }
registerDeviceSurf(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,int Type)155   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
156                           bool Extern, int Type) {
157     DeviceVars.push_back({&Var,
158                           VD,
159                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
160                            /*Managed*/ false,
161                            /*Normalized*/ false, Type}});
162   }
registerDeviceTex(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,int Type,bool Normalized)163   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
164                          bool Extern, int Type, bool Normalized) {
165     DeviceVars.push_back({&Var,
166                           VD,
167                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
168                            /*Managed*/ false, Normalized, Type}});
169   }
170 
171   /// Creates module constructor function
172   llvm::Function *makeModuleCtorFunction();
173   /// Creates module destructor function
174   llvm::Function *makeModuleDtorFunction();
175   /// Transform managed variables for device compilation.
176   void transformManagedVars();
177   /// Create offloading entries to register globals in RDC mode.
178   void createOffloadingEntries();
179 
180 public:
181   CGNVCUDARuntime(CodeGenModule &CGM);
182 
183   llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
getKernelStub(llvm::GlobalValue * Handle)184   llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
185     auto Loc = KernelStubs.find(Handle);
186     assert(Loc != KernelStubs.end());
187     return Loc->second;
188   }
189   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
190   void handleVarRegistration(const VarDecl *VD,
191                              llvm::GlobalVariable &Var) override;
192   void
193   internalizeDeviceSideVar(const VarDecl *D,
194                            llvm::GlobalValue::LinkageTypes &Linkage) override;
195 
196   llvm::Function *finalizeModule() override;
197 };
198 
199 } // end anonymous namespace
200 
addPrefixToName(StringRef FuncName) const201 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
202   return (Prefix + FuncName).str();
203 }
204 std::string
addUnderscoredPrefixToName(StringRef FuncName) const205 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
206   return ("__" + Prefix + FuncName).str();
207 }
208 
InitDeviceMC(CodeGenModule & CGM)209 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
210   // If the host and device have different C++ ABIs, mark it as the device
211   // mangle context so that the mangling needs to retrieve the additional
212   // device lambda mangling number instead of the regular host one.
213   if (CGM.getContext().getAuxTargetInfo() &&
214       CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
215       CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
216     return std::unique_ptr<MangleContext>(
217         CGM.getContext().createDeviceMangleContext(
218             *CGM.getContext().getAuxTargetInfo()));
219   }
220 
221   return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
222       CGM.getContext().getAuxTargetInfo()));
223 }
224 
CGNVCUDARuntime(CodeGenModule & CGM)225 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
226     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
227       TheModule(CGM.getModule()),
228       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
229       DeviceMC(InitDeviceMC(CGM)) {
230   IntTy = CGM.IntTy;
231   SizeTy = CGM.SizeTy;
232   VoidTy = CGM.VoidTy;
233   PtrTy = CGM.UnqualPtrTy;
234 
235   if (CGM.getLangOpts().OffloadViaLLVM)
236     Prefix = "llvm";
237   else if (CGM.getLangOpts().HIP)
238     Prefix = "hip";
239   else
240     Prefix = "cuda";
241 }
242 
getSetupArgumentFn() const243 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
244   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
245   llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
246   return CGM.CreateRuntimeFunction(
247       llvm::FunctionType::get(IntTy, Params, false),
248       addPrefixToName("SetupArgument"));
249 }
250 
getLaunchFn() const251 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
252   if (CGM.getLangOpts().HIP) {
253     // hipError_t hipLaunchByPtr(char *);
254     return CGM.CreateRuntimeFunction(
255         llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
256   }
257   // cudaError_t cudaLaunch(char *);
258   return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
259                                    "cudaLaunch");
260 }
261 
getRegisterGlobalsFnTy() const262 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
263   return llvm::FunctionType::get(VoidTy, PtrTy, false);
264 }
265 
getCallbackFnTy() const266 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
267   return llvm::FunctionType::get(VoidTy, PtrTy, false);
268 }
269 
getRegisterLinkedBinaryFnTy() const270 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
271   llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
272                           llvm::PointerType::getUnqual(Context)};
273   return llvm::FunctionType::get(VoidTy, Params, false);
274 }
275 
getDeviceSideName(const NamedDecl * ND)276 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
277   GlobalDecl GD;
278   // D could be either a kernel or a variable.
279   if (auto *FD = dyn_cast<FunctionDecl>(ND))
280     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
281   else
282     GD = GlobalDecl(ND);
283   std::string DeviceSideName;
284   MangleContext *MC;
285   if (CGM.getLangOpts().CUDAIsDevice)
286     MC = &CGM.getCXXABI().getMangleContext();
287   else
288     MC = DeviceMC.get();
289   if (MC->shouldMangleDeclName(ND)) {
290     SmallString<256> Buffer;
291     llvm::raw_svector_ostream Out(Buffer);
292     MC->mangleName(GD, Out);
293     DeviceSideName = std::string(Out.str());
294   } else
295     DeviceSideName = std::string(ND->getIdentifier()->getName());
296 
297   // Make unique name for device side static file-scope variable for HIP.
298   if (CGM.getContext().shouldExternalize(ND) &&
299       CGM.getLangOpts().GPURelocatableDeviceCode) {
300     SmallString<256> Buffer;
301     llvm::raw_svector_ostream Out(Buffer);
302     Out << DeviceSideName;
303     CGM.printPostfixForExternalizedDecl(Out, ND);
304     DeviceSideName = std::string(Out.str());
305   }
306   return DeviceSideName;
307 }
308 
emitDeviceStub(CodeGenFunction & CGF,FunctionArgList & Args)309 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
310                                      FunctionArgList &Args) {
311   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
312   if (auto *GV =
313           dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
314     GV->setLinkage(CGF.CurFn->getLinkage());
315     GV->setInitializer(CGF.CurFn);
316   }
317   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
318                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
319       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
320       (CGF.getLangOpts().OffloadViaLLVM))
321     emitDeviceStubBodyNew(CGF, Args);
322   else
323     emitDeviceStubBodyLegacy(CGF, Args);
324 }
325 
326 /// CUDA passes the arguments with a level of indirection. For example, a
327 /// (void*, short, void*) is passed as {void **, short *, void **} to the launch
328 /// function. For the LLVM/offload launch we flatten the arguments into the
329 /// struct directly. In addition, we include the size of the arguments, thus
330 /// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
331 /// nullptr}. The last nullptr needs to be initialized to an array of pointers
332 /// pointing to the arguments if we want to offload to the host.
prepareKernelArgsLLVMOffload(CodeGenFunction & CGF,FunctionArgList & Args)333 Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
334                                                       FunctionArgList &Args) {
335   SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
336   for (auto &Arg : Args)
337     ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
338   llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);
339 
340   auto *Int64Ty = CGF.Builder.getInt64Ty();
341   KernelLaunchParamsTypes.push_back(Int64Ty);
342   KernelLaunchParamsTypes.push_back(PtrTy);
343   KernelLaunchParamsTypes.push_back(PtrTy);
344 
345   llvm::StructType *KernelLaunchParamsTy =
346       llvm::StructType::create(KernelLaunchParamsTypes);
347   Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
348       KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args");
349   Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast(
350       KernelLaunchParamsTy, CharUnits::fromQuantity(16),
351       "kernel_launch_params");
352 
353   auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
354   CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
355                           CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
356   CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
357                           CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
358   CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
359                           CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
360 
361   for (unsigned i = 0; i < Args.size(); ++i) {
362     auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
363     CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i));
364   }
365 
366   return KernelLaunchParams;
367 }
368 
prepareKernelArgs(CodeGenFunction & CGF,FunctionArgList & Args)369 Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
370                                            FunctionArgList &Args) {
371   // Calculate amount of space we will need for all arguments.  If we have no
372   // args, allocate a single pointer so we still have a valid pointer to the
373   // argument array that we can pass to runtime, even if it will be unused.
374   Address KernelArgs = CGF.CreateTempAlloca(
375       PtrTy, CharUnits::fromQuantity(16), "kernel_args",
376       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
377   // Store pointers to the arguments in a locally allocated launch_args.
378   for (unsigned i = 0; i < Args.size(); ++i) {
379     llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF);
380     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
381     CGF.Builder.CreateDefaultAlignedStore(
382         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
383                         PtrTy, KernelArgs.emitRawPointer(CGF), i));
384   }
385   return KernelArgs;
386 }
387 
388 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
389 // array and kernels are launched using cudaLaunchKernel().
emitDeviceStubBodyNew(CodeGenFunction & CGF,FunctionArgList & Args)390 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
391                                             FunctionArgList &Args) {
392   // Build the shadow stack entry at the very start of the function.
393   Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
394                            ? prepareKernelArgsLLVMOffload(CGF, Args)
395                            : prepareKernelArgs(CGF, Args);
396 
397   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
398 
399   // Lookup cudaLaunchKernel/hipLaunchKernel function.
400   // HIP kernel launching API name depends on -fgpu-default-stream option. For
401   // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
402   // it is hipLaunchKernel_spt.
403   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
404   //                              void **args, size_t sharedMem,
405   //                              cudaStream_t stream);
406   // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
407   //                                  dim3 blockDim, void **args,
408   //                                  size_t sharedMem, hipStream_t stream);
409   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
410   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
411   std::string KernelLaunchAPI = "LaunchKernel";
412   if (CGF.getLangOpts().GPUDefaultStream ==
413       LangOptions::GPUDefaultStreamKind::PerThread) {
414     if (CGF.getLangOpts().HIP)
415       KernelLaunchAPI = KernelLaunchAPI + "_spt";
416     else if (CGF.getLangOpts().CUDA)
417       KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
418   }
419   auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
420   const IdentifierInfo &cudaLaunchKernelII =
421       CGM.getContext().Idents.get(LaunchKernelName);
422   FunctionDecl *cudaLaunchKernelFD = nullptr;
423   for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
424     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
425       cudaLaunchKernelFD = FD;
426   }
427 
428   if (cudaLaunchKernelFD == nullptr) {
429     CGM.Error(CGF.CurFuncDecl->getLocation(),
430               "Can't find declaration for " + LaunchKernelName);
431     return;
432   }
433   // Create temporary dim3 grid_dim, block_dim.
434   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
435   QualType Dim3Ty = GridDimParam->getType();
436   Address GridDim =
437       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
438   Address BlockDim =
439       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
440   Address ShmemSize =
441       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
442   Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
443   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
444       llvm::FunctionType::get(IntTy,
445                               {/*gridDim=*/GridDim.getType(),
446                                /*blockDim=*/BlockDim.getType(),
447                                /*ShmemSize=*/ShmemSize.getType(),
448                                /*Stream=*/Stream.getType()},
449                               /*isVarArg=*/false),
450       addUnderscoredPrefixToName("PopCallConfiguration"));
451 
452   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF),
453                                                 BlockDim.emitRawPointer(CGF),
454                                                 ShmemSize.emitRawPointer(CGF),
455                                                 Stream.emitRawPointer(CGF)});
456 
457   // Emit the call to cudaLaunch
458   llvm::Value *Kernel =
459       CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
460   CallArgList LaunchKernelArgs;
461   LaunchKernelArgs.add(RValue::get(Kernel),
462                        cudaLaunchKernelFD->getParamDecl(0)->getType());
463   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
464   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
465   LaunchKernelArgs.add(RValue::get(KernelArgs, CGF),
466                        cudaLaunchKernelFD->getParamDecl(3)->getType());
467   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
468                        cudaLaunchKernelFD->getParamDecl(4)->getType());
469   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
470                        cudaLaunchKernelFD->getParamDecl(5)->getType());
471 
472   QualType QT = cudaLaunchKernelFD->getType();
473   QualType CQT = QT.getCanonicalType();
474   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
475   llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
476 
477   const CGFunctionInfo &FI =
478       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
479   llvm::FunctionCallee cudaLaunchKernelFn =
480       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
481   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
482                LaunchKernelArgs);
483 
484   // To prevent CUDA device stub functions from being merged by ICF in MSVC
485   // environment, create an unique global variable for each kernel and write to
486   // the variable in the device stub.
487   if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
488       !CGF.getLangOpts().HIP) {
489     llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
490     std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
491 
492     llvm::GlobalVariable *HandleVar =
493         CGM.getModule().getNamedGlobal(GlobalVarName);
494     if (!HandleVar) {
495       HandleVar = new llvm::GlobalVariable(
496           CGM.getModule(), CGM.Int8Ty,
497           /*Constant=*/false, KernelFunction->getLinkage(),
498           llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
499       HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
500       HandleVar->setVisibility(KernelFunction->getVisibility());
501       if (KernelFunction->hasComdat())
502         HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
503     }
504 
505     CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1),
506                                    HandleVar, CharUnits::One(),
507                                    /*IsVolatile=*/true);
508   }
509 
510   CGF.EmitBranch(EndBlock);
511 
512   CGF.EmitBlock(EndBlock);
513 }
514 
emitDeviceStubBodyLegacy(CodeGenFunction & CGF,FunctionArgList & Args)515 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
516                                                FunctionArgList &Args) {
517   // Emit a call to cudaSetupArgument for each arg in Args.
518   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
519   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
520   CharUnits Offset = CharUnits::Zero();
521   for (const VarDecl *A : Args) {
522     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
523     Offset = Offset.alignTo(TInfo.Align);
524     llvm::Value *Args[] = {
525         CGF.Builder.CreatePointerCast(
526             CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy),
527         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
528         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
529     };
530     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
531     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
532     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
533     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
534     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
535     CGF.EmitBlock(NextBlock);
536     Offset += TInfo.Width;
537   }
538 
539   // Emit the call to cudaLaunch
540   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
541   llvm::Value *Arg =
542       CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
543   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
544   CGF.EmitBranch(EndBlock);
545 
546   CGF.EmitBlock(EndBlock);
547 }
548 
549 // Replace the original variable Var with the address loaded from variable
550 // ManagedVar populated by HIP runtime.
replaceManagedVar(llvm::GlobalVariable * Var,llvm::GlobalVariable * ManagedVar)551 static void replaceManagedVar(llvm::GlobalVariable *Var,
552                               llvm::GlobalVariable *ManagedVar) {
553   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
554   for (auto &&VarUse : Var->uses()) {
555     WorkList.push_back({VarUse.getUser()});
556   }
557   while (!WorkList.empty()) {
558     auto &&WorkItem = WorkList.pop_back_val();
559     auto *U = WorkItem.back();
560     if (isa<llvm::ConstantExpr>(U)) {
561       for (auto &&UU : U->uses()) {
562         WorkItem.push_back(UU.getUser());
563         WorkList.push_back(WorkItem);
564         WorkItem.pop_back();
565       }
566       continue;
567     }
568     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
569       llvm::Value *OldV = Var;
570       llvm::Instruction *NewV = new llvm::LoadInst(
571           Var->getType(), ManagedVar, "ld.managed", false,
572           llvm::Align(Var->getAlignment()), I->getIterator());
573       WorkItem.pop_back();
574       // Replace constant expressions directly or indirectly using the managed
575       // variable with instructions.
576       for (auto &&Op : WorkItem) {
577         auto *CE = cast<llvm::ConstantExpr>(Op);
578         auto *NewInst = CE->getAsInstruction();
579         NewInst->insertBefore(*I->getParent(), I->getIterator());
580         NewInst->replaceUsesOfWith(OldV, NewV);
581         OldV = CE;
582         NewV = NewInst;
583       }
584       I->replaceUsesOfWith(OldV, NewV);
585     } else {
586       llvm_unreachable("Invalid use of managed variable");
587     }
588   }
589 }
590 
591 /// Creates a function that sets up state on the host side for CUDA objects that
592 /// have a presence on both the host and device sides. Specifically, registers
593 /// the host side of kernel functions and device global variables with the CUDA
594 /// runtime.
595 /// \code
596 /// void __cuda_register_globals(void** GpuBinaryHandle) {
597 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
598 ///    ...
599 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
600 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
601 ///    ...
602 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
603 /// }
604 /// \endcode
makeRegisterGlobalsFn()605 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
606   // No need to register anything
607   if (EmittedKernels.empty() && DeviceVars.empty())
608     return nullptr;
609 
610   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
611       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
612       addUnderscoredPrefixToName("_register_globals"), &TheModule);
613   llvm::BasicBlock *EntryBB =
614       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
615   CGBuilderTy Builder(CGM, Context);
616   Builder.SetInsertPoint(EntryBB);
617 
618   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
619   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
620   llvm::Type *RegisterFuncParams[] = {
621       PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
622       PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
623   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
624       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
625       addUnderscoredPrefixToName("RegisterFunction"));
626 
627   // Extract GpuBinaryHandle passed as the first argument passed to
628   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
629   // each emitted kernel.
630   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
631   for (auto &&I : EmittedKernels) {
632     llvm::Constant *KernelName =
633         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
634     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
635     llvm::Value *Args[] = {
636         &GpuBinaryHandlePtr,
637         KernelHandles[I.Kernel->getName()],
638         KernelName,
639         KernelName,
640         llvm::ConstantInt::get(IntTy, -1),
641         NullPtr,
642         NullPtr,
643         NullPtr,
644         NullPtr,
645         llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
646     Builder.CreateCall(RegisterFunc, Args);
647   }
648 
649   llvm::Type *VarSizeTy = IntTy;
650   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
651   if (CGM.getLangOpts().HIP ||
652       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
653     VarSizeTy = SizeTy;
654 
655   // void __cudaRegisterVar(void **, char *, char *, const char *,
656   //                        int, int, int, int)
657   llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy,     PtrTy, PtrTy,
658                                      IntTy, VarSizeTy, IntTy, IntTy};
659   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
660       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
661       addUnderscoredPrefixToName("RegisterVar"));
662   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
663   //                              size_t, unsigned)
664   llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy,     PtrTy,
665                                             PtrTy, VarSizeTy, IntTy};
666   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
667       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
668       addUnderscoredPrefixToName("RegisterManagedVar"));
669   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
670   //                            const void **, const char *, int, int);
671   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
672       llvm::FunctionType::get(
673           VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
674       addUnderscoredPrefixToName("RegisterSurface"));
675   // void __cudaRegisterTexture(void **, const struct textureReference *,
676   //                            const void **, const char *, int, int, int)
677   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
678       llvm::FunctionType::get(
679           VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
680       addUnderscoredPrefixToName("RegisterTexture"));
681   for (auto &&Info : DeviceVars) {
682     llvm::GlobalVariable *Var = Info.Var;
683     assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
684            "External variables should not show up here, except HIP managed "
685            "variables");
686     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
687     switch (Info.Flags.getKind()) {
688     case DeviceVarFlags::Variable: {
689       uint64_t VarSize =
690           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
691       if (Info.Flags.isManaged()) {
692         assert(Var->getName().ends_with(".managed") &&
693                "HIP managed variables not transformed");
694         auto *ManagedVar = CGM.getModule().getNamedGlobal(
695             Var->getName().drop_back(StringRef(".managed").size()));
696         llvm::Value *Args[] = {
697             &GpuBinaryHandlePtr,
698             ManagedVar,
699             Var,
700             VarName,
701             llvm::ConstantInt::get(VarSizeTy, VarSize),
702             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
703         if (!Var->isDeclaration())
704           Builder.CreateCall(RegisterManagedVar, Args);
705       } else {
706         llvm::Value *Args[] = {
707             &GpuBinaryHandlePtr,
708             Var,
709             VarName,
710             VarName,
711             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
712             llvm::ConstantInt::get(VarSizeTy, VarSize),
713             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
714             llvm::ConstantInt::get(IntTy, 0)};
715         Builder.CreateCall(RegisterVar, Args);
716       }
717       break;
718     }
719     case DeviceVarFlags::Surface:
720       Builder.CreateCall(
721           RegisterSurf,
722           {&GpuBinaryHandlePtr, Var, VarName, VarName,
723            llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
724            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
725       break;
726     case DeviceVarFlags::Texture:
727       Builder.CreateCall(
728           RegisterTex,
729           {&GpuBinaryHandlePtr, Var, VarName, VarName,
730            llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
731            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
732            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
733       break;
734     }
735   }
736 
737   Builder.CreateRetVoid();
738   return RegisterKernelsFunc;
739 }
740 
741 /// Creates a global constructor function for the module:
742 ///
743 /// For CUDA:
744 /// \code
745 /// void __cuda_module_ctor() {
746 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
747 ///     __cuda_register_globals(Handle);
748 /// }
749 /// \endcode
750 ///
751 /// For HIP:
752 /// \code
753 /// void __hip_module_ctor() {
754 ///     if (__hip_gpubin_handle == 0) {
755 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
756 ///         __hip_register_globals(__hip_gpubin_handle);
757 ///     }
758 /// }
759 /// \endcode
makeModuleCtorFunction()760 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
761   bool IsHIP = CGM.getLangOpts().HIP;
762   bool IsCUDA = CGM.getLangOpts().CUDA;
763   // No need to generate ctors/dtors if there is no GPU binary.
764   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
765   if (CudaGpuBinaryFileName.empty() && !IsHIP)
766     return nullptr;
767   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
768       DeviceVars.empty())
769     return nullptr;
770 
771   // void __{cuda|hip}_register_globals(void* handle);
772   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
773   // We always need a function to pass in as callback. Create a dummy
774   // implementation if we don't need to register anything.
775   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
776     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
777 
778   // void ** __{cuda|hip}RegisterFatBinary(void *);
779   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
780       llvm::FunctionType::get(PtrTy, PtrTy, false),
781       addUnderscoredPrefixToName("RegisterFatBinary"));
782   // struct { int magic, int version, void * gpu_binary, void * dont_care };
783   llvm::StructType *FatbinWrapperTy =
784       llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
785 
786   // Register GPU binary with the CUDA runtime, store returned handle in a
787   // global variable and save a reference in GpuBinaryHandle to be cleaned up
788   // in destructor on exit. Then associate all known kernels with the GPU binary
789   // handle so CUDA runtime can figure out what to call on the GPU side.
790   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
791   if (!CudaGpuBinaryFileName.empty()) {
792     auto VFS = CGM.getFileSystem();
793     auto CudaGpuBinaryOrErr =
794         VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
795     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
796       CGM.getDiags().Report(diag::err_cannot_open_file)
797           << CudaGpuBinaryFileName << EC.message();
798       return nullptr;
799     }
800     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
801   }
802 
803   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
804       llvm::FunctionType::get(VoidTy, false),
805       llvm::GlobalValue::InternalLinkage,
806       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
807   llvm::BasicBlock *CtorEntryBB =
808       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
809   CGBuilderTy CtorBuilder(CGM, Context);
810 
811   CtorBuilder.SetInsertPoint(CtorEntryBB);
812 
813   const char *FatbinConstantName;
814   const char *FatbinSectionName;
815   const char *ModuleIDSectionName;
816   StringRef ModuleIDPrefix;
817   llvm::Constant *FatBinStr;
818   unsigned FatMagic;
819   if (IsHIP) {
820     FatbinConstantName = ".hip_fatbin";
821     FatbinSectionName = ".hipFatBinSegment";
822 
823     ModuleIDSectionName = "__hip_module_id";
824     ModuleIDPrefix = "__hip_";
825 
826     if (CudaGpuBinary) {
827       // If fatbin is available from early finalization, create a string
828       // literal containing the fat binary loaded from the given file.
829       const unsigned HIPCodeObjectAlign = 4096;
830       FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
831                                     FatbinConstantName, HIPCodeObjectAlign);
832     } else {
833       // If fatbin is not available, create an external symbol
834       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
835       // to contain the fat binary but will be populated somewhere else,
836       // e.g. by lld through link script.
837       FatBinStr = new llvm::GlobalVariable(
838           CGM.getModule(), CGM.Int8Ty,
839           /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
840           "__hip_fatbin" + (CGM.getLangOpts().CUID.empty()
841                                 ? ""
842                                 : "_" + CGM.getContext().getCUIDHash()),
843           nullptr, llvm::GlobalVariable::NotThreadLocal);
844       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
845     }
846 
847     FatMagic = HIPFatMagic;
848   } else {
849     if (RelocatableDeviceCode)
850       FatbinConstantName = CGM.getTriple().isMacOSX()
851                                ? "__NV_CUDA,__nv_relfatbin"
852                                : "__nv_relfatbin";
853     else
854       FatbinConstantName =
855           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
856     // NVIDIA's cuobjdump looks for fatbins in this section.
857     FatbinSectionName =
858         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
859 
860     ModuleIDSectionName = CGM.getTriple().isMacOSX()
861                               ? "__NV_CUDA,__nv_module_id"
862                               : "__nv_module_id";
863     ModuleIDPrefix = "__nv_";
864 
865     // For CUDA, create a string literal containing the fat binary loaded from
866     // the given file.
867     FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
868                                   FatbinConstantName, 8);
869     FatMagic = CudaFatMagic;
870   }
871 
872   // Create initialized wrapper structure that points to the loaded GPU binary
873   ConstantInitBuilder Builder(CGM);
874   auto Values = Builder.beginStruct(FatbinWrapperTy);
875   // Fatbin wrapper magic.
876   Values.addInt(IntTy, FatMagic);
877   // Fatbin version.
878   Values.addInt(IntTy, 1);
879   // Data.
880   Values.add(FatBinStr);
881   // Unused in fatbin v1.
882   Values.add(llvm::ConstantPointerNull::get(PtrTy));
883   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
884       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
885       /*constant*/ true);
886   FatbinWrapper->setSection(FatbinSectionName);
887 
888   // There is only one HIP fat binary per linked module, however there are
889   // multiple constructor functions. Make sure the fat binary is registered
890   // only once. The constructor functions are executed by the dynamic loader
891   // before the program gains control. The dynamic loader cannot execute the
892   // constructor functions concurrently since doing that would not guarantee
893   // thread safety of the loaded program. Therefore we can assume sequential
894   // execution of constructor functions here.
895   if (IsHIP) {
896     auto Linkage = RelocatableDeviceCode ? llvm::GlobalValue::ExternalLinkage
897                                          : llvm::GlobalValue::InternalLinkage;
898     llvm::BasicBlock *IfBlock =
899         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
900     llvm::BasicBlock *ExitBlock =
901         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
902     // The name, size, and initialization pattern of this variable is part
903     // of HIP ABI.
904     GpuBinaryHandle = new llvm::GlobalVariable(
905         TheModule, PtrTy, /*isConstant=*/false, Linkage,
906         /*Initializer=*/
907         !RelocatableDeviceCode ? llvm::ConstantPointerNull::get(PtrTy)
908                                : nullptr,
909         "__hip_gpubin_handle" + (CGM.getLangOpts().CUID.empty()
910                                      ? ""
911                                      : "_" + CGM.getContext().getCUIDHash()));
912     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
913     // Prevent the weak symbol in different shared libraries being merged.
914     if (Linkage != llvm::GlobalValue::InternalLinkage)
915       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
916     Address GpuBinaryAddr(
917         GpuBinaryHandle, PtrTy,
918         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
919     {
920       auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
921       llvm::Constant *Zero =
922           llvm::Constant::getNullValue(HandleValue->getType());
923       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
924       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
925     }
926     {
927       CtorBuilder.SetInsertPoint(IfBlock);
928       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
929       llvm::CallInst *RegisterFatbinCall =
930           CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
931       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
932       CtorBuilder.CreateBr(ExitBlock);
933     }
934     {
935       CtorBuilder.SetInsertPoint(ExitBlock);
936       // Call __hip_register_globals(GpuBinaryHandle);
937       if (RegisterGlobalsFunc) {
938         auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
939         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
940       }
941     }
942   } else if (!RelocatableDeviceCode) {
943     // Register binary with CUDA runtime. This is substantially different in
944     // default mode vs. separate compilation!
945     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
946     llvm::CallInst *RegisterFatbinCall =
947         CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
948     GpuBinaryHandle = new llvm::GlobalVariable(
949         TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
950         llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
951     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
952     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
953                                    CGM.getPointerAlign());
954 
955     // Call __cuda_register_globals(GpuBinaryHandle);
956     if (RegisterGlobalsFunc)
957       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
958 
959     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
960     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
961                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
962       // void __cudaRegisterFatBinaryEnd(void **);
963       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
964           llvm::FunctionType::get(VoidTy, PtrTy, false),
965           "__cudaRegisterFatBinaryEnd");
966       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
967     }
968   } else {
969     // Generate a unique module ID.
970     SmallString<64> ModuleID;
971     llvm::raw_svector_ostream OS(ModuleID);
972     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
973     llvm::Constant *ModuleIDConstant = makeConstantArray(
974         std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
975 
976     // Create an alias for the FatbinWrapper that nvcc will look for.
977     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
978                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
979 
980     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
981     // void *, void (*)(void **))
982     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
983     RegisterLinkedBinaryName += ModuleID;
984     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
985         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
986 
987     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
988     llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
989                            makeDummyFunction(getCallbackFnTy())};
990     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
991   }
992 
993   // Create destructor and register it with atexit() the way NVCC does it. Doing
994   // it during regular destructor phase worked in CUDA before 9.2 but results in
995   // double-free in 9.2.
996   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
997     // extern "C" int atexit(void (*f)(void));
998     llvm::FunctionType *AtExitTy =
999         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
1000     llvm::FunctionCallee AtExitFunc =
1001         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
1002                                   /*Local=*/true);
1003     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
1004   }
1005 
1006   CtorBuilder.CreateRetVoid();
1007   return ModuleCtorFunc;
1008 }
1009 
1010 /// Creates a global destructor function that unregisters the GPU code blob
1011 /// registered by constructor.
1012 ///
1013 /// For CUDA:
1014 /// \code
1015 /// void __cuda_module_dtor() {
1016 ///     __cudaUnregisterFatBinary(Handle);
1017 /// }
1018 /// \endcode
1019 ///
1020 /// For HIP:
1021 /// \code
1022 /// void __hip_module_dtor() {
1023 ///     if (__hip_gpubin_handle) {
1024 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
1025 ///         __hip_gpubin_handle = 0;
1026 ///     }
1027 /// }
1028 /// \endcode
makeModuleDtorFunction()1029 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
1030   // No need for destructor if we don't have a handle to unregister.
1031   if (!GpuBinaryHandle)
1032     return nullptr;
1033 
1034   // void __cudaUnregisterFatBinary(void ** handle);
1035   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
1036       llvm::FunctionType::get(VoidTy, PtrTy, false),
1037       addUnderscoredPrefixToName("UnregisterFatBinary"));
1038 
1039   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
1040       llvm::FunctionType::get(VoidTy, false),
1041       llvm::GlobalValue::InternalLinkage,
1042       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
1043 
1044   llvm::BasicBlock *DtorEntryBB =
1045       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
1046   CGBuilderTy DtorBuilder(CGM, Context);
1047   DtorBuilder.SetInsertPoint(DtorEntryBB);
1048 
1049   Address GpuBinaryAddr(
1050       GpuBinaryHandle, GpuBinaryHandle->getValueType(),
1051       CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
1052   auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
1053   // There is only one HIP fat binary per linked module, however there are
1054   // multiple destructor functions. Make sure the fat binary is unregistered
1055   // only once.
1056   if (CGM.getLangOpts().HIP) {
1057     llvm::BasicBlock *IfBlock =
1058         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
1059     llvm::BasicBlock *ExitBlock =
1060         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
1061     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
1062     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
1063     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
1064 
1065     DtorBuilder.SetInsertPoint(IfBlock);
1066     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1067     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1068     DtorBuilder.CreateBr(ExitBlock);
1069 
1070     DtorBuilder.SetInsertPoint(ExitBlock);
1071   } else {
1072     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1073   }
1074   DtorBuilder.CreateRetVoid();
1075   return ModuleDtorFunc;
1076 }
1077 
CreateNVCUDARuntime(CodeGenModule & CGM)1078 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
1079   return new CGNVCUDARuntime(CGM);
1080 }
1081 
internalizeDeviceSideVar(const VarDecl * D,llvm::GlobalValue::LinkageTypes & Linkage)1082 void CGNVCUDARuntime::internalizeDeviceSideVar(
1083     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1084   // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1085   // global variables become internal definitions. These have to be internal in
1086   // order to prevent name conflicts with global host variables with the same
1087   // name in a different TUs.
1088   //
1089   // For -fgpu-rdc, the shadow variables should not be internalized because
1090   // they may be accessed by different TU.
1091   if (CGM.getLangOpts().GPURelocatableDeviceCode)
1092     return;
1093 
1094   // __shared__ variables are odd. Shadows do get created, but
1095   // they are not registered with the CUDA runtime, so they
1096   // can't really be used to access their device-side
1097   // counterparts. It's not clear yet whether it's nvcc's bug or
1098   // a feature, but we've got to do the same for compatibility.
1099   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1100       D->hasAttr<CUDASharedAttr>() ||
1101       D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1102       D->getType()->isCUDADeviceBuiltinTextureType()) {
1103     Linkage = llvm::GlobalValue::InternalLinkage;
1104   }
1105 }
1106 
handleVarRegistration(const VarDecl * D,llvm::GlobalVariable & GV)1107 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1108                                             llvm::GlobalVariable &GV) {
1109   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1110     // Shadow variables and their properties must be registered with CUDA
1111     // runtime. Skip Extern global variables, which will be registered in
1112     // the TU where they are defined.
1113     //
1114     // Don't register a C++17 inline variable. The local symbol can be
1115     // discarded and referencing a discarded local symbol from outside the
1116     // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1117     //
1118     // HIP managed variables need to be always recorded in device and host
1119     // compilations for transformation.
1120     //
1121     // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1122     // added to llvm.compiler-used, therefore they are safe to be registered.
1123     if ((!D->hasExternalStorage() && !D->isInline()) ||
1124         CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1125         D->hasAttr<HIPManagedAttr>()) {
1126       registerDeviceVar(D, GV, !D->hasDefinition(),
1127                         D->hasAttr<CUDAConstantAttr>());
1128     }
1129   } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1130              D->getType()->isCUDADeviceBuiltinTextureType()) {
1131     // Builtin surfaces and textures and their template arguments are
1132     // also registered with CUDA runtime.
1133     const auto *TD = cast<ClassTemplateSpecializationDecl>(
1134         D->getType()->castAs<RecordType>()->getDecl());
1135     const TemplateArgumentList &Args = TD->getTemplateArgs();
1136     if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1137       assert(Args.size() == 2 &&
1138              "Unexpected number of template arguments of CUDA device "
1139              "builtin surface type.");
1140       auto SurfType = Args[1].getAsIntegral();
1141       if (!D->hasExternalStorage())
1142         registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1143     } else {
1144       assert(Args.size() == 3 &&
1145              "Unexpected number of template arguments of CUDA device "
1146              "builtin texture type.");
1147       auto TexType = Args[1].getAsIntegral();
1148       auto Normalized = Args[2].getAsIntegral();
1149       if (!D->hasExternalStorage())
1150         registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1151                           Normalized.getZExtValue());
1152     }
1153   }
1154 }
1155 
1156 // Transform managed variables to pointers to managed variables in device code.
1157 // Each use of the original managed variable is replaced by a load from the
1158 // transformed managed variable. The transformed managed variable contains
1159 // the address of managed memory which will be allocated by the runtime.
transformManagedVars()1160 void CGNVCUDARuntime::transformManagedVars() {
1161   for (auto &&Info : DeviceVars) {
1162     llvm::GlobalVariable *Var = Info.Var;
1163     if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1164         Info.Flags.isManaged()) {
1165       auto *ManagedVar = new llvm::GlobalVariable(
1166           CGM.getModule(), Var->getType(),
1167           /*isConstant=*/false, Var->getLinkage(),
1168           /*Init=*/Var->isDeclaration()
1169               ? nullptr
1170               : llvm::ConstantPointerNull::get(Var->getType()),
1171           /*Name=*/"", /*InsertBefore=*/nullptr,
1172           llvm::GlobalVariable::NotThreadLocal,
1173           CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1174                                                      ? LangAS::cuda_device
1175                                                      : LangAS::Default));
1176       ManagedVar->setDSOLocal(Var->isDSOLocal());
1177       ManagedVar->setVisibility(Var->getVisibility());
1178       ManagedVar->setExternallyInitialized(true);
1179       replaceManagedVar(Var, ManagedVar);
1180       ManagedVar->takeName(Var);
1181       Var->setName(Twine(ManagedVar->getName()) + ".managed");
1182       // Keep managed variables even if they are not used in device code since
1183       // they need to be allocated by the runtime.
1184       if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1185         assert(!ManagedVar->isDeclaration());
1186         CGM.addCompilerUsedGlobal(Var);
1187         CGM.addCompilerUsedGlobal(ManagedVar);
1188       }
1189     }
1190   }
1191 }
1192 
1193 // Creates offloading entries for all the kernels and globals that must be
1194 // registered. The linker will provide a pointer to this section so we can
1195 // register the symbols with the linked device image.
createOffloadingEntries()1196 void CGNVCUDARuntime::createOffloadingEntries() {
1197   llvm::object::OffloadKind Kind = CGM.getLangOpts().HIP
1198                                        ? llvm::object::OffloadKind::OFK_HIP
1199                                        : llvm::object::OffloadKind::OFK_Cuda;
1200   // For now, just spoof this as OpenMP because that's the runtime it uses.
1201   if (CGM.getLangOpts().OffloadViaLLVM)
1202     Kind = llvm::object::OffloadKind::OFK_OpenMP;
1203 
1204   llvm::Module &M = CGM.getModule();
1205   for (KernelInfo &I : EmittedKernels)
1206     llvm::offloading::emitOffloadingEntry(
1207         M, Kind, KernelHandles[I.Kernel->getName()],
1208         getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1209         llvm::offloading::OffloadGlobalEntry);
1210 
1211   for (VarInfo &I : DeviceVars) {
1212     uint64_t VarSize =
1213         CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1214     int32_t Flags =
1215         (I.Flags.isExtern()
1216              ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1217              : 0) |
1218         (I.Flags.isConstant()
1219              ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1220              : 0) |
1221         (I.Flags.isNormalized()
1222              ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1223              : 0);
1224     if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1225       if (I.Flags.isManaged()) {
1226         assert(I.Var->getName().ends_with(".managed") &&
1227                "HIP managed variables not transformed");
1228 
1229         auto *ManagedVar = M.getNamedGlobal(
1230             I.Var->getName().drop_back(StringRef(".managed").size()));
1231         llvm::offloading::emitOffloadingEntry(
1232             M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1233             llvm::offloading::OffloadGlobalManagedEntry | Flags,
1234             /*Data=*/I.Var->getAlignment(), ManagedVar);
1235       } else {
1236         llvm::offloading::emitOffloadingEntry(
1237             M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1238             llvm::offloading::OffloadGlobalEntry | Flags,
1239             /*Data=*/0);
1240       }
1241     } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1242       llvm::offloading::emitOffloadingEntry(
1243           M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1244           llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1245           I.Flags.getSurfTexType());
1246     } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1247       llvm::offloading::emitOffloadingEntry(
1248           M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1249           llvm::offloading::OffloadGlobalTextureEntry | Flags,
1250           I.Flags.getSurfTexType());
1251     }
1252   }
1253 }
1254 
1255 // Returns module constructor to be added.
finalizeModule()1256 llvm::Function *CGNVCUDARuntime::finalizeModule() {
1257   transformManagedVars();
1258   if (CGM.getLangOpts().CUDAIsDevice) {
1259     // Mark ODR-used device variables as compiler used to prevent it from being
1260     // eliminated by optimization. This is necessary for device variables
1261     // ODR-used by host functions. Sema correctly marks them as ODR-used no
1262     // matter whether they are ODR-used by device or host functions.
1263     //
1264     // We do not need to do this if the variable has used attribute since it
1265     // has already been added.
1266     //
1267     // Static device variables have been externalized at this point, therefore
1268     // variables with LLVM private or internal linkage need not be added.
1269     for (auto &&Info : DeviceVars) {
1270       auto Kind = Info.Flags.getKind();
1271       if (!Info.Var->isDeclaration() &&
1272           !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1273           (Kind == DeviceVarFlags::Variable ||
1274            Kind == DeviceVarFlags::Surface ||
1275            Kind == DeviceVarFlags::Texture) &&
1276           Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1277         CGM.addCompilerUsedGlobal(Info.Var);
1278       }
1279     }
1280     return nullptr;
1281   }
1282   if (CGM.getLangOpts().OffloadViaLLVM ||
1283       (CGM.getLangOpts().OffloadingNewDriver &&
1284        (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
1285     createOffloadingEntries();
1286   else
1287     return makeModuleCtorFunction();
1288 
1289   return nullptr;
1290 }
1291 
getKernelHandle(llvm::Function * F,GlobalDecl GD)1292 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1293                                                     GlobalDecl GD) {
1294   auto Loc = KernelHandles.find(F->getName());
1295   if (Loc != KernelHandles.end()) {
1296     auto OldHandle = Loc->second;
1297     if (KernelStubs[OldHandle] == F)
1298       return OldHandle;
1299 
1300     // We've found the function name, but F itself has changed, so we need to
1301     // update the references.
1302     if (CGM.getLangOpts().HIP) {
1303       // For HIP compilation the handle itself does not change, so we only need
1304       // to update the Stub value.
1305       KernelStubs[OldHandle] = F;
1306       return OldHandle;
1307     }
1308     // For non-HIP compilation, erase the old Stub and fall-through to creating
1309     // new entries.
1310     KernelStubs.erase(OldHandle);
1311   }
1312 
1313   if (!CGM.getLangOpts().HIP) {
1314     KernelHandles[F->getName()] = F;
1315     KernelStubs[F] = F;
1316     return F;
1317   }
1318 
1319   auto *Var = new llvm::GlobalVariable(
1320       TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1321       /*Initializer=*/nullptr,
1322       CGM.getMangledName(
1323           GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1324   Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1325   Var->setDSOLocal(F->isDSOLocal());
1326   Var->setVisibility(F->getVisibility());
1327   auto *FD = cast<FunctionDecl>(GD.getDecl());
1328   auto *FT = FD->getPrimaryTemplate();
1329   if (!FT || FT->isThisDeclarationADefinition())
1330     CGM.maybeSetTrivialComdat(*FD, *Var);
1331   KernelHandles[F->getName()] = Var;
1332   KernelStubs[Var] = F;
1333   return Var;
1334 }
1335