1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "CGCUDARuntime.h"
15 #include "CGCXXABI.h"
16 #include "CodeGenFunction.h"
17 #include "CodeGenModule.h"
18 #include "clang/AST/Decl.h"
19 #include "clang/Basic/Cuda.h"
20 #include "clang/CodeGen/CodeGenABITypes.h"
21 #include "clang/CodeGen/ConstantInitBuilder.h"
22 #include "llvm/Frontend/Offloading/Utility.h"
23 #include "llvm/IR/BasicBlock.h"
24 #include "llvm/IR/Constants.h"
25 #include "llvm/IR/DerivedTypes.h"
26 #include "llvm/IR/ReplaceConstant.h"
27 #include "llvm/Support/Format.h"
28 #include "llvm/Support/VirtualFileSystem.h"
29
30 using namespace clang;
31 using namespace CodeGen;
32
33 namespace {
34 constexpr unsigned CudaFatMagic = 0x466243b1;
35 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36
37 class CGNVCUDARuntime : public CGCUDARuntime {
38
39 private:
40 llvm::IntegerType *IntTy, *SizeTy;
41 llvm::Type *VoidTy;
42 llvm::PointerType *PtrTy;
43
44 /// Convenience reference to LLVM Context
45 llvm::LLVMContext &Context;
46 /// Convenience reference to the current module
47 llvm::Module &TheModule;
48 /// Keeps track of kernel launch stubs and handles emitted in this module
49 struct KernelInfo {
50 llvm::Function *Kernel; // stub function to help launch kernel
51 const Decl *D;
52 };
53 llvm::SmallVector<KernelInfo, 16> EmittedKernels;
54 // Map a kernel mangled name to a symbol for identifying kernel in host code
55 // For CUDA, the symbol for identifying the kernel is the same as the device
56 // stub function. For HIP, they are different.
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58 // Map a kernel handle to the kernel stub.
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
60 struct VarInfo {
61 llvm::GlobalVariable *Var;
62 const VarDecl *D;
63 DeviceVarFlags Flags;
64 };
65 llvm::SmallVector<VarInfo, 16> DeviceVars;
66 /// Keeps track of variable containing handle of GPU binary. Populated by
67 /// ModuleCtorFunction() and used to create corresponding cleanup calls in
68 /// ModuleDtorFunction()
69 llvm::GlobalVariable *GpuBinaryHandle = nullptr;
70 /// Whether we generate relocatable device code.
71 bool RelocatableDeviceCode;
72 /// Mangle context for device.
73 std::unique_ptr<MangleContext> DeviceMC;
74
75 llvm::FunctionCallee getSetupArgumentFn() const;
76 llvm::FunctionCallee getLaunchFn() const;
77
78 llvm::FunctionType *getRegisterGlobalsFnTy() const;
79 llvm::FunctionType *getCallbackFnTy() const;
80 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
81 std::string addPrefixToName(StringRef FuncName) const;
82 std::string addUnderscoredPrefixToName(StringRef FuncName) const;
83
84 /// Creates a function to register all kernel stubs generated in this module.
85 llvm::Function *makeRegisterGlobalsFn();
86
87 /// Helper function that generates a constant string and returns a pointer to
88 /// the start of the string. The result of this function can be used anywhere
89 /// where the C code specifies const char*.
makeConstantString(const std::string & Str,const std::string & Name="")90 llvm::Constant *makeConstantString(const std::string &Str,
91 const std::string &Name = "") {
92 return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer();
93 }
94
95 /// Helper function which generates an initialized constant array from Str,
96 /// and optionally sets section name and alignment. AddNull specifies whether
97 /// the array should nave NUL termination.
makeConstantArray(StringRef Str,StringRef Name="",StringRef SectionName="",unsigned Alignment=0,bool AddNull=false)98 llvm::Constant *makeConstantArray(StringRef Str,
99 StringRef Name = "",
100 StringRef SectionName = "",
101 unsigned Alignment = 0,
102 bool AddNull = false) {
103 llvm::Constant *Value =
104 llvm::ConstantDataArray::getString(Context, Str, AddNull);
105 auto *GV = new llvm::GlobalVariable(
106 TheModule, Value->getType(), /*isConstant=*/true,
107 llvm::GlobalValue::PrivateLinkage, Value, Name);
108 if (!SectionName.empty()) {
109 GV->setSection(SectionName);
110 // Mark the address as used which make sure that this section isn't
111 // merged and we will really have it in the object file.
112 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
113 }
114 if (Alignment)
115 GV->setAlignment(llvm::Align(Alignment));
116 return GV;
117 }
118
119 /// Helper function that generates an empty dummy function returning void.
makeDummyFunction(llvm::FunctionType * FnTy)120 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
121 assert(FnTy->getReturnType()->isVoidTy() &&
122 "Can only generate dummy functions returning void!");
123 llvm::Function *DummyFunc = llvm::Function::Create(
124 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
125
126 llvm::BasicBlock *DummyBlock =
127 llvm::BasicBlock::Create(Context, "", DummyFunc);
128 CGBuilderTy FuncBuilder(CGM, Context);
129 FuncBuilder.SetInsertPoint(DummyBlock);
130 FuncBuilder.CreateRetVoid();
131
132 return DummyFunc;
133 }
134
135 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
136 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
137 std::string getDeviceSideName(const NamedDecl *ND) override;
138
registerDeviceVar(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,bool Constant)139 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
140 bool Extern, bool Constant) {
141 DeviceVars.push_back({&Var,
142 VD,
143 {DeviceVarFlags::Variable, Extern, Constant,
144 VD->hasAttr<HIPManagedAttr>(),
145 /*Normalized*/ false, 0}});
146 }
registerDeviceSurf(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,int Type)147 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
148 bool Extern, int Type) {
149 DeviceVars.push_back({&Var,
150 VD,
151 {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
152 /*Managed*/ false,
153 /*Normalized*/ false, Type}});
154 }
registerDeviceTex(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,int Type,bool Normalized)155 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
156 bool Extern, int Type, bool Normalized) {
157 DeviceVars.push_back({&Var,
158 VD,
159 {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
160 /*Managed*/ false, Normalized, Type}});
161 }
162
163 /// Creates module constructor function
164 llvm::Function *makeModuleCtorFunction();
165 /// Creates module destructor function
166 llvm::Function *makeModuleDtorFunction();
167 /// Transform managed variables for device compilation.
168 void transformManagedVars();
169 /// Create offloading entries to register globals in RDC mode.
170 void createOffloadingEntries();
171
172 public:
173 CGNVCUDARuntime(CodeGenModule &CGM);
174
175 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
getKernelStub(llvm::GlobalValue * Handle)176 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
177 auto Loc = KernelStubs.find(Handle);
178 assert(Loc != KernelStubs.end());
179 return Loc->second;
180 }
181 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
182 void handleVarRegistration(const VarDecl *VD,
183 llvm::GlobalVariable &Var) override;
184 void
185 internalizeDeviceSideVar(const VarDecl *D,
186 llvm::GlobalValue::LinkageTypes &Linkage) override;
187
188 llvm::Function *finalizeModule() override;
189 };
190
191 } // end anonymous namespace
192
addPrefixToName(StringRef FuncName) const193 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
194 if (CGM.getLangOpts().HIP)
195 return ((Twine("hip") + Twine(FuncName)).str());
196 return ((Twine("cuda") + Twine(FuncName)).str());
197 }
198 std::string
addUnderscoredPrefixToName(StringRef FuncName) const199 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
200 if (CGM.getLangOpts().HIP)
201 return ((Twine("__hip") + Twine(FuncName)).str());
202 return ((Twine("__cuda") + Twine(FuncName)).str());
203 }
204
InitDeviceMC(CodeGenModule & CGM)205 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
206 // If the host and device have different C++ ABIs, mark it as the device
207 // mangle context so that the mangling needs to retrieve the additional
208 // device lambda mangling number instead of the regular host one.
209 if (CGM.getContext().getAuxTargetInfo() &&
210 CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
211 CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
212 return std::unique_ptr<MangleContext>(
213 CGM.getContext().createDeviceMangleContext(
214 *CGM.getContext().getAuxTargetInfo()));
215 }
216
217 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
218 CGM.getContext().getAuxTargetInfo()));
219 }
220
CGNVCUDARuntime(CodeGenModule & CGM)221 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
222 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
223 TheModule(CGM.getModule()),
224 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
225 DeviceMC(InitDeviceMC(CGM)) {
226 IntTy = CGM.IntTy;
227 SizeTy = CGM.SizeTy;
228 VoidTy = CGM.VoidTy;
229 PtrTy = CGM.UnqualPtrTy;
230 }
231
getSetupArgumentFn() const232 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
233 // cudaError_t cudaSetupArgument(void *, size_t, size_t)
234 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
235 return CGM.CreateRuntimeFunction(
236 llvm::FunctionType::get(IntTy, Params, false),
237 addPrefixToName("SetupArgument"));
238 }
239
getLaunchFn() const240 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
241 if (CGM.getLangOpts().HIP) {
242 // hipError_t hipLaunchByPtr(char *);
243 return CGM.CreateRuntimeFunction(
244 llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
245 }
246 // cudaError_t cudaLaunch(char *);
247 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
248 "cudaLaunch");
249 }
250
getRegisterGlobalsFnTy() const251 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
252 return llvm::FunctionType::get(VoidTy, PtrTy, false);
253 }
254
getCallbackFnTy() const255 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
256 return llvm::FunctionType::get(VoidTy, PtrTy, false);
257 }
258
getRegisterLinkedBinaryFnTy() const259 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
260 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
261 llvm::PointerType::getUnqual(Context)};
262 return llvm::FunctionType::get(VoidTy, Params, false);
263 }
264
getDeviceSideName(const NamedDecl * ND)265 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
266 GlobalDecl GD;
267 // D could be either a kernel or a variable.
268 if (auto *FD = dyn_cast<FunctionDecl>(ND))
269 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
270 else
271 GD = GlobalDecl(ND);
272 std::string DeviceSideName;
273 MangleContext *MC;
274 if (CGM.getLangOpts().CUDAIsDevice)
275 MC = &CGM.getCXXABI().getMangleContext();
276 else
277 MC = DeviceMC.get();
278 if (MC->shouldMangleDeclName(ND)) {
279 SmallString<256> Buffer;
280 llvm::raw_svector_ostream Out(Buffer);
281 MC->mangleName(GD, Out);
282 DeviceSideName = std::string(Out.str());
283 } else
284 DeviceSideName = std::string(ND->getIdentifier()->getName());
285
286 // Make unique name for device side static file-scope variable for HIP.
287 if (CGM.getContext().shouldExternalize(ND) &&
288 CGM.getLangOpts().GPURelocatableDeviceCode) {
289 SmallString<256> Buffer;
290 llvm::raw_svector_ostream Out(Buffer);
291 Out << DeviceSideName;
292 CGM.printPostfixForExternalizedDecl(Out, ND);
293 DeviceSideName = std::string(Out.str());
294 }
295 return DeviceSideName;
296 }
297
emitDeviceStub(CodeGenFunction & CGF,FunctionArgList & Args)298 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
299 FunctionArgList &Args) {
300 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
301 if (auto *GV =
302 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
303 GV->setLinkage(CGF.CurFn->getLinkage());
304 GV->setInitializer(CGF.CurFn);
305 }
306 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
307 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
308 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
309 emitDeviceStubBodyNew(CGF, Args);
310 else
311 emitDeviceStubBodyLegacy(CGF, Args);
312 }
313
314 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
315 // array and kernels are launched using cudaLaunchKernel().
emitDeviceStubBodyNew(CodeGenFunction & CGF,FunctionArgList & Args)316 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
317 FunctionArgList &Args) {
318 // Build the shadow stack entry at the very start of the function.
319
320 // Calculate amount of space we will need for all arguments. If we have no
321 // args, allocate a single pointer so we still have a valid pointer to the
322 // argument array that we can pass to runtime, even if it will be unused.
323 Address KernelArgs = CGF.CreateTempAlloca(
324 PtrTy, CharUnits::fromQuantity(16), "kernel_args",
325 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
326 // Store pointers to the arguments in a locally allocated launch_args.
327 for (unsigned i = 0; i < Args.size(); ++i) {
328 llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF);
329 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
330 CGF.Builder.CreateDefaultAlignedStore(
331 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
332 PtrTy, KernelArgs.emitRawPointer(CGF), i));
333 }
334
335 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
336
337 // Lookup cudaLaunchKernel/hipLaunchKernel function.
338 // HIP kernel launching API name depends on -fgpu-default-stream option. For
339 // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
340 // it is hipLaunchKernel_spt.
341 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
342 // void **args, size_t sharedMem,
343 // cudaStream_t stream);
344 // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
345 // dim3 blockDim, void **args,
346 // size_t sharedMem, hipStream_t stream);
347 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
348 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
349 std::string KernelLaunchAPI = "LaunchKernel";
350 if (CGF.getLangOpts().GPUDefaultStream ==
351 LangOptions::GPUDefaultStreamKind::PerThread) {
352 if (CGF.getLangOpts().HIP)
353 KernelLaunchAPI = KernelLaunchAPI + "_spt";
354 else if (CGF.getLangOpts().CUDA)
355 KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
356 }
357 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
358 const IdentifierInfo &cudaLaunchKernelII =
359 CGM.getContext().Idents.get(LaunchKernelName);
360 FunctionDecl *cudaLaunchKernelFD = nullptr;
361 for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
362 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
363 cudaLaunchKernelFD = FD;
364 }
365
366 if (cudaLaunchKernelFD == nullptr) {
367 CGM.Error(CGF.CurFuncDecl->getLocation(),
368 "Can't find declaration for " + LaunchKernelName);
369 return;
370 }
371 // Create temporary dim3 grid_dim, block_dim.
372 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
373 QualType Dim3Ty = GridDimParam->getType();
374 Address GridDim =
375 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
376 Address BlockDim =
377 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
378 Address ShmemSize =
379 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
380 Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
381 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
382 llvm::FunctionType::get(IntTy,
383 {/*gridDim=*/GridDim.getType(),
384 /*blockDim=*/BlockDim.getType(),
385 /*ShmemSize=*/ShmemSize.getType(),
386 /*Stream=*/Stream.getType()},
387 /*isVarArg=*/false),
388 addUnderscoredPrefixToName("PopCallConfiguration"));
389
390 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF),
391 BlockDim.emitRawPointer(CGF),
392 ShmemSize.emitRawPointer(CGF),
393 Stream.emitRawPointer(CGF)});
394
395 // Emit the call to cudaLaunch
396 llvm::Value *Kernel =
397 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
398 CallArgList LaunchKernelArgs;
399 LaunchKernelArgs.add(RValue::get(Kernel),
400 cudaLaunchKernelFD->getParamDecl(0)->getType());
401 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
402 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
403 LaunchKernelArgs.add(RValue::get(KernelArgs, CGF),
404 cudaLaunchKernelFD->getParamDecl(3)->getType());
405 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
406 cudaLaunchKernelFD->getParamDecl(4)->getType());
407 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
408 cudaLaunchKernelFD->getParamDecl(5)->getType());
409
410 QualType QT = cudaLaunchKernelFD->getType();
411 QualType CQT = QT.getCanonicalType();
412 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
413 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
414
415 const CGFunctionInfo &FI =
416 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
417 llvm::FunctionCallee cudaLaunchKernelFn =
418 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
419 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
420 LaunchKernelArgs);
421
422 // To prevent CUDA device stub functions from being merged by ICF in MSVC
423 // environment, create an unique global variable for each kernel and write to
424 // the variable in the device stub.
425 if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
426 !CGF.getLangOpts().HIP) {
427 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
428 std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
429
430 llvm::GlobalVariable *HandleVar =
431 CGM.getModule().getNamedGlobal(GlobalVarName);
432 if (!HandleVar) {
433 HandleVar = new llvm::GlobalVariable(
434 CGM.getModule(), CGM.Int8Ty,
435 /*Constant=*/false, KernelFunction->getLinkage(),
436 llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
437 HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
438 HandleVar->setVisibility(KernelFunction->getVisibility());
439 if (KernelFunction->hasComdat())
440 HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
441 }
442
443 CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1),
444 HandleVar, CharUnits::One(),
445 /*IsVolatile=*/true);
446 }
447
448 CGF.EmitBranch(EndBlock);
449
450 CGF.EmitBlock(EndBlock);
451 }
452
emitDeviceStubBodyLegacy(CodeGenFunction & CGF,FunctionArgList & Args)453 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
454 FunctionArgList &Args) {
455 // Emit a call to cudaSetupArgument for each arg in Args.
456 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
457 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
458 CharUnits Offset = CharUnits::Zero();
459 for (const VarDecl *A : Args) {
460 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
461 Offset = Offset.alignTo(TInfo.Align);
462 llvm::Value *Args[] = {
463 CGF.Builder.CreatePointerCast(
464 CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy),
465 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
466 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
467 };
468 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
469 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
470 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
471 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
472 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
473 CGF.EmitBlock(NextBlock);
474 Offset += TInfo.Width;
475 }
476
477 // Emit the call to cudaLaunch
478 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
479 llvm::Value *Arg =
480 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
481 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
482 CGF.EmitBranch(EndBlock);
483
484 CGF.EmitBlock(EndBlock);
485 }
486
487 // Replace the original variable Var with the address loaded from variable
488 // ManagedVar populated by HIP runtime.
replaceManagedVar(llvm::GlobalVariable * Var,llvm::GlobalVariable * ManagedVar)489 static void replaceManagedVar(llvm::GlobalVariable *Var,
490 llvm::GlobalVariable *ManagedVar) {
491 SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
492 for (auto &&VarUse : Var->uses()) {
493 WorkList.push_back({VarUse.getUser()});
494 }
495 while (!WorkList.empty()) {
496 auto &&WorkItem = WorkList.pop_back_val();
497 auto *U = WorkItem.back();
498 if (isa<llvm::ConstantExpr>(U)) {
499 for (auto &&UU : U->uses()) {
500 WorkItem.push_back(UU.getUser());
501 WorkList.push_back(WorkItem);
502 WorkItem.pop_back();
503 }
504 continue;
505 }
506 if (auto *I = dyn_cast<llvm::Instruction>(U)) {
507 llvm::Value *OldV = Var;
508 llvm::Instruction *NewV =
509 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
510 llvm::Align(Var->getAlignment()), I);
511 WorkItem.pop_back();
512 // Replace constant expressions directly or indirectly using the managed
513 // variable with instructions.
514 for (auto &&Op : WorkItem) {
515 auto *CE = cast<llvm::ConstantExpr>(Op);
516 auto *NewInst = CE->getAsInstruction();
517 NewInst->insertBefore(*I->getParent(), I->getIterator());
518 NewInst->replaceUsesOfWith(OldV, NewV);
519 OldV = CE;
520 NewV = NewInst;
521 }
522 I->replaceUsesOfWith(OldV, NewV);
523 } else {
524 llvm_unreachable("Invalid use of managed variable");
525 }
526 }
527 }
528
529 /// Creates a function that sets up state on the host side for CUDA objects that
530 /// have a presence on both the host and device sides. Specifically, registers
531 /// the host side of kernel functions and device global variables with the CUDA
532 /// runtime.
533 /// \code
534 /// void __cuda_register_globals(void** GpuBinaryHandle) {
535 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
536 /// ...
537 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
538 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
539 /// ...
540 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
541 /// }
542 /// \endcode
makeRegisterGlobalsFn()543 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
544 // No need to register anything
545 if (EmittedKernels.empty() && DeviceVars.empty())
546 return nullptr;
547
548 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
549 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
550 addUnderscoredPrefixToName("_register_globals"), &TheModule);
551 llvm::BasicBlock *EntryBB =
552 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
553 CGBuilderTy Builder(CGM, Context);
554 Builder.SetInsertPoint(EntryBB);
555
556 // void __cudaRegisterFunction(void **, const char *, char *, const char *,
557 // int, uint3*, uint3*, dim3*, dim3*, int*)
558 llvm::Type *RegisterFuncParams[] = {
559 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
560 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
561 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
562 llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
563 addUnderscoredPrefixToName("RegisterFunction"));
564
565 // Extract GpuBinaryHandle passed as the first argument passed to
566 // __cuda_register_globals() and generate __cudaRegisterFunction() call for
567 // each emitted kernel.
568 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
569 for (auto &&I : EmittedKernels) {
570 llvm::Constant *KernelName =
571 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
572 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
573 llvm::Value *Args[] = {
574 &GpuBinaryHandlePtr,
575 KernelHandles[I.Kernel->getName()],
576 KernelName,
577 KernelName,
578 llvm::ConstantInt::get(IntTy, -1),
579 NullPtr,
580 NullPtr,
581 NullPtr,
582 NullPtr,
583 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
584 Builder.CreateCall(RegisterFunc, Args);
585 }
586
587 llvm::Type *VarSizeTy = IntTy;
588 // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
589 if (CGM.getLangOpts().HIP ||
590 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
591 VarSizeTy = SizeTy;
592
593 // void __cudaRegisterVar(void **, char *, char *, const char *,
594 // int, int, int, int)
595 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
596 IntTy, VarSizeTy, IntTy, IntTy};
597 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
598 llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
599 addUnderscoredPrefixToName("RegisterVar"));
600 // void __hipRegisterManagedVar(void **, char *, char *, const char *,
601 // size_t, unsigned)
602 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
603 PtrTy, VarSizeTy, IntTy};
604 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
605 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
606 addUnderscoredPrefixToName("RegisterManagedVar"));
607 // void __cudaRegisterSurface(void **, const struct surfaceReference *,
608 // const void **, const char *, int, int);
609 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
610 llvm::FunctionType::get(
611 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
612 addUnderscoredPrefixToName("RegisterSurface"));
613 // void __cudaRegisterTexture(void **, const struct textureReference *,
614 // const void **, const char *, int, int, int)
615 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
616 llvm::FunctionType::get(
617 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
618 addUnderscoredPrefixToName("RegisterTexture"));
619 for (auto &&Info : DeviceVars) {
620 llvm::GlobalVariable *Var = Info.Var;
621 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
622 "External variables should not show up here, except HIP managed "
623 "variables");
624 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
625 switch (Info.Flags.getKind()) {
626 case DeviceVarFlags::Variable: {
627 uint64_t VarSize =
628 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
629 if (Info.Flags.isManaged()) {
630 assert(Var->getName().ends_with(".managed") &&
631 "HIP managed variables not transformed");
632 auto *ManagedVar = CGM.getModule().getNamedGlobal(
633 Var->getName().drop_back(StringRef(".managed").size()));
634 llvm::Value *Args[] = {
635 &GpuBinaryHandlePtr,
636 ManagedVar,
637 Var,
638 VarName,
639 llvm::ConstantInt::get(VarSizeTy, VarSize),
640 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
641 if (!Var->isDeclaration())
642 Builder.CreateCall(RegisterManagedVar, Args);
643 } else {
644 llvm::Value *Args[] = {
645 &GpuBinaryHandlePtr,
646 Var,
647 VarName,
648 VarName,
649 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
650 llvm::ConstantInt::get(VarSizeTy, VarSize),
651 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
652 llvm::ConstantInt::get(IntTy, 0)};
653 Builder.CreateCall(RegisterVar, Args);
654 }
655 break;
656 }
657 case DeviceVarFlags::Surface:
658 Builder.CreateCall(
659 RegisterSurf,
660 {&GpuBinaryHandlePtr, Var, VarName, VarName,
661 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
662 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
663 break;
664 case DeviceVarFlags::Texture:
665 Builder.CreateCall(
666 RegisterTex,
667 {&GpuBinaryHandlePtr, Var, VarName, VarName,
668 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
669 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
670 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
671 break;
672 }
673 }
674
675 Builder.CreateRetVoid();
676 return RegisterKernelsFunc;
677 }
678
679 /// Creates a global constructor function for the module:
680 ///
681 /// For CUDA:
682 /// \code
683 /// void __cuda_module_ctor() {
684 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
685 /// __cuda_register_globals(Handle);
686 /// }
687 /// \endcode
688 ///
689 /// For HIP:
690 /// \code
691 /// void __hip_module_ctor() {
692 /// if (__hip_gpubin_handle == 0) {
693 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
694 /// __hip_register_globals(__hip_gpubin_handle);
695 /// }
696 /// }
697 /// \endcode
makeModuleCtorFunction()698 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
699 bool IsHIP = CGM.getLangOpts().HIP;
700 bool IsCUDA = CGM.getLangOpts().CUDA;
701 // No need to generate ctors/dtors if there is no GPU binary.
702 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
703 if (CudaGpuBinaryFileName.empty() && !IsHIP)
704 return nullptr;
705 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
706 DeviceVars.empty())
707 return nullptr;
708
709 // void __{cuda|hip}_register_globals(void* handle);
710 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
711 // We always need a function to pass in as callback. Create a dummy
712 // implementation if we don't need to register anything.
713 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
714 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
715
716 // void ** __{cuda|hip}RegisterFatBinary(void *);
717 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
718 llvm::FunctionType::get(PtrTy, PtrTy, false),
719 addUnderscoredPrefixToName("RegisterFatBinary"));
720 // struct { int magic, int version, void * gpu_binary, void * dont_care };
721 llvm::StructType *FatbinWrapperTy =
722 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
723
724 // Register GPU binary with the CUDA runtime, store returned handle in a
725 // global variable and save a reference in GpuBinaryHandle to be cleaned up
726 // in destructor on exit. Then associate all known kernels with the GPU binary
727 // handle so CUDA runtime can figure out what to call on the GPU side.
728 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
729 if (!CudaGpuBinaryFileName.empty()) {
730 auto VFS = CGM.getFileSystem();
731 auto CudaGpuBinaryOrErr =
732 VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
733 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
734 CGM.getDiags().Report(diag::err_cannot_open_file)
735 << CudaGpuBinaryFileName << EC.message();
736 return nullptr;
737 }
738 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
739 }
740
741 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
742 llvm::FunctionType::get(VoidTy, false),
743 llvm::GlobalValue::InternalLinkage,
744 addUnderscoredPrefixToName("_module_ctor"), &TheModule);
745 llvm::BasicBlock *CtorEntryBB =
746 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
747 CGBuilderTy CtorBuilder(CGM, Context);
748
749 CtorBuilder.SetInsertPoint(CtorEntryBB);
750
751 const char *FatbinConstantName;
752 const char *FatbinSectionName;
753 const char *ModuleIDSectionName;
754 StringRef ModuleIDPrefix;
755 llvm::Constant *FatBinStr;
756 unsigned FatMagic;
757 if (IsHIP) {
758 FatbinConstantName = ".hip_fatbin";
759 FatbinSectionName = ".hipFatBinSegment";
760
761 ModuleIDSectionName = "__hip_module_id";
762 ModuleIDPrefix = "__hip_";
763
764 if (CudaGpuBinary) {
765 // If fatbin is available from early finalization, create a string
766 // literal containing the fat binary loaded from the given file.
767 const unsigned HIPCodeObjectAlign = 4096;
768 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
769 FatbinConstantName, HIPCodeObjectAlign);
770 } else {
771 // If fatbin is not available, create an external symbol
772 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
773 // to contain the fat binary but will be populated somewhere else,
774 // e.g. by lld through link script.
775 FatBinStr = new llvm::GlobalVariable(
776 CGM.getModule(), CGM.Int8Ty,
777 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
778 "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
779 llvm::GlobalVariable::NotThreadLocal);
780 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
781 }
782
783 FatMagic = HIPFatMagic;
784 } else {
785 if (RelocatableDeviceCode)
786 FatbinConstantName = CGM.getTriple().isMacOSX()
787 ? "__NV_CUDA,__nv_relfatbin"
788 : "__nv_relfatbin";
789 else
790 FatbinConstantName =
791 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
792 // NVIDIA's cuobjdump looks for fatbins in this section.
793 FatbinSectionName =
794 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
795
796 ModuleIDSectionName = CGM.getTriple().isMacOSX()
797 ? "__NV_CUDA,__nv_module_id"
798 : "__nv_module_id";
799 ModuleIDPrefix = "__nv_";
800
801 // For CUDA, create a string literal containing the fat binary loaded from
802 // the given file.
803 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
804 FatbinConstantName, 8);
805 FatMagic = CudaFatMagic;
806 }
807
808 // Create initialized wrapper structure that points to the loaded GPU binary
809 ConstantInitBuilder Builder(CGM);
810 auto Values = Builder.beginStruct(FatbinWrapperTy);
811 // Fatbin wrapper magic.
812 Values.addInt(IntTy, FatMagic);
813 // Fatbin version.
814 Values.addInt(IntTy, 1);
815 // Data.
816 Values.add(FatBinStr);
817 // Unused in fatbin v1.
818 Values.add(llvm::ConstantPointerNull::get(PtrTy));
819 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
820 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
821 /*constant*/ true);
822 FatbinWrapper->setSection(FatbinSectionName);
823
824 // There is only one HIP fat binary per linked module, however there are
825 // multiple constructor functions. Make sure the fat binary is registered
826 // only once. The constructor functions are executed by the dynamic loader
827 // before the program gains control. The dynamic loader cannot execute the
828 // constructor functions concurrently since doing that would not guarantee
829 // thread safety of the loaded program. Therefore we can assume sequential
830 // execution of constructor functions here.
831 if (IsHIP) {
832 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
833 : llvm::GlobalValue::ExternalLinkage;
834 llvm::BasicBlock *IfBlock =
835 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
836 llvm::BasicBlock *ExitBlock =
837 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
838 // The name, size, and initialization pattern of this variable is part
839 // of HIP ABI.
840 GpuBinaryHandle = new llvm::GlobalVariable(
841 TheModule, PtrTy, /*isConstant=*/false, Linkage,
842 /*Initializer=*/
843 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
844 CudaGpuBinary
845 ? "__hip_gpubin_handle"
846 : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
847 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
848 // Prevent the weak symbol in different shared libraries being merged.
849 if (Linkage != llvm::GlobalValue::InternalLinkage)
850 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
851 Address GpuBinaryAddr(
852 GpuBinaryHandle, PtrTy,
853 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
854 {
855 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
856 llvm::Constant *Zero =
857 llvm::Constant::getNullValue(HandleValue->getType());
858 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
859 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
860 }
861 {
862 CtorBuilder.SetInsertPoint(IfBlock);
863 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
864 llvm::CallInst *RegisterFatbinCall =
865 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
866 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
867 CtorBuilder.CreateBr(ExitBlock);
868 }
869 {
870 CtorBuilder.SetInsertPoint(ExitBlock);
871 // Call __hip_register_globals(GpuBinaryHandle);
872 if (RegisterGlobalsFunc) {
873 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
874 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
875 }
876 }
877 } else if (!RelocatableDeviceCode) {
878 // Register binary with CUDA runtime. This is substantially different in
879 // default mode vs. separate compilation!
880 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
881 llvm::CallInst *RegisterFatbinCall =
882 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
883 GpuBinaryHandle = new llvm::GlobalVariable(
884 TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
885 llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
886 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
887 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
888 CGM.getPointerAlign());
889
890 // Call __cuda_register_globals(GpuBinaryHandle);
891 if (RegisterGlobalsFunc)
892 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
893
894 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
895 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
896 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
897 // void __cudaRegisterFatBinaryEnd(void **);
898 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
899 llvm::FunctionType::get(VoidTy, PtrTy, false),
900 "__cudaRegisterFatBinaryEnd");
901 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
902 }
903 } else {
904 // Generate a unique module ID.
905 SmallString<64> ModuleID;
906 llvm::raw_svector_ostream OS(ModuleID);
907 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
908 llvm::Constant *ModuleIDConstant = makeConstantArray(
909 std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
910
911 // Create an alias for the FatbinWrapper that nvcc will look for.
912 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
913 Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
914
915 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
916 // void *, void (*)(void **))
917 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
918 RegisterLinkedBinaryName += ModuleID;
919 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
920 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
921
922 assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
923 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
924 makeDummyFunction(getCallbackFnTy())};
925 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
926 }
927
928 // Create destructor and register it with atexit() the way NVCC does it. Doing
929 // it during regular destructor phase worked in CUDA before 9.2 but results in
930 // double-free in 9.2.
931 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
932 // extern "C" int atexit(void (*f)(void));
933 llvm::FunctionType *AtExitTy =
934 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
935 llvm::FunctionCallee AtExitFunc =
936 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
937 /*Local=*/true);
938 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
939 }
940
941 CtorBuilder.CreateRetVoid();
942 return ModuleCtorFunc;
943 }
944
945 /// Creates a global destructor function that unregisters the GPU code blob
946 /// registered by constructor.
947 ///
948 /// For CUDA:
949 /// \code
950 /// void __cuda_module_dtor() {
951 /// __cudaUnregisterFatBinary(Handle);
952 /// }
953 /// \endcode
954 ///
955 /// For HIP:
956 /// \code
957 /// void __hip_module_dtor() {
958 /// if (__hip_gpubin_handle) {
959 /// __hipUnregisterFatBinary(__hip_gpubin_handle);
960 /// __hip_gpubin_handle = 0;
961 /// }
962 /// }
963 /// \endcode
makeModuleDtorFunction()964 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
965 // No need for destructor if we don't have a handle to unregister.
966 if (!GpuBinaryHandle)
967 return nullptr;
968
969 // void __cudaUnregisterFatBinary(void ** handle);
970 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
971 llvm::FunctionType::get(VoidTy, PtrTy, false),
972 addUnderscoredPrefixToName("UnregisterFatBinary"));
973
974 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
975 llvm::FunctionType::get(VoidTy, false),
976 llvm::GlobalValue::InternalLinkage,
977 addUnderscoredPrefixToName("_module_dtor"), &TheModule);
978
979 llvm::BasicBlock *DtorEntryBB =
980 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
981 CGBuilderTy DtorBuilder(CGM, Context);
982 DtorBuilder.SetInsertPoint(DtorEntryBB);
983
984 Address GpuBinaryAddr(
985 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
986 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
987 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
988 // There is only one HIP fat binary per linked module, however there are
989 // multiple destructor functions. Make sure the fat binary is unregistered
990 // only once.
991 if (CGM.getLangOpts().HIP) {
992 llvm::BasicBlock *IfBlock =
993 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
994 llvm::BasicBlock *ExitBlock =
995 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
996 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
997 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
998 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
999
1000 DtorBuilder.SetInsertPoint(IfBlock);
1001 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1002 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1003 DtorBuilder.CreateBr(ExitBlock);
1004
1005 DtorBuilder.SetInsertPoint(ExitBlock);
1006 } else {
1007 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1008 }
1009 DtorBuilder.CreateRetVoid();
1010 return ModuleDtorFunc;
1011 }
1012
CreateNVCUDARuntime(CodeGenModule & CGM)1013 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
1014 return new CGNVCUDARuntime(CGM);
1015 }
1016
internalizeDeviceSideVar(const VarDecl * D,llvm::GlobalValue::LinkageTypes & Linkage)1017 void CGNVCUDARuntime::internalizeDeviceSideVar(
1018 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1019 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1020 // global variables become internal definitions. These have to be internal in
1021 // order to prevent name conflicts with global host variables with the same
1022 // name in a different TUs.
1023 //
1024 // For -fgpu-rdc, the shadow variables should not be internalized because
1025 // they may be accessed by different TU.
1026 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1027 return;
1028
1029 // __shared__ variables are odd. Shadows do get created, but
1030 // they are not registered with the CUDA runtime, so they
1031 // can't really be used to access their device-side
1032 // counterparts. It's not clear yet whether it's nvcc's bug or
1033 // a feature, but we've got to do the same for compatibility.
1034 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1035 D->hasAttr<CUDASharedAttr>() ||
1036 D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1037 D->getType()->isCUDADeviceBuiltinTextureType()) {
1038 Linkage = llvm::GlobalValue::InternalLinkage;
1039 }
1040 }
1041
handleVarRegistration(const VarDecl * D,llvm::GlobalVariable & GV)1042 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1043 llvm::GlobalVariable &GV) {
1044 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1045 // Shadow variables and their properties must be registered with CUDA
1046 // runtime. Skip Extern global variables, which will be registered in
1047 // the TU where they are defined.
1048 //
1049 // Don't register a C++17 inline variable. The local symbol can be
1050 // discarded and referencing a discarded local symbol from outside the
1051 // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1052 //
1053 // HIP managed variables need to be always recorded in device and host
1054 // compilations for transformation.
1055 //
1056 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1057 // added to llvm.compiler-used, therefore they are safe to be registered.
1058 if ((!D->hasExternalStorage() && !D->isInline()) ||
1059 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1060 D->hasAttr<HIPManagedAttr>()) {
1061 registerDeviceVar(D, GV, !D->hasDefinition(),
1062 D->hasAttr<CUDAConstantAttr>());
1063 }
1064 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1065 D->getType()->isCUDADeviceBuiltinTextureType()) {
1066 // Builtin surfaces and textures and their template arguments are
1067 // also registered with CUDA runtime.
1068 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1069 D->getType()->castAs<RecordType>()->getDecl());
1070 const TemplateArgumentList &Args = TD->getTemplateArgs();
1071 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1072 assert(Args.size() == 2 &&
1073 "Unexpected number of template arguments of CUDA device "
1074 "builtin surface type.");
1075 auto SurfType = Args[1].getAsIntegral();
1076 if (!D->hasExternalStorage())
1077 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1078 } else {
1079 assert(Args.size() == 3 &&
1080 "Unexpected number of template arguments of CUDA device "
1081 "builtin texture type.");
1082 auto TexType = Args[1].getAsIntegral();
1083 auto Normalized = Args[2].getAsIntegral();
1084 if (!D->hasExternalStorage())
1085 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1086 Normalized.getZExtValue());
1087 }
1088 }
1089 }
1090
1091 // Transform managed variables to pointers to managed variables in device code.
1092 // Each use of the original managed variable is replaced by a load from the
1093 // transformed managed variable. The transformed managed variable contains
1094 // the address of managed memory which will be allocated by the runtime.
transformManagedVars()1095 void CGNVCUDARuntime::transformManagedVars() {
1096 for (auto &&Info : DeviceVars) {
1097 llvm::GlobalVariable *Var = Info.Var;
1098 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1099 Info.Flags.isManaged()) {
1100 auto *ManagedVar = new llvm::GlobalVariable(
1101 CGM.getModule(), Var->getType(),
1102 /*isConstant=*/false, Var->getLinkage(),
1103 /*Init=*/Var->isDeclaration()
1104 ? nullptr
1105 : llvm::ConstantPointerNull::get(Var->getType()),
1106 /*Name=*/"", /*InsertBefore=*/nullptr,
1107 llvm::GlobalVariable::NotThreadLocal,
1108 CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1109 ? LangAS::cuda_device
1110 : LangAS::Default));
1111 ManagedVar->setDSOLocal(Var->isDSOLocal());
1112 ManagedVar->setVisibility(Var->getVisibility());
1113 ManagedVar->setExternallyInitialized(true);
1114 replaceManagedVar(Var, ManagedVar);
1115 ManagedVar->takeName(Var);
1116 Var->setName(Twine(ManagedVar->getName()) + ".managed");
1117 // Keep managed variables even if they are not used in device code since
1118 // they need to be allocated by the runtime.
1119 if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1120 assert(!ManagedVar->isDeclaration());
1121 CGM.addCompilerUsedGlobal(Var);
1122 CGM.addCompilerUsedGlobal(ManagedVar);
1123 }
1124 }
1125 }
1126 }
1127
1128 // Creates offloading entries for all the kernels and globals that must be
1129 // registered. The linker will provide a pointer to this section so we can
1130 // register the symbols with the linked device image.
createOffloadingEntries()1131 void CGNVCUDARuntime::createOffloadingEntries() {
1132 StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1133 : "cuda_offloading_entries";
1134 llvm::Module &M = CGM.getModule();
1135 for (KernelInfo &I : EmittedKernels)
1136 llvm::offloading::emitOffloadingEntry(
1137 M, KernelHandles[I.Kernel->getName()],
1138 getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1139 llvm::offloading::OffloadGlobalEntry, Section);
1140
1141 for (VarInfo &I : DeviceVars) {
1142 uint64_t VarSize =
1143 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1144 int32_t Flags =
1145 (I.Flags.isExtern()
1146 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1147 : 0) |
1148 (I.Flags.isConstant()
1149 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1150 : 0) |
1151 (I.Flags.isNormalized()
1152 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1153 : 0);
1154 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1155 llvm::offloading::emitOffloadingEntry(
1156 M, I.Var, getDeviceSideName(I.D), VarSize,
1157 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1158 : llvm::offloading::OffloadGlobalEntry) |
1159 Flags,
1160 /*Data=*/0, Section);
1161 } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1162 llvm::offloading::emitOffloadingEntry(
1163 M, I.Var, getDeviceSideName(I.D), VarSize,
1164 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1165 I.Flags.getSurfTexType(), Section);
1166 } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1167 llvm::offloading::emitOffloadingEntry(
1168 M, I.Var, getDeviceSideName(I.D), VarSize,
1169 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1170 I.Flags.getSurfTexType(), Section);
1171 }
1172 }
1173 }
1174
1175 // Returns module constructor to be added.
finalizeModule()1176 llvm::Function *CGNVCUDARuntime::finalizeModule() {
1177 transformManagedVars();
1178 if (CGM.getLangOpts().CUDAIsDevice) {
1179 // Mark ODR-used device variables as compiler used to prevent it from being
1180 // eliminated by optimization. This is necessary for device variables
1181 // ODR-used by host functions. Sema correctly marks them as ODR-used no
1182 // matter whether they are ODR-used by device or host functions.
1183 //
1184 // We do not need to do this if the variable has used attribute since it
1185 // has already been added.
1186 //
1187 // Static device variables have been externalized at this point, therefore
1188 // variables with LLVM private or internal linkage need not be added.
1189 for (auto &&Info : DeviceVars) {
1190 auto Kind = Info.Flags.getKind();
1191 if (!Info.Var->isDeclaration() &&
1192 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1193 (Kind == DeviceVarFlags::Variable ||
1194 Kind == DeviceVarFlags::Surface ||
1195 Kind == DeviceVarFlags::Texture) &&
1196 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1197 CGM.addCompilerUsedGlobal(Info.Var);
1198 }
1199 }
1200 return nullptr;
1201 }
1202 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1203 createOffloadingEntries();
1204 else
1205 return makeModuleCtorFunction();
1206
1207 return nullptr;
1208 }
1209
getKernelHandle(llvm::Function * F,GlobalDecl GD)1210 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1211 GlobalDecl GD) {
1212 auto Loc = KernelHandles.find(F->getName());
1213 if (Loc != KernelHandles.end()) {
1214 auto OldHandle = Loc->second;
1215 if (KernelStubs[OldHandle] == F)
1216 return OldHandle;
1217
1218 // We've found the function name, but F itself has changed, so we need to
1219 // update the references.
1220 if (CGM.getLangOpts().HIP) {
1221 // For HIP compilation the handle itself does not change, so we only need
1222 // to update the Stub value.
1223 KernelStubs[OldHandle] = F;
1224 return OldHandle;
1225 }
1226 // For non-HIP compilation, erase the old Stub and fall-through to creating
1227 // new entries.
1228 KernelStubs.erase(OldHandle);
1229 }
1230
1231 if (!CGM.getLangOpts().HIP) {
1232 KernelHandles[F->getName()] = F;
1233 KernelStubs[F] = F;
1234 return F;
1235 }
1236
1237 auto *Var = new llvm::GlobalVariable(
1238 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1239 /*Initializer=*/nullptr,
1240 CGM.getMangledName(
1241 GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1242 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1243 Var->setDSOLocal(F->isDSOLocal());
1244 Var->setVisibility(F->getVisibility());
1245 auto *FD = cast<FunctionDecl>(GD.getDecl());
1246 auto *FT = FD->getPrimaryTemplate();
1247 if (!FT || FT->isThisDeclarationADefinition())
1248 CGM.maybeSetTrivialComdat(*FD, *Var);
1249 KernelHandles[F->getName()] = Var;
1250 KernelStubs[Var] = F;
1251 return Var;
1252 }
1253