1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CGCXXABI.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/AST/Decl.h" 19 #include "clang/Basic/Cuda.h" 20 #include "clang/CodeGen/CodeGenABITypes.h" 21 #include "clang/CodeGen/ConstantInitBuilder.h" 22 #include "llvm/Frontend/Offloading/Utility.h" 23 #include "llvm/IR/BasicBlock.h" 24 #include "llvm/IR/Constants.h" 25 #include "llvm/IR/DerivedTypes.h" 26 #include "llvm/IR/ReplaceConstant.h" 27 #include "llvm/Support/Format.h" 28 #include "llvm/Support/VirtualFileSystem.h" 29 30 using namespace clang; 31 using namespace CodeGen; 32 33 namespace { 34 constexpr unsigned CudaFatMagic = 0x466243b1; 35 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 36 37 class CGNVCUDARuntime : public CGCUDARuntime { 38 39 private: 40 llvm::IntegerType *IntTy, *SizeTy; 41 llvm::Type *VoidTy; 42 llvm::PointerType *PtrTy; 43 44 /// Convenience reference to LLVM Context 45 llvm::LLVMContext &Context; 46 /// Convenience reference to the current module 47 llvm::Module &TheModule; 48 /// Keeps track of kernel launch stubs and handles emitted in this module 49 struct KernelInfo { 50 llvm::Function *Kernel; // stub function to help launch kernel 51 const Decl *D; 52 }; 53 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 54 // Map a kernel mangled name to a symbol for identifying kernel in host code 55 // For CUDA, the symbol for identifying the kernel is the same as the device 56 // stub function. For HIP, they are different. 57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles; 58 // Map a kernel handle to the kernel stub. 59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs; 60 struct VarInfo { 61 llvm::GlobalVariable *Var; 62 const VarDecl *D; 63 DeviceVarFlags Flags; 64 }; 65 llvm::SmallVector<VarInfo, 16> DeviceVars; 66 /// Keeps track of variable containing handle of GPU binary. Populated by 67 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 68 /// ModuleDtorFunction() 69 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 70 /// Whether we generate relocatable device code. 71 bool RelocatableDeviceCode; 72 /// Mangle context for device. 73 std::unique_ptr<MangleContext> DeviceMC; 74 75 llvm::FunctionCallee getSetupArgumentFn() const; 76 llvm::FunctionCallee getLaunchFn() const; 77 78 llvm::FunctionType *getRegisterGlobalsFnTy() const; 79 llvm::FunctionType *getCallbackFnTy() const; 80 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 81 std::string addPrefixToName(StringRef FuncName) const; 82 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 83 84 /// Creates a function to register all kernel stubs generated in this module. 85 llvm::Function *makeRegisterGlobalsFn(); 86 87 /// Helper function that generates a constant string and returns a pointer to 88 /// the start of the string. The result of this function can be used anywhere 89 /// where the C code specifies const char*. 90 llvm::Constant *makeConstantString(const std::string &Str, 91 const std::string &Name = "") { 92 return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer(); 93 } 94 95 /// Helper function which generates an initialized constant array from Str, 96 /// and optionally sets section name and alignment. AddNull specifies whether 97 /// the array should nave NUL termination. 98 llvm::Constant *makeConstantArray(StringRef Str, 99 StringRef Name = "", 100 StringRef SectionName = "", 101 unsigned Alignment = 0, 102 bool AddNull = false) { 103 llvm::Constant *Value = 104 llvm::ConstantDataArray::getString(Context, Str, AddNull); 105 auto *GV = new llvm::GlobalVariable( 106 TheModule, Value->getType(), /*isConstant=*/true, 107 llvm::GlobalValue::PrivateLinkage, Value, Name); 108 if (!SectionName.empty()) { 109 GV->setSection(SectionName); 110 // Mark the address as used which make sure that this section isn't 111 // merged and we will really have it in the object file. 112 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 113 } 114 if (Alignment) 115 GV->setAlignment(llvm::Align(Alignment)); 116 return GV; 117 } 118 119 /// Helper function that generates an empty dummy function returning void. 120 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 121 assert(FnTy->getReturnType()->isVoidTy() && 122 "Can only generate dummy functions returning void!"); 123 llvm::Function *DummyFunc = llvm::Function::Create( 124 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 125 126 llvm::BasicBlock *DummyBlock = 127 llvm::BasicBlock::Create(Context, "", DummyFunc); 128 CGBuilderTy FuncBuilder(CGM, Context); 129 FuncBuilder.SetInsertPoint(DummyBlock); 130 FuncBuilder.CreateRetVoid(); 131 132 return DummyFunc; 133 } 134 135 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 136 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 137 std::string getDeviceSideName(const NamedDecl *ND) override; 138 139 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 140 bool Extern, bool Constant) { 141 DeviceVars.push_back({&Var, 142 VD, 143 {DeviceVarFlags::Variable, Extern, Constant, 144 VD->hasAttr<HIPManagedAttr>(), 145 /*Normalized*/ false, 0}}); 146 } 147 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, 148 bool Extern, int Type) { 149 DeviceVars.push_back({&Var, 150 VD, 151 {DeviceVarFlags::Surface, Extern, /*Constant*/ false, 152 /*Managed*/ false, 153 /*Normalized*/ false, Type}}); 154 } 155 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, 156 bool Extern, int Type, bool Normalized) { 157 DeviceVars.push_back({&Var, 158 VD, 159 {DeviceVarFlags::Texture, Extern, /*Constant*/ false, 160 /*Managed*/ false, Normalized, Type}}); 161 } 162 163 /// Creates module constructor function 164 llvm::Function *makeModuleCtorFunction(); 165 /// Creates module destructor function 166 llvm::Function *makeModuleDtorFunction(); 167 /// Transform managed variables for device compilation. 168 void transformManagedVars(); 169 /// Create offloading entries to register globals in RDC mode. 170 void createOffloadingEntries(); 171 172 public: 173 CGNVCUDARuntime(CodeGenModule &CGM); 174 175 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override; 176 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override { 177 auto Loc = KernelStubs.find(Handle); 178 assert(Loc != KernelStubs.end()); 179 return Loc->second; 180 } 181 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 182 void handleVarRegistration(const VarDecl *VD, 183 llvm::GlobalVariable &Var) override; 184 void 185 internalizeDeviceSideVar(const VarDecl *D, 186 llvm::GlobalValue::LinkageTypes &Linkage) override; 187 188 llvm::Function *finalizeModule() override; 189 }; 190 191 } // end anonymous namespace 192 193 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 194 if (CGM.getLangOpts().HIP) 195 return ((Twine("hip") + Twine(FuncName)).str()); 196 return ((Twine("cuda") + Twine(FuncName)).str()); 197 } 198 std::string 199 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 200 if (CGM.getLangOpts().HIP) 201 return ((Twine("__hip") + Twine(FuncName)).str()); 202 return ((Twine("__cuda") + Twine(FuncName)).str()); 203 } 204 205 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) { 206 // If the host and device have different C++ ABIs, mark it as the device 207 // mangle context so that the mangling needs to retrieve the additional 208 // device lambda mangling number instead of the regular host one. 209 if (CGM.getContext().getAuxTargetInfo() && 210 CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && 211 CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { 212 return std::unique_ptr<MangleContext>( 213 CGM.getContext().createDeviceMangleContext( 214 *CGM.getContext().getAuxTargetInfo())); 215 } 216 217 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext( 218 CGM.getContext().getAuxTargetInfo())); 219 } 220 221 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 222 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 223 TheModule(CGM.getModule()), 224 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 225 DeviceMC(InitDeviceMC(CGM)) { 226 IntTy = CGM.IntTy; 227 SizeTy = CGM.SizeTy; 228 VoidTy = CGM.VoidTy; 229 PtrTy = CGM.UnqualPtrTy; 230 } 231 232 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 233 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 234 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy}; 235 return CGM.CreateRuntimeFunction( 236 llvm::FunctionType::get(IntTy, Params, false), 237 addPrefixToName("SetupArgument")); 238 } 239 240 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 241 if (CGM.getLangOpts().HIP) { 242 // hipError_t hipLaunchByPtr(char *); 243 return CGM.CreateRuntimeFunction( 244 llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr"); 245 } 246 // cudaError_t cudaLaunch(char *); 247 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false), 248 "cudaLaunch"); 249 } 250 251 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 252 return llvm::FunctionType::get(VoidTy, PtrTy, false); 253 } 254 255 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 256 return llvm::FunctionType::get(VoidTy, PtrTy, false); 257 } 258 259 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 260 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy, 261 llvm::PointerType::getUnqual(Context)}; 262 return llvm::FunctionType::get(VoidTy, Params, false); 263 } 264 265 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { 266 GlobalDecl GD; 267 // D could be either a kernel or a variable. 268 if (auto *FD = dyn_cast<FunctionDecl>(ND)) 269 GD = GlobalDecl(FD, KernelReferenceKind::Kernel); 270 else 271 GD = GlobalDecl(ND); 272 std::string DeviceSideName; 273 MangleContext *MC; 274 if (CGM.getLangOpts().CUDAIsDevice) 275 MC = &CGM.getCXXABI().getMangleContext(); 276 else 277 MC = DeviceMC.get(); 278 if (MC->shouldMangleDeclName(ND)) { 279 SmallString<256> Buffer; 280 llvm::raw_svector_ostream Out(Buffer); 281 MC->mangleName(GD, Out); 282 DeviceSideName = std::string(Out.str()); 283 } else 284 DeviceSideName = std::string(ND->getIdentifier()->getName()); 285 286 // Make unique name for device side static file-scope variable for HIP. 287 if (CGM.getContext().shouldExternalize(ND) && 288 CGM.getLangOpts().GPURelocatableDeviceCode) { 289 SmallString<256> Buffer; 290 llvm::raw_svector_ostream Out(Buffer); 291 Out << DeviceSideName; 292 CGM.printPostfixForExternalizedDecl(Out, ND); 293 DeviceSideName = std::string(Out.str()); 294 } 295 return DeviceSideName; 296 } 297 298 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 299 FunctionArgList &Args) { 300 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 301 if (auto *GV = 302 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) { 303 GV->setLinkage(CGF.CurFn->getLinkage()); 304 GV->setInitializer(CGF.CurFn); 305 } 306 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 307 CudaFeature::CUDA_USES_NEW_LAUNCH) || 308 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) 309 emitDeviceStubBodyNew(CGF, Args); 310 else 311 emitDeviceStubBodyLegacy(CGF, Args); 312 } 313 314 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 315 // array and kernels are launched using cudaLaunchKernel(). 316 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 317 FunctionArgList &Args) { 318 // Build the shadow stack entry at the very start of the function. 319 320 // Calculate amount of space we will need for all arguments. If we have no 321 // args, allocate a single pointer so we still have a valid pointer to the 322 // argument array that we can pass to runtime, even if it will be unused. 323 Address KernelArgs = CGF.CreateTempAlloca( 324 PtrTy, CharUnits::fromQuantity(16), "kernel_args", 325 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 326 // Store pointers to the arguments in a locally allocated launch_args. 327 for (unsigned i = 0; i < Args.size(); ++i) { 328 llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF); 329 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy); 330 CGF.Builder.CreateDefaultAlignedStore( 331 VoidVarPtr, CGF.Builder.CreateConstGEP1_32( 332 PtrTy, KernelArgs.emitRawPointer(CGF), i)); 333 } 334 335 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 336 337 // Lookup cudaLaunchKernel/hipLaunchKernel function. 338 // HIP kernel launching API name depends on -fgpu-default-stream option. For 339 // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', 340 // it is hipLaunchKernel_spt. 341 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 342 // void **args, size_t sharedMem, 343 // cudaStream_t stream); 344 // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, 345 // dim3 blockDim, void **args, 346 // size_t sharedMem, hipStream_t stream); 347 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 348 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 349 std::string KernelLaunchAPI = "LaunchKernel"; 350 if (CGF.getLangOpts().GPUDefaultStream == 351 LangOptions::GPUDefaultStreamKind::PerThread) { 352 if (CGF.getLangOpts().HIP) 353 KernelLaunchAPI = KernelLaunchAPI + "_spt"; 354 else if (CGF.getLangOpts().CUDA) 355 KernelLaunchAPI = KernelLaunchAPI + "_ptsz"; 356 } 357 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI); 358 const IdentifierInfo &cudaLaunchKernelII = 359 CGM.getContext().Idents.get(LaunchKernelName); 360 FunctionDecl *cudaLaunchKernelFD = nullptr; 361 for (auto *Result : DC->lookup(&cudaLaunchKernelII)) { 362 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 363 cudaLaunchKernelFD = FD; 364 } 365 366 if (cudaLaunchKernelFD == nullptr) { 367 CGM.Error(CGF.CurFuncDecl->getLocation(), 368 "Can't find declaration for " + LaunchKernelName); 369 return; 370 } 371 // Create temporary dim3 grid_dim, block_dim. 372 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 373 QualType Dim3Ty = GridDimParam->getType(); 374 Address GridDim = 375 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 376 Address BlockDim = 377 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 378 Address ShmemSize = 379 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 380 Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream"); 381 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 382 llvm::FunctionType::get(IntTy, 383 {/*gridDim=*/GridDim.getType(), 384 /*blockDim=*/BlockDim.getType(), 385 /*ShmemSize=*/ShmemSize.getType(), 386 /*Stream=*/Stream.getType()}, 387 /*isVarArg=*/false), 388 addUnderscoredPrefixToName("PopCallConfiguration")); 389 390 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF), 391 BlockDim.emitRawPointer(CGF), 392 ShmemSize.emitRawPointer(CGF), 393 Stream.emitRawPointer(CGF)}); 394 395 // Emit the call to cudaLaunch 396 llvm::Value *Kernel = 397 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy); 398 CallArgList LaunchKernelArgs; 399 LaunchKernelArgs.add(RValue::get(Kernel), 400 cudaLaunchKernelFD->getParamDecl(0)->getType()); 401 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 402 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 403 LaunchKernelArgs.add(RValue::get(KernelArgs, CGF), 404 cudaLaunchKernelFD->getParamDecl(3)->getType()); 405 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 406 cudaLaunchKernelFD->getParamDecl(4)->getType()); 407 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 408 cudaLaunchKernelFD->getParamDecl(5)->getType()); 409 410 QualType QT = cudaLaunchKernelFD->getType(); 411 QualType CQT = QT.getCanonicalType(); 412 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 413 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty); 414 415 const CGFunctionInfo &FI = 416 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 417 llvm::FunctionCallee cudaLaunchKernelFn = 418 CGM.CreateRuntimeFunction(FTy, LaunchKernelName); 419 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 420 LaunchKernelArgs); 421 422 // To prevent CUDA device stub functions from being merged by ICF in MSVC 423 // environment, create an unique global variable for each kernel and write to 424 // the variable in the device stub. 425 if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && 426 !CGF.getLangOpts().HIP) { 427 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel); 428 std::string GlobalVarName = (KernelFunction->getName() + ".id").str(); 429 430 llvm::GlobalVariable *HandleVar = 431 CGM.getModule().getNamedGlobal(GlobalVarName); 432 if (!HandleVar) { 433 HandleVar = new llvm::GlobalVariable( 434 CGM.getModule(), CGM.Int8Ty, 435 /*Constant=*/false, KernelFunction->getLinkage(), 436 llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName); 437 HandleVar->setDSOLocal(KernelFunction->isDSOLocal()); 438 HandleVar->setVisibility(KernelFunction->getVisibility()); 439 if (KernelFunction->hasComdat()) 440 HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName)); 441 } 442 443 CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1), 444 HandleVar, CharUnits::One(), 445 /*IsVolatile=*/true); 446 } 447 448 CGF.EmitBranch(EndBlock); 449 450 CGF.EmitBlock(EndBlock); 451 } 452 453 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 454 FunctionArgList &Args) { 455 // Emit a call to cudaSetupArgument for each arg in Args. 456 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 457 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 458 CharUnits Offset = CharUnits::Zero(); 459 for (const VarDecl *A : Args) { 460 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType()); 461 Offset = Offset.alignTo(TInfo.Align); 462 llvm::Value *Args[] = { 463 CGF.Builder.CreatePointerCast( 464 CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy), 465 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()), 466 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 467 }; 468 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 469 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 470 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 471 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 472 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 473 CGF.EmitBlock(NextBlock); 474 Offset += TInfo.Width; 475 } 476 477 // Emit the call to cudaLaunch 478 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 479 llvm::Value *Arg = 480 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy); 481 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 482 CGF.EmitBranch(EndBlock); 483 484 CGF.EmitBlock(EndBlock); 485 } 486 487 // Replace the original variable Var with the address loaded from variable 488 // ManagedVar populated by HIP runtime. 489 static void replaceManagedVar(llvm::GlobalVariable *Var, 490 llvm::GlobalVariable *ManagedVar) { 491 SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; 492 for (auto &&VarUse : Var->uses()) { 493 WorkList.push_back({VarUse.getUser()}); 494 } 495 while (!WorkList.empty()) { 496 auto &&WorkItem = WorkList.pop_back_val(); 497 auto *U = WorkItem.back(); 498 if (isa<llvm::ConstantExpr>(U)) { 499 for (auto &&UU : U->uses()) { 500 WorkItem.push_back(UU.getUser()); 501 WorkList.push_back(WorkItem); 502 WorkItem.pop_back(); 503 } 504 continue; 505 } 506 if (auto *I = dyn_cast<llvm::Instruction>(U)) { 507 llvm::Value *OldV = Var; 508 llvm::Instruction *NewV = 509 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false, 510 llvm::Align(Var->getAlignment()), I); 511 WorkItem.pop_back(); 512 // Replace constant expressions directly or indirectly using the managed 513 // variable with instructions. 514 for (auto &&Op : WorkItem) { 515 auto *CE = cast<llvm::ConstantExpr>(Op); 516 auto *NewInst = CE->getAsInstruction(); 517 NewInst->insertBefore(*I->getParent(), I->getIterator()); 518 NewInst->replaceUsesOfWith(OldV, NewV); 519 OldV = CE; 520 NewV = NewInst; 521 } 522 I->replaceUsesOfWith(OldV, NewV); 523 } else { 524 llvm_unreachable("Invalid use of managed variable"); 525 } 526 } 527 } 528 529 /// Creates a function that sets up state on the host side for CUDA objects that 530 /// have a presence on both the host and device sides. Specifically, registers 531 /// the host side of kernel functions and device global variables with the CUDA 532 /// runtime. 533 /// \code 534 /// void __cuda_register_globals(void** GpuBinaryHandle) { 535 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 536 /// ... 537 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 538 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 539 /// ... 540 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 541 /// } 542 /// \endcode 543 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 544 // No need to register anything 545 if (EmittedKernels.empty() && DeviceVars.empty()) 546 return nullptr; 547 548 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 549 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 550 addUnderscoredPrefixToName("_register_globals"), &TheModule); 551 llvm::BasicBlock *EntryBB = 552 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 553 CGBuilderTy Builder(CGM, Context); 554 Builder.SetInsertPoint(EntryBB); 555 556 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 557 // int, uint3*, uint3*, dim3*, dim3*, int*) 558 llvm::Type *RegisterFuncParams[] = { 559 PtrTy, PtrTy, PtrTy, PtrTy, IntTy, 560 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)}; 561 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 562 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 563 addUnderscoredPrefixToName("RegisterFunction")); 564 565 // Extract GpuBinaryHandle passed as the first argument passed to 566 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 567 // each emitted kernel. 568 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 569 for (auto &&I : EmittedKernels) { 570 llvm::Constant *KernelName = 571 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); 572 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy); 573 llvm::Value *Args[] = { 574 &GpuBinaryHandlePtr, 575 KernelHandles[I.Kernel->getName()], 576 KernelName, 577 KernelName, 578 llvm::ConstantInt::get(IntTy, -1), 579 NullPtr, 580 NullPtr, 581 NullPtr, 582 NullPtr, 583 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))}; 584 Builder.CreateCall(RegisterFunc, Args); 585 } 586 587 llvm::Type *VarSizeTy = IntTy; 588 // For HIP or CUDA 9.0+, device variable size is type of `size_t`. 589 if (CGM.getLangOpts().HIP || 590 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90) 591 VarSizeTy = SizeTy; 592 593 // void __cudaRegisterVar(void **, char *, char *, const char *, 594 // int, int, int, int) 595 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy, 596 IntTy, VarSizeTy, IntTy, IntTy}; 597 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 598 llvm::FunctionType::get(VoidTy, RegisterVarParams, false), 599 addUnderscoredPrefixToName("RegisterVar")); 600 // void __hipRegisterManagedVar(void **, char *, char *, const char *, 601 // size_t, unsigned) 602 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy, 603 PtrTy, VarSizeTy, IntTy}; 604 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( 605 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false), 606 addUnderscoredPrefixToName("RegisterManagedVar")); 607 // void __cudaRegisterSurface(void **, const struct surfaceReference *, 608 // const void **, const char *, int, int); 609 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( 610 llvm::FunctionType::get( 611 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false), 612 addUnderscoredPrefixToName("RegisterSurface")); 613 // void __cudaRegisterTexture(void **, const struct textureReference *, 614 // const void **, const char *, int, int, int) 615 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction( 616 llvm::FunctionType::get( 617 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false), 618 addUnderscoredPrefixToName("RegisterTexture")); 619 for (auto &&Info : DeviceVars) { 620 llvm::GlobalVariable *Var = Info.Var; 621 assert((!Var->isDeclaration() || Info.Flags.isManaged()) && 622 "External variables should not show up here, except HIP managed " 623 "variables"); 624 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 625 switch (Info.Flags.getKind()) { 626 case DeviceVarFlags::Variable: { 627 uint64_t VarSize = 628 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 629 if (Info.Flags.isManaged()) { 630 assert(Var->getName().ends_with(".managed") && 631 "HIP managed variables not transformed"); 632 auto *ManagedVar = CGM.getModule().getNamedGlobal( 633 Var->getName().drop_back(StringRef(".managed").size())); 634 llvm::Value *Args[] = { 635 &GpuBinaryHandlePtr, 636 ManagedVar, 637 Var, 638 VarName, 639 llvm::ConstantInt::get(VarSizeTy, VarSize), 640 llvm::ConstantInt::get(IntTy, Var->getAlignment())}; 641 if (!Var->isDeclaration()) 642 Builder.CreateCall(RegisterManagedVar, Args); 643 } else { 644 llvm::Value *Args[] = { 645 &GpuBinaryHandlePtr, 646 Var, 647 VarName, 648 VarName, 649 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), 650 llvm::ConstantInt::get(VarSizeTy, VarSize), 651 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), 652 llvm::ConstantInt::get(IntTy, 0)}; 653 Builder.CreateCall(RegisterVar, Args); 654 } 655 break; 656 } 657 case DeviceVarFlags::Surface: 658 Builder.CreateCall( 659 RegisterSurf, 660 {&GpuBinaryHandlePtr, Var, VarName, VarName, 661 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), 662 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); 663 break; 664 case DeviceVarFlags::Texture: 665 Builder.CreateCall( 666 RegisterTex, 667 {&GpuBinaryHandlePtr, Var, VarName, VarName, 668 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), 669 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()), 670 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); 671 break; 672 } 673 } 674 675 Builder.CreateRetVoid(); 676 return RegisterKernelsFunc; 677 } 678 679 /// Creates a global constructor function for the module: 680 /// 681 /// For CUDA: 682 /// \code 683 /// void __cuda_module_ctor() { 684 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 685 /// __cuda_register_globals(Handle); 686 /// } 687 /// \endcode 688 /// 689 /// For HIP: 690 /// \code 691 /// void __hip_module_ctor() { 692 /// if (__hip_gpubin_handle == 0) { 693 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 694 /// __hip_register_globals(__hip_gpubin_handle); 695 /// } 696 /// } 697 /// \endcode 698 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 699 bool IsHIP = CGM.getLangOpts().HIP; 700 bool IsCUDA = CGM.getLangOpts().CUDA; 701 // No need to generate ctors/dtors if there is no GPU binary. 702 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 703 if (CudaGpuBinaryFileName.empty() && !IsHIP) 704 return nullptr; 705 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && 706 DeviceVars.empty()) 707 return nullptr; 708 709 // void __{cuda|hip}_register_globals(void* handle); 710 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 711 // We always need a function to pass in as callback. Create a dummy 712 // implementation if we don't need to register anything. 713 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 714 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 715 716 // void ** __{cuda|hip}RegisterFatBinary(void *); 717 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 718 llvm::FunctionType::get(PtrTy, PtrTy, false), 719 addUnderscoredPrefixToName("RegisterFatBinary")); 720 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 721 llvm::StructType *FatbinWrapperTy = 722 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy); 723 724 // Register GPU binary with the CUDA runtime, store returned handle in a 725 // global variable and save a reference in GpuBinaryHandle to be cleaned up 726 // in destructor on exit. Then associate all known kernels with the GPU binary 727 // handle so CUDA runtime can figure out what to call on the GPU side. 728 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 729 if (!CudaGpuBinaryFileName.empty()) { 730 auto VFS = CGM.getFileSystem(); 731 auto CudaGpuBinaryOrErr = 732 VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false); 733 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 734 CGM.getDiags().Report(diag::err_cannot_open_file) 735 << CudaGpuBinaryFileName << EC.message(); 736 return nullptr; 737 } 738 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 739 } 740 741 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 742 llvm::FunctionType::get(VoidTy, false), 743 llvm::GlobalValue::InternalLinkage, 744 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 745 llvm::BasicBlock *CtorEntryBB = 746 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 747 CGBuilderTy CtorBuilder(CGM, Context); 748 749 CtorBuilder.SetInsertPoint(CtorEntryBB); 750 751 const char *FatbinConstantName; 752 const char *FatbinSectionName; 753 const char *ModuleIDSectionName; 754 StringRef ModuleIDPrefix; 755 llvm::Constant *FatBinStr; 756 unsigned FatMagic; 757 if (IsHIP) { 758 FatbinConstantName = ".hip_fatbin"; 759 FatbinSectionName = ".hipFatBinSegment"; 760 761 ModuleIDSectionName = "__hip_module_id"; 762 ModuleIDPrefix = "__hip_"; 763 764 if (CudaGpuBinary) { 765 // If fatbin is available from early finalization, create a string 766 // literal containing the fat binary loaded from the given file. 767 const unsigned HIPCodeObjectAlign = 4096; 768 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "", 769 FatbinConstantName, HIPCodeObjectAlign); 770 } else { 771 // If fatbin is not available, create an external symbol 772 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 773 // to contain the fat binary but will be populated somewhere else, 774 // e.g. by lld through link script. 775 FatBinStr = new llvm::GlobalVariable( 776 CGM.getModule(), CGM.Int8Ty, 777 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 778 "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr, 779 llvm::GlobalVariable::NotThreadLocal); 780 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 781 } 782 783 FatMagic = HIPFatMagic; 784 } else { 785 if (RelocatableDeviceCode) 786 FatbinConstantName = CGM.getTriple().isMacOSX() 787 ? "__NV_CUDA,__nv_relfatbin" 788 : "__nv_relfatbin"; 789 else 790 FatbinConstantName = 791 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 792 // NVIDIA's cuobjdump looks for fatbins in this section. 793 FatbinSectionName = 794 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 795 796 ModuleIDSectionName = CGM.getTriple().isMacOSX() 797 ? "__NV_CUDA,__nv_module_id" 798 : "__nv_module_id"; 799 ModuleIDPrefix = "__nv_"; 800 801 // For CUDA, create a string literal containing the fat binary loaded from 802 // the given file. 803 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "", 804 FatbinConstantName, 8); 805 FatMagic = CudaFatMagic; 806 } 807 808 // Create initialized wrapper structure that points to the loaded GPU binary 809 ConstantInitBuilder Builder(CGM); 810 auto Values = Builder.beginStruct(FatbinWrapperTy); 811 // Fatbin wrapper magic. 812 Values.addInt(IntTy, FatMagic); 813 // Fatbin version. 814 Values.addInt(IntTy, 1); 815 // Data. 816 Values.add(FatBinStr); 817 // Unused in fatbin v1. 818 Values.add(llvm::ConstantPointerNull::get(PtrTy)); 819 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 820 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 821 /*constant*/ true); 822 FatbinWrapper->setSection(FatbinSectionName); 823 824 // There is only one HIP fat binary per linked module, however there are 825 // multiple constructor functions. Make sure the fat binary is registered 826 // only once. The constructor functions are executed by the dynamic loader 827 // before the program gains control. The dynamic loader cannot execute the 828 // constructor functions concurrently since doing that would not guarantee 829 // thread safety of the loaded program. Therefore we can assume sequential 830 // execution of constructor functions here. 831 if (IsHIP) { 832 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage 833 : llvm::GlobalValue::ExternalLinkage; 834 llvm::BasicBlock *IfBlock = 835 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 836 llvm::BasicBlock *ExitBlock = 837 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 838 // The name, size, and initialization pattern of this variable is part 839 // of HIP ABI. 840 GpuBinaryHandle = new llvm::GlobalVariable( 841 TheModule, PtrTy, /*isConstant=*/false, Linkage, 842 /*Initializer=*/ 843 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr, 844 CudaGpuBinary 845 ? "__hip_gpubin_handle" 846 : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash()); 847 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 848 // Prevent the weak symbol in different shared libraries being merged. 849 if (Linkage != llvm::GlobalValue::InternalLinkage) 850 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 851 Address GpuBinaryAddr( 852 GpuBinaryHandle, PtrTy, 853 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 854 { 855 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 856 llvm::Constant *Zero = 857 llvm::Constant::getNullValue(HandleValue->getType()); 858 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 859 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 860 } 861 { 862 CtorBuilder.SetInsertPoint(IfBlock); 863 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 864 llvm::CallInst *RegisterFatbinCall = 865 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper); 866 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 867 CtorBuilder.CreateBr(ExitBlock); 868 } 869 { 870 CtorBuilder.SetInsertPoint(ExitBlock); 871 // Call __hip_register_globals(GpuBinaryHandle); 872 if (RegisterGlobalsFunc) { 873 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 874 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 875 } 876 } 877 } else if (!RelocatableDeviceCode) { 878 // Register binary with CUDA runtime. This is substantially different in 879 // default mode vs. separate compilation! 880 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 881 llvm::CallInst *RegisterFatbinCall = 882 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper); 883 GpuBinaryHandle = new llvm::GlobalVariable( 884 TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage, 885 llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle"); 886 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 887 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 888 CGM.getPointerAlign()); 889 890 // Call __cuda_register_globals(GpuBinaryHandle); 891 if (RegisterGlobalsFunc) 892 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 893 894 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 895 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 896 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 897 // void __cudaRegisterFatBinaryEnd(void **); 898 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 899 llvm::FunctionType::get(VoidTy, PtrTy, false), 900 "__cudaRegisterFatBinaryEnd"); 901 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 902 } 903 } else { 904 // Generate a unique module ID. 905 SmallString<64> ModuleID; 906 llvm::raw_svector_ostream OS(ModuleID); 907 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 908 llvm::Constant *ModuleIDConstant = makeConstantArray( 909 std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true); 910 911 // Create an alias for the FatbinWrapper that nvcc will look for. 912 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 913 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 914 915 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 916 // void *, void (*)(void **)) 917 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 918 RegisterLinkedBinaryName += ModuleID; 919 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 920 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 921 922 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 923 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant, 924 makeDummyFunction(getCallbackFnTy())}; 925 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 926 } 927 928 // Create destructor and register it with atexit() the way NVCC does it. Doing 929 // it during regular destructor phase worked in CUDA before 9.2 but results in 930 // double-free in 9.2. 931 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 932 // extern "C" int atexit(void (*f)(void)); 933 llvm::FunctionType *AtExitTy = 934 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 935 llvm::FunctionCallee AtExitFunc = 936 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 937 /*Local=*/true); 938 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 939 } 940 941 CtorBuilder.CreateRetVoid(); 942 return ModuleCtorFunc; 943 } 944 945 /// Creates a global destructor function that unregisters the GPU code blob 946 /// registered by constructor. 947 /// 948 /// For CUDA: 949 /// \code 950 /// void __cuda_module_dtor() { 951 /// __cudaUnregisterFatBinary(Handle); 952 /// } 953 /// \endcode 954 /// 955 /// For HIP: 956 /// \code 957 /// void __hip_module_dtor() { 958 /// if (__hip_gpubin_handle) { 959 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 960 /// __hip_gpubin_handle = 0; 961 /// } 962 /// } 963 /// \endcode 964 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 965 // No need for destructor if we don't have a handle to unregister. 966 if (!GpuBinaryHandle) 967 return nullptr; 968 969 // void __cudaUnregisterFatBinary(void ** handle); 970 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 971 llvm::FunctionType::get(VoidTy, PtrTy, false), 972 addUnderscoredPrefixToName("UnregisterFatBinary")); 973 974 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 975 llvm::FunctionType::get(VoidTy, false), 976 llvm::GlobalValue::InternalLinkage, 977 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 978 979 llvm::BasicBlock *DtorEntryBB = 980 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 981 CGBuilderTy DtorBuilder(CGM, Context); 982 DtorBuilder.SetInsertPoint(DtorEntryBB); 983 984 Address GpuBinaryAddr( 985 GpuBinaryHandle, GpuBinaryHandle->getValueType(), 986 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 987 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 988 // There is only one HIP fat binary per linked module, however there are 989 // multiple destructor functions. Make sure the fat binary is unregistered 990 // only once. 991 if (CGM.getLangOpts().HIP) { 992 llvm::BasicBlock *IfBlock = 993 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 994 llvm::BasicBlock *ExitBlock = 995 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 996 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 997 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 998 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 999 1000 DtorBuilder.SetInsertPoint(IfBlock); 1001 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 1002 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 1003 DtorBuilder.CreateBr(ExitBlock); 1004 1005 DtorBuilder.SetInsertPoint(ExitBlock); 1006 } else { 1007 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 1008 } 1009 DtorBuilder.CreateRetVoid(); 1010 return ModuleDtorFunc; 1011 } 1012 1013 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 1014 return new CGNVCUDARuntime(CGM); 1015 } 1016 1017 void CGNVCUDARuntime::internalizeDeviceSideVar( 1018 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { 1019 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side 1020 // global variables become internal definitions. These have to be internal in 1021 // order to prevent name conflicts with global host variables with the same 1022 // name in a different TUs. 1023 // 1024 // For -fgpu-rdc, the shadow variables should not be internalized because 1025 // they may be accessed by different TU. 1026 if (CGM.getLangOpts().GPURelocatableDeviceCode) 1027 return; 1028 1029 // __shared__ variables are odd. Shadows do get created, but 1030 // they are not registered with the CUDA runtime, so they 1031 // can't really be used to access their device-side 1032 // counterparts. It's not clear yet whether it's nvcc's bug or 1033 // a feature, but we've got to do the same for compatibility. 1034 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || 1035 D->hasAttr<CUDASharedAttr>() || 1036 D->getType()->isCUDADeviceBuiltinSurfaceType() || 1037 D->getType()->isCUDADeviceBuiltinTextureType()) { 1038 Linkage = llvm::GlobalValue::InternalLinkage; 1039 } 1040 } 1041 1042 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, 1043 llvm::GlobalVariable &GV) { 1044 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) { 1045 // Shadow variables and their properties must be registered with CUDA 1046 // runtime. Skip Extern global variables, which will be registered in 1047 // the TU where they are defined. 1048 // 1049 // Don't register a C++17 inline variable. The local symbol can be 1050 // discarded and referencing a discarded local symbol from outside the 1051 // comdat (__cuda_register_globals) is disallowed by the ELF spec. 1052 // 1053 // HIP managed variables need to be always recorded in device and host 1054 // compilations for transformation. 1055 // 1056 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are 1057 // added to llvm.compiler-used, therefore they are safe to be registered. 1058 if ((!D->hasExternalStorage() && !D->isInline()) || 1059 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) || 1060 D->hasAttr<HIPManagedAttr>()) { 1061 registerDeviceVar(D, GV, !D->hasDefinition(), 1062 D->hasAttr<CUDAConstantAttr>()); 1063 } 1064 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() || 1065 D->getType()->isCUDADeviceBuiltinTextureType()) { 1066 // Builtin surfaces and textures and their template arguments are 1067 // also registered with CUDA runtime. 1068 const auto *TD = cast<ClassTemplateSpecializationDecl>( 1069 D->getType()->castAs<RecordType>()->getDecl()); 1070 const TemplateArgumentList &Args = TD->getTemplateArgs(); 1071 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) { 1072 assert(Args.size() == 2 && 1073 "Unexpected number of template arguments of CUDA device " 1074 "builtin surface type."); 1075 auto SurfType = Args[1].getAsIntegral(); 1076 if (!D->hasExternalStorage()) 1077 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue()); 1078 } else { 1079 assert(Args.size() == 3 && 1080 "Unexpected number of template arguments of CUDA device " 1081 "builtin texture type."); 1082 auto TexType = Args[1].getAsIntegral(); 1083 auto Normalized = Args[2].getAsIntegral(); 1084 if (!D->hasExternalStorage()) 1085 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(), 1086 Normalized.getZExtValue()); 1087 } 1088 } 1089 } 1090 1091 // Transform managed variables to pointers to managed variables in device code. 1092 // Each use of the original managed variable is replaced by a load from the 1093 // transformed managed variable. The transformed managed variable contains 1094 // the address of managed memory which will be allocated by the runtime. 1095 void CGNVCUDARuntime::transformManagedVars() { 1096 for (auto &&Info : DeviceVars) { 1097 llvm::GlobalVariable *Var = Info.Var; 1098 if (Info.Flags.getKind() == DeviceVarFlags::Variable && 1099 Info.Flags.isManaged()) { 1100 auto *ManagedVar = new llvm::GlobalVariable( 1101 CGM.getModule(), Var->getType(), 1102 /*isConstant=*/false, Var->getLinkage(), 1103 /*Init=*/Var->isDeclaration() 1104 ? nullptr 1105 : llvm::ConstantPointerNull::get(Var->getType()), 1106 /*Name=*/"", /*InsertBefore=*/nullptr, 1107 llvm::GlobalVariable::NotThreadLocal, 1108 CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice 1109 ? LangAS::cuda_device 1110 : LangAS::Default)); 1111 ManagedVar->setDSOLocal(Var->isDSOLocal()); 1112 ManagedVar->setVisibility(Var->getVisibility()); 1113 ManagedVar->setExternallyInitialized(true); 1114 replaceManagedVar(Var, ManagedVar); 1115 ManagedVar->takeName(Var); 1116 Var->setName(Twine(ManagedVar->getName()) + ".managed"); 1117 // Keep managed variables even if they are not used in device code since 1118 // they need to be allocated by the runtime. 1119 if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) { 1120 assert(!ManagedVar->isDeclaration()); 1121 CGM.addCompilerUsedGlobal(Var); 1122 CGM.addCompilerUsedGlobal(ManagedVar); 1123 } 1124 } 1125 } 1126 } 1127 1128 // Creates offloading entries for all the kernels and globals that must be 1129 // registered. The linker will provide a pointer to this section so we can 1130 // register the symbols with the linked device image. 1131 void CGNVCUDARuntime::createOffloadingEntries() { 1132 StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries" 1133 : "cuda_offloading_entries"; 1134 llvm::Module &M = CGM.getModule(); 1135 for (KernelInfo &I : EmittedKernels) 1136 llvm::offloading::emitOffloadingEntry( 1137 M, KernelHandles[I.Kernel->getName()], 1138 getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0, 1139 llvm::offloading::OffloadGlobalEntry, Section); 1140 1141 for (VarInfo &I : DeviceVars) { 1142 uint64_t VarSize = 1143 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType()); 1144 int32_t Flags = 1145 (I.Flags.isExtern() 1146 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern) 1147 : 0) | 1148 (I.Flags.isConstant() 1149 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant) 1150 : 0) | 1151 (I.Flags.isNormalized() 1152 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized) 1153 : 0); 1154 if (I.Flags.getKind() == DeviceVarFlags::Variable) { 1155 llvm::offloading::emitOffloadingEntry( 1156 M, I.Var, getDeviceSideName(I.D), VarSize, 1157 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry 1158 : llvm::offloading::OffloadGlobalEntry) | 1159 Flags, 1160 /*Data=*/0, Section); 1161 } else if (I.Flags.getKind() == DeviceVarFlags::Surface) { 1162 llvm::offloading::emitOffloadingEntry( 1163 M, I.Var, getDeviceSideName(I.D), VarSize, 1164 llvm::offloading::OffloadGlobalSurfaceEntry | Flags, 1165 I.Flags.getSurfTexType(), Section); 1166 } else if (I.Flags.getKind() == DeviceVarFlags::Texture) { 1167 llvm::offloading::emitOffloadingEntry( 1168 M, I.Var, getDeviceSideName(I.D), VarSize, 1169 llvm::offloading::OffloadGlobalTextureEntry | Flags, 1170 I.Flags.getSurfTexType(), Section); 1171 } 1172 } 1173 } 1174 1175 // Returns module constructor to be added. 1176 llvm::Function *CGNVCUDARuntime::finalizeModule() { 1177 transformManagedVars(); 1178 if (CGM.getLangOpts().CUDAIsDevice) { 1179 // Mark ODR-used device variables as compiler used to prevent it from being 1180 // eliminated by optimization. This is necessary for device variables 1181 // ODR-used by host functions. Sema correctly marks them as ODR-used no 1182 // matter whether they are ODR-used by device or host functions. 1183 // 1184 // We do not need to do this if the variable has used attribute since it 1185 // has already been added. 1186 // 1187 // Static device variables have been externalized at this point, therefore 1188 // variables with LLVM private or internal linkage need not be added. 1189 for (auto &&Info : DeviceVars) { 1190 auto Kind = Info.Flags.getKind(); 1191 if (!Info.Var->isDeclaration() && 1192 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) && 1193 (Kind == DeviceVarFlags::Variable || 1194 Kind == DeviceVarFlags::Surface || 1195 Kind == DeviceVarFlags::Texture) && 1196 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) { 1197 CGM.addCompilerUsedGlobal(Info.Var); 1198 } 1199 } 1200 return nullptr; 1201 } 1202 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode) 1203 createOffloadingEntries(); 1204 else 1205 return makeModuleCtorFunction(); 1206 1207 return nullptr; 1208 } 1209 1210 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, 1211 GlobalDecl GD) { 1212 auto Loc = KernelHandles.find(F->getName()); 1213 if (Loc != KernelHandles.end()) { 1214 auto OldHandle = Loc->second; 1215 if (KernelStubs[OldHandle] == F) 1216 return OldHandle; 1217 1218 // We've found the function name, but F itself has changed, so we need to 1219 // update the references. 1220 if (CGM.getLangOpts().HIP) { 1221 // For HIP compilation the handle itself does not change, so we only need 1222 // to update the Stub value. 1223 KernelStubs[OldHandle] = F; 1224 return OldHandle; 1225 } 1226 // For non-HIP compilation, erase the old Stub and fall-through to creating 1227 // new entries. 1228 KernelStubs.erase(OldHandle); 1229 } 1230 1231 if (!CGM.getLangOpts().HIP) { 1232 KernelHandles[F->getName()] = F; 1233 KernelStubs[F] = F; 1234 return F; 1235 } 1236 1237 auto *Var = new llvm::GlobalVariable( 1238 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(), 1239 /*Initializer=*/nullptr, 1240 CGM.getMangledName( 1241 GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel))); 1242 Var->setAlignment(CGM.getPointerAlign().getAsAlign()); 1243 Var->setDSOLocal(F->isDSOLocal()); 1244 Var->setVisibility(F->getVisibility()); 1245 auto *FD = cast<FunctionDecl>(GD.getDecl()); 1246 auto *FT = FD->getPrimaryTemplate(); 1247 if (!FT || FT->isThisDeclarationADefinition()) 1248 CGM.maybeSetTrivialComdat(*FD, *Var); 1249 KernelHandles[F->getName()] = Var; 1250 KernelStubs[Var] = F; 1251 return Var; 1252 } 1253