1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CGCXXABI.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/AST/Decl.h" 19 #include "clang/Basic/Cuda.h" 20 #include "clang/CodeGen/CodeGenABITypes.h" 21 #include "clang/CodeGen/ConstantInitBuilder.h" 22 #include "llvm/IR/BasicBlock.h" 23 #include "llvm/IR/Constants.h" 24 #include "llvm/IR/DerivedTypes.h" 25 #include "llvm/IR/ReplaceConstant.h" 26 #include "llvm/Support/Format.h" 27 #include "llvm/Support/VirtualFileSystem.h" 28 29 using namespace clang; 30 using namespace CodeGen; 31 32 namespace { 33 constexpr unsigned CudaFatMagic = 0x466243b1; 34 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 35 36 class CGNVCUDARuntime : public CGCUDARuntime { 37 38 private: 39 llvm::IntegerType *IntTy, *SizeTy; 40 llvm::Type *VoidTy; 41 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 42 43 /// Convenience reference to LLVM Context 44 llvm::LLVMContext &Context; 45 /// Convenience reference to the current module 46 llvm::Module &TheModule; 47 /// Keeps track of kernel launch stubs and handles emitted in this module 48 struct KernelInfo { 49 llvm::Function *Kernel; // stub function to help launch kernel 50 const Decl *D; 51 }; 52 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 53 // Map a kernel mangled name to a symbol for identifying kernel in host code 54 // For CUDA, the symbol for identifying the kernel is the same as the device 55 // stub function. For HIP, they are different. 56 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles; 57 // Map a kernel handle to the kernel stub. 58 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs; 59 struct VarInfo { 60 llvm::GlobalVariable *Var; 61 const VarDecl *D; 62 DeviceVarFlags Flags; 63 }; 64 llvm::SmallVector<VarInfo, 16> DeviceVars; 65 /// Keeps track of variable containing handle of GPU binary. Populated by 66 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 67 /// ModuleDtorFunction() 68 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 69 /// Whether we generate relocatable device code. 70 bool RelocatableDeviceCode; 71 /// Mangle context for device. 72 std::unique_ptr<MangleContext> DeviceMC; 73 /// Some zeros used for GEPs. 74 llvm::Constant *Zeros[2]; 75 76 llvm::FunctionCallee getSetupArgumentFn() const; 77 llvm::FunctionCallee getLaunchFn() const; 78 79 llvm::FunctionType *getRegisterGlobalsFnTy() const; 80 llvm::FunctionType *getCallbackFnTy() const; 81 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 82 std::string addPrefixToName(StringRef FuncName) const; 83 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 84 85 /// Creates a function to register all kernel stubs generated in this module. 86 llvm::Function *makeRegisterGlobalsFn(); 87 88 /// Helper function that generates a constant string and returns a pointer to 89 /// the start of the string. The result of this function can be used anywhere 90 /// where the C code specifies const char*. 91 llvm::Constant *makeConstantString(const std::string &Str, 92 const std::string &Name = "") { 93 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 94 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 95 ConstStr.getPointer(), Zeros); 96 } 97 98 /// Helper function which generates an initialized constant array from Str, 99 /// and optionally sets section name and alignment. AddNull specifies whether 100 /// the array should nave NUL termination. 101 llvm::Constant *makeConstantArray(StringRef Str, 102 StringRef Name = "", 103 StringRef SectionName = "", 104 unsigned Alignment = 0, 105 bool AddNull = false) { 106 llvm::Constant *Value = 107 llvm::ConstantDataArray::getString(Context, Str, AddNull); 108 auto *GV = new llvm::GlobalVariable( 109 TheModule, Value->getType(), /*isConstant=*/true, 110 llvm::GlobalValue::PrivateLinkage, Value, Name); 111 if (!SectionName.empty()) { 112 GV->setSection(SectionName); 113 // Mark the address as used which make sure that this section isn't 114 // merged and we will really have it in the object file. 115 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 116 } 117 if (Alignment) 118 GV->setAlignment(llvm::Align(Alignment)); 119 return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros); 120 } 121 122 /// Helper function that generates an empty dummy function returning void. 123 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 124 assert(FnTy->getReturnType()->isVoidTy() && 125 "Can only generate dummy functions returning void!"); 126 llvm::Function *DummyFunc = llvm::Function::Create( 127 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 128 129 llvm::BasicBlock *DummyBlock = 130 llvm::BasicBlock::Create(Context, "", DummyFunc); 131 CGBuilderTy FuncBuilder(CGM, Context); 132 FuncBuilder.SetInsertPoint(DummyBlock); 133 FuncBuilder.CreateRetVoid(); 134 135 return DummyFunc; 136 } 137 138 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 139 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 140 std::string getDeviceSideName(const NamedDecl *ND) override; 141 142 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 143 bool Extern, bool Constant) { 144 DeviceVars.push_back({&Var, 145 VD, 146 {DeviceVarFlags::Variable, Extern, Constant, 147 VD->hasAttr<HIPManagedAttr>(), 148 /*Normalized*/ false, 0}}); 149 } 150 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, 151 bool Extern, int Type) { 152 DeviceVars.push_back({&Var, 153 VD, 154 {DeviceVarFlags::Surface, Extern, /*Constant*/ false, 155 /*Managed*/ false, 156 /*Normalized*/ false, Type}}); 157 } 158 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, 159 bool Extern, int Type, bool Normalized) { 160 DeviceVars.push_back({&Var, 161 VD, 162 {DeviceVarFlags::Texture, Extern, /*Constant*/ false, 163 /*Managed*/ false, Normalized, Type}}); 164 } 165 166 /// Creates module constructor function 167 llvm::Function *makeModuleCtorFunction(); 168 /// Creates module destructor function 169 llvm::Function *makeModuleDtorFunction(); 170 /// Transform managed variables for device compilation. 171 void transformManagedVars(); 172 /// Create offloading entries to register globals in RDC mode. 173 void createOffloadingEntries(); 174 175 public: 176 CGNVCUDARuntime(CodeGenModule &CGM); 177 178 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override; 179 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override { 180 auto Loc = KernelStubs.find(Handle); 181 assert(Loc != KernelStubs.end()); 182 return Loc->second; 183 } 184 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 185 void handleVarRegistration(const VarDecl *VD, 186 llvm::GlobalVariable &Var) override; 187 void 188 internalizeDeviceSideVar(const VarDecl *D, 189 llvm::GlobalValue::LinkageTypes &Linkage) override; 190 191 llvm::Function *finalizeModule() override; 192 }; 193 194 } // end anonymous namespace 195 196 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 197 if (CGM.getLangOpts().HIP) 198 return ((Twine("hip") + Twine(FuncName)).str()); 199 return ((Twine("cuda") + Twine(FuncName)).str()); 200 } 201 std::string 202 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 203 if (CGM.getLangOpts().HIP) 204 return ((Twine("__hip") + Twine(FuncName)).str()); 205 return ((Twine("__cuda") + Twine(FuncName)).str()); 206 } 207 208 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) { 209 // If the host and device have different C++ ABIs, mark it as the device 210 // mangle context so that the mangling needs to retrieve the additional 211 // device lambda mangling number instead of the regular host one. 212 if (CGM.getContext().getAuxTargetInfo() && 213 CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && 214 CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { 215 return std::unique_ptr<MangleContext>( 216 CGM.getContext().createDeviceMangleContext( 217 *CGM.getContext().getAuxTargetInfo())); 218 } 219 220 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext( 221 CGM.getContext().getAuxTargetInfo())); 222 } 223 224 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 225 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 226 TheModule(CGM.getModule()), 227 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 228 DeviceMC(InitDeviceMC(CGM)) { 229 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 230 ASTContext &Ctx = CGM.getContext(); 231 232 IntTy = CGM.IntTy; 233 SizeTy = CGM.SizeTy; 234 VoidTy = CGM.VoidTy; 235 Zeros[0] = llvm::ConstantInt::get(SizeTy, 0); 236 Zeros[1] = Zeros[0]; 237 238 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 239 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 240 VoidPtrPtrTy = llvm::PointerType::getUnqual(CGM.getLLVMContext()); 241 } 242 243 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 244 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 245 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 246 return CGM.CreateRuntimeFunction( 247 llvm::FunctionType::get(IntTy, Params, false), 248 addPrefixToName("SetupArgument")); 249 } 250 251 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 252 if (CGM.getLangOpts().HIP) { 253 // hipError_t hipLaunchByPtr(char *); 254 return CGM.CreateRuntimeFunction( 255 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 256 } 257 // cudaError_t cudaLaunch(char *); 258 return CGM.CreateRuntimeFunction( 259 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 260 } 261 262 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 263 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 264 } 265 266 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 267 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 268 } 269 270 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 271 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), VoidPtrTy, 272 VoidPtrTy, llvm::PointerType::getUnqual(Context)}; 273 return llvm::FunctionType::get(VoidTy, Params, false); 274 } 275 276 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { 277 GlobalDecl GD; 278 // D could be either a kernel or a variable. 279 if (auto *FD = dyn_cast<FunctionDecl>(ND)) 280 GD = GlobalDecl(FD, KernelReferenceKind::Kernel); 281 else 282 GD = GlobalDecl(ND); 283 std::string DeviceSideName; 284 MangleContext *MC; 285 if (CGM.getLangOpts().CUDAIsDevice) 286 MC = &CGM.getCXXABI().getMangleContext(); 287 else 288 MC = DeviceMC.get(); 289 if (MC->shouldMangleDeclName(ND)) { 290 SmallString<256> Buffer; 291 llvm::raw_svector_ostream Out(Buffer); 292 MC->mangleName(GD, Out); 293 DeviceSideName = std::string(Out.str()); 294 } else 295 DeviceSideName = std::string(ND->getIdentifier()->getName()); 296 297 // Make unique name for device side static file-scope variable for HIP. 298 if (CGM.getContext().shouldExternalize(ND) && 299 CGM.getLangOpts().GPURelocatableDeviceCode) { 300 SmallString<256> Buffer; 301 llvm::raw_svector_ostream Out(Buffer); 302 Out << DeviceSideName; 303 CGM.printPostfixForExternalizedDecl(Out, ND); 304 DeviceSideName = std::string(Out.str()); 305 } 306 return DeviceSideName; 307 } 308 309 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 310 FunctionArgList &Args) { 311 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 312 if (auto *GV = 313 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) { 314 GV->setLinkage(CGF.CurFn->getLinkage()); 315 GV->setInitializer(CGF.CurFn); 316 } 317 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 318 CudaFeature::CUDA_USES_NEW_LAUNCH) || 319 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) 320 emitDeviceStubBodyNew(CGF, Args); 321 else 322 emitDeviceStubBodyLegacy(CGF, Args); 323 } 324 325 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 326 // array and kernels are launched using cudaLaunchKernel(). 327 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 328 FunctionArgList &Args) { 329 // Build the shadow stack entry at the very start of the function. 330 331 // Calculate amount of space we will need for all arguments. If we have no 332 // args, allocate a single pointer so we still have a valid pointer to the 333 // argument array that we can pass to runtime, even if it will be unused. 334 Address KernelArgs = CGF.CreateTempAlloca( 335 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", 336 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 337 // Store pointers to the arguments in a locally allocated launch_args. 338 for (unsigned i = 0; i < Args.size(); ++i) { 339 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); 340 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); 341 CGF.Builder.CreateDefaultAlignedStore( 342 VoidVarPtr, 343 CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i)); 344 } 345 346 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 347 348 // Lookup cudaLaunchKernel/hipLaunchKernel function. 349 // HIP kernel launching API name depends on -fgpu-default-stream option. For 350 // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', 351 // it is hipLaunchKernel_spt. 352 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 353 // void **args, size_t sharedMem, 354 // cudaStream_t stream); 355 // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, 356 // dim3 blockDim, void **args, 357 // size_t sharedMem, hipStream_t stream); 358 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 359 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 360 std::string KernelLaunchAPI = "LaunchKernel"; 361 if (CGF.getLangOpts().GPUDefaultStream == 362 LangOptions::GPUDefaultStreamKind::PerThread) { 363 if (CGF.getLangOpts().HIP) 364 KernelLaunchAPI = KernelLaunchAPI + "_spt"; 365 else if (CGF.getLangOpts().CUDA) 366 KernelLaunchAPI = KernelLaunchAPI + "_ptsz"; 367 } 368 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI); 369 IdentifierInfo &cudaLaunchKernelII = 370 CGM.getContext().Idents.get(LaunchKernelName); 371 FunctionDecl *cudaLaunchKernelFD = nullptr; 372 for (auto *Result : DC->lookup(&cudaLaunchKernelII)) { 373 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 374 cudaLaunchKernelFD = FD; 375 } 376 377 if (cudaLaunchKernelFD == nullptr) { 378 CGM.Error(CGF.CurFuncDecl->getLocation(), 379 "Can't find declaration for " + LaunchKernelName); 380 return; 381 } 382 // Create temporary dim3 grid_dim, block_dim. 383 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 384 QualType Dim3Ty = GridDimParam->getType(); 385 Address GridDim = 386 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 387 Address BlockDim = 388 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 389 Address ShmemSize = 390 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 391 Address Stream = 392 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); 393 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 394 llvm::FunctionType::get(IntTy, 395 {/*gridDim=*/GridDim.getType(), 396 /*blockDim=*/BlockDim.getType(), 397 /*ShmemSize=*/ShmemSize.getType(), 398 /*Stream=*/Stream.getType()}, 399 /*isVarArg=*/false), 400 addUnderscoredPrefixToName("PopCallConfiguration")); 401 402 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 403 {GridDim.getPointer(), BlockDim.getPointer(), 404 ShmemSize.getPointer(), Stream.getPointer()}); 405 406 // Emit the call to cudaLaunch 407 llvm::Value *Kernel = CGF.Builder.CreatePointerCast( 408 KernelHandles[CGF.CurFn->getName()], VoidPtrTy); 409 CallArgList LaunchKernelArgs; 410 LaunchKernelArgs.add(RValue::get(Kernel), 411 cudaLaunchKernelFD->getParamDecl(0)->getType()); 412 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 413 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 414 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), 415 cudaLaunchKernelFD->getParamDecl(3)->getType()); 416 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 417 cudaLaunchKernelFD->getParamDecl(4)->getType()); 418 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 419 cudaLaunchKernelFD->getParamDecl(5)->getType()); 420 421 QualType QT = cudaLaunchKernelFD->getType(); 422 QualType CQT = QT.getCanonicalType(); 423 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 424 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty); 425 426 const CGFunctionInfo &FI = 427 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 428 llvm::FunctionCallee cudaLaunchKernelFn = 429 CGM.CreateRuntimeFunction(FTy, LaunchKernelName); 430 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 431 LaunchKernelArgs); 432 CGF.EmitBranch(EndBlock); 433 434 CGF.EmitBlock(EndBlock); 435 } 436 437 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 438 FunctionArgList &Args) { 439 // Emit a call to cudaSetupArgument for each arg in Args. 440 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 441 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 442 CharUnits Offset = CharUnits::Zero(); 443 for (const VarDecl *A : Args) { 444 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType()); 445 Offset = Offset.alignTo(TInfo.Align); 446 llvm::Value *Args[] = { 447 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 448 VoidPtrTy), 449 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()), 450 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 451 }; 452 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 453 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 454 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 455 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 456 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 457 CGF.EmitBlock(NextBlock); 458 Offset += TInfo.Width; 459 } 460 461 // Emit the call to cudaLaunch 462 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 463 llvm::Value *Arg = CGF.Builder.CreatePointerCast( 464 KernelHandles[CGF.CurFn->getName()], CharPtrTy); 465 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 466 CGF.EmitBranch(EndBlock); 467 468 CGF.EmitBlock(EndBlock); 469 } 470 471 // Replace the original variable Var with the address loaded from variable 472 // ManagedVar populated by HIP runtime. 473 static void replaceManagedVar(llvm::GlobalVariable *Var, 474 llvm::GlobalVariable *ManagedVar) { 475 SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; 476 for (auto &&VarUse : Var->uses()) { 477 WorkList.push_back({VarUse.getUser()}); 478 } 479 while (!WorkList.empty()) { 480 auto &&WorkItem = WorkList.pop_back_val(); 481 auto *U = WorkItem.back(); 482 if (isa<llvm::ConstantExpr>(U)) { 483 for (auto &&UU : U->uses()) { 484 WorkItem.push_back(UU.getUser()); 485 WorkList.push_back(WorkItem); 486 WorkItem.pop_back(); 487 } 488 continue; 489 } 490 if (auto *I = dyn_cast<llvm::Instruction>(U)) { 491 llvm::Value *OldV = Var; 492 llvm::Instruction *NewV = 493 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false, 494 llvm::Align(Var->getAlignment()), I); 495 WorkItem.pop_back(); 496 // Replace constant expressions directly or indirectly using the managed 497 // variable with instructions. 498 for (auto &&Op : WorkItem) { 499 auto *CE = cast<llvm::ConstantExpr>(Op); 500 auto *NewInst = CE->getAsInstruction(I); 501 NewInst->replaceUsesOfWith(OldV, NewV); 502 OldV = CE; 503 NewV = NewInst; 504 } 505 I->replaceUsesOfWith(OldV, NewV); 506 } else { 507 llvm_unreachable("Invalid use of managed variable"); 508 } 509 } 510 } 511 512 /// Creates a function that sets up state on the host side for CUDA objects that 513 /// have a presence on both the host and device sides. Specifically, registers 514 /// the host side of kernel functions and device global variables with the CUDA 515 /// runtime. 516 /// \code 517 /// void __cuda_register_globals(void** GpuBinaryHandle) { 518 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 519 /// ... 520 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 521 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 522 /// ... 523 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 524 /// } 525 /// \endcode 526 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 527 // No need to register anything 528 if (EmittedKernels.empty() && DeviceVars.empty()) 529 return nullptr; 530 531 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 532 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 533 addUnderscoredPrefixToName("_register_globals"), &TheModule); 534 llvm::BasicBlock *EntryBB = 535 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 536 CGBuilderTy Builder(CGM, Context); 537 Builder.SetInsertPoint(EntryBB); 538 539 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 540 // int, uint3*, uint3*, dim3*, dim3*, int*) 541 llvm::Type *RegisterFuncParams[] = { 542 VoidPtrPtrTy, CharPtrTy, 543 CharPtrTy, CharPtrTy, 544 IntTy, VoidPtrTy, 545 VoidPtrTy, VoidPtrTy, 546 VoidPtrTy, llvm::PointerType::getUnqual(Context)}; 547 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 548 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 549 addUnderscoredPrefixToName("RegisterFunction")); 550 551 // Extract GpuBinaryHandle passed as the first argument passed to 552 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 553 // each emitted kernel. 554 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 555 for (auto &&I : EmittedKernels) { 556 llvm::Constant *KernelName = 557 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); 558 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 559 llvm::Value *Args[] = { 560 &GpuBinaryHandlePtr, 561 Builder.CreateBitCast(KernelHandles[I.Kernel->getName()], VoidPtrTy), 562 KernelName, 563 KernelName, 564 llvm::ConstantInt::get(IntTy, -1), 565 NullPtr, 566 NullPtr, 567 NullPtr, 568 NullPtr, 569 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))}; 570 Builder.CreateCall(RegisterFunc, Args); 571 } 572 573 llvm::Type *VarSizeTy = IntTy; 574 // For HIP or CUDA 9.0+, device variable size is type of `size_t`. 575 if (CGM.getLangOpts().HIP || 576 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90) 577 VarSizeTy = SizeTy; 578 579 // void __cudaRegisterVar(void **, char *, char *, const char *, 580 // int, int, int, int) 581 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 582 CharPtrTy, IntTy, VarSizeTy, 583 IntTy, IntTy}; 584 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 585 llvm::FunctionType::get(VoidTy, RegisterVarParams, false), 586 addUnderscoredPrefixToName("RegisterVar")); 587 // void __hipRegisterManagedVar(void **, char *, char *, const char *, 588 // size_t, unsigned) 589 llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 590 CharPtrTy, VarSizeTy, IntTy}; 591 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( 592 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false), 593 addUnderscoredPrefixToName("RegisterManagedVar")); 594 // void __cudaRegisterSurface(void **, const struct surfaceReference *, 595 // const void **, const char *, int, int); 596 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( 597 llvm::FunctionType::get( 598 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy}, 599 false), 600 addUnderscoredPrefixToName("RegisterSurface")); 601 // void __cudaRegisterTexture(void **, const struct textureReference *, 602 // const void **, const char *, int, int, int) 603 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction( 604 llvm::FunctionType::get( 605 VoidTy, 606 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy}, 607 false), 608 addUnderscoredPrefixToName("RegisterTexture")); 609 for (auto &&Info : DeviceVars) { 610 llvm::GlobalVariable *Var = Info.Var; 611 assert((!Var->isDeclaration() || Info.Flags.isManaged()) && 612 "External variables should not show up here, except HIP managed " 613 "variables"); 614 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 615 switch (Info.Flags.getKind()) { 616 case DeviceVarFlags::Variable: { 617 uint64_t VarSize = 618 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 619 if (Info.Flags.isManaged()) { 620 auto *ManagedVar = new llvm::GlobalVariable( 621 CGM.getModule(), Var->getType(), 622 /*isConstant=*/false, Var->getLinkage(), 623 /*Init=*/Var->isDeclaration() 624 ? nullptr 625 : llvm::ConstantPointerNull::get(Var->getType()), 626 /*Name=*/"", /*InsertBefore=*/nullptr, 627 llvm::GlobalVariable::NotThreadLocal); 628 ManagedVar->setDSOLocal(Var->isDSOLocal()); 629 ManagedVar->setVisibility(Var->getVisibility()); 630 ManagedVar->setExternallyInitialized(true); 631 ManagedVar->takeName(Var); 632 Var->setName(Twine(ManagedVar->getName() + ".managed")); 633 replaceManagedVar(Var, ManagedVar); 634 llvm::Value *Args[] = { 635 &GpuBinaryHandlePtr, 636 Builder.CreateBitCast(ManagedVar, VoidPtrTy), 637 Builder.CreateBitCast(Var, VoidPtrTy), 638 VarName, 639 llvm::ConstantInt::get(VarSizeTy, VarSize), 640 llvm::ConstantInt::get(IntTy, Var->getAlignment())}; 641 if (!Var->isDeclaration()) 642 Builder.CreateCall(RegisterManagedVar, Args); 643 } else { 644 llvm::Value *Args[] = { 645 &GpuBinaryHandlePtr, 646 Builder.CreateBitCast(Var, VoidPtrTy), 647 VarName, 648 VarName, 649 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), 650 llvm::ConstantInt::get(VarSizeTy, VarSize), 651 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), 652 llvm::ConstantInt::get(IntTy, 0)}; 653 Builder.CreateCall(RegisterVar, Args); 654 } 655 break; 656 } 657 case DeviceVarFlags::Surface: 658 Builder.CreateCall( 659 RegisterSurf, 660 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, 661 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), 662 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); 663 break; 664 case DeviceVarFlags::Texture: 665 Builder.CreateCall( 666 RegisterTex, 667 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, 668 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), 669 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()), 670 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); 671 break; 672 } 673 } 674 675 Builder.CreateRetVoid(); 676 return RegisterKernelsFunc; 677 } 678 679 /// Creates a global constructor function for the module: 680 /// 681 /// For CUDA: 682 /// \code 683 /// void __cuda_module_ctor() { 684 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 685 /// __cuda_register_globals(Handle); 686 /// } 687 /// \endcode 688 /// 689 /// For HIP: 690 /// \code 691 /// void __hip_module_ctor() { 692 /// if (__hip_gpubin_handle == 0) { 693 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 694 /// __hip_register_globals(__hip_gpubin_handle); 695 /// } 696 /// } 697 /// \endcode 698 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 699 bool IsHIP = CGM.getLangOpts().HIP; 700 bool IsCUDA = CGM.getLangOpts().CUDA; 701 // No need to generate ctors/dtors if there is no GPU binary. 702 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 703 if (CudaGpuBinaryFileName.empty() && !IsHIP) 704 return nullptr; 705 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && 706 DeviceVars.empty()) 707 return nullptr; 708 709 // void __{cuda|hip}_register_globals(void* handle); 710 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 711 // We always need a function to pass in as callback. Create a dummy 712 // implementation if we don't need to register anything. 713 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 714 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 715 716 // void ** __{cuda|hip}RegisterFatBinary(void *); 717 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 718 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 719 addUnderscoredPrefixToName("RegisterFatBinary")); 720 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 721 llvm::StructType *FatbinWrapperTy = 722 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 723 724 // Register GPU binary with the CUDA runtime, store returned handle in a 725 // global variable and save a reference in GpuBinaryHandle to be cleaned up 726 // in destructor on exit. Then associate all known kernels with the GPU binary 727 // handle so CUDA runtime can figure out what to call on the GPU side. 728 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 729 if (!CudaGpuBinaryFileName.empty()) { 730 auto VFS = CGM.getFileSystem(); 731 auto CudaGpuBinaryOrErr = 732 VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false); 733 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 734 CGM.getDiags().Report(diag::err_cannot_open_file) 735 << CudaGpuBinaryFileName << EC.message(); 736 return nullptr; 737 } 738 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 739 } 740 741 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 742 llvm::FunctionType::get(VoidTy, false), 743 llvm::GlobalValue::InternalLinkage, 744 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 745 llvm::BasicBlock *CtorEntryBB = 746 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 747 CGBuilderTy CtorBuilder(CGM, Context); 748 749 CtorBuilder.SetInsertPoint(CtorEntryBB); 750 751 const char *FatbinConstantName; 752 const char *FatbinSectionName; 753 const char *ModuleIDSectionName; 754 StringRef ModuleIDPrefix; 755 llvm::Constant *FatBinStr; 756 unsigned FatMagic; 757 if (IsHIP) { 758 FatbinConstantName = ".hip_fatbin"; 759 FatbinSectionName = ".hipFatBinSegment"; 760 761 ModuleIDSectionName = "__hip_module_id"; 762 ModuleIDPrefix = "__hip_"; 763 764 if (CudaGpuBinary) { 765 // If fatbin is available from early finalization, create a string 766 // literal containing the fat binary loaded from the given file. 767 const unsigned HIPCodeObjectAlign = 4096; 768 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "", 769 FatbinConstantName, HIPCodeObjectAlign); 770 } else { 771 // If fatbin is not available, create an external symbol 772 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 773 // to contain the fat binary but will be populated somewhere else, 774 // e.g. by lld through link script. 775 FatBinStr = new llvm::GlobalVariable( 776 CGM.getModule(), CGM.Int8Ty, 777 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 778 "__hip_fatbin", nullptr, 779 llvm::GlobalVariable::NotThreadLocal); 780 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 781 } 782 783 FatMagic = HIPFatMagic; 784 } else { 785 if (RelocatableDeviceCode) 786 FatbinConstantName = CGM.getTriple().isMacOSX() 787 ? "__NV_CUDA,__nv_relfatbin" 788 : "__nv_relfatbin"; 789 else 790 FatbinConstantName = 791 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 792 // NVIDIA's cuobjdump looks for fatbins in this section. 793 FatbinSectionName = 794 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 795 796 ModuleIDSectionName = CGM.getTriple().isMacOSX() 797 ? "__NV_CUDA,__nv_module_id" 798 : "__nv_module_id"; 799 ModuleIDPrefix = "__nv_"; 800 801 // For CUDA, create a string literal containing the fat binary loaded from 802 // the given file. 803 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "", 804 FatbinConstantName, 8); 805 FatMagic = CudaFatMagic; 806 } 807 808 // Create initialized wrapper structure that points to the loaded GPU binary 809 ConstantInitBuilder Builder(CGM); 810 auto Values = Builder.beginStruct(FatbinWrapperTy); 811 // Fatbin wrapper magic. 812 Values.addInt(IntTy, FatMagic); 813 // Fatbin version. 814 Values.addInt(IntTy, 1); 815 // Data. 816 Values.add(FatBinStr); 817 // Unused in fatbin v1. 818 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 819 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 820 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 821 /*constant*/ true); 822 FatbinWrapper->setSection(FatbinSectionName); 823 824 // There is only one HIP fat binary per linked module, however there are 825 // multiple constructor functions. Make sure the fat binary is registered 826 // only once. The constructor functions are executed by the dynamic loader 827 // before the program gains control. The dynamic loader cannot execute the 828 // constructor functions concurrently since doing that would not guarantee 829 // thread safety of the loaded program. Therefore we can assume sequential 830 // execution of constructor functions here. 831 if (IsHIP) { 832 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 833 llvm::GlobalValue::LinkOnceAnyLinkage; 834 llvm::BasicBlock *IfBlock = 835 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 836 llvm::BasicBlock *ExitBlock = 837 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 838 // The name, size, and initialization pattern of this variable is part 839 // of HIP ABI. 840 GpuBinaryHandle = new llvm::GlobalVariable( 841 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 842 Linkage, 843 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 844 "__hip_gpubin_handle"); 845 if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage) 846 GpuBinaryHandle->setComdat( 847 CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName())); 848 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 849 // Prevent the weak symbol in different shared libraries being merged. 850 if (Linkage != llvm::GlobalValue::InternalLinkage) 851 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 852 Address GpuBinaryAddr( 853 GpuBinaryHandle, VoidPtrPtrTy, 854 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 855 { 856 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 857 llvm::Constant *Zero = 858 llvm::Constant::getNullValue(HandleValue->getType()); 859 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 860 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 861 } 862 { 863 CtorBuilder.SetInsertPoint(IfBlock); 864 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 865 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 866 RegisterFatbinFunc, 867 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 868 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 869 CtorBuilder.CreateBr(ExitBlock); 870 } 871 { 872 CtorBuilder.SetInsertPoint(ExitBlock); 873 // Call __hip_register_globals(GpuBinaryHandle); 874 if (RegisterGlobalsFunc) { 875 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 876 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 877 } 878 } 879 } else if (!RelocatableDeviceCode) { 880 // Register binary with CUDA runtime. This is substantially different in 881 // default mode vs. separate compilation! 882 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 883 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 884 RegisterFatbinFunc, 885 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 886 GpuBinaryHandle = new llvm::GlobalVariable( 887 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 888 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 889 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 890 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 891 CGM.getPointerAlign()); 892 893 // Call __cuda_register_globals(GpuBinaryHandle); 894 if (RegisterGlobalsFunc) 895 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 896 897 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 898 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 899 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 900 // void __cudaRegisterFatBinaryEnd(void **); 901 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 902 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 903 "__cudaRegisterFatBinaryEnd"); 904 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 905 } 906 } else { 907 // Generate a unique module ID. 908 SmallString<64> ModuleID; 909 llvm::raw_svector_ostream OS(ModuleID); 910 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 911 llvm::Constant *ModuleIDConstant = makeConstantArray( 912 std::string(ModuleID.str()), "", ModuleIDSectionName, 32, /*AddNull=*/true); 913 914 // Create an alias for the FatbinWrapper that nvcc will look for. 915 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 916 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 917 918 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 919 // void *, void (*)(void **)) 920 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 921 RegisterLinkedBinaryName += ModuleID; 922 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 923 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 924 925 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 926 llvm::Value *Args[] = {RegisterGlobalsFunc, 927 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 928 ModuleIDConstant, 929 makeDummyFunction(getCallbackFnTy())}; 930 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 931 } 932 933 // Create destructor and register it with atexit() the way NVCC does it. Doing 934 // it during regular destructor phase worked in CUDA before 9.2 but results in 935 // double-free in 9.2. 936 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 937 // extern "C" int atexit(void (*f)(void)); 938 llvm::FunctionType *AtExitTy = 939 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 940 llvm::FunctionCallee AtExitFunc = 941 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 942 /*Local=*/true); 943 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 944 } 945 946 CtorBuilder.CreateRetVoid(); 947 return ModuleCtorFunc; 948 } 949 950 /// Creates a global destructor function that unregisters the GPU code blob 951 /// registered by constructor. 952 /// 953 /// For CUDA: 954 /// \code 955 /// void __cuda_module_dtor() { 956 /// __cudaUnregisterFatBinary(Handle); 957 /// } 958 /// \endcode 959 /// 960 /// For HIP: 961 /// \code 962 /// void __hip_module_dtor() { 963 /// if (__hip_gpubin_handle) { 964 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 965 /// __hip_gpubin_handle = 0; 966 /// } 967 /// } 968 /// \endcode 969 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 970 // No need for destructor if we don't have a handle to unregister. 971 if (!GpuBinaryHandle) 972 return nullptr; 973 974 // void __cudaUnregisterFatBinary(void ** handle); 975 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 976 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 977 addUnderscoredPrefixToName("UnregisterFatBinary")); 978 979 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 980 llvm::FunctionType::get(VoidTy, false), 981 llvm::GlobalValue::InternalLinkage, 982 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 983 984 llvm::BasicBlock *DtorEntryBB = 985 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 986 CGBuilderTy DtorBuilder(CGM, Context); 987 DtorBuilder.SetInsertPoint(DtorEntryBB); 988 989 Address GpuBinaryAddr( 990 GpuBinaryHandle, GpuBinaryHandle->getValueType(), 991 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 992 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 993 // There is only one HIP fat binary per linked module, however there are 994 // multiple destructor functions. Make sure the fat binary is unregistered 995 // only once. 996 if (CGM.getLangOpts().HIP) { 997 llvm::BasicBlock *IfBlock = 998 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 999 llvm::BasicBlock *ExitBlock = 1000 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 1001 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 1002 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 1003 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 1004 1005 DtorBuilder.SetInsertPoint(IfBlock); 1006 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 1007 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 1008 DtorBuilder.CreateBr(ExitBlock); 1009 1010 DtorBuilder.SetInsertPoint(ExitBlock); 1011 } else { 1012 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 1013 } 1014 DtorBuilder.CreateRetVoid(); 1015 return ModuleDtorFunc; 1016 } 1017 1018 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 1019 return new CGNVCUDARuntime(CGM); 1020 } 1021 1022 void CGNVCUDARuntime::internalizeDeviceSideVar( 1023 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { 1024 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side 1025 // global variables become internal definitions. These have to be internal in 1026 // order to prevent name conflicts with global host variables with the same 1027 // name in a different TUs. 1028 // 1029 // For -fgpu-rdc, the shadow variables should not be internalized because 1030 // they may be accessed by different TU. 1031 if (CGM.getLangOpts().GPURelocatableDeviceCode) 1032 return; 1033 1034 // __shared__ variables are odd. Shadows do get created, but 1035 // they are not registered with the CUDA runtime, so they 1036 // can't really be used to access their device-side 1037 // counterparts. It's not clear yet whether it's nvcc's bug or 1038 // a feature, but we've got to do the same for compatibility. 1039 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || 1040 D->hasAttr<CUDASharedAttr>() || 1041 D->getType()->isCUDADeviceBuiltinSurfaceType() || 1042 D->getType()->isCUDADeviceBuiltinTextureType()) { 1043 Linkage = llvm::GlobalValue::InternalLinkage; 1044 } 1045 } 1046 1047 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, 1048 llvm::GlobalVariable &GV) { 1049 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) { 1050 // Shadow variables and their properties must be registered with CUDA 1051 // runtime. Skip Extern global variables, which will be registered in 1052 // the TU where they are defined. 1053 // 1054 // Don't register a C++17 inline variable. The local symbol can be 1055 // discarded and referencing a discarded local symbol from outside the 1056 // comdat (__cuda_register_globals) is disallowed by the ELF spec. 1057 // 1058 // HIP managed variables need to be always recorded in device and host 1059 // compilations for transformation. 1060 // 1061 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are 1062 // added to llvm.compiler-used, therefore they are safe to be registered. 1063 if ((!D->hasExternalStorage() && !D->isInline()) || 1064 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) || 1065 D->hasAttr<HIPManagedAttr>()) { 1066 registerDeviceVar(D, GV, !D->hasDefinition(), 1067 D->hasAttr<CUDAConstantAttr>()); 1068 } 1069 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() || 1070 D->getType()->isCUDADeviceBuiltinTextureType()) { 1071 // Builtin surfaces and textures and their template arguments are 1072 // also registered with CUDA runtime. 1073 const auto *TD = cast<ClassTemplateSpecializationDecl>( 1074 D->getType()->castAs<RecordType>()->getDecl()); 1075 const TemplateArgumentList &Args = TD->getTemplateArgs(); 1076 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) { 1077 assert(Args.size() == 2 && 1078 "Unexpected number of template arguments of CUDA device " 1079 "builtin surface type."); 1080 auto SurfType = Args[1].getAsIntegral(); 1081 if (!D->hasExternalStorage()) 1082 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue()); 1083 } else { 1084 assert(Args.size() == 3 && 1085 "Unexpected number of template arguments of CUDA device " 1086 "builtin texture type."); 1087 auto TexType = Args[1].getAsIntegral(); 1088 auto Normalized = Args[2].getAsIntegral(); 1089 if (!D->hasExternalStorage()) 1090 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(), 1091 Normalized.getZExtValue()); 1092 } 1093 } 1094 } 1095 1096 // Transform managed variables to pointers to managed variables in device code. 1097 // Each use of the original managed variable is replaced by a load from the 1098 // transformed managed variable. The transformed managed variable contains 1099 // the address of managed memory which will be allocated by the runtime. 1100 void CGNVCUDARuntime::transformManagedVars() { 1101 for (auto &&Info : DeviceVars) { 1102 llvm::GlobalVariable *Var = Info.Var; 1103 if (Info.Flags.getKind() == DeviceVarFlags::Variable && 1104 Info.Flags.isManaged()) { 1105 auto *ManagedVar = new llvm::GlobalVariable( 1106 CGM.getModule(), Var->getType(), 1107 /*isConstant=*/false, Var->getLinkage(), 1108 /*Init=*/Var->isDeclaration() 1109 ? nullptr 1110 : llvm::ConstantPointerNull::get(Var->getType()), 1111 /*Name=*/"", /*InsertBefore=*/nullptr, 1112 llvm::GlobalVariable::NotThreadLocal, 1113 CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); 1114 ManagedVar->setDSOLocal(Var->isDSOLocal()); 1115 ManagedVar->setVisibility(Var->getVisibility()); 1116 ManagedVar->setExternallyInitialized(true); 1117 replaceManagedVar(Var, ManagedVar); 1118 ManagedVar->takeName(Var); 1119 Var->setName(Twine(ManagedVar->getName()) + ".managed"); 1120 // Keep managed variables even if they are not used in device code since 1121 // they need to be allocated by the runtime. 1122 if (!Var->isDeclaration()) { 1123 assert(!ManagedVar->isDeclaration()); 1124 CGM.addCompilerUsedGlobal(Var); 1125 CGM.addCompilerUsedGlobal(ManagedVar); 1126 } 1127 } 1128 } 1129 } 1130 1131 // Creates offloading entries for all the kernels and globals that must be 1132 // registered. The linker will provide a pointer to this section so we can 1133 // register the symbols with the linked device image. 1134 void CGNVCUDARuntime::createOffloadingEntries() { 1135 llvm::OpenMPIRBuilder OMPBuilder(CGM.getModule()); 1136 OMPBuilder.initialize(); 1137 1138 StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries" 1139 : "cuda_offloading_entries"; 1140 for (KernelInfo &I : EmittedKernels) 1141 OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel->getName()], 1142 getDeviceSideName(cast<NamedDecl>(I.D)), 0, 1143 DeviceVarFlags::OffloadGlobalEntry, Section); 1144 1145 for (VarInfo &I : DeviceVars) { 1146 uint64_t VarSize = 1147 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType()); 1148 if (I.Flags.getKind() == DeviceVarFlags::Variable) { 1149 OMPBuilder.emitOffloadingEntry( 1150 I.Var, getDeviceSideName(I.D), VarSize, 1151 I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry 1152 : DeviceVarFlags::OffloadGlobalEntry, 1153 Section); 1154 } else if (I.Flags.getKind() == DeviceVarFlags::Surface) { 1155 OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize, 1156 DeviceVarFlags::OffloadGlobalSurfaceEntry, 1157 Section); 1158 } else if (I.Flags.getKind() == DeviceVarFlags::Texture) { 1159 OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize, 1160 DeviceVarFlags::OffloadGlobalTextureEntry, 1161 Section); 1162 } 1163 } 1164 } 1165 1166 // Returns module constructor to be added. 1167 llvm::Function *CGNVCUDARuntime::finalizeModule() { 1168 if (CGM.getLangOpts().CUDAIsDevice) { 1169 transformManagedVars(); 1170 1171 // Mark ODR-used device variables as compiler used to prevent it from being 1172 // eliminated by optimization. This is necessary for device variables 1173 // ODR-used by host functions. Sema correctly marks them as ODR-used no 1174 // matter whether they are ODR-used by device or host functions. 1175 // 1176 // We do not need to do this if the variable has used attribute since it 1177 // has already been added. 1178 // 1179 // Static device variables have been externalized at this point, therefore 1180 // variables with LLVM private or internal linkage need not be added. 1181 for (auto &&Info : DeviceVars) { 1182 auto Kind = Info.Flags.getKind(); 1183 if (!Info.Var->isDeclaration() && 1184 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) && 1185 (Kind == DeviceVarFlags::Variable || 1186 Kind == DeviceVarFlags::Surface || 1187 Kind == DeviceVarFlags::Texture) && 1188 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) { 1189 CGM.addCompilerUsedGlobal(Info.Var); 1190 } 1191 } 1192 return nullptr; 1193 } 1194 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode) 1195 createOffloadingEntries(); 1196 else 1197 return makeModuleCtorFunction(); 1198 1199 return nullptr; 1200 } 1201 1202 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, 1203 GlobalDecl GD) { 1204 auto Loc = KernelHandles.find(F->getName()); 1205 if (Loc != KernelHandles.end()) { 1206 auto OldHandle = Loc->second; 1207 if (KernelStubs[OldHandle] == F) 1208 return OldHandle; 1209 1210 // We've found the function name, but F itself has changed, so we need to 1211 // update the references. 1212 if (CGM.getLangOpts().HIP) { 1213 // For HIP compilation the handle itself does not change, so we only need 1214 // to update the Stub value. 1215 KernelStubs[OldHandle] = F; 1216 return OldHandle; 1217 } 1218 // For non-HIP compilation, erase the old Stub and fall-through to creating 1219 // new entries. 1220 KernelStubs.erase(OldHandle); 1221 } 1222 1223 if (!CGM.getLangOpts().HIP) { 1224 KernelHandles[F->getName()] = F; 1225 KernelStubs[F] = F; 1226 return F; 1227 } 1228 1229 auto *Var = new llvm::GlobalVariable( 1230 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(), 1231 /*Initializer=*/nullptr, 1232 CGM.getMangledName( 1233 GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel))); 1234 Var->setAlignment(CGM.getPointerAlign().getAsAlign()); 1235 Var->setDSOLocal(F->isDSOLocal()); 1236 Var->setVisibility(F->getVisibility()); 1237 CGM.maybeSetTrivialComdat(*GD.getDecl(), *Var); 1238 KernelHandles[F->getName()] = Var; 1239 KernelStubs[Var] = F; 1240 return Var; 1241 } 1242