1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CodeGenFunction.h" 16 #include "CodeGenModule.h" 17 #include "clang/AST/Decl.h" 18 #include "clang/Basic/Cuda.h" 19 #include "clang/CodeGen/CodeGenABITypes.h" 20 #include "clang/CodeGen/ConstantInitBuilder.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/Support/Format.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 29 namespace { 30 constexpr unsigned CudaFatMagic = 0x466243b1; 31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 32 33 class CGNVCUDARuntime : public CGCUDARuntime { 34 35 private: 36 llvm::IntegerType *IntTy, *SizeTy; 37 llvm::Type *VoidTy; 38 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 39 40 /// Convenience reference to LLVM Context 41 llvm::LLVMContext &Context; 42 /// Convenience reference to the current module 43 llvm::Module &TheModule; 44 /// Keeps track of kernel launch stubs emitted in this module 45 struct KernelInfo { 46 llvm::Function *Kernel; 47 const Decl *D; 48 }; 49 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 50 struct VarInfo { 51 llvm::GlobalVariable *Var; 52 const VarDecl *D; 53 unsigned Flag; 54 }; 55 llvm::SmallVector<VarInfo, 16> DeviceVars; 56 /// Keeps track of variable containing handle of GPU binary. Populated by 57 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 58 /// ModuleDtorFunction() 59 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 60 /// Whether we generate relocatable device code. 61 bool RelocatableDeviceCode; 62 /// Mangle context for device. 63 std::unique_ptr<MangleContext> DeviceMC; 64 65 llvm::FunctionCallee getSetupArgumentFn() const; 66 llvm::FunctionCallee getLaunchFn() const; 67 68 llvm::FunctionType *getRegisterGlobalsFnTy() const; 69 llvm::FunctionType *getCallbackFnTy() const; 70 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 71 std::string addPrefixToName(StringRef FuncName) const; 72 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 73 74 /// Creates a function to register all kernel stubs generated in this module. 75 llvm::Function *makeRegisterGlobalsFn(); 76 77 /// Helper function that generates a constant string and returns a pointer to 78 /// the start of the string. The result of this function can be used anywhere 79 /// where the C code specifies const char*. 80 llvm::Constant *makeConstantString(const std::string &Str, 81 const std::string &Name = "", 82 const std::string &SectionName = "", 83 unsigned Alignment = 0) { 84 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 85 llvm::ConstantInt::get(SizeTy, 0)}; 86 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 87 llvm::GlobalVariable *GV = 88 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 89 if (!SectionName.empty()) { 90 GV->setSection(SectionName); 91 // Mark the address as used which make sure that this section isn't 92 // merged and we will really have it in the object file. 93 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 94 } 95 if (Alignment) 96 GV->setAlignment(llvm::Align(Alignment)); 97 98 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 99 ConstStr.getPointer(), Zeros); 100 } 101 102 /// Helper function that generates an empty dummy function returning void. 103 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 104 assert(FnTy->getReturnType()->isVoidTy() && 105 "Can only generate dummy functions returning void!"); 106 llvm::Function *DummyFunc = llvm::Function::Create( 107 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 108 109 llvm::BasicBlock *DummyBlock = 110 llvm::BasicBlock::Create(Context, "", DummyFunc); 111 CGBuilderTy FuncBuilder(CGM, Context); 112 FuncBuilder.SetInsertPoint(DummyBlock); 113 FuncBuilder.CreateRetVoid(); 114 115 return DummyFunc; 116 } 117 118 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 119 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 120 std::string getDeviceSideName(const Decl *ND); 121 122 public: 123 CGNVCUDARuntime(CodeGenModule &CGM); 124 125 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 126 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 127 unsigned Flags) override { 128 DeviceVars.push_back({&Var, VD, Flags}); 129 } 130 131 /// Creates module constructor function 132 llvm::Function *makeModuleCtorFunction() override; 133 /// Creates module destructor function 134 llvm::Function *makeModuleDtorFunction() override; 135 /// Construct and return the stub name of a kernel. 136 std::string getDeviceStubName(llvm::StringRef Name) const override; 137 }; 138 139 } 140 141 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 142 if (CGM.getLangOpts().HIP) 143 return ((Twine("hip") + Twine(FuncName)).str()); 144 return ((Twine("cuda") + Twine(FuncName)).str()); 145 } 146 std::string 147 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 148 if (CGM.getLangOpts().HIP) 149 return ((Twine("__hip") + Twine(FuncName)).str()); 150 return ((Twine("__cuda") + Twine(FuncName)).str()); 151 } 152 153 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 154 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 155 TheModule(CGM.getModule()), 156 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 157 DeviceMC(CGM.getContext().createMangleContext( 158 CGM.getContext().getAuxTargetInfo())) { 159 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 160 ASTContext &Ctx = CGM.getContext(); 161 162 IntTy = CGM.IntTy; 163 SizeTy = CGM.SizeTy; 164 VoidTy = CGM.VoidTy; 165 166 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 167 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 168 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 169 } 170 171 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 172 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 173 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 174 return CGM.CreateRuntimeFunction( 175 llvm::FunctionType::get(IntTy, Params, false), 176 addPrefixToName("SetupArgument")); 177 } 178 179 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 180 if (CGM.getLangOpts().HIP) { 181 // hipError_t hipLaunchByPtr(char *); 182 return CGM.CreateRuntimeFunction( 183 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 184 } else { 185 // cudaError_t cudaLaunch(char *); 186 return CGM.CreateRuntimeFunction( 187 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 188 } 189 } 190 191 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 192 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 193 } 194 195 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 196 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 197 } 198 199 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 200 auto CallbackFnTy = getCallbackFnTy(); 201 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 202 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 203 VoidPtrTy, CallbackFnTy->getPointerTo()}; 204 return llvm::FunctionType::get(VoidTy, Params, false); 205 } 206 207 std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) { 208 auto *ND = cast<const NamedDecl>(D); 209 std::string DeviceSideName; 210 if (DeviceMC->shouldMangleDeclName(ND)) { 211 SmallString<256> Buffer; 212 llvm::raw_svector_ostream Out(Buffer); 213 DeviceMC->mangleName(ND, Out); 214 DeviceSideName = Out.str(); 215 } else 216 DeviceSideName = ND->getIdentifier()->getName(); 217 return DeviceSideName; 218 } 219 220 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 221 FunctionArgList &Args) { 222 // Ensure either we have different ABIs between host and device compilations, 223 // says host compilation following MSVC ABI but device compilation follows 224 // Itanium C++ ABI or, if they follow the same ABI, kernel names after 225 // mangling should be the same after name stubbing. The later checking is 226 // very important as the device kernel name being mangled in host-compilation 227 // is used to resolve the device binaries to be executed. Inconsistent naming 228 // result in undefined behavior. Even though we cannot check that naming 229 // directly between host- and device-compilations, the host- and 230 // device-mangling in host compilation could help catching certain ones. 231 assert((CGF.CGM.getContext().getAuxTargetInfo() && 232 (CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI() != 233 CGF.CGM.getContext().getTargetInfo().getCXXABI())) || 234 getDeviceStubName(getDeviceSideName(CGF.CurFuncDecl)) == 235 CGF.CurFn->getName()); 236 237 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 238 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 239 CudaFeature::CUDA_USES_NEW_LAUNCH) || 240 CGF.getLangOpts().HIPUseNewLaunchAPI) 241 emitDeviceStubBodyNew(CGF, Args); 242 else 243 emitDeviceStubBodyLegacy(CGF, Args); 244 } 245 246 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 247 // array and kernels are launched using cudaLaunchKernel(). 248 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 249 FunctionArgList &Args) { 250 // Build the shadow stack entry at the very start of the function. 251 252 // Calculate amount of space we will need for all arguments. If we have no 253 // args, allocate a single pointer so we still have a valid pointer to the 254 // argument array that we can pass to runtime, even if it will be unused. 255 Address KernelArgs = CGF.CreateTempAlloca( 256 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", 257 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 258 // Store pointers to the arguments in a locally allocated launch_args. 259 for (unsigned i = 0; i < Args.size(); ++i) { 260 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); 261 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); 262 CGF.Builder.CreateDefaultAlignedStore( 263 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); 264 } 265 266 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 267 268 // Lookup cudaLaunchKernel/hipLaunchKernel function. 269 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 270 // void **args, size_t sharedMem, 271 // cudaStream_t stream); 272 // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 273 // void **args, size_t sharedMem, 274 // hipStream_t stream); 275 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 276 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 277 auto LaunchKernelName = addPrefixToName("LaunchKernel"); 278 IdentifierInfo &cudaLaunchKernelII = 279 CGM.getContext().Idents.get(LaunchKernelName); 280 FunctionDecl *cudaLaunchKernelFD = nullptr; 281 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { 282 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 283 cudaLaunchKernelFD = FD; 284 } 285 286 if (cudaLaunchKernelFD == nullptr) { 287 CGM.Error(CGF.CurFuncDecl->getLocation(), 288 "Can't find declaration for " + LaunchKernelName); 289 return; 290 } 291 // Create temporary dim3 grid_dim, block_dim. 292 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 293 QualType Dim3Ty = GridDimParam->getType(); 294 Address GridDim = 295 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 296 Address BlockDim = 297 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 298 Address ShmemSize = 299 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 300 Address Stream = 301 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); 302 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 303 llvm::FunctionType::get(IntTy, 304 {/*gridDim=*/GridDim.getType(), 305 /*blockDim=*/BlockDim.getType(), 306 /*ShmemSize=*/ShmemSize.getType(), 307 /*Stream=*/Stream.getType()}, 308 /*isVarArg=*/false), 309 addUnderscoredPrefixToName("PopCallConfiguration")); 310 311 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 312 {GridDim.getPointer(), BlockDim.getPointer(), 313 ShmemSize.getPointer(), Stream.getPointer()}); 314 315 // Emit the call to cudaLaunch 316 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); 317 CallArgList LaunchKernelArgs; 318 LaunchKernelArgs.add(RValue::get(Kernel), 319 cudaLaunchKernelFD->getParamDecl(0)->getType()); 320 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 321 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 322 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), 323 cudaLaunchKernelFD->getParamDecl(3)->getType()); 324 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 325 cudaLaunchKernelFD->getParamDecl(4)->getType()); 326 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 327 cudaLaunchKernelFD->getParamDecl(5)->getType()); 328 329 QualType QT = cudaLaunchKernelFD->getType(); 330 QualType CQT = QT.getCanonicalType(); 331 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 332 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); 333 334 const CGFunctionInfo &FI = 335 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 336 llvm::FunctionCallee cudaLaunchKernelFn = 337 CGM.CreateRuntimeFunction(FTy, LaunchKernelName); 338 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 339 LaunchKernelArgs); 340 CGF.EmitBranch(EndBlock); 341 342 CGF.EmitBlock(EndBlock); 343 } 344 345 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 346 FunctionArgList &Args) { 347 // Emit a call to cudaSetupArgument for each arg in Args. 348 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 349 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 350 CharUnits Offset = CharUnits::Zero(); 351 for (const VarDecl *A : Args) { 352 CharUnits TyWidth, TyAlign; 353 std::tie(TyWidth, TyAlign) = 354 CGM.getContext().getTypeInfoInChars(A->getType()); 355 Offset = Offset.alignTo(TyAlign); 356 llvm::Value *Args[] = { 357 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 358 VoidPtrTy), 359 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 360 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 361 }; 362 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 363 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 364 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 365 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 366 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 367 CGF.EmitBlock(NextBlock); 368 Offset += TyWidth; 369 } 370 371 // Emit the call to cudaLaunch 372 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 373 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 374 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 375 CGF.EmitBranch(EndBlock); 376 377 CGF.EmitBlock(EndBlock); 378 } 379 380 /// Creates a function that sets up state on the host side for CUDA objects that 381 /// have a presence on both the host and device sides. Specifically, registers 382 /// the host side of kernel functions and device global variables with the CUDA 383 /// runtime. 384 /// \code 385 /// void __cuda_register_globals(void** GpuBinaryHandle) { 386 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 387 /// ... 388 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 389 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 390 /// ... 391 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 392 /// } 393 /// \endcode 394 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 395 // No need to register anything 396 if (EmittedKernels.empty() && DeviceVars.empty()) 397 return nullptr; 398 399 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 400 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 401 addUnderscoredPrefixToName("_register_globals"), &TheModule); 402 llvm::BasicBlock *EntryBB = 403 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 404 CGBuilderTy Builder(CGM, Context); 405 Builder.SetInsertPoint(EntryBB); 406 407 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 408 // int, uint3*, uint3*, dim3*, dim3*, int*) 409 llvm::Type *RegisterFuncParams[] = { 410 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 411 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 412 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 413 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 414 addUnderscoredPrefixToName("RegisterFunction")); 415 416 // Extract GpuBinaryHandle passed as the first argument passed to 417 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 418 // each emitted kernel. 419 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 420 for (auto &&I : EmittedKernels) { 421 llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D)); 422 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 423 llvm::Value *Args[] = { 424 &GpuBinaryHandlePtr, 425 Builder.CreateBitCast(I.Kernel, VoidPtrTy), 426 KernelName, 427 KernelName, 428 llvm::ConstantInt::get(IntTy, -1), 429 NullPtr, 430 NullPtr, 431 NullPtr, 432 NullPtr, 433 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 434 Builder.CreateCall(RegisterFunc, Args); 435 } 436 437 // void __cudaRegisterVar(void **, char *, char *, const char *, 438 // int, int, int, int) 439 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 440 CharPtrTy, IntTy, IntTy, 441 IntTy, IntTy}; 442 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 443 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 444 addUnderscoredPrefixToName("RegisterVar")); 445 for (auto &&Info : DeviceVars) { 446 llvm::GlobalVariable *Var = Info.Var; 447 unsigned Flags = Info.Flag; 448 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 449 uint64_t VarSize = 450 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 451 llvm::Value *Args[] = { 452 &GpuBinaryHandlePtr, 453 Builder.CreateBitCast(Var, VoidPtrTy), 454 VarName, 455 VarName, 456 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 457 llvm::ConstantInt::get(IntTy, VarSize), 458 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 459 llvm::ConstantInt::get(IntTy, 0)}; 460 Builder.CreateCall(RegisterVar, Args); 461 } 462 463 Builder.CreateRetVoid(); 464 return RegisterKernelsFunc; 465 } 466 467 /// Creates a global constructor function for the module: 468 /// 469 /// For CUDA: 470 /// \code 471 /// void __cuda_module_ctor(void*) { 472 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 473 /// __cuda_register_globals(Handle); 474 /// } 475 /// \endcode 476 /// 477 /// For HIP: 478 /// \code 479 /// void __hip_module_ctor(void*) { 480 /// if (__hip_gpubin_handle == 0) { 481 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 482 /// __hip_register_globals(__hip_gpubin_handle); 483 /// } 484 /// } 485 /// \endcode 486 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 487 bool IsHIP = CGM.getLangOpts().HIP; 488 bool IsCUDA = CGM.getLangOpts().CUDA; 489 // No need to generate ctors/dtors if there is no GPU binary. 490 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 491 if (CudaGpuBinaryFileName.empty() && !IsHIP) 492 return nullptr; 493 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && 494 DeviceVars.empty()) 495 return nullptr; 496 497 // void __{cuda|hip}_register_globals(void* handle); 498 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 499 // We always need a function to pass in as callback. Create a dummy 500 // implementation if we don't need to register anything. 501 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 502 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 503 504 // void ** __{cuda|hip}RegisterFatBinary(void *); 505 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 506 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 507 addUnderscoredPrefixToName("RegisterFatBinary")); 508 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 509 llvm::StructType *FatbinWrapperTy = 510 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 511 512 // Register GPU binary with the CUDA runtime, store returned handle in a 513 // global variable and save a reference in GpuBinaryHandle to be cleaned up 514 // in destructor on exit. Then associate all known kernels with the GPU binary 515 // handle so CUDA runtime can figure out what to call on the GPU side. 516 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 517 if (!CudaGpuBinaryFileName.empty()) { 518 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 519 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 520 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 521 CGM.getDiags().Report(diag::err_cannot_open_file) 522 << CudaGpuBinaryFileName << EC.message(); 523 return nullptr; 524 } 525 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 526 } 527 528 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 529 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 530 llvm::GlobalValue::InternalLinkage, 531 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 532 llvm::BasicBlock *CtorEntryBB = 533 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 534 CGBuilderTy CtorBuilder(CGM, Context); 535 536 CtorBuilder.SetInsertPoint(CtorEntryBB); 537 538 const char *FatbinConstantName; 539 const char *FatbinSectionName; 540 const char *ModuleIDSectionName; 541 StringRef ModuleIDPrefix; 542 llvm::Constant *FatBinStr; 543 unsigned FatMagic; 544 if (IsHIP) { 545 FatbinConstantName = ".hip_fatbin"; 546 FatbinSectionName = ".hipFatBinSegment"; 547 548 ModuleIDSectionName = "__hip_module_id"; 549 ModuleIDPrefix = "__hip_"; 550 551 if (CudaGpuBinary) { 552 // If fatbin is available from early finalization, create a string 553 // literal containing the fat binary loaded from the given file. 554 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 555 FatbinConstantName, 8); 556 } else { 557 // If fatbin is not available, create an external symbol 558 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 559 // to contain the fat binary but will be populated somewhere else, 560 // e.g. by lld through link script. 561 FatBinStr = new llvm::GlobalVariable( 562 CGM.getModule(), CGM.Int8Ty, 563 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 564 "__hip_fatbin", nullptr, 565 llvm::GlobalVariable::NotThreadLocal); 566 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 567 } 568 569 FatMagic = HIPFatMagic; 570 } else { 571 if (RelocatableDeviceCode) 572 FatbinConstantName = CGM.getTriple().isMacOSX() 573 ? "__NV_CUDA,__nv_relfatbin" 574 : "__nv_relfatbin"; 575 else 576 FatbinConstantName = 577 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 578 // NVIDIA's cuobjdump looks for fatbins in this section. 579 FatbinSectionName = 580 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 581 582 ModuleIDSectionName = CGM.getTriple().isMacOSX() 583 ? "__NV_CUDA,__nv_module_id" 584 : "__nv_module_id"; 585 ModuleIDPrefix = "__nv_"; 586 587 // For CUDA, create a string literal containing the fat binary loaded from 588 // the given file. 589 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 590 FatbinConstantName, 8); 591 FatMagic = CudaFatMagic; 592 } 593 594 // Create initialized wrapper structure that points to the loaded GPU binary 595 ConstantInitBuilder Builder(CGM); 596 auto Values = Builder.beginStruct(FatbinWrapperTy); 597 // Fatbin wrapper magic. 598 Values.addInt(IntTy, FatMagic); 599 // Fatbin version. 600 Values.addInt(IntTy, 1); 601 // Data. 602 Values.add(FatBinStr); 603 // Unused in fatbin v1. 604 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 605 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 606 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 607 /*constant*/ true); 608 FatbinWrapper->setSection(FatbinSectionName); 609 610 // There is only one HIP fat binary per linked module, however there are 611 // multiple constructor functions. Make sure the fat binary is registered 612 // only once. The constructor functions are executed by the dynamic loader 613 // before the program gains control. The dynamic loader cannot execute the 614 // constructor functions concurrently since doing that would not guarantee 615 // thread safety of the loaded program. Therefore we can assume sequential 616 // execution of constructor functions here. 617 if (IsHIP) { 618 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 619 llvm::GlobalValue::LinkOnceAnyLinkage; 620 llvm::BasicBlock *IfBlock = 621 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 622 llvm::BasicBlock *ExitBlock = 623 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 624 // The name, size, and initialization pattern of this variable is part 625 // of HIP ABI. 626 GpuBinaryHandle = new llvm::GlobalVariable( 627 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 628 Linkage, 629 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 630 "__hip_gpubin_handle"); 631 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 632 // Prevent the weak symbol in different shared libraries being merged. 633 if (Linkage != llvm::GlobalValue::InternalLinkage) 634 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 635 Address GpuBinaryAddr( 636 GpuBinaryHandle, 637 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 638 { 639 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 640 llvm::Constant *Zero = 641 llvm::Constant::getNullValue(HandleValue->getType()); 642 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 643 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 644 } 645 { 646 CtorBuilder.SetInsertPoint(IfBlock); 647 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 648 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 649 RegisterFatbinFunc, 650 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 651 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 652 CtorBuilder.CreateBr(ExitBlock); 653 } 654 { 655 CtorBuilder.SetInsertPoint(ExitBlock); 656 // Call __hip_register_globals(GpuBinaryHandle); 657 if (RegisterGlobalsFunc) { 658 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 659 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 660 } 661 } 662 } else if (!RelocatableDeviceCode) { 663 // Register binary with CUDA runtime. This is substantially different in 664 // default mode vs. separate compilation! 665 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 666 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 667 RegisterFatbinFunc, 668 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 669 GpuBinaryHandle = new llvm::GlobalVariable( 670 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 671 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 672 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 673 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 674 CGM.getPointerAlign()); 675 676 // Call __cuda_register_globals(GpuBinaryHandle); 677 if (RegisterGlobalsFunc) 678 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 679 680 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 681 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 682 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 683 // void __cudaRegisterFatBinaryEnd(void **); 684 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 685 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 686 "__cudaRegisterFatBinaryEnd"); 687 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 688 } 689 } else { 690 // Generate a unique module ID. 691 SmallString<64> ModuleID; 692 llvm::raw_svector_ostream OS(ModuleID); 693 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 694 llvm::Constant *ModuleIDConstant = 695 makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); 696 697 // Create an alias for the FatbinWrapper that nvcc will look for. 698 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 699 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 700 701 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 702 // void *, void (*)(void **)) 703 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 704 RegisterLinkedBinaryName += ModuleID; 705 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 706 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 707 708 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 709 llvm::Value *Args[] = {RegisterGlobalsFunc, 710 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 711 ModuleIDConstant, 712 makeDummyFunction(getCallbackFnTy())}; 713 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 714 } 715 716 // Create destructor and register it with atexit() the way NVCC does it. Doing 717 // it during regular destructor phase worked in CUDA before 9.2 but results in 718 // double-free in 9.2. 719 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 720 // extern "C" int atexit(void (*f)(void)); 721 llvm::FunctionType *AtExitTy = 722 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 723 llvm::FunctionCallee AtExitFunc = 724 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 725 /*Local=*/true); 726 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 727 } 728 729 CtorBuilder.CreateRetVoid(); 730 return ModuleCtorFunc; 731 } 732 733 /// Creates a global destructor function that unregisters the GPU code blob 734 /// registered by constructor. 735 /// 736 /// For CUDA: 737 /// \code 738 /// void __cuda_module_dtor(void*) { 739 /// __cudaUnregisterFatBinary(Handle); 740 /// } 741 /// \endcode 742 /// 743 /// For HIP: 744 /// \code 745 /// void __hip_module_dtor(void*) { 746 /// if (__hip_gpubin_handle) { 747 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 748 /// __hip_gpubin_handle = 0; 749 /// } 750 /// } 751 /// \endcode 752 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 753 // No need for destructor if we don't have a handle to unregister. 754 if (!GpuBinaryHandle) 755 return nullptr; 756 757 // void __cudaUnregisterFatBinary(void ** handle); 758 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 759 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 760 addUnderscoredPrefixToName("UnregisterFatBinary")); 761 762 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 763 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 764 llvm::GlobalValue::InternalLinkage, 765 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 766 767 llvm::BasicBlock *DtorEntryBB = 768 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 769 CGBuilderTy DtorBuilder(CGM, Context); 770 DtorBuilder.SetInsertPoint(DtorEntryBB); 771 772 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 773 GpuBinaryHandle->getAlignment())); 774 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 775 // There is only one HIP fat binary per linked module, however there are 776 // multiple destructor functions. Make sure the fat binary is unregistered 777 // only once. 778 if (CGM.getLangOpts().HIP) { 779 llvm::BasicBlock *IfBlock = 780 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 781 llvm::BasicBlock *ExitBlock = 782 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 783 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 784 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 785 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 786 787 DtorBuilder.SetInsertPoint(IfBlock); 788 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 789 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 790 DtorBuilder.CreateBr(ExitBlock); 791 792 DtorBuilder.SetInsertPoint(ExitBlock); 793 } else { 794 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 795 } 796 DtorBuilder.CreateRetVoid(); 797 return ModuleDtorFunc; 798 } 799 800 std::string CGNVCUDARuntime::getDeviceStubName(llvm::StringRef Name) const { 801 if (!CGM.getLangOpts().HIP) 802 return Name; 803 return (Name + ".stub").str(); 804 } 805 806 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 807 return new CGNVCUDARuntime(CGM); 808 } 809