1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CodeGenFunction.h" 16 #include "CodeGenModule.h" 17 #include "clang/AST/Decl.h" 18 #include "clang/Basic/Cuda.h" 19 #include "clang/CodeGen/CodeGenABITypes.h" 20 #include "clang/CodeGen/ConstantInitBuilder.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/Support/Format.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 29 namespace { 30 constexpr unsigned CudaFatMagic = 0x466243b1; 31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 32 33 class CGNVCUDARuntime : public CGCUDARuntime { 34 35 private: 36 llvm::IntegerType *IntTy, *SizeTy; 37 llvm::Type *VoidTy; 38 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 39 40 /// Convenience reference to LLVM Context 41 llvm::LLVMContext &Context; 42 /// Convenience reference to the current module 43 llvm::Module &TheModule; 44 /// Keeps track of kernel launch stubs emitted in this module 45 struct KernelInfo { 46 llvm::Function *Kernel; 47 const Decl *D; 48 }; 49 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 50 struct VarInfo { 51 llvm::GlobalVariable *Var; 52 const VarDecl *D; 53 unsigned Flag; 54 }; 55 llvm::SmallVector<VarInfo, 16> DeviceVars; 56 /// Keeps track of variable containing handle of GPU binary. Populated by 57 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 58 /// ModuleDtorFunction() 59 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 60 /// Whether we generate relocatable device code. 61 bool RelocatableDeviceCode; 62 /// Mangle context for device. 63 std::unique_ptr<MangleContext> DeviceMC; 64 65 llvm::FunctionCallee getSetupArgumentFn() const; 66 llvm::FunctionCallee getLaunchFn() const; 67 68 llvm::FunctionType *getRegisterGlobalsFnTy() const; 69 llvm::FunctionType *getCallbackFnTy() const; 70 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 71 std::string addPrefixToName(StringRef FuncName) const; 72 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 73 74 /// Creates a function to register all kernel stubs generated in this module. 75 llvm::Function *makeRegisterGlobalsFn(); 76 77 /// Helper function that generates a constant string and returns a pointer to 78 /// the start of the string. The result of this function can be used anywhere 79 /// where the C code specifies const char*. 80 llvm::Constant *makeConstantString(const std::string &Str, 81 const std::string &Name = "", 82 const std::string &SectionName = "", 83 unsigned Alignment = 0) { 84 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 85 llvm::ConstantInt::get(SizeTy, 0)}; 86 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 87 llvm::GlobalVariable *GV = 88 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 89 if (!SectionName.empty()) { 90 GV->setSection(SectionName); 91 // Mark the address as used which make sure that this section isn't 92 // merged and we will really have it in the object file. 93 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 94 } 95 if (Alignment) 96 GV->setAlignment(Alignment); 97 98 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 99 ConstStr.getPointer(), Zeros); 100 } 101 102 /// Helper function that generates an empty dummy function returning void. 103 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 104 assert(FnTy->getReturnType()->isVoidTy() && 105 "Can only generate dummy functions returning void!"); 106 llvm::Function *DummyFunc = llvm::Function::Create( 107 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 108 109 llvm::BasicBlock *DummyBlock = 110 llvm::BasicBlock::Create(Context, "", DummyFunc); 111 CGBuilderTy FuncBuilder(CGM, Context); 112 FuncBuilder.SetInsertPoint(DummyBlock); 113 FuncBuilder.CreateRetVoid(); 114 115 return DummyFunc; 116 } 117 118 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 119 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 120 std::string getDeviceSideName(const Decl *ND); 121 122 public: 123 CGNVCUDARuntime(CodeGenModule &CGM); 124 125 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 126 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 127 unsigned Flags) override { 128 DeviceVars.push_back({&Var, VD, Flags}); 129 } 130 131 /// Creates module constructor function 132 llvm::Function *makeModuleCtorFunction() override; 133 /// Creates module destructor function 134 llvm::Function *makeModuleDtorFunction() override; 135 /// Construct and return the stub name of a kernel. 136 std::string getDeviceStubName(llvm::StringRef Name) const override; 137 }; 138 139 } 140 141 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 142 if (CGM.getLangOpts().HIP) 143 return ((Twine("hip") + Twine(FuncName)).str()); 144 return ((Twine("cuda") + Twine(FuncName)).str()); 145 } 146 std::string 147 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 148 if (CGM.getLangOpts().HIP) 149 return ((Twine("__hip") + Twine(FuncName)).str()); 150 return ((Twine("__cuda") + Twine(FuncName)).str()); 151 } 152 153 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 154 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 155 TheModule(CGM.getModule()), 156 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 157 DeviceMC(CGM.getContext().createMangleContext( 158 CGM.getContext().getAuxTargetInfo())) { 159 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 160 ASTContext &Ctx = CGM.getContext(); 161 162 IntTy = CGM.IntTy; 163 SizeTy = CGM.SizeTy; 164 VoidTy = CGM.VoidTy; 165 166 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 167 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 168 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 169 } 170 171 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 172 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 173 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 174 return CGM.CreateRuntimeFunction( 175 llvm::FunctionType::get(IntTy, Params, false), 176 addPrefixToName("SetupArgument")); 177 } 178 179 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 180 if (CGM.getLangOpts().HIP) { 181 // hipError_t hipLaunchByPtr(char *); 182 return CGM.CreateRuntimeFunction( 183 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 184 } else { 185 // cudaError_t cudaLaunch(char *); 186 return CGM.CreateRuntimeFunction( 187 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 188 } 189 } 190 191 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 192 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 193 } 194 195 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 196 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 197 } 198 199 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 200 auto CallbackFnTy = getCallbackFnTy(); 201 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 202 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 203 VoidPtrTy, CallbackFnTy->getPointerTo()}; 204 return llvm::FunctionType::get(VoidTy, Params, false); 205 } 206 207 std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) { 208 auto *ND = cast<const NamedDecl>(D); 209 std::string DeviceSideName; 210 if (DeviceMC->shouldMangleDeclName(ND)) { 211 SmallString<256> Buffer; 212 llvm::raw_svector_ostream Out(Buffer); 213 DeviceMC->mangleName(ND, Out); 214 DeviceSideName = Out.str(); 215 } else 216 DeviceSideName = ND->getIdentifier()->getName(); 217 return DeviceSideName; 218 } 219 220 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 221 FunctionArgList &Args) { 222 // Ensure either we have different ABIs between host and device compilations, 223 // says host compilation following MSVC ABI but device compilation follows 224 // Itanium C++ ABI or, if they follow the same ABI, kernel names after 225 // mangling should be the same after name stubbing. The later checking is 226 // very important as the device kernel name being mangled in host-compilation 227 // is used to resolve the device binaries to be executed. Inconsistent naming 228 // result in undefined behavior. Even though we cannot check that naming 229 // directly between host- and device-compilations, the host- and 230 // device-mangling in host compilation could help catching certain ones. 231 assert((CGF.CGM.getContext().getAuxTargetInfo() && 232 (CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI() != 233 CGF.CGM.getContext().getTargetInfo().getCXXABI())) || 234 getDeviceStubName(getDeviceSideName(CGF.CurFuncDecl)) == 235 CGF.CurFn->getName()); 236 237 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 238 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 239 CudaFeature::CUDA_USES_NEW_LAUNCH)) 240 emitDeviceStubBodyNew(CGF, Args); 241 else 242 emitDeviceStubBodyLegacy(CGF, Args); 243 } 244 245 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 246 // array and kernels are launched using cudaLaunchKernel(). 247 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 248 FunctionArgList &Args) { 249 // Build the shadow stack entry at the very start of the function. 250 251 // Calculate amount of space we will need for all arguments. If we have no 252 // args, allocate a single pointer so we still have a valid pointer to the 253 // argument array that we can pass to runtime, even if it will be unused. 254 Address KernelArgs = CGF.CreateTempAlloca( 255 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", 256 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 257 // Store pointers to the arguments in a locally allocated launch_args. 258 for (unsigned i = 0; i < Args.size(); ++i) { 259 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); 260 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); 261 CGF.Builder.CreateDefaultAlignedStore( 262 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); 263 } 264 265 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 266 267 // Lookup cudaLaunchKernel function. 268 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 269 // void **args, size_t sharedMem, 270 // cudaStream_t stream); 271 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 272 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 273 IdentifierInfo &cudaLaunchKernelII = 274 CGM.getContext().Idents.get("cudaLaunchKernel"); 275 FunctionDecl *cudaLaunchKernelFD = nullptr; 276 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { 277 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 278 cudaLaunchKernelFD = FD; 279 } 280 281 if (cudaLaunchKernelFD == nullptr) { 282 CGM.Error(CGF.CurFuncDecl->getLocation(), 283 "Can't find declaration for cudaLaunchKernel()"); 284 return; 285 } 286 // Create temporary dim3 grid_dim, block_dim. 287 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 288 QualType Dim3Ty = GridDimParam->getType(); 289 Address GridDim = 290 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 291 Address BlockDim = 292 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 293 Address ShmemSize = 294 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 295 Address Stream = 296 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); 297 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 298 llvm::FunctionType::get(IntTy, 299 {/*gridDim=*/GridDim.getType(), 300 /*blockDim=*/BlockDim.getType(), 301 /*ShmemSize=*/ShmemSize.getType(), 302 /*Stream=*/Stream.getType()}, 303 /*isVarArg=*/false), 304 "__cudaPopCallConfiguration"); 305 306 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 307 {GridDim.getPointer(), BlockDim.getPointer(), 308 ShmemSize.getPointer(), Stream.getPointer()}); 309 310 // Emit the call to cudaLaunch 311 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); 312 CallArgList LaunchKernelArgs; 313 LaunchKernelArgs.add(RValue::get(Kernel), 314 cudaLaunchKernelFD->getParamDecl(0)->getType()); 315 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 316 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 317 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), 318 cudaLaunchKernelFD->getParamDecl(3)->getType()); 319 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 320 cudaLaunchKernelFD->getParamDecl(4)->getType()); 321 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 322 cudaLaunchKernelFD->getParamDecl(5)->getType()); 323 324 QualType QT = cudaLaunchKernelFD->getType(); 325 QualType CQT = QT.getCanonicalType(); 326 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 327 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); 328 329 const CGFunctionInfo &FI = 330 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 331 llvm::FunctionCallee cudaLaunchKernelFn = 332 CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); 333 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 334 LaunchKernelArgs); 335 CGF.EmitBranch(EndBlock); 336 337 CGF.EmitBlock(EndBlock); 338 } 339 340 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 341 FunctionArgList &Args) { 342 // Emit a call to cudaSetupArgument for each arg in Args. 343 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 344 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 345 CharUnits Offset = CharUnits::Zero(); 346 for (const VarDecl *A : Args) { 347 CharUnits TyWidth, TyAlign; 348 std::tie(TyWidth, TyAlign) = 349 CGM.getContext().getTypeInfoInChars(A->getType()); 350 Offset = Offset.alignTo(TyAlign); 351 llvm::Value *Args[] = { 352 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 353 VoidPtrTy), 354 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 355 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 356 }; 357 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 358 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 359 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 360 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 361 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 362 CGF.EmitBlock(NextBlock); 363 Offset += TyWidth; 364 } 365 366 // Emit the call to cudaLaunch 367 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 368 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 369 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 370 CGF.EmitBranch(EndBlock); 371 372 CGF.EmitBlock(EndBlock); 373 } 374 375 /// Creates a function that sets up state on the host side for CUDA objects that 376 /// have a presence on both the host and device sides. Specifically, registers 377 /// the host side of kernel functions and device global variables with the CUDA 378 /// runtime. 379 /// \code 380 /// void __cuda_register_globals(void** GpuBinaryHandle) { 381 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 382 /// ... 383 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 384 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 385 /// ... 386 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 387 /// } 388 /// \endcode 389 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 390 // No need to register anything 391 if (EmittedKernels.empty() && DeviceVars.empty()) 392 return nullptr; 393 394 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 395 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 396 addUnderscoredPrefixToName("_register_globals"), &TheModule); 397 llvm::BasicBlock *EntryBB = 398 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 399 CGBuilderTy Builder(CGM, Context); 400 Builder.SetInsertPoint(EntryBB); 401 402 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 403 // int, uint3*, uint3*, dim3*, dim3*, int*) 404 llvm::Type *RegisterFuncParams[] = { 405 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 406 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 407 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 408 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 409 addUnderscoredPrefixToName("RegisterFunction")); 410 411 // Extract GpuBinaryHandle passed as the first argument passed to 412 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 413 // each emitted kernel. 414 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 415 for (auto &&I : EmittedKernels) { 416 llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D)); 417 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 418 llvm::Value *Args[] = { 419 &GpuBinaryHandlePtr, 420 Builder.CreateBitCast(I.Kernel, VoidPtrTy), 421 KernelName, 422 KernelName, 423 llvm::ConstantInt::get(IntTy, -1), 424 NullPtr, 425 NullPtr, 426 NullPtr, 427 NullPtr, 428 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 429 Builder.CreateCall(RegisterFunc, Args); 430 } 431 432 // void __cudaRegisterVar(void **, char *, char *, const char *, 433 // int, int, int, int) 434 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 435 CharPtrTy, IntTy, IntTy, 436 IntTy, IntTy}; 437 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 438 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 439 addUnderscoredPrefixToName("RegisterVar")); 440 for (auto &&Info : DeviceVars) { 441 llvm::GlobalVariable *Var = Info.Var; 442 unsigned Flags = Info.Flag; 443 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 444 uint64_t VarSize = 445 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 446 llvm::Value *Args[] = { 447 &GpuBinaryHandlePtr, 448 Builder.CreateBitCast(Var, VoidPtrTy), 449 VarName, 450 VarName, 451 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 452 llvm::ConstantInt::get(IntTy, VarSize), 453 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 454 llvm::ConstantInt::get(IntTy, 0)}; 455 Builder.CreateCall(RegisterVar, Args); 456 } 457 458 Builder.CreateRetVoid(); 459 return RegisterKernelsFunc; 460 } 461 462 /// Creates a global constructor function for the module: 463 /// 464 /// For CUDA: 465 /// \code 466 /// void __cuda_module_ctor(void*) { 467 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 468 /// __cuda_register_globals(Handle); 469 /// } 470 /// \endcode 471 /// 472 /// For HIP: 473 /// \code 474 /// void __hip_module_ctor(void*) { 475 /// if (__hip_gpubin_handle == 0) { 476 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 477 /// __hip_register_globals(__hip_gpubin_handle); 478 /// } 479 /// } 480 /// \endcode 481 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 482 bool IsHIP = CGM.getLangOpts().HIP; 483 bool IsCUDA = CGM.getLangOpts().CUDA; 484 // No need to generate ctors/dtors if there is no GPU binary. 485 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 486 if (CudaGpuBinaryFileName.empty() && !IsHIP) 487 return nullptr; 488 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && 489 DeviceVars.empty()) 490 return nullptr; 491 492 // void __{cuda|hip}_register_globals(void* handle); 493 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 494 // We always need a function to pass in as callback. Create a dummy 495 // implementation if we don't need to register anything. 496 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 497 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 498 499 // void ** __{cuda|hip}RegisterFatBinary(void *); 500 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 501 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 502 addUnderscoredPrefixToName("RegisterFatBinary")); 503 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 504 llvm::StructType *FatbinWrapperTy = 505 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 506 507 // Register GPU binary with the CUDA runtime, store returned handle in a 508 // global variable and save a reference in GpuBinaryHandle to be cleaned up 509 // in destructor on exit. Then associate all known kernels with the GPU binary 510 // handle so CUDA runtime can figure out what to call on the GPU side. 511 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 512 if (!CudaGpuBinaryFileName.empty()) { 513 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 514 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 515 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 516 CGM.getDiags().Report(diag::err_cannot_open_file) 517 << CudaGpuBinaryFileName << EC.message(); 518 return nullptr; 519 } 520 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 521 } 522 523 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 524 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 525 llvm::GlobalValue::InternalLinkage, 526 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 527 llvm::BasicBlock *CtorEntryBB = 528 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 529 CGBuilderTy CtorBuilder(CGM, Context); 530 531 CtorBuilder.SetInsertPoint(CtorEntryBB); 532 533 const char *FatbinConstantName; 534 const char *FatbinSectionName; 535 const char *ModuleIDSectionName; 536 StringRef ModuleIDPrefix; 537 llvm::Constant *FatBinStr; 538 unsigned FatMagic; 539 if (IsHIP) { 540 FatbinConstantName = ".hip_fatbin"; 541 FatbinSectionName = ".hipFatBinSegment"; 542 543 ModuleIDSectionName = "__hip_module_id"; 544 ModuleIDPrefix = "__hip_"; 545 546 if (CudaGpuBinary) { 547 // If fatbin is available from early finalization, create a string 548 // literal containing the fat binary loaded from the given file. 549 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 550 FatbinConstantName, 8); 551 } else { 552 // If fatbin is not available, create an external symbol 553 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 554 // to contain the fat binary but will be populated somewhere else, 555 // e.g. by lld through link script. 556 FatBinStr = new llvm::GlobalVariable( 557 CGM.getModule(), CGM.Int8Ty, 558 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 559 "__hip_fatbin", nullptr, 560 llvm::GlobalVariable::NotThreadLocal); 561 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 562 } 563 564 FatMagic = HIPFatMagic; 565 } else { 566 if (RelocatableDeviceCode) 567 FatbinConstantName = CGM.getTriple().isMacOSX() 568 ? "__NV_CUDA,__nv_relfatbin" 569 : "__nv_relfatbin"; 570 else 571 FatbinConstantName = 572 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 573 // NVIDIA's cuobjdump looks for fatbins in this section. 574 FatbinSectionName = 575 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 576 577 ModuleIDSectionName = CGM.getTriple().isMacOSX() 578 ? "__NV_CUDA,__nv_module_id" 579 : "__nv_module_id"; 580 ModuleIDPrefix = "__nv_"; 581 582 // For CUDA, create a string literal containing the fat binary loaded from 583 // the given file. 584 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 585 FatbinConstantName, 8); 586 FatMagic = CudaFatMagic; 587 } 588 589 // Create initialized wrapper structure that points to the loaded GPU binary 590 ConstantInitBuilder Builder(CGM); 591 auto Values = Builder.beginStruct(FatbinWrapperTy); 592 // Fatbin wrapper magic. 593 Values.addInt(IntTy, FatMagic); 594 // Fatbin version. 595 Values.addInt(IntTy, 1); 596 // Data. 597 Values.add(FatBinStr); 598 // Unused in fatbin v1. 599 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 600 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 601 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 602 /*constant*/ true); 603 FatbinWrapper->setSection(FatbinSectionName); 604 605 // There is only one HIP fat binary per linked module, however there are 606 // multiple constructor functions. Make sure the fat binary is registered 607 // only once. The constructor functions are executed by the dynamic loader 608 // before the program gains control. The dynamic loader cannot execute the 609 // constructor functions concurrently since doing that would not guarantee 610 // thread safety of the loaded program. Therefore we can assume sequential 611 // execution of constructor functions here. 612 if (IsHIP) { 613 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 614 llvm::GlobalValue::LinkOnceAnyLinkage; 615 llvm::BasicBlock *IfBlock = 616 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 617 llvm::BasicBlock *ExitBlock = 618 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 619 // The name, size, and initialization pattern of this variable is part 620 // of HIP ABI. 621 GpuBinaryHandle = new llvm::GlobalVariable( 622 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 623 Linkage, 624 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 625 "__hip_gpubin_handle"); 626 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 627 // Prevent the weak symbol in different shared libraries being merged. 628 if (Linkage != llvm::GlobalValue::InternalLinkage) 629 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 630 Address GpuBinaryAddr( 631 GpuBinaryHandle, 632 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 633 { 634 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 635 llvm::Constant *Zero = 636 llvm::Constant::getNullValue(HandleValue->getType()); 637 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 638 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 639 } 640 { 641 CtorBuilder.SetInsertPoint(IfBlock); 642 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 643 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 644 RegisterFatbinFunc, 645 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 646 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 647 CtorBuilder.CreateBr(ExitBlock); 648 } 649 { 650 CtorBuilder.SetInsertPoint(ExitBlock); 651 // Call __hip_register_globals(GpuBinaryHandle); 652 if (RegisterGlobalsFunc) { 653 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 654 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 655 } 656 } 657 } else if (!RelocatableDeviceCode) { 658 // Register binary with CUDA runtime. This is substantially different in 659 // default mode vs. separate compilation! 660 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 661 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 662 RegisterFatbinFunc, 663 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 664 GpuBinaryHandle = new llvm::GlobalVariable( 665 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 666 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 667 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 668 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 669 CGM.getPointerAlign()); 670 671 // Call __cuda_register_globals(GpuBinaryHandle); 672 if (RegisterGlobalsFunc) 673 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 674 675 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 676 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 677 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 678 // void __cudaRegisterFatBinaryEnd(void **); 679 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 680 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 681 "__cudaRegisterFatBinaryEnd"); 682 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 683 } 684 } else { 685 // Generate a unique module ID. 686 SmallString<64> ModuleID; 687 llvm::raw_svector_ostream OS(ModuleID); 688 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 689 llvm::Constant *ModuleIDConstant = 690 makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); 691 692 // Create an alias for the FatbinWrapper that nvcc will look for. 693 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 694 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 695 696 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 697 // void *, void (*)(void **)) 698 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 699 RegisterLinkedBinaryName += ModuleID; 700 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 701 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 702 703 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 704 llvm::Value *Args[] = {RegisterGlobalsFunc, 705 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 706 ModuleIDConstant, 707 makeDummyFunction(getCallbackFnTy())}; 708 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 709 } 710 711 // Create destructor and register it with atexit() the way NVCC does it. Doing 712 // it during regular destructor phase worked in CUDA before 9.2 but results in 713 // double-free in 9.2. 714 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 715 // extern "C" int atexit(void (*f)(void)); 716 llvm::FunctionType *AtExitTy = 717 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 718 llvm::FunctionCallee AtExitFunc = 719 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 720 /*Local=*/true); 721 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 722 } 723 724 CtorBuilder.CreateRetVoid(); 725 return ModuleCtorFunc; 726 } 727 728 /// Creates a global destructor function that unregisters the GPU code blob 729 /// registered by constructor. 730 /// 731 /// For CUDA: 732 /// \code 733 /// void __cuda_module_dtor(void*) { 734 /// __cudaUnregisterFatBinary(Handle); 735 /// } 736 /// \endcode 737 /// 738 /// For HIP: 739 /// \code 740 /// void __hip_module_dtor(void*) { 741 /// if (__hip_gpubin_handle) { 742 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 743 /// __hip_gpubin_handle = 0; 744 /// } 745 /// } 746 /// \endcode 747 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 748 // No need for destructor if we don't have a handle to unregister. 749 if (!GpuBinaryHandle) 750 return nullptr; 751 752 // void __cudaUnregisterFatBinary(void ** handle); 753 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 754 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 755 addUnderscoredPrefixToName("UnregisterFatBinary")); 756 757 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 758 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 759 llvm::GlobalValue::InternalLinkage, 760 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 761 762 llvm::BasicBlock *DtorEntryBB = 763 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 764 CGBuilderTy DtorBuilder(CGM, Context); 765 DtorBuilder.SetInsertPoint(DtorEntryBB); 766 767 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 768 GpuBinaryHandle->getAlignment())); 769 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 770 // There is only one HIP fat binary per linked module, however there are 771 // multiple destructor functions. Make sure the fat binary is unregistered 772 // only once. 773 if (CGM.getLangOpts().HIP) { 774 llvm::BasicBlock *IfBlock = 775 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 776 llvm::BasicBlock *ExitBlock = 777 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 778 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 779 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 780 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 781 782 DtorBuilder.SetInsertPoint(IfBlock); 783 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 784 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 785 DtorBuilder.CreateBr(ExitBlock); 786 787 DtorBuilder.SetInsertPoint(ExitBlock); 788 } else { 789 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 790 } 791 DtorBuilder.CreateRetVoid(); 792 return ModuleDtorFunc; 793 } 794 795 std::string CGNVCUDARuntime::getDeviceStubName(llvm::StringRef Name) const { 796 if (!CGM.getLangOpts().HIP) 797 return Name; 798 return (Name + ".stub").str(); 799 } 800 801 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 802 return new CGNVCUDARuntime(CGM); 803 } 804