1 //===--- Cuda.cpp - Cuda Tool and ToolChain Implementations -----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "Cuda.h" 10 #include "CommonArgs.h" 11 #include "clang/Basic/Cuda.h" 12 #include "clang/Config/config.h" 13 #include "clang/Driver/Compilation.h" 14 #include "clang/Driver/Distro.h" 15 #include "clang/Driver/Driver.h" 16 #include "clang/Driver/DriverDiagnostic.h" 17 #include "clang/Driver/InputInfo.h" 18 #include "clang/Driver/Options.h" 19 #include "llvm/ADT/Optional.h" 20 #include "llvm/Option/ArgList.h" 21 #include "llvm/Support/FileSystem.h" 22 #include "llvm/Support/Host.h" 23 #include "llvm/Support/Path.h" 24 #include "llvm/Support/Process.h" 25 #include "llvm/Support/Program.h" 26 #include "llvm/Support/TargetParser.h" 27 #include "llvm/Support/VirtualFileSystem.h" 28 #include <system_error> 29 30 using namespace clang::driver; 31 using namespace clang::driver::toolchains; 32 using namespace clang::driver::tools; 33 using namespace clang; 34 using namespace llvm::opt; 35 36 namespace { 37 struct CudaVersionInfo { 38 std::string DetectedVersion; 39 CudaVersion Version; 40 }; 41 // Parses the contents of version.txt in an CUDA installation. It should 42 // contain one line of the from e.g. "CUDA Version 7.5.2". 43 CudaVersionInfo parseCudaVersionFile(llvm::StringRef V) { 44 V = V.trim(); 45 if (!V.startswith("CUDA Version ")) 46 return {V.str(), CudaVersion::UNKNOWN}; 47 V = V.substr(strlen("CUDA Version ")); 48 SmallVector<StringRef,4> VersionParts; 49 V.split(VersionParts, '.'); 50 return {"version.txt: " + V.str() + ".", 51 VersionParts.size() < 2 52 ? CudaVersion::UNKNOWN 53 : CudaStringToVersion( 54 join_items(".", VersionParts[0], VersionParts[1]))}; 55 } 56 57 CudaVersion getCudaVersion(uint32_t raw_version) { 58 if (raw_version < 7050) 59 return CudaVersion::CUDA_70; 60 if (raw_version < 8000) 61 return CudaVersion::CUDA_75; 62 if (raw_version < 9000) 63 return CudaVersion::CUDA_80; 64 if (raw_version < 9010) 65 return CudaVersion::CUDA_90; 66 if (raw_version < 9020) 67 return CudaVersion::CUDA_91; 68 if (raw_version < 10000) 69 return CudaVersion::CUDA_92; 70 if (raw_version < 10010) 71 return CudaVersion::CUDA_100; 72 if (raw_version < 10020) 73 return CudaVersion::CUDA_101; 74 if (raw_version < 11000) 75 return CudaVersion::CUDA_102; 76 if (raw_version < 11010) 77 return CudaVersion::CUDA_110; 78 if (raw_version < 11020) 79 return CudaVersion::CUDA_111; 80 return CudaVersion::LATEST; 81 } 82 83 CudaVersionInfo parseCudaHFile(llvm::StringRef Input) { 84 // Helper lambda which skips the words if the line starts with them or returns 85 // None otherwise. 86 auto StartsWithWords = 87 [](llvm::StringRef Line, 88 const SmallVector<StringRef, 3> words) -> llvm::Optional<StringRef> { 89 for (StringRef word : words) { 90 if (!Line.consume_front(word)) 91 return {}; 92 Line = Line.ltrim(); 93 } 94 return Line; 95 }; 96 97 Input = Input.ltrim(); 98 while (!Input.empty()) { 99 if (auto Line = 100 StartsWithWords(Input.ltrim(), {"#", "define", "CUDA_VERSION"})) { 101 uint32_t RawVersion; 102 Line->consumeInteger(10, RawVersion); 103 return {"cuda.h: CUDA_VERSION=" + Twine(RawVersion).str() + ".", 104 getCudaVersion(RawVersion)}; 105 } 106 // Find next non-empty line. 107 Input = Input.drop_front(Input.find_first_of("\n\r")).ltrim(); 108 } 109 return {"cuda.h: CUDA_VERSION not found.", CudaVersion::UNKNOWN}; 110 } 111 } // namespace 112 113 void CudaInstallationDetector::WarnIfUnsupportedVersion() { 114 if (DetectedVersionIsNotSupported) 115 D.Diag(diag::warn_drv_unknown_cuda_version) 116 << DetectedVersion 117 << CudaVersionToString(CudaVersion::LATEST_SUPPORTED); 118 } 119 120 CudaInstallationDetector::CudaInstallationDetector( 121 const Driver &D, const llvm::Triple &HostTriple, 122 const llvm::opt::ArgList &Args) 123 : D(D) { 124 struct Candidate { 125 std::string Path; 126 bool StrictChecking; 127 128 Candidate(std::string Path, bool StrictChecking = false) 129 : Path(Path), StrictChecking(StrictChecking) {} 130 }; 131 SmallVector<Candidate, 4> Candidates; 132 133 // In decreasing order so we prefer newer versions to older versions. 134 std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"}; 135 auto &FS = D.getVFS(); 136 137 if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) { 138 Candidates.emplace_back( 139 Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str()); 140 } else if (HostTriple.isOSWindows()) { 141 for (const char *Ver : Versions) 142 Candidates.emplace_back( 143 D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" + 144 Ver); 145 } else { 146 if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) { 147 // Try to find ptxas binary. If the executable is located in a directory 148 // called 'bin/', its parent directory might be a good guess for a valid 149 // CUDA installation. 150 // However, some distributions might installs 'ptxas' to /usr/bin. In that 151 // case the candidate would be '/usr' which passes the following checks 152 // because '/usr/include' exists as well. To avoid this case, we always 153 // check for the directory potentially containing files for libdevice, 154 // even if the user passes -nocudalib. 155 if (llvm::ErrorOr<std::string> ptxas = 156 llvm::sys::findProgramByName("ptxas")) { 157 SmallString<256> ptxasAbsolutePath; 158 llvm::sys::fs::real_path(*ptxas, ptxasAbsolutePath); 159 160 StringRef ptxasDir = llvm::sys::path::parent_path(ptxasAbsolutePath); 161 if (llvm::sys::path::filename(ptxasDir) == "bin") 162 Candidates.emplace_back( 163 std::string(llvm::sys::path::parent_path(ptxasDir)), 164 /*StrictChecking=*/true); 165 } 166 } 167 168 Candidates.emplace_back(D.SysRoot + "/usr/local/cuda"); 169 for (const char *Ver : Versions) 170 Candidates.emplace_back(D.SysRoot + "/usr/local/cuda-" + Ver); 171 172 Distro Dist(FS, llvm::Triple(llvm::sys::getProcessTriple())); 173 if (Dist.IsDebian() || Dist.IsUbuntu()) 174 // Special case for Debian to have nvidia-cuda-toolkit work 175 // out of the box. More info on http://bugs.debian.org/882505 176 Candidates.emplace_back(D.SysRoot + "/usr/lib/cuda"); 177 } 178 179 bool NoCudaLib = Args.hasArg(options::OPT_nogpulib); 180 181 for (const auto &Candidate : Candidates) { 182 InstallPath = Candidate.Path; 183 if (InstallPath.empty() || !FS.exists(InstallPath)) 184 continue; 185 186 BinPath = InstallPath + "/bin"; 187 IncludePath = InstallPath + "/include"; 188 LibDevicePath = InstallPath + "/nvvm/libdevice"; 189 190 if (!(FS.exists(IncludePath) && FS.exists(BinPath))) 191 continue; 192 bool CheckLibDevice = (!NoCudaLib || Candidate.StrictChecking); 193 if (CheckLibDevice && !FS.exists(LibDevicePath)) 194 continue; 195 196 // On Linux, we have both lib and lib64 directories, and we need to choose 197 // based on our triple. On MacOS, we have only a lib directory. 198 // 199 // It's sufficient for our purposes to be flexible: If both lib and lib64 200 // exist, we choose whichever one matches our triple. Otherwise, if only 201 // lib exists, we use it. 202 if (HostTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64")) 203 LibPath = InstallPath + "/lib64"; 204 else if (FS.exists(InstallPath + "/lib")) 205 LibPath = InstallPath + "/lib"; 206 else 207 continue; 208 209 CudaVersionInfo VersionInfo = {"", CudaVersion::UNKNOWN}; 210 if (auto VersionFile = FS.getBufferForFile(InstallPath + "/version.txt")) 211 VersionInfo = parseCudaVersionFile((*VersionFile)->getBuffer()); 212 // If version file didn't give us the version, try to find it in cuda.h 213 if (VersionInfo.Version == CudaVersion::UNKNOWN) 214 if (auto CudaHFile = FS.getBufferForFile(InstallPath + "/include/cuda.h")) 215 VersionInfo = parseCudaHFile((*CudaHFile)->getBuffer()); 216 // As the last resort, make an educated guess between CUDA-7.0, (which had 217 // no version.txt file and had old-style libdevice bitcode ) and an unknown 218 // recent CUDA version (no version.txt, new style bitcode). 219 if (VersionInfo.Version == CudaVersion::UNKNOWN) { 220 VersionInfo.Version = (FS.exists(LibDevicePath + "/libdevice.10.bc")) 221 ? Version = CudaVersion::LATEST 222 : Version = CudaVersion::CUDA_70; 223 VersionInfo.DetectedVersion = 224 "No version found in version.txt or cuda.h."; 225 } 226 227 Version = VersionInfo.Version; 228 DetectedVersion = VersionInfo.DetectedVersion; 229 230 // TODO(tra): remove the warning once we have all features of 10.2 231 // and 11.0 implemented. 232 DetectedVersionIsNotSupported = Version > CudaVersion::LATEST_SUPPORTED; 233 234 if (Version >= CudaVersion::CUDA_90) { 235 // CUDA-9+ uses single libdevice file for all GPU variants. 236 std::string FilePath = LibDevicePath + "/libdevice.10.bc"; 237 if (FS.exists(FilePath)) { 238 for (int Arch = (int)CudaArch::SM_30, E = (int)CudaArch::LAST; Arch < E; 239 ++Arch) { 240 CudaArch GpuArch = static_cast<CudaArch>(Arch); 241 if (!IsNVIDIAGpuArch(GpuArch)) 242 continue; 243 std::string GpuArchName(CudaArchToString(GpuArch)); 244 LibDeviceMap[GpuArchName] = FilePath; 245 } 246 } 247 } else { 248 std::error_code EC; 249 for (llvm::vfs::directory_iterator LI = FS.dir_begin(LibDevicePath, EC), 250 LE; 251 !EC && LI != LE; LI = LI.increment(EC)) { 252 StringRef FilePath = LI->path(); 253 StringRef FileName = llvm::sys::path::filename(FilePath); 254 // Process all bitcode filenames that look like 255 // libdevice.compute_XX.YY.bc 256 const StringRef LibDeviceName = "libdevice."; 257 if (!(FileName.startswith(LibDeviceName) && FileName.endswith(".bc"))) 258 continue; 259 StringRef GpuArch = FileName.slice( 260 LibDeviceName.size(), FileName.find('.', LibDeviceName.size())); 261 LibDeviceMap[GpuArch] = FilePath.str(); 262 // Insert map entries for specific devices with this compute 263 // capability. NVCC's choice of the libdevice library version is 264 // rather peculiar and depends on the CUDA version. 265 if (GpuArch == "compute_20") { 266 LibDeviceMap["sm_20"] = std::string(FilePath); 267 LibDeviceMap["sm_21"] = std::string(FilePath); 268 LibDeviceMap["sm_32"] = std::string(FilePath); 269 } else if (GpuArch == "compute_30") { 270 LibDeviceMap["sm_30"] = std::string(FilePath); 271 if (Version < CudaVersion::CUDA_80) { 272 LibDeviceMap["sm_50"] = std::string(FilePath); 273 LibDeviceMap["sm_52"] = std::string(FilePath); 274 LibDeviceMap["sm_53"] = std::string(FilePath); 275 } 276 LibDeviceMap["sm_60"] = std::string(FilePath); 277 LibDeviceMap["sm_61"] = std::string(FilePath); 278 LibDeviceMap["sm_62"] = std::string(FilePath); 279 } else if (GpuArch == "compute_35") { 280 LibDeviceMap["sm_35"] = std::string(FilePath); 281 LibDeviceMap["sm_37"] = std::string(FilePath); 282 } else if (GpuArch == "compute_50") { 283 if (Version >= CudaVersion::CUDA_80) { 284 LibDeviceMap["sm_50"] = std::string(FilePath); 285 LibDeviceMap["sm_52"] = std::string(FilePath); 286 LibDeviceMap["sm_53"] = std::string(FilePath); 287 } 288 } 289 } 290 } 291 292 // Check that we have found at least one libdevice that we can link in if 293 // -nocudalib hasn't been specified. 294 if (LibDeviceMap.empty() && !NoCudaLib) 295 continue; 296 297 IsValid = true; 298 break; 299 } 300 } 301 302 void CudaInstallationDetector::AddCudaIncludeArgs( 303 const ArgList &DriverArgs, ArgStringList &CC1Args) const { 304 if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { 305 // Add cuda_wrappers/* to our system include path. This lets us wrap 306 // standard library headers. 307 SmallString<128> P(D.ResourceDir); 308 llvm::sys::path::append(P, "include"); 309 llvm::sys::path::append(P, "cuda_wrappers"); 310 CC1Args.push_back("-internal-isystem"); 311 CC1Args.push_back(DriverArgs.MakeArgString(P)); 312 } 313 314 if (DriverArgs.hasArg(options::OPT_nogpuinc)) 315 return; 316 317 if (!isValid()) { 318 D.Diag(diag::err_drv_no_cuda_installation); 319 return; 320 } 321 322 CC1Args.push_back("-internal-isystem"); 323 CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath())); 324 CC1Args.push_back("-include"); 325 CC1Args.push_back("__clang_cuda_runtime_wrapper.h"); 326 } 327 328 void CudaInstallationDetector::CheckCudaVersionSupportsArch( 329 CudaArch Arch) const { 330 if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN || 331 ArchsWithBadVersion[(int)Arch]) 332 return; 333 334 auto MinVersion = MinVersionForCudaArch(Arch); 335 auto MaxVersion = MaxVersionForCudaArch(Arch); 336 if (Version < MinVersion || Version > MaxVersion) { 337 ArchsWithBadVersion[(int)Arch] = true; 338 D.Diag(diag::err_drv_cuda_version_unsupported) 339 << CudaArchToString(Arch) << CudaVersionToString(MinVersion) 340 << CudaVersionToString(MaxVersion) << InstallPath 341 << CudaVersionToString(Version); 342 } 343 } 344 345 void CudaInstallationDetector::print(raw_ostream &OS) const { 346 if (isValid()) 347 OS << "Found CUDA installation: " << InstallPath << ", version " 348 << CudaVersionToString(Version) << "\n"; 349 } 350 351 namespace { 352 /// Debug info level for the NVPTX devices. We may need to emit different debug 353 /// info level for the host and for the device itselfi. This type controls 354 /// emission of the debug info for the devices. It either prohibits disable info 355 /// emission completely, or emits debug directives only, or emits same debug 356 /// info as for the host. 357 enum DeviceDebugInfoLevel { 358 DisableDebugInfo, /// Do not emit debug info for the devices. 359 DebugDirectivesOnly, /// Emit only debug directives. 360 EmitSameDebugInfoAsHost, /// Use the same debug info level just like for the 361 /// host. 362 }; 363 } // anonymous namespace 364 365 /// Define debug info level for the NVPTX devices. If the debug info for both 366 /// the host and device are disabled (-g0/-ggdb0 or no debug options at all). If 367 /// only debug directives are requested for the both host and device 368 /// (-gline-directvies-only), or the debug info only for the device is disabled 369 /// (optimization is on and --cuda-noopt-device-debug was not specified), the 370 /// debug directves only must be emitted for the device. Otherwise, use the same 371 /// debug info level just like for the host (with the limitations of only 372 /// supported DWARF2 standard). 373 static DeviceDebugInfoLevel mustEmitDebugInfo(const ArgList &Args) { 374 const Arg *A = Args.getLastArg(options::OPT_O_Group); 375 bool IsDebugEnabled = !A || A->getOption().matches(options::OPT_O0) || 376 Args.hasFlag(options::OPT_cuda_noopt_device_debug, 377 options::OPT_no_cuda_noopt_device_debug, 378 /*Default=*/false); 379 if (const Arg *A = Args.getLastArg(options::OPT_g_Group)) { 380 const Option &Opt = A->getOption(); 381 if (Opt.matches(options::OPT_gN_Group)) { 382 if (Opt.matches(options::OPT_g0) || Opt.matches(options::OPT_ggdb0)) 383 return DisableDebugInfo; 384 if (Opt.matches(options::OPT_gline_directives_only)) 385 return DebugDirectivesOnly; 386 } 387 return IsDebugEnabled ? EmitSameDebugInfoAsHost : DebugDirectivesOnly; 388 } 389 return willEmitRemarks(Args) ? DebugDirectivesOnly : DisableDebugInfo; 390 } 391 392 void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, 393 const InputInfo &Output, 394 const InputInfoList &Inputs, 395 const ArgList &Args, 396 const char *LinkingOutput) const { 397 const auto &TC = 398 static_cast<const toolchains::CudaToolChain &>(getToolChain()); 399 assert(TC.getTriple().isNVPTX() && "Wrong platform"); 400 401 StringRef GPUArchName; 402 // If this is an OpenMP action we need to extract the device architecture 403 // from the -march=arch option. This option may come from -Xopenmp-target 404 // flag or the default value. 405 if (JA.isDeviceOffloading(Action::OFK_OpenMP)) { 406 GPUArchName = Args.getLastArgValue(options::OPT_march_EQ); 407 assert(!GPUArchName.empty() && "Must have an architecture passed in."); 408 } else 409 GPUArchName = JA.getOffloadingArch(); 410 411 // Obtain architecture from the action. 412 CudaArch gpu_arch = StringToCudaArch(GPUArchName); 413 assert(gpu_arch != CudaArch::UNKNOWN && 414 "Device action expected to have an architecture."); 415 416 // Check that our installation's ptxas supports gpu_arch. 417 if (!Args.hasArg(options::OPT_no_cuda_version_check)) { 418 TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch); 419 } 420 421 ArgStringList CmdArgs; 422 CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32"); 423 DeviceDebugInfoLevel DIKind = mustEmitDebugInfo(Args); 424 if (DIKind == EmitSameDebugInfoAsHost) { 425 // ptxas does not accept -g option if optimization is enabled, so 426 // we ignore the compiler's -O* options if we want debug info. 427 CmdArgs.push_back("-g"); 428 CmdArgs.push_back("--dont-merge-basicblocks"); 429 CmdArgs.push_back("--return-at-end"); 430 } else if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { 431 // Map the -O we received to -O{0,1,2,3}. 432 // 433 // TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's 434 // default, so it may correspond more closely to the spirit of clang -O2. 435 436 // -O3 seems like the least-bad option when -Osomething is specified to 437 // clang but it isn't handled below. 438 StringRef OOpt = "3"; 439 if (A->getOption().matches(options::OPT_O4) || 440 A->getOption().matches(options::OPT_Ofast)) 441 OOpt = "3"; 442 else if (A->getOption().matches(options::OPT_O0)) 443 OOpt = "0"; 444 else if (A->getOption().matches(options::OPT_O)) { 445 // -Os, -Oz, and -O(anything else) map to -O2, for lack of better options. 446 OOpt = llvm::StringSwitch<const char *>(A->getValue()) 447 .Case("1", "1") 448 .Case("2", "2") 449 .Case("3", "3") 450 .Case("s", "2") 451 .Case("z", "2") 452 .Default("2"); 453 } 454 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt)); 455 } else { 456 // If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond 457 // to no optimizations, but ptxas's default is -O3. 458 CmdArgs.push_back("-O0"); 459 } 460 if (DIKind == DebugDirectivesOnly) 461 CmdArgs.push_back("-lineinfo"); 462 463 // Pass -v to ptxas if it was passed to the driver. 464 if (Args.hasArg(options::OPT_v)) 465 CmdArgs.push_back("-v"); 466 467 CmdArgs.push_back("--gpu-name"); 468 CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch))); 469 CmdArgs.push_back("--output-file"); 470 CmdArgs.push_back(Args.MakeArgString(TC.getInputFilename(Output))); 471 for (const auto& II : Inputs) 472 CmdArgs.push_back(Args.MakeArgString(II.getFilename())); 473 474 for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) 475 CmdArgs.push_back(Args.MakeArgString(A)); 476 477 bool Relocatable = false; 478 if (JA.isOffloading(Action::OFK_OpenMP)) 479 // In OpenMP we need to generate relocatable code. 480 Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target, 481 options::OPT_fnoopenmp_relocatable_target, 482 /*Default=*/true); 483 else if (JA.isOffloading(Action::OFK_Cuda)) 484 Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, 485 options::OPT_fno_gpu_rdc, /*Default=*/false); 486 487 if (Relocatable) 488 CmdArgs.push_back("-c"); 489 490 const char *Exec; 491 if (Arg *A = Args.getLastArg(options::OPT_ptxas_path_EQ)) 492 Exec = A->getValue(); 493 else 494 Exec = Args.MakeArgString(TC.GetProgramPath("ptxas")); 495 C.addCommand(std::make_unique<Command>( 496 JA, *this, 497 ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8, 498 "--options-file"}, 499 Exec, CmdArgs, Inputs, Output)); 500 } 501 502 static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) { 503 bool includePTX = true; 504 for (Arg *A : Args) { 505 if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) || 506 A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ))) 507 continue; 508 A->claim(); 509 const StringRef ArchStr = A->getValue(); 510 if (ArchStr == "all" || ArchStr == gpu_arch) { 511 includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ); 512 continue; 513 } 514 } 515 return includePTX; 516 } 517 518 // All inputs to this linker must be from CudaDeviceActions, as we need to look 519 // at the Inputs' Actions in order to figure out which GPU architecture they 520 // correspond to. 521 void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, 522 const InputInfo &Output, 523 const InputInfoList &Inputs, 524 const ArgList &Args, 525 const char *LinkingOutput) const { 526 const auto &TC = 527 static_cast<const toolchains::CudaToolChain &>(getToolChain()); 528 assert(TC.getTriple().isNVPTX() && "Wrong platform"); 529 530 ArgStringList CmdArgs; 531 if (TC.CudaInstallation.version() <= CudaVersion::CUDA_100) 532 CmdArgs.push_back("--cuda"); 533 CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32"); 534 CmdArgs.push_back(Args.MakeArgString("--create")); 535 CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); 536 if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost) 537 CmdArgs.push_back("-g"); 538 539 for (const auto& II : Inputs) { 540 auto *A = II.getAction(); 541 assert(A->getInputs().size() == 1 && 542 "Device offload action is expected to have a single input"); 543 const char *gpu_arch_str = A->getOffloadingArch(); 544 assert(gpu_arch_str && 545 "Device action expected to have associated a GPU architecture!"); 546 CudaArch gpu_arch = StringToCudaArch(gpu_arch_str); 547 548 if (II.getType() == types::TY_PP_Asm && 549 !shouldIncludePTX(Args, gpu_arch_str)) 550 continue; 551 // We need to pass an Arch of the form "sm_XX" for cubin files and 552 // "compute_XX" for ptx. 553 const char *Arch = (II.getType() == types::TY_PP_Asm) 554 ? CudaArchToVirtualArchString(gpu_arch) 555 : gpu_arch_str; 556 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + 557 Arch + ",file=" + II.getFilename())); 558 } 559 560 for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary)) 561 CmdArgs.push_back(Args.MakeArgString(A)); 562 563 const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary")); 564 C.addCommand(std::make_unique<Command>( 565 JA, *this, 566 ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8, 567 "--options-file"}, 568 Exec, CmdArgs, Inputs, Output)); 569 } 570 571 void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA, 572 const InputInfo &Output, 573 const InputInfoList &Inputs, 574 const ArgList &Args, 575 const char *LinkingOutput) const { 576 const auto &TC = 577 static_cast<const toolchains::CudaToolChain &>(getToolChain()); 578 assert(TC.getTriple().isNVPTX() && "Wrong platform"); 579 580 ArgStringList CmdArgs; 581 582 // OpenMP uses nvlink to link cubin files. The result will be embedded in the 583 // host binary by the host linker. 584 assert(!JA.isHostOffloading(Action::OFK_OpenMP) && 585 "CUDA toolchain not expected for an OpenMP host device."); 586 587 if (Output.isFilename()) { 588 CmdArgs.push_back("-o"); 589 CmdArgs.push_back(Output.getFilename()); 590 } else 591 assert(Output.isNothing() && "Invalid output."); 592 if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost) 593 CmdArgs.push_back("-g"); 594 595 if (Args.hasArg(options::OPT_v)) 596 CmdArgs.push_back("-v"); 597 598 StringRef GPUArch = 599 Args.getLastArgValue(options::OPT_march_EQ); 600 assert(!GPUArch.empty() && "At least one GPU Arch required for ptxas."); 601 602 CmdArgs.push_back("-arch"); 603 CmdArgs.push_back(Args.MakeArgString(GPUArch)); 604 605 // Add paths specified in LIBRARY_PATH environment variable as -L options. 606 addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH"); 607 608 // Add paths for the default clang library path. 609 SmallString<256> DefaultLibPath = 610 llvm::sys::path::parent_path(TC.getDriver().Dir); 611 llvm::sys::path::append(DefaultLibPath, "lib" CLANG_LIBDIR_SUFFIX); 612 CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath)); 613 614 for (const auto &II : Inputs) { 615 if (II.getType() == types::TY_LLVM_IR || 616 II.getType() == types::TY_LTO_IR || 617 II.getType() == types::TY_LTO_BC || 618 II.getType() == types::TY_LLVM_BC) { 619 C.getDriver().Diag(diag::err_drv_no_linker_llvm_support) 620 << getToolChain().getTripleString(); 621 continue; 622 } 623 624 // Currently, we only pass the input files to the linker, we do not pass 625 // any libraries that may be valid only for the host. 626 if (!II.isFilename()) 627 continue; 628 629 const char *CubinF = C.addTempFile( 630 C.getArgs().MakeArgString(getToolChain().getInputFilename(II))); 631 632 CmdArgs.push_back(CubinF); 633 } 634 635 const char *Exec = 636 Args.MakeArgString(getToolChain().GetProgramPath("nvlink")); 637 C.addCommand(std::make_unique<Command>( 638 JA, *this, 639 ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8, 640 "--options-file"}, 641 Exec, CmdArgs, Inputs, Output)); 642 } 643 644 /// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary, 645 /// which isn't properly a linker but nonetheless performs the step of stitching 646 /// together object files from the assembler into a single blob. 647 648 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple, 649 const ToolChain &HostTC, const ArgList &Args, 650 const Action::OffloadKind OK) 651 : ToolChain(D, Triple, Args), HostTC(HostTC), 652 CudaInstallation(D, HostTC.getTriple(), Args), OK(OK) { 653 if (CudaInstallation.isValid()) { 654 CudaInstallation.WarnIfUnsupportedVersion(); 655 getProgramPaths().push_back(std::string(CudaInstallation.getBinPath())); 656 } 657 // Lookup binaries into the driver directory, this is used to 658 // discover the clang-offload-bundler executable. 659 getProgramPaths().push_back(getDriver().Dir); 660 } 661 662 std::string CudaToolChain::getInputFilename(const InputInfo &Input) const { 663 // Only object files are changed, for example assembly files keep their .s 664 // extensions. CUDA also continues to use .o as they don't use nvlink but 665 // fatbinary. 666 if (!(OK == Action::OFK_OpenMP && Input.getType() == types::TY_Object)) 667 return ToolChain::getInputFilename(Input); 668 669 // Replace extension for object files with cubin because nvlink relies on 670 // these particular file names. 671 SmallString<256> Filename(ToolChain::getInputFilename(Input)); 672 llvm::sys::path::replace_extension(Filename, "cubin"); 673 return std::string(Filename.str()); 674 } 675 676 void CudaToolChain::addClangTargetOptions( 677 const llvm::opt::ArgList &DriverArgs, 678 llvm::opt::ArgStringList &CC1Args, 679 Action::OffloadKind DeviceOffloadingKind) const { 680 HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind); 681 682 StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ); 683 assert(!GpuArch.empty() && "Must have an explicit GPU arch."); 684 assert((DeviceOffloadingKind == Action::OFK_OpenMP || 685 DeviceOffloadingKind == Action::OFK_Cuda) && 686 "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs."); 687 688 if (DeviceOffloadingKind == Action::OFK_Cuda) { 689 CC1Args.push_back("-fcuda-is-device"); 690 691 if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, 692 options::OPT_fno_cuda_approx_transcendentals, false)) 693 CC1Args.push_back("-fcuda-approx-transcendentals"); 694 } 695 696 if (DriverArgs.hasArg(options::OPT_nogpulib)) 697 return; 698 699 if (DeviceOffloadingKind == Action::OFK_OpenMP && 700 DriverArgs.hasArg(options::OPT_S)) 701 return; 702 703 std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch); 704 if (LibDeviceFile.empty()) { 705 getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch; 706 return; 707 } 708 709 CC1Args.push_back("-mlink-builtin-bitcode"); 710 CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); 711 712 clang::CudaVersion CudaInstallationVersion = CudaInstallation.version(); 713 714 // New CUDA versions often introduce new instructions that are only supported 715 // by new PTX version, so we need to raise PTX level to enable them in NVPTX 716 // back-end. 717 const char *PtxFeature = nullptr; 718 switch (CudaInstallationVersion) { 719 #define CASE_CUDA_VERSION(CUDA_VER, PTX_VER) \ 720 case CudaVersion::CUDA_##CUDA_VER: \ 721 PtxFeature = "+ptx" #PTX_VER; \ 722 break; 723 CASE_CUDA_VERSION(112, 72); 724 CASE_CUDA_VERSION(111, 71); 725 CASE_CUDA_VERSION(110, 70); 726 CASE_CUDA_VERSION(102, 65); 727 CASE_CUDA_VERSION(101, 64); 728 CASE_CUDA_VERSION(100, 63); 729 CASE_CUDA_VERSION(92, 61); 730 CASE_CUDA_VERSION(91, 61); 731 CASE_CUDA_VERSION(90, 60); 732 #undef CASE_CUDA_VERSION 733 default: 734 PtxFeature = "+ptx42"; 735 } 736 CC1Args.append({"-target-feature", PtxFeature}); 737 if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, 738 options::OPT_fno_cuda_short_ptr, false)) 739 CC1Args.append({"-mllvm", "--nvptx-short-ptr"}); 740 741 if (CudaInstallationVersion >= CudaVersion::UNKNOWN) 742 CC1Args.push_back( 743 DriverArgs.MakeArgString(Twine("-target-sdk-version=") + 744 CudaVersionToString(CudaInstallationVersion))); 745 746 if (DeviceOffloadingKind == Action::OFK_OpenMP) { 747 if (CudaInstallationVersion < CudaVersion::CUDA_92) { 748 getDriver().Diag( 749 diag::err_drv_omp_offload_target_cuda_version_not_support) 750 << CudaVersionToString(CudaInstallationVersion); 751 return; 752 } 753 754 std::string BitcodeSuffix; 755 if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, 756 options::OPT_fno_openmp_target_new_runtime, false)) 757 BitcodeSuffix = "new-nvptx-" + GpuArch.str(); 758 else 759 BitcodeSuffix = "nvptx-" + GpuArch.str(); 760 761 addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix, 762 getTriple()); 763 } 764 } 765 766 llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType( 767 const llvm::opt::ArgList &DriverArgs, const JobAction &JA, 768 const llvm::fltSemantics *FPType) const { 769 if (JA.getOffloadingDeviceKind() == Action::OFK_Cuda) { 770 if (FPType && FPType == &llvm::APFloat::IEEEsingle() && 771 DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero, 772 options::OPT_fno_gpu_flush_denormals_to_zero, false)) 773 return llvm::DenormalMode::getPreserveSign(); 774 } 775 776 assert(JA.getOffloadingDeviceKind() != Action::OFK_Host); 777 return llvm::DenormalMode::getIEEE(); 778 } 779 780 bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const { 781 const Option &O = A->getOption(); 782 return (O.matches(options::OPT_gN_Group) && 783 !O.matches(options::OPT_gmodules)) || 784 O.matches(options::OPT_g_Flag) || 785 O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) || 786 O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) || 787 O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) || 788 O.matches(options::OPT_gdwarf_5) || 789 O.matches(options::OPT_gcolumn_info); 790 } 791 792 void CudaToolChain::adjustDebugInfoKind( 793 codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const { 794 switch (mustEmitDebugInfo(Args)) { 795 case DisableDebugInfo: 796 DebugInfoKind = codegenoptions::NoDebugInfo; 797 break; 798 case DebugDirectivesOnly: 799 DebugInfoKind = codegenoptions::DebugDirectivesOnly; 800 break; 801 case EmitSameDebugInfoAsHost: 802 // Use same debug info level as the host. 803 break; 804 } 805 } 806 807 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, 808 ArgStringList &CC1Args) const { 809 // Check our CUDA version if we're going to include the CUDA headers. 810 if (!DriverArgs.hasArg(options::OPT_nogpuinc) && 811 !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) { 812 StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ); 813 assert(!Arch.empty() && "Must have an explicit GPU arch."); 814 CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch)); 815 } 816 CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args); 817 } 818 819 llvm::opt::DerivedArgList * 820 CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, 821 StringRef BoundArch, 822 Action::OffloadKind DeviceOffloadKind) const { 823 DerivedArgList *DAL = 824 HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind); 825 if (!DAL) 826 DAL = new DerivedArgList(Args.getBaseArgs()); 827 828 const OptTable &Opts = getDriver().getOpts(); 829 830 // For OpenMP device offloading, append derived arguments. Make sure 831 // flags are not duplicated. 832 // Also append the compute capability. 833 if (DeviceOffloadKind == Action::OFK_OpenMP) { 834 for (Arg *A : Args) { 835 bool IsDuplicate = false; 836 for (Arg *DALArg : *DAL) { 837 if (A == DALArg) { 838 IsDuplicate = true; 839 break; 840 } 841 } 842 if (!IsDuplicate) 843 DAL->append(A); 844 } 845 846 StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ); 847 if (Arch.empty()) 848 DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), 849 CLANG_OPENMP_NVPTX_DEFAULT_ARCH); 850 851 return DAL; 852 } 853 854 for (Arg *A : Args) { 855 DAL->append(A); 856 } 857 858 if (!BoundArch.empty()) { 859 DAL->eraseArg(options::OPT_march_EQ); 860 DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch); 861 } 862 return DAL; 863 } 864 865 Tool *CudaToolChain::buildAssembler() const { 866 return new tools::NVPTX::Assembler(*this); 867 } 868 869 Tool *CudaToolChain::buildLinker() const { 870 if (OK == Action::OFK_OpenMP) 871 return new tools::NVPTX::OpenMPLinker(*this); 872 return new tools::NVPTX::Linker(*this); 873 } 874 875 void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const { 876 HostTC.addClangWarningOptions(CC1Args); 877 } 878 879 ToolChain::CXXStdlibType 880 CudaToolChain::GetCXXStdlibType(const ArgList &Args) const { 881 return HostTC.GetCXXStdlibType(Args); 882 } 883 884 void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, 885 ArgStringList &CC1Args) const { 886 HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args); 887 } 888 889 void CudaToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args, 890 ArgStringList &CC1Args) const { 891 HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args); 892 } 893 894 void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args, 895 ArgStringList &CC1Args) const { 896 HostTC.AddIAMCUIncludeArgs(Args, CC1Args); 897 } 898 899 SanitizerMask CudaToolChain::getSupportedSanitizers() const { 900 // The CudaToolChain only supports sanitizers in the sense that it allows 901 // sanitizer arguments on the command line if they are supported by the host 902 // toolchain. The CudaToolChain will actually ignore any command line 903 // arguments for any of these "supported" sanitizers. That means that no 904 // sanitization of device code is actually supported at this time. 905 // 906 // This behavior is necessary because the host and device toolchains 907 // invocations often share the command line, so the device toolchain must 908 // tolerate flags meant only for the host toolchain. 909 return HostTC.getSupportedSanitizers(); 910 } 911 912 VersionTuple CudaToolChain::computeMSVCVersion(const Driver *D, 913 const ArgList &Args) const { 914 return HostTC.computeMSVCVersion(D, Args); 915 } 916