1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAliasAnalysis.h" 16 #include "NVPTXAllocaHoisting.h" 17 #include "NVPTXAtomicLower.h" 18 #include "NVPTXCtorDtorLowering.h" 19 #include "NVPTXLowerAggrCopies.h" 20 #include "NVPTXMachineFunctionInfo.h" 21 #include "NVPTXTargetObjectFile.h" 22 #include "NVPTXTargetTransformInfo.h" 23 #include "TargetInfo/NVPTXTargetInfo.h" 24 #include "llvm/ADT/STLExtras.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/CodeGen/Passes.h" 27 #include "llvm/CodeGen/TargetPassConfig.h" 28 #include "llvm/IR/IntrinsicsNVPTX.h" 29 #include "llvm/MC/TargetRegistry.h" 30 #include "llvm/Pass.h" 31 #include "llvm/Passes/PassBuilder.h" 32 #include "llvm/Support/CommandLine.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Target/TargetOptions.h" 35 #include "llvm/TargetParser/Triple.h" 36 #include "llvm/Transforms/Scalar.h" 37 #include "llvm/Transforms/Scalar/GVN.h" 38 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" 39 #include <cassert> 40 #include <optional> 41 #include <string> 42 43 using namespace llvm; 44 45 // LSV is still relatively new; this switch lets us turn it off in case we 46 // encounter (or suspect) a bug. 47 static cl::opt<bool> 48 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 49 cl::desc("Disable load/store vectorizer"), 50 cl::init(false), cl::Hidden); 51 52 // TODO: Remove this flag when we are confident with no regressions. 53 static cl::opt<bool> DisableRequireStructuredCFG( 54 "disable-nvptx-require-structured-cfg", 55 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 56 "structured CFG. The requirement should be disabled only when " 57 "unexpected regressions happen."), 58 cl::init(false), cl::Hidden); 59 60 static cl::opt<bool> UseShortPointersOpt( 61 "nvptx-short-ptr", 62 cl::desc( 63 "Use 32-bit pointers for accessing const/local/shared address spaces."), 64 cl::init(false), cl::Hidden); 65 66 // FIXME: intended as a temporary debugging aid. Should be removed before it 67 // makes it into the LLVM-17 release. 68 static cl::opt<bool> 69 ExitOnUnreachable("nvptx-exit-on-unreachable", 70 cl::desc("Lower 'unreachable' as 'exit' instruction."), 71 cl::init(true), cl::Hidden); 72 73 namespace llvm { 74 75 void initializeGenericToNVVMLegacyPassPass(PassRegistry &); 76 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 77 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &); 78 void initializeNVPTXAtomicLowerPass(PassRegistry &); 79 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); 80 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 81 void initializeNVPTXLowerAllocaPass(PassRegistry &); 82 void initializeNVPTXLowerUnreachablePass(PassRegistry &); 83 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); 84 void initializeNVPTXLowerArgsPass(PassRegistry &); 85 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 86 void initializeNVVMIntrRangePass(PassRegistry &); 87 void initializeNVVMReflectPass(PassRegistry &); 88 void initializeNVPTXAAWrapperPassPass(PassRegistry &); 89 void initializeNVPTXExternalAAWrapperPass(PassRegistry &); 90 91 } // end namespace llvm 92 93 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 94 // Register the target. 95 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 96 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 97 98 PassRegistry &PR = *PassRegistry::getPassRegistry(); 99 // FIXME: This pass is really intended to be invoked during IR optimization, 100 // but it's very NVPTX-specific. 101 initializeNVVMReflectPass(PR); 102 initializeNVVMIntrRangePass(PR); 103 initializeGenericToNVVMLegacyPassPass(PR); 104 initializeNVPTXAllocaHoistingPass(PR); 105 initializeNVPTXAssignValidGlobalNamesPass(PR); 106 initializeNVPTXAtomicLowerPass(PR); 107 initializeNVPTXLowerArgsPass(PR); 108 initializeNVPTXLowerAllocaPass(PR); 109 initializeNVPTXLowerUnreachablePass(PR); 110 initializeNVPTXCtorDtorLoweringLegacyPass(PR); 111 initializeNVPTXLowerAggrCopiesPass(PR); 112 initializeNVPTXProxyRegErasurePass(PR); 113 initializeNVPTXDAGToDAGISelPass(PR); 114 initializeNVPTXAAWrapperPassPass(PR); 115 initializeNVPTXExternalAAWrapperPass(PR); 116 } 117 118 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 119 std::string Ret = "e"; 120 121 if (!is64Bit) 122 Ret += "-p:32:32"; 123 else if (UseShortPointers) 124 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 125 126 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 127 128 return Ret; 129 } 130 131 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 132 StringRef CPU, StringRef FS, 133 const TargetOptions &Options, 134 std::optional<Reloc::Model> RM, 135 std::optional<CodeModel::Model> CM, 136 CodeGenOpt::Level OL, bool is64bit) 137 // The pic relocation model is used regardless of what the client has 138 // specified, as it is the only relocation model currently supported. 139 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 140 CPU, FS, Options, Reloc::PIC_, 141 getEffectiveCodeModel(CM, CodeModel::Small), OL), 142 is64bit(is64bit), UseShortPointers(UseShortPointersOpt), 143 TLOF(std::make_unique<NVPTXTargetObjectFile>()), 144 Subtarget(TT, std::string(CPU), std::string(FS), *this), 145 StrPool(StrAlloc) { 146 if (TT.getOS() == Triple::NVCL) 147 drvInterface = NVPTX::NVCL; 148 else 149 drvInterface = NVPTX::CUDA; 150 if (!DisableRequireStructuredCFG) 151 setRequiresStructuredCFG(true); 152 initAsmInfo(); 153 } 154 155 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 156 157 void NVPTXTargetMachine32::anchor() {} 158 159 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 160 StringRef CPU, StringRef FS, 161 const TargetOptions &Options, 162 std::optional<Reloc::Model> RM, 163 std::optional<CodeModel::Model> CM, 164 CodeGenOpt::Level OL, bool JIT) 165 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 166 167 void NVPTXTargetMachine64::anchor() {} 168 169 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 170 StringRef CPU, StringRef FS, 171 const TargetOptions &Options, 172 std::optional<Reloc::Model> RM, 173 std::optional<CodeModel::Model> CM, 174 CodeGenOpt::Level OL, bool JIT) 175 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 176 177 namespace { 178 179 class NVPTXPassConfig : public TargetPassConfig { 180 public: 181 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 182 : TargetPassConfig(TM, PM) {} 183 184 NVPTXTargetMachine &getNVPTXTargetMachine() const { 185 return getTM<NVPTXTargetMachine>(); 186 } 187 188 void addIRPasses() override; 189 bool addInstSelector() override; 190 void addPreRegAlloc() override; 191 void addPostRegAlloc() override; 192 void addMachineSSAOptimization() override; 193 194 FunctionPass *createTargetRegisterAllocator(bool) override; 195 void addFastRegAlloc() override; 196 void addOptimizedRegAlloc() override; 197 198 bool addRegAssignAndRewriteFast() override { 199 llvm_unreachable("should not be used"); 200 } 201 202 bool addRegAssignAndRewriteOptimized() override { 203 llvm_unreachable("should not be used"); 204 } 205 206 private: 207 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 208 // function is only called in opt mode. 209 void addEarlyCSEOrGVNPass(); 210 211 // Add passes that propagate special memory spaces. 212 void addAddressSpaceInferencePasses(); 213 214 // Add passes that perform straight-line scalar optimizations. 215 void addStraightLineScalarOptimizationPasses(); 216 }; 217 218 } // end anonymous namespace 219 220 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 221 return new NVPTXPassConfig(*this, PM); 222 } 223 224 MachineFunctionInfo *NVPTXTargetMachine::createMachineFunctionInfo( 225 BumpPtrAllocator &Allocator, const Function &F, 226 const TargetSubtargetInfo *STI) const { 227 return NVPTXMachineFunctionInfo::create<NVPTXMachineFunctionInfo>(Allocator, 228 F, STI); 229 } 230 231 void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 232 AAM.registerFunctionAnalysis<NVPTXAA>(); 233 } 234 235 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 236 PB.registerPipelineParsingCallback( 237 [](StringRef PassName, FunctionPassManager &PM, 238 ArrayRef<PassBuilder::PipelineElement>) { 239 if (PassName == "nvvm-reflect") { 240 PM.addPass(NVVMReflectPass()); 241 return true; 242 } 243 if (PassName == "nvvm-intr-range") { 244 PM.addPass(NVVMIntrRangePass()); 245 return true; 246 } 247 return false; 248 }); 249 250 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { 251 FAM.registerPass([&] { return NVPTXAA(); }); 252 }); 253 254 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { 255 if (AAName == "nvptx-aa") { 256 AAM.registerFunctionAnalysis<NVPTXAA>(); 257 return true; 258 } 259 return false; 260 }); 261 262 PB.registerPipelineParsingCallback( 263 [](StringRef PassName, ModulePassManager &PM, 264 ArrayRef<PassBuilder::PipelineElement>) { 265 if (PassName == "nvptx-lower-ctor-dtor") { 266 PM.addPass(NVPTXCtorDtorLoweringPass()); 267 return true; 268 } 269 if (PassName == "generic-to-nvvm") { 270 PM.addPass(GenericToNVVMPass()); 271 return true; 272 } 273 return false; 274 }); 275 276 PB.registerPipelineStartEPCallback( 277 [this](ModulePassManager &PM, OptimizationLevel Level) { 278 FunctionPassManager FPM; 279 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 280 // FIXME: NVVMIntrRangePass is causing numerical discrepancies, 281 // investigate and re-enable. 282 // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); 283 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 284 }); 285 } 286 287 TargetTransformInfo 288 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const { 289 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 290 } 291 292 std::pair<const Value *, unsigned> 293 NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const { 294 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 295 switch (II->getIntrinsicID()) { 296 case Intrinsic::nvvm_isspacep_const: 297 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST); 298 case Intrinsic::nvvm_isspacep_global: 299 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL); 300 case Intrinsic::nvvm_isspacep_local: 301 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL); 302 case Intrinsic::nvvm_isspacep_shared: 303 case Intrinsic::nvvm_isspacep_shared_cluster: 304 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED); 305 default: 306 break; 307 } 308 } 309 return std::make_pair(nullptr, -1); 310 } 311 312 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 313 if (getOptLevel() == CodeGenOpt::Aggressive) 314 addPass(createGVNPass()); 315 else 316 addPass(createEarlyCSEPass()); 317 } 318 319 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 320 // NVPTXLowerArgs emits alloca for byval parameters which can often 321 // be eliminated by SROA. 322 addPass(createSROAPass()); 323 addPass(createNVPTXLowerAllocaPass()); 324 addPass(createInferAddressSpacesPass()); 325 addPass(createNVPTXAtomicLowerPass()); 326 } 327 328 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 329 addPass(createSeparateConstOffsetFromGEPPass()); 330 addPass(createSpeculativeExecutionPass()); 331 // ReassociateGEPs exposes more opportunites for SLSR. See 332 // the example in reassociate-geps-and-slsr.ll. 333 addPass(createStraightLineStrengthReducePass()); 334 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 335 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 336 // for some of our benchmarks. 337 addEarlyCSEOrGVNPass(); 338 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 339 addPass(createNaryReassociatePass()); 340 // NaryReassociate on GEPs creates redundant common expressions, so run 341 // EarlyCSE after it. 342 addPass(createEarlyCSEPass()); 343 } 344 345 void NVPTXPassConfig::addIRPasses() { 346 // The following passes are known to not play well with virtual regs hanging 347 // around after register allocation (which in our case, is *all* registers). 348 // We explicitly disable them here. We do, however, need some functionality 349 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 350 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 351 disablePass(&PrologEpilogCodeInserterID); 352 disablePass(&MachineLateInstrsCleanupID); 353 disablePass(&MachineCopyPropagationID); 354 disablePass(&TailDuplicateID); 355 disablePass(&StackMapLivenessID); 356 disablePass(&LiveDebugValuesID); 357 disablePass(&PostRAMachineSinkingID); 358 disablePass(&PostRASchedulerID); 359 disablePass(&FuncletLayoutID); 360 disablePass(&PatchableFunctionID); 361 disablePass(&ShrinkWrapID); 362 363 addPass(createNVPTXAAWrapperPass()); 364 addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { 365 if (auto *WrapperPass = P.getAnalysisIfAvailable<NVPTXAAWrapperPass>()) 366 AAR.addAAResult(WrapperPass->getResult()); 367 })); 368 369 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 370 // it here does nothing. But since we need it for correctness when lowering 371 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 372 // call addEarlyAsPossiblePasses. 373 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 374 addPass(createNVVMReflectPass(ST.getSmVersion())); 375 376 if (getOptLevel() != CodeGenOpt::None) 377 addPass(createNVPTXImageOptimizerPass()); 378 addPass(createNVPTXAssignValidGlobalNamesPass()); 379 addPass(createGenericToNVVMLegacyPass()); 380 381 // NVPTXLowerArgs is required for correctness and should be run right 382 // before the address space inference passes. 383 addPass(createNVPTXLowerArgsPass()); 384 if (getOptLevel() != CodeGenOpt::None) { 385 addAddressSpaceInferencePasses(); 386 addStraightLineScalarOptimizationPasses(); 387 } 388 389 addPass(createAtomicExpandPass()); 390 addPass(createNVPTXCtorDtorLoweringLegacyPass()); 391 392 // === LSR and other generic IR passes === 393 TargetPassConfig::addIRPasses(); 394 // EarlyCSE is not always strong enough to clean up what LSR produces. For 395 // example, GVN can combine 396 // 397 // %0 = add %a, %b 398 // %1 = add %b, %a 399 // 400 // and 401 // 402 // %0 = shl nsw %a, 2 403 // %1 = shl %a, 2 404 // 405 // but EarlyCSE can do neither of them. 406 if (getOptLevel() != CodeGenOpt::None) { 407 addEarlyCSEOrGVNPass(); 408 if (!DisableLoadStoreVectorizer) 409 addPass(createLoadStoreVectorizerPass()); 410 addPass(createSROAPass()); 411 } 412 413 if (ExitOnUnreachable) 414 addPass(createNVPTXLowerUnreachablePass()); 415 } 416 417 bool NVPTXPassConfig::addInstSelector() { 418 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 419 420 addPass(createLowerAggrCopies()); 421 addPass(createAllocaHoisting()); 422 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 423 424 if (!ST.hasImageHandles()) 425 addPass(createNVPTXReplaceImageHandlesPass()); 426 427 return false; 428 } 429 430 void NVPTXPassConfig::addPreRegAlloc() { 431 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 432 addPass(createNVPTXProxyRegErasurePass()); 433 } 434 435 void NVPTXPassConfig::addPostRegAlloc() { 436 addPass(createNVPTXPrologEpilogPass()); 437 if (getOptLevel() != CodeGenOpt::None) { 438 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 439 // index with VRFrame register. NVPTXPeephole need to be run after that and 440 // will replace VRFrame with VRFrameLocal when possible. 441 addPass(createNVPTXPeephole()); 442 } 443 } 444 445 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 446 return nullptr; // No reg alloc 447 } 448 449 void NVPTXPassConfig::addFastRegAlloc() { 450 addPass(&PHIEliminationID); 451 addPass(&TwoAddressInstructionPassID); 452 } 453 454 void NVPTXPassConfig::addOptimizedRegAlloc() { 455 addPass(&ProcessImplicitDefsID); 456 addPass(&LiveVariablesID); 457 addPass(&MachineLoopInfoID); 458 addPass(&PHIEliminationID); 459 460 addPass(&TwoAddressInstructionPassID); 461 addPass(&RegisterCoalescerID); 462 463 // PreRA instruction scheduling. 464 if (addPass(&MachineSchedulerID)) 465 printAndVerify("After Machine Scheduling"); 466 467 addPass(&StackSlotColoringID); 468 469 // FIXME: Needs physical registers 470 // addPass(&MachineLICMID); 471 472 printAndVerify("After StackSlotColoring"); 473 } 474 475 void NVPTXPassConfig::addMachineSSAOptimization() { 476 // Pre-ra tail duplication. 477 if (addPass(&EarlyTailDuplicateID)) 478 printAndVerify("After Pre-RegAlloc TailDuplicate"); 479 480 // Optimize PHIs before DCE: removing dead PHI cycles may make more 481 // instructions dead. 482 addPass(&OptimizePHIsID); 483 484 // This pass merges large allocas. StackSlotColoring is a different pass 485 // which merges spill slots. 486 addPass(&StackColoringID); 487 488 // If the target requests it, assign local variables to stack slots relative 489 // to one another and simplify frame index references where possible. 490 addPass(&LocalStackSlotAllocationID); 491 492 // With optimization, dead code should already be eliminated. However 493 // there is one known exception: lowered code for arguments that are only 494 // used by tail calls, where the tail calls reuse the incoming stack 495 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 496 addPass(&DeadMachineInstructionElimID); 497 printAndVerify("After codegen DCE pass"); 498 499 // Allow targets to insert passes that improve instruction level parallelism, 500 // like if-conversion. Such passes will typically need dominator trees and 501 // loop info, just like LICM and CSE below. 502 if (addILPOpts()) 503 printAndVerify("After ILP optimizations"); 504 505 addPass(&EarlyMachineLICMID); 506 addPass(&MachineCSEID); 507 508 addPass(&MachineSinkingID); 509 printAndVerify("After Machine LICM, CSE and Sinking passes"); 510 511 addPass(&PeepholeOptimizerID); 512 printAndVerify("After codegen peephole optimization pass"); 513 } 514