1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAliasAnalysis.h" 16 #include "NVPTXAllocaHoisting.h" 17 #include "NVPTXAtomicLower.h" 18 #include "NVPTXCtorDtorLowering.h" 19 #include "NVPTXLowerAggrCopies.h" 20 #include "NVPTXMachineFunctionInfo.h" 21 #include "NVPTXTargetObjectFile.h" 22 #include "NVPTXTargetTransformInfo.h" 23 #include "TargetInfo/NVPTXTargetInfo.h" 24 #include "llvm/ADT/STLExtras.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/CodeGen/Passes.h" 27 #include "llvm/CodeGen/TargetPassConfig.h" 28 #include "llvm/IR/IntrinsicsNVPTX.h" 29 #include "llvm/MC/TargetRegistry.h" 30 #include "llvm/Pass.h" 31 #include "llvm/Passes/PassBuilder.h" 32 #include "llvm/Support/CommandLine.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Target/TargetOptions.h" 35 #include "llvm/TargetParser/Triple.h" 36 #include "llvm/Transforms/Scalar.h" 37 #include "llvm/Transforms/Scalar/GVN.h" 38 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" 39 #include <cassert> 40 #include <optional> 41 #include <string> 42 43 using namespace llvm; 44 45 // LSV is still relatively new; this switch lets us turn it off in case we 46 // encounter (or suspect) a bug. 47 static cl::opt<bool> 48 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 49 cl::desc("Disable load/store vectorizer"), 50 cl::init(false), cl::Hidden); 51 52 // TODO: Remove this flag when we are confident with no regressions. 53 static cl::opt<bool> DisableRequireStructuredCFG( 54 "disable-nvptx-require-structured-cfg", 55 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 56 "structured CFG. The requirement should be disabled only when " 57 "unexpected regressions happen."), 58 cl::init(false), cl::Hidden); 59 60 static cl::opt<bool> UseShortPointersOpt( 61 "nvptx-short-ptr", 62 cl::desc( 63 "Use 32-bit pointers for accessing const/local/shared address spaces."), 64 cl::init(false), cl::Hidden); 65 66 namespace llvm { 67 68 void initializeGenericToNVVMLegacyPassPass(PassRegistry &); 69 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 70 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &); 71 void initializeNVPTXAtomicLowerPass(PassRegistry &); 72 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); 73 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 74 void initializeNVPTXLowerAllocaPass(PassRegistry &); 75 void initializeNVPTXLowerUnreachablePass(PassRegistry &); 76 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); 77 void initializeNVPTXLowerArgsPass(PassRegistry &); 78 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 79 void initializeNVVMIntrRangePass(PassRegistry &); 80 void initializeNVVMReflectPass(PassRegistry &); 81 void initializeNVPTXAAWrapperPassPass(PassRegistry &); 82 void initializeNVPTXExternalAAWrapperPass(PassRegistry &); 83 84 } // end namespace llvm 85 86 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 87 // Register the target. 88 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 89 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 90 91 PassRegistry &PR = *PassRegistry::getPassRegistry(); 92 // FIXME: This pass is really intended to be invoked during IR optimization, 93 // but it's very NVPTX-specific. 94 initializeNVVMReflectPass(PR); 95 initializeNVVMIntrRangePass(PR); 96 initializeGenericToNVVMLegacyPassPass(PR); 97 initializeNVPTXAllocaHoistingPass(PR); 98 initializeNVPTXAssignValidGlobalNamesPass(PR); 99 initializeNVPTXAtomicLowerPass(PR); 100 initializeNVPTXLowerArgsPass(PR); 101 initializeNVPTXLowerAllocaPass(PR); 102 initializeNVPTXLowerUnreachablePass(PR); 103 initializeNVPTXCtorDtorLoweringLegacyPass(PR); 104 initializeNVPTXLowerAggrCopiesPass(PR); 105 initializeNVPTXProxyRegErasurePass(PR); 106 initializeNVPTXDAGToDAGISelPass(PR); 107 initializeNVPTXAAWrapperPassPass(PR); 108 initializeNVPTXExternalAAWrapperPass(PR); 109 } 110 111 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 112 std::string Ret = "e"; 113 114 if (!is64Bit) 115 Ret += "-p:32:32"; 116 else if (UseShortPointers) 117 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 118 119 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 120 121 return Ret; 122 } 123 124 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 125 StringRef CPU, StringRef FS, 126 const TargetOptions &Options, 127 std::optional<Reloc::Model> RM, 128 std::optional<CodeModel::Model> CM, 129 CodeGenOptLevel OL, bool is64bit) 130 // The pic relocation model is used regardless of what the client has 131 // specified, as it is the only relocation model currently supported. 132 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 133 CPU, FS, Options, Reloc::PIC_, 134 getEffectiveCodeModel(CM, CodeModel::Small), OL), 135 is64bit(is64bit), UseShortPointers(UseShortPointersOpt), 136 TLOF(std::make_unique<NVPTXTargetObjectFile>()), 137 Subtarget(TT, std::string(CPU), std::string(FS), *this), 138 StrPool(StrAlloc) { 139 if (TT.getOS() == Triple::NVCL) 140 drvInterface = NVPTX::NVCL; 141 else 142 drvInterface = NVPTX::CUDA; 143 if (!DisableRequireStructuredCFG) 144 setRequiresStructuredCFG(true); 145 initAsmInfo(); 146 } 147 148 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 149 150 void NVPTXTargetMachine32::anchor() {} 151 152 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 153 StringRef CPU, StringRef FS, 154 const TargetOptions &Options, 155 std::optional<Reloc::Model> RM, 156 std::optional<CodeModel::Model> CM, 157 CodeGenOptLevel OL, bool JIT) 158 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 159 160 void NVPTXTargetMachine64::anchor() {} 161 162 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 163 StringRef CPU, StringRef FS, 164 const TargetOptions &Options, 165 std::optional<Reloc::Model> RM, 166 std::optional<CodeModel::Model> CM, 167 CodeGenOptLevel OL, bool JIT) 168 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 169 170 namespace { 171 172 class NVPTXPassConfig : public TargetPassConfig { 173 public: 174 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 175 : TargetPassConfig(TM, PM) {} 176 177 NVPTXTargetMachine &getNVPTXTargetMachine() const { 178 return getTM<NVPTXTargetMachine>(); 179 } 180 181 void addIRPasses() override; 182 bool addInstSelector() override; 183 void addPreRegAlloc() override; 184 void addPostRegAlloc() override; 185 void addMachineSSAOptimization() override; 186 187 FunctionPass *createTargetRegisterAllocator(bool) override; 188 void addFastRegAlloc() override; 189 void addOptimizedRegAlloc() override; 190 191 bool addRegAssignAndRewriteFast() override { 192 llvm_unreachable("should not be used"); 193 } 194 195 bool addRegAssignAndRewriteOptimized() override { 196 llvm_unreachable("should not be used"); 197 } 198 199 private: 200 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 201 // function is only called in opt mode. 202 void addEarlyCSEOrGVNPass(); 203 204 // Add passes that propagate special memory spaces. 205 void addAddressSpaceInferencePasses(); 206 207 // Add passes that perform straight-line scalar optimizations. 208 void addStraightLineScalarOptimizationPasses(); 209 }; 210 211 } // end anonymous namespace 212 213 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 214 return new NVPTXPassConfig(*this, PM); 215 } 216 217 MachineFunctionInfo *NVPTXTargetMachine::createMachineFunctionInfo( 218 BumpPtrAllocator &Allocator, const Function &F, 219 const TargetSubtargetInfo *STI) const { 220 return NVPTXMachineFunctionInfo::create<NVPTXMachineFunctionInfo>(Allocator, 221 F, STI); 222 } 223 224 void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 225 AAM.registerFunctionAnalysis<NVPTXAA>(); 226 } 227 228 void NVPTXTargetMachine::registerPassBuilderCallbacks( 229 PassBuilder &PB, bool PopulateClassToPassNames) { 230 PB.registerPipelineParsingCallback( 231 [](StringRef PassName, FunctionPassManager &PM, 232 ArrayRef<PassBuilder::PipelineElement>) { 233 if (PassName == "nvvm-reflect") { 234 PM.addPass(NVVMReflectPass()); 235 return true; 236 } 237 if (PassName == "nvvm-intr-range") { 238 PM.addPass(NVVMIntrRangePass()); 239 return true; 240 } 241 return false; 242 }); 243 244 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { 245 FAM.registerPass([&] { return NVPTXAA(); }); 246 }); 247 248 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { 249 if (AAName == "nvptx-aa") { 250 AAM.registerFunctionAnalysis<NVPTXAA>(); 251 return true; 252 } 253 return false; 254 }); 255 256 PB.registerPipelineParsingCallback( 257 [](StringRef PassName, ModulePassManager &PM, 258 ArrayRef<PassBuilder::PipelineElement>) { 259 if (PassName == "nvptx-lower-ctor-dtor") { 260 PM.addPass(NVPTXCtorDtorLoweringPass()); 261 return true; 262 } 263 if (PassName == "generic-to-nvvm") { 264 PM.addPass(GenericToNVVMPass()); 265 return true; 266 } 267 return false; 268 }); 269 270 PB.registerPipelineStartEPCallback( 271 [this](ModulePassManager &PM, OptimizationLevel Level) { 272 FunctionPassManager FPM; 273 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 274 // FIXME: NVVMIntrRangePass is causing numerical discrepancies, 275 // investigate and re-enable. 276 // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); 277 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 278 }); 279 } 280 281 TargetTransformInfo 282 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const { 283 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 284 } 285 286 std::pair<const Value *, unsigned> 287 NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const { 288 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 289 switch (II->getIntrinsicID()) { 290 case Intrinsic::nvvm_isspacep_const: 291 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST); 292 case Intrinsic::nvvm_isspacep_global: 293 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL); 294 case Intrinsic::nvvm_isspacep_local: 295 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL); 296 case Intrinsic::nvvm_isspacep_shared: 297 case Intrinsic::nvvm_isspacep_shared_cluster: 298 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED); 299 default: 300 break; 301 } 302 } 303 return std::make_pair(nullptr, -1); 304 } 305 306 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 307 if (getOptLevel() == CodeGenOptLevel::Aggressive) 308 addPass(createGVNPass()); 309 else 310 addPass(createEarlyCSEPass()); 311 } 312 313 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 314 // NVPTXLowerArgs emits alloca for byval parameters which can often 315 // be eliminated by SROA. 316 addPass(createSROAPass()); 317 addPass(createNVPTXLowerAllocaPass()); 318 addPass(createInferAddressSpacesPass()); 319 addPass(createNVPTXAtomicLowerPass()); 320 } 321 322 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 323 addPass(createSeparateConstOffsetFromGEPPass()); 324 addPass(createSpeculativeExecutionPass()); 325 // ReassociateGEPs exposes more opportunites for SLSR. See 326 // the example in reassociate-geps-and-slsr.ll. 327 addPass(createStraightLineStrengthReducePass()); 328 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 329 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 330 // for some of our benchmarks. 331 addEarlyCSEOrGVNPass(); 332 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 333 addPass(createNaryReassociatePass()); 334 // NaryReassociate on GEPs creates redundant common expressions, so run 335 // EarlyCSE after it. 336 addPass(createEarlyCSEPass()); 337 } 338 339 void NVPTXPassConfig::addIRPasses() { 340 // The following passes are known to not play well with virtual regs hanging 341 // around after register allocation (which in our case, is *all* registers). 342 // We explicitly disable them here. We do, however, need some functionality 343 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 344 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 345 disablePass(&PrologEpilogCodeInserterID); 346 disablePass(&MachineLateInstrsCleanupID); 347 disablePass(&MachineCopyPropagationID); 348 disablePass(&TailDuplicateID); 349 disablePass(&StackMapLivenessID); 350 disablePass(&LiveDebugValuesID); 351 disablePass(&PostRAMachineSinkingID); 352 disablePass(&PostRASchedulerID); 353 disablePass(&FuncletLayoutID); 354 disablePass(&PatchableFunctionID); 355 disablePass(&ShrinkWrapID); 356 357 addPass(createNVPTXAAWrapperPass()); 358 addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { 359 if (auto *WrapperPass = P.getAnalysisIfAvailable<NVPTXAAWrapperPass>()) 360 AAR.addAAResult(WrapperPass->getResult()); 361 })); 362 363 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 364 // it here does nothing. But since we need it for correctness when lowering 365 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 366 // call addEarlyAsPossiblePasses. 367 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 368 addPass(createNVVMReflectPass(ST.getSmVersion())); 369 370 if (getOptLevel() != CodeGenOptLevel::None) 371 addPass(createNVPTXImageOptimizerPass()); 372 addPass(createNVPTXAssignValidGlobalNamesPass()); 373 addPass(createGenericToNVVMLegacyPass()); 374 375 // NVPTXLowerArgs is required for correctness and should be run right 376 // before the address space inference passes. 377 addPass(createNVPTXLowerArgsPass()); 378 if (getOptLevel() != CodeGenOptLevel::None) { 379 addAddressSpaceInferencePasses(); 380 addStraightLineScalarOptimizationPasses(); 381 } 382 383 addPass(createAtomicExpandPass()); 384 addPass(createNVPTXCtorDtorLoweringLegacyPass()); 385 386 // === LSR and other generic IR passes === 387 TargetPassConfig::addIRPasses(); 388 // EarlyCSE is not always strong enough to clean up what LSR produces. For 389 // example, GVN can combine 390 // 391 // %0 = add %a, %b 392 // %1 = add %b, %a 393 // 394 // and 395 // 396 // %0 = shl nsw %a, 2 397 // %1 = shl %a, 2 398 // 399 // but EarlyCSE can do neither of them. 400 if (getOptLevel() != CodeGenOptLevel::None) { 401 addEarlyCSEOrGVNPass(); 402 if (!DisableLoadStoreVectorizer) 403 addPass(createLoadStoreVectorizerPass()); 404 addPass(createSROAPass()); 405 } 406 407 const auto &Options = getNVPTXTargetMachine().Options; 408 addPass(createNVPTXLowerUnreachablePass(Options.TrapUnreachable, 409 Options.NoTrapAfterNoreturn)); 410 } 411 412 bool NVPTXPassConfig::addInstSelector() { 413 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 414 415 addPass(createLowerAggrCopies()); 416 addPass(createAllocaHoisting()); 417 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 418 419 if (!ST.hasImageHandles()) 420 addPass(createNVPTXReplaceImageHandlesPass()); 421 422 return false; 423 } 424 425 void NVPTXPassConfig::addPreRegAlloc() { 426 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 427 addPass(createNVPTXProxyRegErasurePass()); 428 } 429 430 void NVPTXPassConfig::addPostRegAlloc() { 431 addPass(createNVPTXPrologEpilogPass()); 432 if (getOptLevel() != CodeGenOptLevel::None) { 433 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 434 // index with VRFrame register. NVPTXPeephole need to be run after that and 435 // will replace VRFrame with VRFrameLocal when possible. 436 addPass(createNVPTXPeephole()); 437 } 438 } 439 440 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 441 return nullptr; // No reg alloc 442 } 443 444 void NVPTXPassConfig::addFastRegAlloc() { 445 addPass(&PHIEliminationID); 446 addPass(&TwoAddressInstructionPassID); 447 } 448 449 void NVPTXPassConfig::addOptimizedRegAlloc() { 450 addPass(&ProcessImplicitDefsID); 451 addPass(&LiveVariablesID); 452 addPass(&MachineLoopInfoID); 453 addPass(&PHIEliminationID); 454 455 addPass(&TwoAddressInstructionPassID); 456 addPass(&RegisterCoalescerID); 457 458 // PreRA instruction scheduling. 459 if (addPass(&MachineSchedulerID)) 460 printAndVerify("After Machine Scheduling"); 461 462 addPass(&StackSlotColoringID); 463 464 // FIXME: Needs physical registers 465 // addPass(&MachineLICMID); 466 467 printAndVerify("After StackSlotColoring"); 468 } 469 470 void NVPTXPassConfig::addMachineSSAOptimization() { 471 // Pre-ra tail duplication. 472 if (addPass(&EarlyTailDuplicateID)) 473 printAndVerify("After Pre-RegAlloc TailDuplicate"); 474 475 // Optimize PHIs before DCE: removing dead PHI cycles may make more 476 // instructions dead. 477 addPass(&OptimizePHIsID); 478 479 // This pass merges large allocas. StackSlotColoring is a different pass 480 // which merges spill slots. 481 addPass(&StackColoringID); 482 483 // If the target requests it, assign local variables to stack slots relative 484 // to one another and simplify frame index references where possible. 485 addPass(&LocalStackSlotAllocationID); 486 487 // With optimization, dead code should already be eliminated. However 488 // there is one known exception: lowered code for arguments that are only 489 // used by tail calls, where the tail calls reuse the incoming stack 490 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 491 addPass(&DeadMachineInstructionElimID); 492 printAndVerify("After codegen DCE pass"); 493 494 // Allow targets to insert passes that improve instruction level parallelism, 495 // like if-conversion. Such passes will typically need dominator trees and 496 // loop info, just like LICM and CSE below. 497 if (addILPOpts()) 498 printAndVerify("After ILP optimizations"); 499 500 addPass(&EarlyMachineLICMID); 501 addPass(&MachineCSEID); 502 503 addPass(&MachineSinkingID); 504 printAndVerify("After Machine LICM, CSE and Sinking passes"); 505 506 addPass(&PeepholeOptimizerID); 507 printAndVerify("After codegen peephole optimization pass"); 508 } 509