1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAliasAnalysis.h" 16 #include "NVPTXAllocaHoisting.h" 17 #include "NVPTXAtomicLower.h" 18 #include "NVPTXCtorDtorLowering.h" 19 #include "NVPTXLowerAggrCopies.h" 20 #include "NVPTXMachineFunctionInfo.h" 21 #include "NVPTXTargetObjectFile.h" 22 #include "NVPTXTargetTransformInfo.h" 23 #include "TargetInfo/NVPTXTargetInfo.h" 24 #include "llvm/ADT/STLExtras.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/CodeGen/Passes.h" 27 #include "llvm/CodeGen/TargetPassConfig.h" 28 #include "llvm/IR/IntrinsicsNVPTX.h" 29 #include "llvm/MC/TargetRegistry.h" 30 #include "llvm/Pass.h" 31 #include "llvm/Passes/PassBuilder.h" 32 #include "llvm/Support/CommandLine.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Target/TargetOptions.h" 35 #include "llvm/TargetParser/Triple.h" 36 #include "llvm/Transforms/IPO/ExpandVariadics.h" 37 #include "llvm/Transforms/Scalar.h" 38 #include "llvm/Transforms/Scalar/GVN.h" 39 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" 40 #include <cassert> 41 #include <optional> 42 #include <string> 43 44 using namespace llvm; 45 46 // LSV is still relatively new; this switch lets us turn it off in case we 47 // encounter (or suspect) a bug. 48 static cl::opt<bool> 49 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 50 cl::desc("Disable load/store vectorizer"), 51 cl::init(false), cl::Hidden); 52 53 // TODO: Remove this flag when we are confident with no regressions. 54 static cl::opt<bool> DisableRequireStructuredCFG( 55 "disable-nvptx-require-structured-cfg", 56 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 57 "structured CFG. The requirement should be disabled only when " 58 "unexpected regressions happen."), 59 cl::init(false), cl::Hidden); 60 61 static cl::opt<bool> UseShortPointersOpt( 62 "nvptx-short-ptr", 63 cl::desc( 64 "Use 32-bit pointers for accessing const/local/shared address spaces."), 65 cl::init(false), cl::Hidden); 66 67 namespace llvm { 68 69 void initializeGenericToNVVMLegacyPassPass(PassRegistry &); 70 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 71 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &); 72 void initializeNVPTXAtomicLowerPass(PassRegistry &); 73 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); 74 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 75 void initializeNVPTXLowerAllocaPass(PassRegistry &); 76 void initializeNVPTXLowerUnreachablePass(PassRegistry &); 77 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); 78 void initializeNVPTXLowerArgsPass(PassRegistry &); 79 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 80 void initializeNVVMIntrRangePass(PassRegistry &); 81 void initializeNVVMReflectPass(PassRegistry &); 82 void initializeNVPTXAAWrapperPassPass(PassRegistry &); 83 void initializeNVPTXExternalAAWrapperPass(PassRegistry &); 84 85 } // end namespace llvm 86 87 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 88 // Register the target. 89 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 90 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 91 92 PassRegistry &PR = *PassRegistry::getPassRegistry(); 93 // FIXME: This pass is really intended to be invoked during IR optimization, 94 // but it's very NVPTX-specific. 95 initializeNVVMReflectPass(PR); 96 initializeNVVMIntrRangePass(PR); 97 initializeGenericToNVVMLegacyPassPass(PR); 98 initializeNVPTXAllocaHoistingPass(PR); 99 initializeNVPTXAssignValidGlobalNamesPass(PR); 100 initializeNVPTXAtomicLowerPass(PR); 101 initializeNVPTXLowerArgsPass(PR); 102 initializeNVPTXLowerAllocaPass(PR); 103 initializeNVPTXLowerUnreachablePass(PR); 104 initializeNVPTXCtorDtorLoweringLegacyPass(PR); 105 initializeNVPTXLowerAggrCopiesPass(PR); 106 initializeNVPTXProxyRegErasurePass(PR); 107 initializeNVPTXDAGToDAGISelLegacyPass(PR); 108 initializeNVPTXAAWrapperPassPass(PR); 109 initializeNVPTXExternalAAWrapperPass(PR); 110 } 111 112 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 113 std::string Ret = "e"; 114 115 if (!is64Bit) 116 Ret += "-p:32:32"; 117 else if (UseShortPointers) 118 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 119 120 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 121 122 return Ret; 123 } 124 125 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 126 StringRef CPU, StringRef FS, 127 const TargetOptions &Options, 128 std::optional<Reloc::Model> RM, 129 std::optional<CodeModel::Model> CM, 130 CodeGenOptLevel OL, bool is64bit) 131 // The pic relocation model is used regardless of what the client has 132 // specified, as it is the only relocation model currently supported. 133 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 134 CPU, FS, Options, Reloc::PIC_, 135 getEffectiveCodeModel(CM, CodeModel::Small), OL), 136 is64bit(is64bit), TLOF(std::make_unique<NVPTXTargetObjectFile>()), 137 Subtarget(TT, std::string(CPU), std::string(FS), *this), 138 StrPool(StrAlloc) { 139 if (TT.getOS() == Triple::NVCL) 140 drvInterface = NVPTX::NVCL; 141 else 142 drvInterface = NVPTX::CUDA; 143 if (!DisableRequireStructuredCFG) 144 setRequiresStructuredCFG(true); 145 initAsmInfo(); 146 } 147 148 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 149 150 void NVPTXTargetMachine32::anchor() {} 151 152 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 153 StringRef CPU, StringRef FS, 154 const TargetOptions &Options, 155 std::optional<Reloc::Model> RM, 156 std::optional<CodeModel::Model> CM, 157 CodeGenOptLevel OL, bool JIT) 158 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 159 160 void NVPTXTargetMachine64::anchor() {} 161 162 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 163 StringRef CPU, StringRef FS, 164 const TargetOptions &Options, 165 std::optional<Reloc::Model> RM, 166 std::optional<CodeModel::Model> CM, 167 CodeGenOptLevel OL, bool JIT) 168 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 169 170 namespace { 171 172 class NVPTXPassConfig : public TargetPassConfig { 173 public: 174 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 175 : TargetPassConfig(TM, PM) {} 176 177 NVPTXTargetMachine &getNVPTXTargetMachine() const { 178 return getTM<NVPTXTargetMachine>(); 179 } 180 181 void addIRPasses() override; 182 bool addInstSelector() override; 183 void addPreRegAlloc() override; 184 void addPostRegAlloc() override; 185 void addMachineSSAOptimization() override; 186 187 FunctionPass *createTargetRegisterAllocator(bool) override; 188 void addFastRegAlloc() override; 189 void addOptimizedRegAlloc() override; 190 191 bool addRegAssignAndRewriteFast() override { 192 llvm_unreachable("should not be used"); 193 } 194 195 bool addRegAssignAndRewriteOptimized() override { 196 llvm_unreachable("should not be used"); 197 } 198 199 private: 200 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 201 // function is only called in opt mode. 202 void addEarlyCSEOrGVNPass(); 203 204 // Add passes that propagate special memory spaces. 205 void addAddressSpaceInferencePasses(); 206 207 // Add passes that perform straight-line scalar optimizations. 208 void addStraightLineScalarOptimizationPasses(); 209 }; 210 211 } // end anonymous namespace 212 213 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 214 return new NVPTXPassConfig(*this, PM); 215 } 216 217 MachineFunctionInfo *NVPTXTargetMachine::createMachineFunctionInfo( 218 BumpPtrAllocator &Allocator, const Function &F, 219 const TargetSubtargetInfo *STI) const { 220 return NVPTXMachineFunctionInfo::create<NVPTXMachineFunctionInfo>(Allocator, 221 F, STI); 222 } 223 224 void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 225 AAM.registerFunctionAnalysis<NVPTXAA>(); 226 } 227 228 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 229 #define GET_PASS_REGISTRY "NVPTXPassRegistry.def" 230 #include "llvm/Passes/TargetPassRegistry.inc" 231 232 PB.registerPipelineStartEPCallback( 233 [this](ModulePassManager &PM, OptimizationLevel Level) { 234 FunctionPassManager FPM; 235 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 236 // Note: NVVMIntrRangePass was causing numerical discrepancies at one 237 // point, if issues crop up, consider disabling. 238 FPM.addPass(NVVMIntrRangePass()); 239 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 240 }); 241 } 242 243 TargetTransformInfo 244 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const { 245 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 246 } 247 248 std::pair<const Value *, unsigned> 249 NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const { 250 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 251 switch (II->getIntrinsicID()) { 252 case Intrinsic::nvvm_isspacep_const: 253 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST); 254 case Intrinsic::nvvm_isspacep_global: 255 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL); 256 case Intrinsic::nvvm_isspacep_local: 257 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL); 258 case Intrinsic::nvvm_isspacep_shared: 259 case Intrinsic::nvvm_isspacep_shared_cluster: 260 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED); 261 default: 262 break; 263 } 264 } 265 return std::make_pair(nullptr, -1); 266 } 267 268 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 269 if (getOptLevel() == CodeGenOptLevel::Aggressive) 270 addPass(createGVNPass()); 271 else 272 addPass(createEarlyCSEPass()); 273 } 274 275 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 276 // NVPTXLowerArgs emits alloca for byval parameters which can often 277 // be eliminated by SROA. 278 addPass(createSROAPass()); 279 addPass(createNVPTXLowerAllocaPass()); 280 addPass(createInferAddressSpacesPass()); 281 addPass(createNVPTXAtomicLowerPass()); 282 } 283 284 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 285 addPass(createSeparateConstOffsetFromGEPPass()); 286 addPass(createSpeculativeExecutionPass()); 287 // ReassociateGEPs exposes more opportunites for SLSR. See 288 // the example in reassociate-geps-and-slsr.ll. 289 addPass(createStraightLineStrengthReducePass()); 290 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 291 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 292 // for some of our benchmarks. 293 addEarlyCSEOrGVNPass(); 294 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 295 addPass(createNaryReassociatePass()); 296 // NaryReassociate on GEPs creates redundant common expressions, so run 297 // EarlyCSE after it. 298 addPass(createEarlyCSEPass()); 299 } 300 301 void NVPTXPassConfig::addIRPasses() { 302 // The following passes are known to not play well with virtual regs hanging 303 // around after register allocation (which in our case, is *all* registers). 304 // We explicitly disable them here. We do, however, need some functionality 305 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 306 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 307 disablePass(&PrologEpilogCodeInserterID); 308 disablePass(&MachineLateInstrsCleanupID); 309 disablePass(&MachineCopyPropagationID); 310 disablePass(&TailDuplicateID); 311 disablePass(&StackMapLivenessID); 312 disablePass(&LiveDebugValuesID); 313 disablePass(&PostRAMachineSinkingID); 314 disablePass(&PostRASchedulerID); 315 disablePass(&FuncletLayoutID); 316 disablePass(&PatchableFunctionID); 317 disablePass(&ShrinkWrapID); 318 319 addPass(createNVPTXAAWrapperPass()); 320 addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { 321 if (auto *WrapperPass = P.getAnalysisIfAvailable<NVPTXAAWrapperPass>()) 322 AAR.addAAResult(WrapperPass->getResult()); 323 })); 324 325 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 326 // it here does nothing. But since we need it for correctness when lowering 327 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 328 // call addEarlyAsPossiblePasses. 329 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 330 addPass(createNVVMReflectPass(ST.getSmVersion())); 331 332 if (getOptLevel() != CodeGenOptLevel::None) 333 addPass(createNVPTXImageOptimizerPass()); 334 addPass(createNVPTXAssignValidGlobalNamesPass()); 335 addPass(createGenericToNVVMLegacyPass()); 336 337 // NVPTXLowerArgs is required for correctness and should be run right 338 // before the address space inference passes. 339 addPass(createNVPTXLowerArgsPass()); 340 if (getOptLevel() != CodeGenOptLevel::None) { 341 addAddressSpaceInferencePasses(); 342 addStraightLineScalarOptimizationPasses(); 343 } 344 345 addPass(createAtomicExpandLegacyPass()); 346 addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); 347 addPass(createNVPTXCtorDtorLoweringLegacyPass()); 348 349 // === LSR and other generic IR passes === 350 TargetPassConfig::addIRPasses(); 351 // EarlyCSE is not always strong enough to clean up what LSR produces. For 352 // example, GVN can combine 353 // 354 // %0 = add %a, %b 355 // %1 = add %b, %a 356 // 357 // and 358 // 359 // %0 = shl nsw %a, 2 360 // %1 = shl %a, 2 361 // 362 // but EarlyCSE can do neither of them. 363 if (getOptLevel() != CodeGenOptLevel::None) { 364 addEarlyCSEOrGVNPass(); 365 if (!DisableLoadStoreVectorizer) 366 addPass(createLoadStoreVectorizerPass()); 367 addPass(createSROAPass()); 368 } 369 370 const auto &Options = getNVPTXTargetMachine().Options; 371 addPass(createNVPTXLowerUnreachablePass(Options.TrapUnreachable, 372 Options.NoTrapAfterNoreturn)); 373 } 374 375 bool NVPTXPassConfig::addInstSelector() { 376 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 377 378 addPass(createLowerAggrCopies()); 379 addPass(createAllocaHoisting()); 380 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 381 382 if (!ST.hasImageHandles()) 383 addPass(createNVPTXReplaceImageHandlesPass()); 384 385 return false; 386 } 387 388 void NVPTXPassConfig::addPreRegAlloc() { 389 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 390 addPass(createNVPTXProxyRegErasurePass()); 391 } 392 393 void NVPTXPassConfig::addPostRegAlloc() { 394 addPass(createNVPTXPrologEpilogPass()); 395 if (getOptLevel() != CodeGenOptLevel::None) { 396 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 397 // index with VRFrame register. NVPTXPeephole need to be run after that and 398 // will replace VRFrame with VRFrameLocal when possible. 399 addPass(createNVPTXPeephole()); 400 } 401 } 402 403 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 404 return nullptr; // No reg alloc 405 } 406 407 void NVPTXPassConfig::addFastRegAlloc() { 408 addPass(&PHIEliminationID); 409 addPass(&TwoAddressInstructionPassID); 410 } 411 412 void NVPTXPassConfig::addOptimizedRegAlloc() { 413 addPass(&ProcessImplicitDefsID); 414 addPass(&LiveVariablesID); 415 addPass(&MachineLoopInfoID); 416 addPass(&PHIEliminationID); 417 418 addPass(&TwoAddressInstructionPassID); 419 addPass(&RegisterCoalescerID); 420 421 // PreRA instruction scheduling. 422 if (addPass(&MachineSchedulerID)) 423 printAndVerify("After Machine Scheduling"); 424 425 addPass(&StackSlotColoringID); 426 427 // FIXME: Needs physical registers 428 // addPass(&MachineLICMID); 429 430 printAndVerify("After StackSlotColoring"); 431 } 432 433 void NVPTXPassConfig::addMachineSSAOptimization() { 434 // Pre-ra tail duplication. 435 if (addPass(&EarlyTailDuplicateID)) 436 printAndVerify("After Pre-RegAlloc TailDuplicate"); 437 438 // Optimize PHIs before DCE: removing dead PHI cycles may make more 439 // instructions dead. 440 addPass(&OptimizePHIsID); 441 442 // This pass merges large allocas. StackSlotColoring is a different pass 443 // which merges spill slots. 444 addPass(&StackColoringID); 445 446 // If the target requests it, assign local variables to stack slots relative 447 // to one another and simplify frame index references where possible. 448 addPass(&LocalStackSlotAllocationID); 449 450 // With optimization, dead code should already be eliminated. However 451 // there is one known exception: lowered code for arguments that are only 452 // used by tail calls, where the tail calls reuse the incoming stack 453 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 454 addPass(&DeadMachineInstructionElimID); 455 printAndVerify("After codegen DCE pass"); 456 457 // Allow targets to insert passes that improve instruction level parallelism, 458 // like if-conversion. Such passes will typically need dominator trees and 459 // loop info, just like LICM and CSE below. 460 if (addILPOpts()) 461 printAndVerify("After ILP optimizations"); 462 463 addPass(&EarlyMachineLICMID); 464 addPass(&MachineCSEID); 465 466 addPass(&MachineSinkingID); 467 printAndVerify("After Machine LICM, CSE and Sinking passes"); 468 469 addPass(&PeepholeOptimizerID); 470 printAndVerify("After codegen peephole optimization pass"); 471 } 472