1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAllocaHoisting.h" 16 #include "NVPTXAtomicLower.h" 17 #include "NVPTXLowerAggrCopies.h" 18 #include "NVPTXMachineFunctionInfo.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXTargetTransformInfo.h" 21 #include "TargetInfo/NVPTXTargetInfo.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/Triple.h" 24 #include "llvm/Analysis/TargetTransformInfo.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/TargetPassConfig.h" 27 #include "llvm/IR/IntrinsicsNVPTX.h" 28 #include "llvm/IR/LegacyPassManager.h" 29 #include "llvm/MC/TargetRegistry.h" 30 #include "llvm/Pass.h" 31 #include "llvm/Passes/PassBuilder.h" 32 #include "llvm/Support/CommandLine.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Target/TargetOptions.h" 35 #include "llvm/Transforms/Scalar.h" 36 #include "llvm/Transforms/Scalar/GVN.h" 37 #include "llvm/Transforms/Vectorize.h" 38 #include <cassert> 39 #include <optional> 40 #include <string> 41 42 using namespace llvm; 43 44 // LSV is still relatively new; this switch lets us turn it off in case we 45 // encounter (or suspect) a bug. 46 static cl::opt<bool> 47 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 48 cl::desc("Disable load/store vectorizer"), 49 cl::init(false), cl::Hidden); 50 51 // TODO: Remove this flag when we are confident with no regressions. 52 static cl::opt<bool> DisableRequireStructuredCFG( 53 "disable-nvptx-require-structured-cfg", 54 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 55 "structured CFG. The requirement should be disabled only when " 56 "unexpected regressions happen."), 57 cl::init(false), cl::Hidden); 58 59 static cl::opt<bool> UseShortPointersOpt( 60 "nvptx-short-ptr", 61 cl::desc( 62 "Use 32-bit pointers for accessing const/local/shared address spaces."), 63 cl::init(false), cl::Hidden); 64 65 namespace llvm { 66 67 void initializeGenericToNVVMPass(PassRegistry&); 68 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 69 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); 70 void initializeNVPTXAtomicLowerPass(PassRegistry &); 71 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 72 void initializeNVPTXLowerAllocaPass(PassRegistry &); 73 void initializeNVPTXLowerArgsPass(PassRegistry &); 74 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 75 void initializeNVVMIntrRangePass(PassRegistry &); 76 void initializeNVVMReflectPass(PassRegistry &); 77 78 } // end namespace llvm 79 80 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 81 // Register the target. 82 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 83 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 84 85 PassRegistry &PR = *PassRegistry::getPassRegistry(); 86 // FIXME: This pass is really intended to be invoked during IR optimization, 87 // but it's very NVPTX-specific. 88 initializeNVVMReflectPass(PR); 89 initializeNVVMIntrRangePass(PR); 90 initializeGenericToNVVMPass(PR); 91 initializeNVPTXAllocaHoistingPass(PR); 92 initializeNVPTXAssignValidGlobalNamesPass(PR); 93 initializeNVPTXAtomicLowerPass(PR); 94 initializeNVPTXLowerArgsPass(PR); 95 initializeNVPTXLowerAllocaPass(PR); 96 initializeNVPTXLowerAggrCopiesPass(PR); 97 initializeNVPTXProxyRegErasurePass(PR); 98 initializeNVPTXDAGToDAGISelPass(PR); 99 } 100 101 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 102 std::string Ret = "e"; 103 104 if (!is64Bit) 105 Ret += "-p:32:32"; 106 else if (UseShortPointers) 107 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 108 109 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 110 111 return Ret; 112 } 113 114 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 115 StringRef CPU, StringRef FS, 116 const TargetOptions &Options, 117 std::optional<Reloc::Model> RM, 118 std::optional<CodeModel::Model> CM, 119 CodeGenOpt::Level OL, bool is64bit) 120 // The pic relocation model is used regardless of what the client has 121 // specified, as it is the only relocation model currently supported. 122 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 123 CPU, FS, Options, Reloc::PIC_, 124 getEffectiveCodeModel(CM, CodeModel::Small), OL), 125 is64bit(is64bit), UseShortPointers(UseShortPointersOpt), 126 TLOF(std::make_unique<NVPTXTargetObjectFile>()), 127 Subtarget(TT, std::string(CPU), std::string(FS), *this), 128 StrPool(StrAlloc) { 129 if (TT.getOS() == Triple::NVCL) 130 drvInterface = NVPTX::NVCL; 131 else 132 drvInterface = NVPTX::CUDA; 133 if (!DisableRequireStructuredCFG) 134 setRequiresStructuredCFG(true); 135 initAsmInfo(); 136 } 137 138 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 139 140 void NVPTXTargetMachine32::anchor() {} 141 142 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 143 StringRef CPU, StringRef FS, 144 const TargetOptions &Options, 145 std::optional<Reloc::Model> RM, 146 std::optional<CodeModel::Model> CM, 147 CodeGenOpt::Level OL, bool JIT) 148 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 149 150 void NVPTXTargetMachine64::anchor() {} 151 152 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 153 StringRef CPU, StringRef FS, 154 const TargetOptions &Options, 155 std::optional<Reloc::Model> RM, 156 std::optional<CodeModel::Model> CM, 157 CodeGenOpt::Level OL, bool JIT) 158 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 159 160 namespace { 161 162 class NVPTXPassConfig : public TargetPassConfig { 163 public: 164 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 165 : TargetPassConfig(TM, PM) {} 166 167 NVPTXTargetMachine &getNVPTXTargetMachine() const { 168 return getTM<NVPTXTargetMachine>(); 169 } 170 171 void addIRPasses() override; 172 bool addInstSelector() override; 173 void addPreRegAlloc() override; 174 void addPostRegAlloc() override; 175 void addMachineSSAOptimization() override; 176 177 FunctionPass *createTargetRegisterAllocator(bool) override; 178 void addFastRegAlloc() override; 179 void addOptimizedRegAlloc() override; 180 181 bool addRegAssignAndRewriteFast() override { 182 llvm_unreachable("should not be used"); 183 } 184 185 bool addRegAssignAndRewriteOptimized() override { 186 llvm_unreachable("should not be used"); 187 } 188 189 private: 190 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 191 // function is only called in opt mode. 192 void addEarlyCSEOrGVNPass(); 193 194 // Add passes that propagate special memory spaces. 195 void addAddressSpaceInferencePasses(); 196 197 // Add passes that perform straight-line scalar optimizations. 198 void addStraightLineScalarOptimizationPasses(); 199 }; 200 201 } // end anonymous namespace 202 203 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 204 return new NVPTXPassConfig(*this, PM); 205 } 206 207 MachineFunctionInfo *NVPTXTargetMachine::createMachineFunctionInfo( 208 BumpPtrAllocator &Allocator, const Function &F, 209 const TargetSubtargetInfo *STI) const { 210 return NVPTXMachineFunctionInfo::create<NVPTXMachineFunctionInfo>(Allocator, 211 F, STI); 212 } 213 214 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 215 PB.registerPipelineParsingCallback( 216 [](StringRef PassName, FunctionPassManager &PM, 217 ArrayRef<PassBuilder::PipelineElement>) { 218 if (PassName == "nvvm-reflect") { 219 PM.addPass(NVVMReflectPass()); 220 return true; 221 } 222 if (PassName == "nvvm-intr-range") { 223 PM.addPass(NVVMIntrRangePass()); 224 return true; 225 } 226 return false; 227 }); 228 229 PB.registerPipelineStartEPCallback( 230 [this](ModulePassManager &PM, OptimizationLevel Level) { 231 FunctionPassManager FPM; 232 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 233 // FIXME: NVVMIntrRangePass is causing numerical discrepancies, 234 // investigate and re-enable. 235 // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); 236 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 237 }); 238 } 239 240 TargetTransformInfo 241 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const { 242 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 243 } 244 245 std::pair<const Value *, unsigned> 246 NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const { 247 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 248 switch (II->getIntrinsicID()) { 249 case Intrinsic::nvvm_isspacep_const: 250 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST); 251 case Intrinsic::nvvm_isspacep_global: 252 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL); 253 case Intrinsic::nvvm_isspacep_local: 254 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL); 255 case Intrinsic::nvvm_isspacep_shared: 256 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED); 257 default: 258 break; 259 } 260 } 261 return std::make_pair(nullptr, -1); 262 } 263 264 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 265 if (getOptLevel() == CodeGenOpt::Aggressive) 266 addPass(createGVNPass()); 267 else 268 addPass(createEarlyCSEPass()); 269 } 270 271 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 272 // NVPTXLowerArgs emits alloca for byval parameters which can often 273 // be eliminated by SROA. 274 addPass(createSROAPass()); 275 addPass(createNVPTXLowerAllocaPass()); 276 addPass(createInferAddressSpacesPass()); 277 addPass(createNVPTXAtomicLowerPass()); 278 } 279 280 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 281 addPass(createSeparateConstOffsetFromGEPPass()); 282 addPass(createSpeculativeExecutionPass()); 283 // ReassociateGEPs exposes more opportunites for SLSR. See 284 // the example in reassociate-geps-and-slsr.ll. 285 addPass(createStraightLineStrengthReducePass()); 286 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 287 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 288 // for some of our benchmarks. 289 addEarlyCSEOrGVNPass(); 290 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 291 addPass(createNaryReassociatePass()); 292 // NaryReassociate on GEPs creates redundant common expressions, so run 293 // EarlyCSE after it. 294 addPass(createEarlyCSEPass()); 295 } 296 297 void NVPTXPassConfig::addIRPasses() { 298 // The following passes are known to not play well with virtual regs hanging 299 // around after register allocation (which in our case, is *all* registers). 300 // We explicitly disable them here. We do, however, need some functionality 301 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 302 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 303 disablePass(&PrologEpilogCodeInserterID); 304 disablePass(&MachineLateInstrsCleanupID); 305 disablePass(&MachineCopyPropagationID); 306 disablePass(&TailDuplicateID); 307 disablePass(&StackMapLivenessID); 308 disablePass(&LiveDebugValuesID); 309 disablePass(&PostRAMachineSinkingID); 310 disablePass(&PostRASchedulerID); 311 disablePass(&FuncletLayoutID); 312 disablePass(&PatchableFunctionID); 313 disablePass(&ShrinkWrapID); 314 315 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 316 // it here does nothing. But since we need it for correctness when lowering 317 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 318 // call addEarlyAsPossiblePasses. 319 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 320 addPass(createNVVMReflectPass(ST.getSmVersion())); 321 322 if (getOptLevel() != CodeGenOpt::None) 323 addPass(createNVPTXImageOptimizerPass()); 324 addPass(createNVPTXAssignValidGlobalNamesPass()); 325 addPass(createGenericToNVVMPass()); 326 327 // NVPTXLowerArgs is required for correctness and should be run right 328 // before the address space inference passes. 329 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); 330 if (getOptLevel() != CodeGenOpt::None) { 331 addAddressSpaceInferencePasses(); 332 addStraightLineScalarOptimizationPasses(); 333 } 334 335 addPass(createAtomicExpandPass()); 336 337 // === LSR and other generic IR passes === 338 TargetPassConfig::addIRPasses(); 339 // EarlyCSE is not always strong enough to clean up what LSR produces. For 340 // example, GVN can combine 341 // 342 // %0 = add %a, %b 343 // %1 = add %b, %a 344 // 345 // and 346 // 347 // %0 = shl nsw %a, 2 348 // %1 = shl %a, 2 349 // 350 // but EarlyCSE can do neither of them. 351 if (getOptLevel() != CodeGenOpt::None) { 352 addEarlyCSEOrGVNPass(); 353 if (!DisableLoadStoreVectorizer) 354 addPass(createLoadStoreVectorizerPass()); 355 addPass(createSROAPass()); 356 } 357 } 358 359 bool NVPTXPassConfig::addInstSelector() { 360 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 361 362 addPass(createLowerAggrCopies()); 363 addPass(createAllocaHoisting()); 364 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 365 366 if (!ST.hasImageHandles()) 367 addPass(createNVPTXReplaceImageHandlesPass()); 368 369 return false; 370 } 371 372 void NVPTXPassConfig::addPreRegAlloc() { 373 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 374 addPass(createNVPTXProxyRegErasurePass()); 375 } 376 377 void NVPTXPassConfig::addPostRegAlloc() { 378 addPass(createNVPTXPrologEpilogPass()); 379 if (getOptLevel() != CodeGenOpt::None) { 380 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 381 // index with VRFrame register. NVPTXPeephole need to be run after that and 382 // will replace VRFrame with VRFrameLocal when possible. 383 addPass(createNVPTXPeephole()); 384 } 385 } 386 387 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 388 return nullptr; // No reg alloc 389 } 390 391 void NVPTXPassConfig::addFastRegAlloc() { 392 addPass(&PHIEliminationID); 393 addPass(&TwoAddressInstructionPassID); 394 } 395 396 void NVPTXPassConfig::addOptimizedRegAlloc() { 397 addPass(&ProcessImplicitDefsID); 398 addPass(&LiveVariablesID); 399 addPass(&MachineLoopInfoID); 400 addPass(&PHIEliminationID); 401 402 addPass(&TwoAddressInstructionPassID); 403 addPass(&RegisterCoalescerID); 404 405 // PreRA instruction scheduling. 406 if (addPass(&MachineSchedulerID)) 407 printAndVerify("After Machine Scheduling"); 408 409 410 addPass(&StackSlotColoringID); 411 412 // FIXME: Needs physical registers 413 //addPass(&MachineLICMID); 414 415 printAndVerify("After StackSlotColoring"); 416 } 417 418 void NVPTXPassConfig::addMachineSSAOptimization() { 419 // Pre-ra tail duplication. 420 if (addPass(&EarlyTailDuplicateID)) 421 printAndVerify("After Pre-RegAlloc TailDuplicate"); 422 423 // Optimize PHIs before DCE: removing dead PHI cycles may make more 424 // instructions dead. 425 addPass(&OptimizePHIsID); 426 427 // This pass merges large allocas. StackSlotColoring is a different pass 428 // which merges spill slots. 429 addPass(&StackColoringID); 430 431 // If the target requests it, assign local variables to stack slots relative 432 // to one another and simplify frame index references where possible. 433 addPass(&LocalStackSlotAllocationID); 434 435 // With optimization, dead code should already be eliminated. However 436 // there is one known exception: lowered code for arguments that are only 437 // used by tail calls, where the tail calls reuse the incoming stack 438 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 439 addPass(&DeadMachineInstructionElimID); 440 printAndVerify("After codegen DCE pass"); 441 442 // Allow targets to insert passes that improve instruction level parallelism, 443 // like if-conversion. Such passes will typically need dominator trees and 444 // loop info, just like LICM and CSE below. 445 if (addILPOpts()) 446 printAndVerify("After ILP optimizations"); 447 448 addPass(&EarlyMachineLICMID); 449 addPass(&MachineCSEID); 450 451 addPass(&MachineSinkingID); 452 printAndVerify("After Machine LICM, CSE and Sinking passes"); 453 454 addPass(&PeepholeOptimizerID); 455 printAndVerify("After codegen peephole optimization pass"); 456 } 457