1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAllocaHoisting.h" 16 #include "NVPTXAtomicLower.h" 17 #include "NVPTXLowerAggrCopies.h" 18 #include "NVPTXTargetObjectFile.h" 19 #include "NVPTXTargetTransformInfo.h" 20 #include "TargetInfo/NVPTXTargetInfo.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/Triple.h" 23 #include "llvm/Analysis/TargetTransformInfo.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/IR/IntrinsicsNVPTX.h" 27 #include "llvm/IR/LegacyPassManager.h" 28 #include "llvm/MC/TargetRegistry.h" 29 #include "llvm/Pass.h" 30 #include "llvm/Passes/PassBuilder.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Target/TargetMachine.h" 33 #include "llvm/Target/TargetOptions.h" 34 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 35 #include "llvm/Transforms/Scalar.h" 36 #include "llvm/Transforms/Scalar/GVN.h" 37 #include "llvm/Transforms/Vectorize.h" 38 #include <cassert> 39 #include <string> 40 41 using namespace llvm; 42 43 // LSV is still relatively new; this switch lets us turn it off in case we 44 // encounter (or suspect) a bug. 45 static cl::opt<bool> 46 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 47 cl::desc("Disable load/store vectorizer"), 48 cl::init(false), cl::Hidden); 49 50 // TODO: Remove this flag when we are confident with no regressions. 51 static cl::opt<bool> DisableRequireStructuredCFG( 52 "disable-nvptx-require-structured-cfg", 53 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 54 "structured CFG. The requirement should be disabled only when " 55 "unexpected regressions happen."), 56 cl::init(false), cl::Hidden); 57 58 static cl::opt<bool> UseShortPointersOpt( 59 "nvptx-short-ptr", 60 cl::desc( 61 "Use 32-bit pointers for accessing const/local/shared address spaces."), 62 cl::init(false), cl::Hidden); 63 64 namespace llvm { 65 66 void initializeNVVMIntrRangePass(PassRegistry&); 67 void initializeNVVMReflectPass(PassRegistry&); 68 void initializeGenericToNVVMPass(PassRegistry&); 69 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 70 void initializeNVPTXAtomicLowerPass(PassRegistry &); 71 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); 72 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 73 void initializeNVPTXLowerArgsPass(PassRegistry &); 74 void initializeNVPTXLowerAllocaPass(PassRegistry &); 75 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 76 77 } // end namespace llvm 78 79 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 80 // Register the target. 81 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 82 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 83 84 // FIXME: This pass is really intended to be invoked during IR optimization, 85 // but it's very NVPTX-specific. 86 PassRegistry &PR = *PassRegistry::getPassRegistry(); 87 initializeNVVMReflectPass(PR); 88 initializeNVVMIntrRangePass(PR); 89 initializeGenericToNVVMPass(PR); 90 initializeNVPTXAllocaHoistingPass(PR); 91 initializeNVPTXAssignValidGlobalNamesPass(PR); 92 initializeNVPTXAtomicLowerPass(PR); 93 initializeNVPTXLowerArgsPass(PR); 94 initializeNVPTXLowerAllocaPass(PR); 95 initializeNVPTXLowerAggrCopiesPass(PR); 96 initializeNVPTXProxyRegErasurePass(PR); 97 } 98 99 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 100 std::string Ret = "e"; 101 102 if (!is64Bit) 103 Ret += "-p:32:32"; 104 else if (UseShortPointers) 105 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 106 107 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 108 109 return Ret; 110 } 111 112 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 113 StringRef CPU, StringRef FS, 114 const TargetOptions &Options, 115 Optional<Reloc::Model> RM, 116 Optional<CodeModel::Model> CM, 117 CodeGenOpt::Level OL, bool is64bit) 118 // The pic relocation model is used regardless of what the client has 119 // specified, as it is the only relocation model currently supported. 120 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 121 CPU, FS, Options, Reloc::PIC_, 122 getEffectiveCodeModel(CM, CodeModel::Small), OL), 123 is64bit(is64bit), UseShortPointers(UseShortPointersOpt), 124 TLOF(std::make_unique<NVPTXTargetObjectFile>()), 125 Subtarget(TT, std::string(CPU), std::string(FS), *this) { 126 if (TT.getOS() == Triple::NVCL) 127 drvInterface = NVPTX::NVCL; 128 else 129 drvInterface = NVPTX::CUDA; 130 if (!DisableRequireStructuredCFG) 131 setRequiresStructuredCFG(true); 132 initAsmInfo(); 133 } 134 135 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 136 137 void NVPTXTargetMachine32::anchor() {} 138 139 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 140 StringRef CPU, StringRef FS, 141 const TargetOptions &Options, 142 Optional<Reloc::Model> RM, 143 Optional<CodeModel::Model> CM, 144 CodeGenOpt::Level OL, bool JIT) 145 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 146 147 void NVPTXTargetMachine64::anchor() {} 148 149 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 150 StringRef CPU, StringRef FS, 151 const TargetOptions &Options, 152 Optional<Reloc::Model> RM, 153 Optional<CodeModel::Model> CM, 154 CodeGenOpt::Level OL, bool JIT) 155 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 156 157 namespace { 158 159 class NVPTXPassConfig : public TargetPassConfig { 160 public: 161 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 162 : TargetPassConfig(TM, PM) {} 163 164 NVPTXTargetMachine &getNVPTXTargetMachine() const { 165 return getTM<NVPTXTargetMachine>(); 166 } 167 168 void addIRPasses() override; 169 bool addInstSelector() override; 170 void addPreRegAlloc() override; 171 void addPostRegAlloc() override; 172 void addMachineSSAOptimization() override; 173 174 FunctionPass *createTargetRegisterAllocator(bool) override; 175 void addFastRegAlloc() override; 176 void addOptimizedRegAlloc() override; 177 178 bool addRegAssignAndRewriteFast() override { 179 llvm_unreachable("should not be used"); 180 } 181 182 bool addRegAssignAndRewriteOptimized() override { 183 llvm_unreachable("should not be used"); 184 } 185 186 private: 187 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 188 // function is only called in opt mode. 189 void addEarlyCSEOrGVNPass(); 190 191 // Add passes that propagate special memory spaces. 192 void addAddressSpaceInferencePasses(); 193 194 // Add passes that perform straight-line scalar optimizations. 195 void addStraightLineScalarOptimizationPasses(); 196 }; 197 198 } // end anonymous namespace 199 200 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 201 return new NVPTXPassConfig(*this, PM); 202 } 203 204 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 205 Builder.addExtension( 206 PassManagerBuilder::EP_EarlyAsPossible, 207 [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { 208 PM.add(createNVVMReflectPass(Subtarget.getSmVersion())); 209 PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); 210 }); 211 } 212 213 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 214 PB.registerPipelineParsingCallback( 215 [](StringRef PassName, FunctionPassManager &PM, 216 ArrayRef<PassBuilder::PipelineElement>) { 217 if (PassName == "nvvm-reflect") { 218 PM.addPass(NVVMReflectPass()); 219 return true; 220 } 221 if (PassName == "nvvm-intr-range") { 222 PM.addPass(NVVMIntrRangePass()); 223 return true; 224 } 225 return false; 226 }); 227 228 PB.registerPipelineStartEPCallback( 229 [this](ModulePassManager &PM, OptimizationLevel Level) { 230 FunctionPassManager FPM; 231 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 232 // FIXME: NVVMIntrRangePass is causing numerical discrepancies, 233 // investigate and re-enable. 234 // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); 235 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 236 }); 237 } 238 239 TargetTransformInfo 240 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const { 241 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 242 } 243 244 std::pair<const Value *, unsigned> 245 NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const { 246 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 247 switch (II->getIntrinsicID()) { 248 case Intrinsic::nvvm_isspacep_const: 249 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST); 250 case Intrinsic::nvvm_isspacep_global: 251 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL); 252 case Intrinsic::nvvm_isspacep_local: 253 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL); 254 case Intrinsic::nvvm_isspacep_shared: 255 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED); 256 default: 257 break; 258 } 259 } 260 return std::make_pair(nullptr, -1); 261 } 262 263 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 264 if (getOptLevel() == CodeGenOpt::Aggressive) 265 addPass(createGVNPass()); 266 else 267 addPass(createEarlyCSEPass()); 268 } 269 270 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 271 // NVPTXLowerArgs emits alloca for byval parameters which can often 272 // be eliminated by SROA. 273 addPass(createSROAPass()); 274 addPass(createNVPTXLowerAllocaPass()); 275 addPass(createInferAddressSpacesPass()); 276 addPass(createNVPTXAtomicLowerPass()); 277 } 278 279 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 280 addPass(createSeparateConstOffsetFromGEPPass()); 281 addPass(createSpeculativeExecutionPass()); 282 // ReassociateGEPs exposes more opportunites for SLSR. See 283 // the example in reassociate-geps-and-slsr.ll. 284 addPass(createStraightLineStrengthReducePass()); 285 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 286 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 287 // for some of our benchmarks. 288 addEarlyCSEOrGVNPass(); 289 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 290 addPass(createNaryReassociatePass()); 291 // NaryReassociate on GEPs creates redundant common expressions, so run 292 // EarlyCSE after it. 293 addPass(createEarlyCSEPass()); 294 } 295 296 void NVPTXPassConfig::addIRPasses() { 297 // The following passes are known to not play well with virtual regs hanging 298 // around after register allocation (which in our case, is *all* registers). 299 // We explicitly disable them here. We do, however, need some functionality 300 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 301 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 302 disablePass(&PrologEpilogCodeInserterID); 303 disablePass(&MachineCopyPropagationID); 304 disablePass(&TailDuplicateID); 305 disablePass(&StackMapLivenessID); 306 disablePass(&LiveDebugValuesID); 307 disablePass(&PostRAMachineSinkingID); 308 disablePass(&PostRASchedulerID); 309 disablePass(&FuncletLayoutID); 310 disablePass(&PatchableFunctionID); 311 disablePass(&ShrinkWrapID); 312 313 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 314 // it here does nothing. But since we need it for correctness when lowering 315 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 316 // call addEarlyAsPossiblePasses. 317 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 318 addPass(createNVVMReflectPass(ST.getSmVersion())); 319 320 if (getOptLevel() != CodeGenOpt::None) 321 addPass(createNVPTXImageOptimizerPass()); 322 addPass(createNVPTXAssignValidGlobalNamesPass()); 323 addPass(createGenericToNVVMPass()); 324 325 // NVPTXLowerArgs is required for correctness and should be run right 326 // before the address space inference passes. 327 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); 328 if (getOptLevel() != CodeGenOpt::None) { 329 addAddressSpaceInferencePasses(); 330 addStraightLineScalarOptimizationPasses(); 331 } 332 333 addPass(createAtomicExpandPass()); 334 335 // === LSR and other generic IR passes === 336 TargetPassConfig::addIRPasses(); 337 // EarlyCSE is not always strong enough to clean up what LSR produces. For 338 // example, GVN can combine 339 // 340 // %0 = add %a, %b 341 // %1 = add %b, %a 342 // 343 // and 344 // 345 // %0 = shl nsw %a, 2 346 // %1 = shl %a, 2 347 // 348 // but EarlyCSE can do neither of them. 349 if (getOptLevel() != CodeGenOpt::None) { 350 addEarlyCSEOrGVNPass(); 351 if (!DisableLoadStoreVectorizer) 352 addPass(createLoadStoreVectorizerPass()); 353 addPass(createSROAPass()); 354 } 355 } 356 357 bool NVPTXPassConfig::addInstSelector() { 358 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 359 360 addPass(createLowerAggrCopies()); 361 addPass(createAllocaHoisting()); 362 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 363 364 if (!ST.hasImageHandles()) 365 addPass(createNVPTXReplaceImageHandlesPass()); 366 367 return false; 368 } 369 370 void NVPTXPassConfig::addPreRegAlloc() { 371 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 372 addPass(createNVPTXProxyRegErasurePass()); 373 } 374 375 void NVPTXPassConfig::addPostRegAlloc() { 376 addPass(createNVPTXPrologEpilogPass()); 377 if (getOptLevel() != CodeGenOpt::None) { 378 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 379 // index with VRFrame register. NVPTXPeephole need to be run after that and 380 // will replace VRFrame with VRFrameLocal when possible. 381 addPass(createNVPTXPeephole()); 382 } 383 } 384 385 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 386 return nullptr; // No reg alloc 387 } 388 389 void NVPTXPassConfig::addFastRegAlloc() { 390 addPass(&PHIEliminationID); 391 addPass(&TwoAddressInstructionPassID); 392 } 393 394 void NVPTXPassConfig::addOptimizedRegAlloc() { 395 addPass(&ProcessImplicitDefsID); 396 addPass(&LiveVariablesID); 397 addPass(&MachineLoopInfoID); 398 addPass(&PHIEliminationID); 399 400 addPass(&TwoAddressInstructionPassID); 401 addPass(&RegisterCoalescerID); 402 403 // PreRA instruction scheduling. 404 if (addPass(&MachineSchedulerID)) 405 printAndVerify("After Machine Scheduling"); 406 407 408 addPass(&StackSlotColoringID); 409 410 // FIXME: Needs physical registers 411 //addPass(&MachineLICMID); 412 413 printAndVerify("After StackSlotColoring"); 414 } 415 416 void NVPTXPassConfig::addMachineSSAOptimization() { 417 // Pre-ra tail duplication. 418 if (addPass(&EarlyTailDuplicateID)) 419 printAndVerify("After Pre-RegAlloc TailDuplicate"); 420 421 // Optimize PHIs before DCE: removing dead PHI cycles may make more 422 // instructions dead. 423 addPass(&OptimizePHIsID); 424 425 // This pass merges large allocas. StackSlotColoring is a different pass 426 // which merges spill slots. 427 addPass(&StackColoringID); 428 429 // If the target requests it, assign local variables to stack slots relative 430 // to one another and simplify frame index references where possible. 431 addPass(&LocalStackSlotAllocationID); 432 433 // With optimization, dead code should already be eliminated. However 434 // there is one known exception: lowered code for arguments that are only 435 // used by tail calls, where the tail calls reuse the incoming stack 436 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 437 addPass(&DeadMachineInstructionElimID); 438 printAndVerify("After codegen DCE pass"); 439 440 // Allow targets to insert passes that improve instruction level parallelism, 441 // like if-conversion. Such passes will typically need dominator trees and 442 // loop info, just like LICM and CSE below. 443 if (addILPOpts()) 444 printAndVerify("After ILP optimizations"); 445 446 addPass(&EarlyMachineLICMID); 447 addPass(&MachineCSEID); 448 449 addPass(&MachineSinkingID); 450 printAndVerify("After Machine LICM, CSE and Sinking passes"); 451 452 addPass(&PeepholeOptimizerID); 453 printAndVerify("After codegen peephole optimization pass"); 454 } 455