1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAllocaHoisting.h" 16 #include "NVPTXAtomicLower.h" 17 #include "NVPTXLowerAggrCopies.h" 18 #include "NVPTXTargetObjectFile.h" 19 #include "NVPTXTargetTransformInfo.h" 20 #include "TargetInfo/NVPTXTargetInfo.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/Triple.h" 23 #include "llvm/Analysis/TargetTransformInfo.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/IR/LegacyPassManager.h" 27 #include "llvm/Pass.h" 28 #include "llvm/Passes/PassBuilder.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/TargetRegistry.h" 31 #include "llvm/Target/TargetMachine.h" 32 #include "llvm/Target/TargetOptions.h" 33 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 34 #include "llvm/Transforms/Scalar.h" 35 #include "llvm/Transforms/Scalar/GVN.h" 36 #include "llvm/Transforms/Vectorize.h" 37 #include <cassert> 38 #include <string> 39 40 using namespace llvm; 41 42 // LSV is still relatively new; this switch lets us turn it off in case we 43 // encounter (or suspect) a bug. 44 static cl::opt<bool> 45 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 46 cl::desc("Disable load/store vectorizer"), 47 cl::init(false), cl::Hidden); 48 49 // TODO: Remove this flag when we are confident with no regressions. 50 static cl::opt<bool> DisableRequireStructuredCFG( 51 "disable-nvptx-require-structured-cfg", 52 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 53 "structured CFG. The requirement should be disabled only when " 54 "unexpected regressions happen."), 55 cl::init(false), cl::Hidden); 56 57 static cl::opt<bool> UseShortPointersOpt( 58 "nvptx-short-ptr", 59 cl::desc( 60 "Use 32-bit pointers for accessing const/local/shared address spaces."), 61 cl::init(false), cl::Hidden); 62 63 namespace llvm { 64 65 void initializeNVVMIntrRangePass(PassRegistry&); 66 void initializeNVVMReflectPass(PassRegistry&); 67 void initializeGenericToNVVMPass(PassRegistry&); 68 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 69 void initializeNVPTXAtomicLowerPass(PassRegistry &); 70 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); 71 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 72 void initializeNVPTXLowerArgsPass(PassRegistry &); 73 void initializeNVPTXLowerAllocaPass(PassRegistry &); 74 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 75 76 } // end namespace llvm 77 78 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 79 // Register the target. 80 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 81 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 82 83 // FIXME: This pass is really intended to be invoked during IR optimization, 84 // but it's very NVPTX-specific. 85 PassRegistry &PR = *PassRegistry::getPassRegistry(); 86 initializeNVVMReflectPass(PR); 87 initializeNVVMIntrRangePass(PR); 88 initializeGenericToNVVMPass(PR); 89 initializeNVPTXAllocaHoistingPass(PR); 90 initializeNVPTXAssignValidGlobalNamesPass(PR); 91 initializeNVPTXAtomicLowerPass(PR); 92 initializeNVPTXLowerArgsPass(PR); 93 initializeNVPTXLowerAllocaPass(PR); 94 initializeNVPTXLowerAggrCopiesPass(PR); 95 initializeNVPTXProxyRegErasurePass(PR); 96 } 97 98 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 99 std::string Ret = "e"; 100 101 if (!is64Bit) 102 Ret += "-p:32:32"; 103 else if (UseShortPointers) 104 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 105 106 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 107 108 return Ret; 109 } 110 111 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 112 StringRef CPU, StringRef FS, 113 const TargetOptions &Options, 114 Optional<Reloc::Model> RM, 115 Optional<CodeModel::Model> CM, 116 CodeGenOpt::Level OL, bool is64bit) 117 // The pic relocation model is used regardless of what the client has 118 // specified, as it is the only relocation model currently supported. 119 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 120 CPU, FS, Options, Reloc::PIC_, 121 getEffectiveCodeModel(CM, CodeModel::Small), OL), 122 is64bit(is64bit), UseShortPointers(UseShortPointersOpt), 123 TLOF(std::make_unique<NVPTXTargetObjectFile>()), 124 Subtarget(TT, std::string(CPU), std::string(FS), *this) { 125 if (TT.getOS() == Triple::NVCL) 126 drvInterface = NVPTX::NVCL; 127 else 128 drvInterface = NVPTX::CUDA; 129 if (!DisableRequireStructuredCFG) 130 setRequiresStructuredCFG(true); 131 initAsmInfo(); 132 } 133 134 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 135 136 void NVPTXTargetMachine32::anchor() {} 137 138 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 139 StringRef CPU, StringRef FS, 140 const TargetOptions &Options, 141 Optional<Reloc::Model> RM, 142 Optional<CodeModel::Model> CM, 143 CodeGenOpt::Level OL, bool JIT) 144 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 145 146 void NVPTXTargetMachine64::anchor() {} 147 148 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 149 StringRef CPU, StringRef FS, 150 const TargetOptions &Options, 151 Optional<Reloc::Model> RM, 152 Optional<CodeModel::Model> CM, 153 CodeGenOpt::Level OL, bool JIT) 154 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 155 156 namespace { 157 158 class NVPTXPassConfig : public TargetPassConfig { 159 public: 160 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 161 : TargetPassConfig(TM, PM) {} 162 163 NVPTXTargetMachine &getNVPTXTargetMachine() const { 164 return getTM<NVPTXTargetMachine>(); 165 } 166 167 void addIRPasses() override; 168 bool addInstSelector() override; 169 void addPreRegAlloc() override; 170 void addPostRegAlloc() override; 171 void addMachineSSAOptimization() override; 172 173 FunctionPass *createTargetRegisterAllocator(bool) override; 174 void addFastRegAlloc() override; 175 void addOptimizedRegAlloc() override; 176 177 bool addRegAssignAndRewriteFast() override { 178 llvm_unreachable("should not be used"); 179 } 180 181 bool addRegAssignAndRewriteOptimized() override { 182 llvm_unreachable("should not be used"); 183 } 184 185 private: 186 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 187 // function is only called in opt mode. 188 void addEarlyCSEOrGVNPass(); 189 190 // Add passes that propagate special memory spaces. 191 void addAddressSpaceInferencePasses(); 192 193 // Add passes that perform straight-line scalar optimizations. 194 void addStraightLineScalarOptimizationPasses(); 195 }; 196 197 } // end anonymous namespace 198 199 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 200 return new NVPTXPassConfig(*this, PM); 201 } 202 203 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 204 Builder.addExtension( 205 PassManagerBuilder::EP_EarlyAsPossible, 206 [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { 207 PM.add(createNVVMReflectPass(Subtarget.getSmVersion())); 208 PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); 209 }); 210 } 211 212 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 213 PB.registerPipelineParsingCallback( 214 [](StringRef PassName, FunctionPassManager &PM, 215 ArrayRef<PassBuilder::PipelineElement>) { 216 if (PassName == "nvvm-reflect") { 217 PM.addPass(NVVMReflectPass()); 218 return true; 219 } 220 if (PassName == "nvvm-intr-range") { 221 PM.addPass(NVVMIntrRangePass()); 222 return true; 223 } 224 return false; 225 }); 226 227 PB.registerPipelineStartEPCallback( 228 [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) { 229 FunctionPassManager FPM; 230 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 231 // FIXME: NVVMIntrRangePass is causing numerical discrepancies, 232 // investigate and re-enable. 233 // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); 234 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 235 }); 236 } 237 238 TargetTransformInfo 239 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) { 240 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 241 } 242 243 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 244 if (getOptLevel() == CodeGenOpt::Aggressive) 245 addPass(createGVNPass()); 246 else 247 addPass(createEarlyCSEPass()); 248 } 249 250 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 251 // NVPTXLowerArgs emits alloca for byval parameters which can often 252 // be eliminated by SROA. 253 addPass(createSROAPass()); 254 addPass(createNVPTXLowerAllocaPass()); 255 addPass(createInferAddressSpacesPass()); 256 addPass(createNVPTXAtomicLowerPass()); 257 } 258 259 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 260 addPass(createSeparateConstOffsetFromGEPPass()); 261 addPass(createSpeculativeExecutionPass()); 262 // ReassociateGEPs exposes more opportunites for SLSR. See 263 // the example in reassociate-geps-and-slsr.ll. 264 addPass(createStraightLineStrengthReducePass()); 265 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 266 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 267 // for some of our benchmarks. 268 addEarlyCSEOrGVNPass(); 269 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 270 addPass(createNaryReassociatePass()); 271 // NaryReassociate on GEPs creates redundant common expressions, so run 272 // EarlyCSE after it. 273 addPass(createEarlyCSEPass()); 274 } 275 276 void NVPTXPassConfig::addIRPasses() { 277 // The following passes are known to not play well with virtual regs hanging 278 // around after register allocation (which in our case, is *all* registers). 279 // We explicitly disable them here. We do, however, need some functionality 280 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 281 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 282 disablePass(&PrologEpilogCodeInserterID); 283 disablePass(&MachineCopyPropagationID); 284 disablePass(&TailDuplicateID); 285 disablePass(&StackMapLivenessID); 286 disablePass(&LiveDebugValuesID); 287 disablePass(&PostRAMachineSinkingID); 288 disablePass(&PostRASchedulerID); 289 disablePass(&FuncletLayoutID); 290 disablePass(&PatchableFunctionID); 291 disablePass(&ShrinkWrapID); 292 293 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 294 // it here does nothing. But since we need it for correctness when lowering 295 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 296 // call addEarlyAsPossiblePasses. 297 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 298 addPass(createNVVMReflectPass(ST.getSmVersion())); 299 300 if (getOptLevel() != CodeGenOpt::None) 301 addPass(createNVPTXImageOptimizerPass()); 302 addPass(createNVPTXAssignValidGlobalNamesPass()); 303 addPass(createGenericToNVVMPass()); 304 305 // NVPTXLowerArgs is required for correctness and should be run right 306 // before the address space inference passes. 307 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); 308 if (getOptLevel() != CodeGenOpt::None) { 309 addAddressSpaceInferencePasses(); 310 addStraightLineScalarOptimizationPasses(); 311 } 312 313 // === LSR and other generic IR passes === 314 TargetPassConfig::addIRPasses(); 315 // EarlyCSE is not always strong enough to clean up what LSR produces. For 316 // example, GVN can combine 317 // 318 // %0 = add %a, %b 319 // %1 = add %b, %a 320 // 321 // and 322 // 323 // %0 = shl nsw %a, 2 324 // %1 = shl %a, 2 325 // 326 // but EarlyCSE can do neither of them. 327 if (getOptLevel() != CodeGenOpt::None) { 328 addEarlyCSEOrGVNPass(); 329 if (!DisableLoadStoreVectorizer) 330 addPass(createLoadStoreVectorizerPass()); 331 } 332 } 333 334 bool NVPTXPassConfig::addInstSelector() { 335 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 336 337 addPass(createLowerAggrCopies()); 338 addPass(createAllocaHoisting()); 339 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 340 341 if (!ST.hasImageHandles()) 342 addPass(createNVPTXReplaceImageHandlesPass()); 343 344 return false; 345 } 346 347 void NVPTXPassConfig::addPreRegAlloc() { 348 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 349 addPass(createNVPTXProxyRegErasurePass()); 350 } 351 352 void NVPTXPassConfig::addPostRegAlloc() { 353 addPass(createNVPTXPrologEpilogPass(), false); 354 if (getOptLevel() != CodeGenOpt::None) { 355 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 356 // index with VRFrame register. NVPTXPeephole need to be run after that and 357 // will replace VRFrame with VRFrameLocal when possible. 358 addPass(createNVPTXPeephole()); 359 } 360 } 361 362 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 363 return nullptr; // No reg alloc 364 } 365 366 void NVPTXPassConfig::addFastRegAlloc() { 367 addPass(&PHIEliminationID); 368 addPass(&TwoAddressInstructionPassID); 369 } 370 371 void NVPTXPassConfig::addOptimizedRegAlloc() { 372 addPass(&ProcessImplicitDefsID); 373 addPass(&LiveVariablesID); 374 addPass(&MachineLoopInfoID); 375 addPass(&PHIEliminationID); 376 377 addPass(&TwoAddressInstructionPassID); 378 addPass(&RegisterCoalescerID); 379 380 // PreRA instruction scheduling. 381 if (addPass(&MachineSchedulerID)) 382 printAndVerify("After Machine Scheduling"); 383 384 385 addPass(&StackSlotColoringID); 386 387 // FIXME: Needs physical registers 388 //addPass(&MachineLICMID); 389 390 printAndVerify("After StackSlotColoring"); 391 } 392 393 void NVPTXPassConfig::addMachineSSAOptimization() { 394 // Pre-ra tail duplication. 395 if (addPass(&EarlyTailDuplicateID)) 396 printAndVerify("After Pre-RegAlloc TailDuplicate"); 397 398 // Optimize PHIs before DCE: removing dead PHI cycles may make more 399 // instructions dead. 400 addPass(&OptimizePHIsID); 401 402 // This pass merges large allocas. StackSlotColoring is a different pass 403 // which merges spill slots. 404 addPass(&StackColoringID); 405 406 // If the target requests it, assign local variables to stack slots relative 407 // to one another and simplify frame index references where possible. 408 addPass(&LocalStackSlotAllocationID); 409 410 // With optimization, dead code should already be eliminated. However 411 // there is one known exception: lowered code for arguments that are only 412 // used by tail calls, where the tail calls reuse the incoming stack 413 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 414 addPass(&DeadMachineInstructionElimID); 415 printAndVerify("After codegen DCE pass"); 416 417 // Allow targets to insert passes that improve instruction level parallelism, 418 // like if-conversion. Such passes will typically need dominator trees and 419 // loop info, just like LICM and CSE below. 420 if (addILPOpts()) 421 printAndVerify("After ILP optimizations"); 422 423 addPass(&EarlyMachineLICMID); 424 addPass(&MachineCSEID); 425 426 addPass(&MachineSinkingID); 427 printAndVerify("After Machine LICM, CSE and Sinking passes"); 428 429 addPass(&PeepholeOptimizerID); 430 printAndVerify("After codegen peephole optimization pass"); 431 } 432