1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAllocaHoisting.h" 16 #include "NVPTXLowerAggrCopies.h" 17 #include "NVPTXTargetObjectFile.h" 18 #include "NVPTXTargetTransformInfo.h" 19 #include "TargetInfo/NVPTXTargetInfo.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/Triple.h" 22 #include "llvm/Analysis/TargetTransformInfo.h" 23 #include "llvm/CodeGen/Passes.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/IR/LegacyPassManager.h" 26 #include "llvm/Pass.h" 27 #include "llvm/Passes/PassBuilder.h" 28 #include "llvm/Support/CommandLine.h" 29 #include "llvm/Support/TargetRegistry.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include "llvm/Target/TargetOptions.h" 32 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 33 #include "llvm/Transforms/Scalar.h" 34 #include "llvm/Transforms/Scalar/GVN.h" 35 #include "llvm/Transforms/Vectorize.h" 36 #include <cassert> 37 #include <string> 38 39 using namespace llvm; 40 41 // LSV is still relatively new; this switch lets us turn it off in case we 42 // encounter (or suspect) a bug. 43 static cl::opt<bool> 44 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 45 cl::desc("Disable load/store vectorizer"), 46 cl::init(false), cl::Hidden); 47 48 // TODO: Remove this flag when we are confident with no regressions. 49 static cl::opt<bool> DisableRequireStructuredCFG( 50 "disable-nvptx-require-structured-cfg", 51 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 52 "structured CFG. The requirement should be disabled only when " 53 "unexpected regressions happen."), 54 cl::init(false), cl::Hidden); 55 56 static cl::opt<bool> UseShortPointersOpt( 57 "nvptx-short-ptr", 58 cl::desc( 59 "Use 32-bit pointers for accessing const/local/shared address spaces."), 60 cl::init(false), cl::Hidden); 61 62 namespace llvm { 63 64 void initializeNVVMIntrRangePass(PassRegistry&); 65 void initializeNVVMReflectPass(PassRegistry&); 66 void initializeGenericToNVVMPass(PassRegistry&); 67 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 68 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); 69 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 70 void initializeNVPTXLowerArgsPass(PassRegistry &); 71 void initializeNVPTXLowerAllocaPass(PassRegistry &); 72 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 73 74 } // end namespace llvm 75 76 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 77 // Register the target. 78 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 79 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 80 81 // FIXME: This pass is really intended to be invoked during IR optimization, 82 // but it's very NVPTX-specific. 83 PassRegistry &PR = *PassRegistry::getPassRegistry(); 84 initializeNVVMReflectPass(PR); 85 initializeNVVMIntrRangePass(PR); 86 initializeGenericToNVVMPass(PR); 87 initializeNVPTXAllocaHoistingPass(PR); 88 initializeNVPTXAssignValidGlobalNamesPass(PR); 89 initializeNVPTXLowerArgsPass(PR); 90 initializeNVPTXLowerAllocaPass(PR); 91 initializeNVPTXLowerAggrCopiesPass(PR); 92 initializeNVPTXProxyRegErasurePass(PR); 93 } 94 95 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 96 std::string Ret = "e"; 97 98 if (!is64Bit) 99 Ret += "-p:32:32"; 100 else if (UseShortPointers) 101 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 102 103 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 104 105 return Ret; 106 } 107 108 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 109 StringRef CPU, StringRef FS, 110 const TargetOptions &Options, 111 Optional<Reloc::Model> RM, 112 Optional<CodeModel::Model> CM, 113 CodeGenOpt::Level OL, bool is64bit) 114 // The pic relocation model is used regardless of what the client has 115 // specified, as it is the only relocation model currently supported. 116 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 117 CPU, FS, Options, Reloc::PIC_, 118 getEffectiveCodeModel(CM, CodeModel::Small), OL), 119 is64bit(is64bit), UseShortPointers(UseShortPointersOpt), 120 TLOF(std::make_unique<NVPTXTargetObjectFile>()), 121 Subtarget(TT, std::string(CPU), std::string(FS), *this) { 122 if (TT.getOS() == Triple::NVCL) 123 drvInterface = NVPTX::NVCL; 124 else 125 drvInterface = NVPTX::CUDA; 126 if (!DisableRequireStructuredCFG) 127 setRequiresStructuredCFG(true); 128 initAsmInfo(); 129 } 130 131 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 132 133 void NVPTXTargetMachine32::anchor() {} 134 135 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 136 StringRef CPU, StringRef FS, 137 const TargetOptions &Options, 138 Optional<Reloc::Model> RM, 139 Optional<CodeModel::Model> CM, 140 CodeGenOpt::Level OL, bool JIT) 141 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 142 143 void NVPTXTargetMachine64::anchor() {} 144 145 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 146 StringRef CPU, StringRef FS, 147 const TargetOptions &Options, 148 Optional<Reloc::Model> RM, 149 Optional<CodeModel::Model> CM, 150 CodeGenOpt::Level OL, bool JIT) 151 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 152 153 namespace { 154 155 class NVPTXPassConfig : public TargetPassConfig { 156 public: 157 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 158 : TargetPassConfig(TM, PM) {} 159 160 NVPTXTargetMachine &getNVPTXTargetMachine() const { 161 return getTM<NVPTXTargetMachine>(); 162 } 163 164 void addIRPasses() override; 165 bool addInstSelector() override; 166 void addPreRegAlloc() override; 167 void addPostRegAlloc() override; 168 void addMachineSSAOptimization() override; 169 170 FunctionPass *createTargetRegisterAllocator(bool) override; 171 void addFastRegAlloc() override; 172 void addOptimizedRegAlloc() override; 173 174 bool addRegAssignAndRewriteFast() override { 175 llvm_unreachable("should not be used"); 176 } 177 178 bool addRegAssignAndRewriteOptimized() override { 179 llvm_unreachable("should not be used"); 180 } 181 182 private: 183 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 184 // function is only called in opt mode. 185 void addEarlyCSEOrGVNPass(); 186 187 // Add passes that propagate special memory spaces. 188 void addAddressSpaceInferencePasses(); 189 190 // Add passes that perform straight-line scalar optimizations. 191 void addStraightLineScalarOptimizationPasses(); 192 }; 193 194 } // end anonymous namespace 195 196 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 197 return new NVPTXPassConfig(*this, PM); 198 } 199 200 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 201 Builder.addExtension( 202 PassManagerBuilder::EP_EarlyAsPossible, 203 [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { 204 PM.add(createNVVMReflectPass(Subtarget.getSmVersion())); 205 PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); 206 }); 207 } 208 209 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, 210 bool DebugPassManager) { 211 PB.registerPipelineParsingCallback( 212 [](StringRef PassName, FunctionPassManager &PM, 213 ArrayRef<PassBuilder::PipelineElement>) { 214 if (PassName == "nvvm-reflect") { 215 PM.addPass(NVVMReflectPass()); 216 return true; 217 } 218 if (PassName == "nvvm-intr-range") { 219 PM.addPass(NVVMIntrRangePass()); 220 return true; 221 } 222 return false; 223 }); 224 225 PB.registerPipelineStartEPCallback( 226 [this, DebugPassManager](ModulePassManager &PM, 227 PassBuilder::OptimizationLevel Level) { 228 FunctionPassManager FPM(DebugPassManager); 229 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 230 FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); 231 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 232 }); 233 } 234 235 TargetTransformInfo 236 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) { 237 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 238 } 239 240 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 241 if (getOptLevel() == CodeGenOpt::Aggressive) 242 addPass(createGVNPass()); 243 else 244 addPass(createEarlyCSEPass()); 245 } 246 247 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 248 // NVPTXLowerArgs emits alloca for byval parameters which can often 249 // be eliminated by SROA. 250 addPass(createSROAPass()); 251 addPass(createNVPTXLowerAllocaPass()); 252 addPass(createInferAddressSpacesPass()); 253 } 254 255 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 256 addPass(createSeparateConstOffsetFromGEPPass()); 257 addPass(createSpeculativeExecutionPass()); 258 // ReassociateGEPs exposes more opportunites for SLSR. See 259 // the example in reassociate-geps-and-slsr.ll. 260 addPass(createStraightLineStrengthReducePass()); 261 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 262 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 263 // for some of our benchmarks. 264 addEarlyCSEOrGVNPass(); 265 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 266 addPass(createNaryReassociatePass()); 267 // NaryReassociate on GEPs creates redundant common expressions, so run 268 // EarlyCSE after it. 269 addPass(createEarlyCSEPass()); 270 } 271 272 void NVPTXPassConfig::addIRPasses() { 273 // The following passes are known to not play well with virtual regs hanging 274 // around after register allocation (which in our case, is *all* registers). 275 // We explicitly disable them here. We do, however, need some functionality 276 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 277 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 278 disablePass(&PrologEpilogCodeInserterID); 279 disablePass(&MachineCopyPropagationID); 280 disablePass(&TailDuplicateID); 281 disablePass(&StackMapLivenessID); 282 disablePass(&LiveDebugValuesID); 283 disablePass(&PostRAMachineSinkingID); 284 disablePass(&PostRASchedulerID); 285 disablePass(&FuncletLayoutID); 286 disablePass(&PatchableFunctionID); 287 disablePass(&ShrinkWrapID); 288 289 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 290 // it here does nothing. But since we need it for correctness when lowering 291 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 292 // call addEarlyAsPossiblePasses. 293 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 294 addPass(createNVVMReflectPass(ST.getSmVersion())); 295 296 if (getOptLevel() != CodeGenOpt::None) 297 addPass(createNVPTXImageOptimizerPass()); 298 addPass(createNVPTXAssignValidGlobalNamesPass()); 299 addPass(createGenericToNVVMPass()); 300 301 // NVPTXLowerArgs is required for correctness and should be run right 302 // before the address space inference passes. 303 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); 304 if (getOptLevel() != CodeGenOpt::None) { 305 addAddressSpaceInferencePasses(); 306 addStraightLineScalarOptimizationPasses(); 307 } 308 309 // === LSR and other generic IR passes === 310 TargetPassConfig::addIRPasses(); 311 // EarlyCSE is not always strong enough to clean up what LSR produces. For 312 // example, GVN can combine 313 // 314 // %0 = add %a, %b 315 // %1 = add %b, %a 316 // 317 // and 318 // 319 // %0 = shl nsw %a, 2 320 // %1 = shl %a, 2 321 // 322 // but EarlyCSE can do neither of them. 323 if (getOptLevel() != CodeGenOpt::None) { 324 addEarlyCSEOrGVNPass(); 325 if (!DisableLoadStoreVectorizer) 326 addPass(createLoadStoreVectorizerPass()); 327 } 328 } 329 330 bool NVPTXPassConfig::addInstSelector() { 331 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 332 333 addPass(createLowerAggrCopies()); 334 addPass(createAllocaHoisting()); 335 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 336 337 if (!ST.hasImageHandles()) 338 addPass(createNVPTXReplaceImageHandlesPass()); 339 340 return false; 341 } 342 343 void NVPTXPassConfig::addPreRegAlloc() { 344 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 345 addPass(createNVPTXProxyRegErasurePass()); 346 } 347 348 void NVPTXPassConfig::addPostRegAlloc() { 349 addPass(createNVPTXPrologEpilogPass(), false); 350 if (getOptLevel() != CodeGenOpt::None) { 351 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 352 // index with VRFrame register. NVPTXPeephole need to be run after that and 353 // will replace VRFrame with VRFrameLocal when possible. 354 addPass(createNVPTXPeephole()); 355 } 356 } 357 358 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 359 return nullptr; // No reg alloc 360 } 361 362 void NVPTXPassConfig::addFastRegAlloc() { 363 addPass(&PHIEliminationID); 364 addPass(&TwoAddressInstructionPassID); 365 } 366 367 void NVPTXPassConfig::addOptimizedRegAlloc() { 368 addPass(&ProcessImplicitDefsID); 369 addPass(&LiveVariablesID); 370 addPass(&MachineLoopInfoID); 371 addPass(&PHIEliminationID); 372 373 addPass(&TwoAddressInstructionPassID); 374 addPass(&RegisterCoalescerID); 375 376 // PreRA instruction scheduling. 377 if (addPass(&MachineSchedulerID)) 378 printAndVerify("After Machine Scheduling"); 379 380 381 addPass(&StackSlotColoringID); 382 383 // FIXME: Needs physical registers 384 //addPass(&MachineLICMID); 385 386 printAndVerify("After StackSlotColoring"); 387 } 388 389 void NVPTXPassConfig::addMachineSSAOptimization() { 390 // Pre-ra tail duplication. 391 if (addPass(&EarlyTailDuplicateID)) 392 printAndVerify("After Pre-RegAlloc TailDuplicate"); 393 394 // Optimize PHIs before DCE: removing dead PHI cycles may make more 395 // instructions dead. 396 addPass(&OptimizePHIsID); 397 398 // This pass merges large allocas. StackSlotColoring is a different pass 399 // which merges spill slots. 400 addPass(&StackColoringID); 401 402 // If the target requests it, assign local variables to stack slots relative 403 // to one another and simplify frame index references where possible. 404 addPass(&LocalStackSlotAllocationID); 405 406 // With optimization, dead code should already be eliminated. However 407 // there is one known exception: lowered code for arguments that are only 408 // used by tail calls, where the tail calls reuse the incoming stack 409 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 410 addPass(&DeadMachineInstructionElimID); 411 printAndVerify("After codegen DCE pass"); 412 413 // Allow targets to insert passes that improve instruction level parallelism, 414 // like if-conversion. Such passes will typically need dominator trees and 415 // loop info, just like LICM and CSE below. 416 if (addILPOpts()) 417 printAndVerify("After ILP optimizations"); 418 419 addPass(&EarlyMachineLICMID); 420 addPass(&MachineCSEID); 421 422 addPass(&MachineSinkingID); 423 printAndVerify("After Machine LICM, CSE and Sinking passes"); 424 425 addPass(&PeepholeOptimizerID); 426 printAndVerify("After codegen peephole optimization pass"); 427 } 428