1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file contains both AMDGPU target machine and the CodeGen pass builder. 11 /// The AMDGPU target machine contains all of the hardware specific information 12 /// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The 13 /// CodeGen pass builder handles the pass pipeline for new pass manager. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPU.h" 19 #include "AMDGPUAliasAnalysis.h" 20 #include "AMDGPUCtorDtorLowering.h" 21 #include "AMDGPUExportClustering.h" 22 #include "AMDGPUExportKernelRuntimeHandles.h" 23 #include "AMDGPUIGroupLP.h" 24 #include "AMDGPUISelDAGToDAG.h" 25 #include "AMDGPUMacroFusion.h" 26 #include "AMDGPUPerfHintAnalysis.h" 27 #include "AMDGPUPreloadKernArgProlog.h" 28 #include "AMDGPURemoveIncompatibleFunctions.h" 29 #include "AMDGPUReserveWWMRegs.h" 30 #include "AMDGPUResourceUsageAnalysis.h" 31 #include "AMDGPUSplitModule.h" 32 #include "AMDGPUTargetObjectFile.h" 33 #include "AMDGPUTargetTransformInfo.h" 34 #include "AMDGPUUnifyDivergentExitNodes.h" 35 #include "AMDGPUWaitSGPRHazards.h" 36 #include "GCNDPPCombine.h" 37 #include "GCNIterativeScheduler.h" 38 #include "GCNNSAReassign.h" 39 #include "GCNPreRALongBranchReg.h" 40 #include "GCNPreRAOptimizations.h" 41 #include "GCNRewritePartialRegUses.h" 42 #include "GCNSchedStrategy.h" 43 #include "GCNVOPDUtils.h" 44 #include "R600.h" 45 #include "R600TargetMachine.h" 46 #include "SIFixSGPRCopies.h" 47 #include "SIFixVGPRCopies.h" 48 #include "SIFoldOperands.h" 49 #include "SIFormMemoryClauses.h" 50 #include "SILoadStoreOptimizer.h" 51 #include "SILowerControlFlow.h" 52 #include "SILowerSGPRSpills.h" 53 #include "SILowerWWMCopies.h" 54 #include "SIMachineFunctionInfo.h" 55 #include "SIMachineScheduler.h" 56 #include "SIOptimizeExecMasking.h" 57 #include "SIOptimizeExecMaskingPreRA.h" 58 #include "SIOptimizeVGPRLiveRange.h" 59 #include "SIPeepholeSDWA.h" 60 #include "SIPostRABundler.h" 61 #include "SIPreAllocateWWMRegs.h" 62 #include "SIShrinkInstructions.h" 63 #include "SIWholeQuadMode.h" 64 #include "TargetInfo/AMDGPUTargetInfo.h" 65 #include "Utils/AMDGPUBaseInfo.h" 66 #include "llvm/Analysis/CGSCCPassManager.h" 67 #include "llvm/Analysis/CallGraphSCCPass.h" 68 #include "llvm/Analysis/KernelInfo.h" 69 #include "llvm/Analysis/UniformityAnalysis.h" 70 #include "llvm/CodeGen/AtomicExpand.h" 71 #include "llvm/CodeGen/BranchRelaxation.h" 72 #include "llvm/CodeGen/DeadMachineInstructionElim.h" 73 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 74 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 75 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 76 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 77 #include "llvm/CodeGen/GlobalISel/Localizer.h" 78 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 79 #include "llvm/CodeGen/MIRParser/MIParser.h" 80 #include "llvm/CodeGen/MachineCSE.h" 81 #include "llvm/CodeGen/MachineLICM.h" 82 #include "llvm/CodeGen/MachineScheduler.h" 83 #include "llvm/CodeGen/Passes.h" 84 #include "llvm/CodeGen/PostRAHazardRecognizer.h" 85 #include "llvm/CodeGen/RegAllocRegistry.h" 86 #include "llvm/CodeGen/TargetPassConfig.h" 87 #include "llvm/IR/IntrinsicsAMDGPU.h" 88 #include "llvm/IR/PassManager.h" 89 #include "llvm/IR/PatternMatch.h" 90 #include "llvm/InitializePasses.h" 91 #include "llvm/MC/TargetRegistry.h" 92 #include "llvm/Passes/PassBuilder.h" 93 #include "llvm/Support/Compiler.h" 94 #include "llvm/Support/FormatVariadic.h" 95 #include "llvm/Transforms/HipStdPar/HipStdPar.h" 96 #include "llvm/Transforms/IPO.h" 97 #include "llvm/Transforms/IPO/AlwaysInliner.h" 98 #include "llvm/Transforms/IPO/ExpandVariadics.h" 99 #include "llvm/Transforms/IPO/GlobalDCE.h" 100 #include "llvm/Transforms/IPO/Internalize.h" 101 #include "llvm/Transforms/Scalar.h" 102 #include "llvm/Transforms/Scalar/EarlyCSE.h" 103 #include "llvm/Transforms/Scalar/FlattenCFG.h" 104 #include "llvm/Transforms/Scalar/GVN.h" 105 #include "llvm/Transforms/Scalar/InferAddressSpaces.h" 106 #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" 107 #include "llvm/Transforms/Scalar/NaryReassociate.h" 108 #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" 109 #include "llvm/Transforms/Scalar/Sink.h" 110 #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h" 111 #include "llvm/Transforms/Scalar/StructurizeCFG.h" 112 #include "llvm/Transforms/Utils.h" 113 #include "llvm/Transforms/Utils/FixIrreducible.h" 114 #include "llvm/Transforms/Utils/LCSSA.h" 115 #include "llvm/Transforms/Utils/LowerSwitch.h" 116 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 117 #include "llvm/Transforms/Utils/UnifyLoopExits.h" 118 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" 119 #include <optional> 120 121 using namespace llvm; 122 using namespace llvm::PatternMatch; 123 124 namespace { 125 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { 126 public: 127 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 128 : RegisterRegAllocBase(N, D, C) {} 129 }; 130 131 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { 132 public: 133 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 134 : RegisterRegAllocBase(N, D, C) {} 135 }; 136 137 class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> { 138 public: 139 WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 140 : RegisterRegAllocBase(N, D, C) {} 141 }; 142 143 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, 144 const MachineRegisterInfo &MRI, 145 const Register Reg) { 146 const TargetRegisterClass *RC = MRI.getRegClass(Reg); 147 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC); 148 } 149 150 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, 151 const MachineRegisterInfo &MRI, 152 const Register Reg) { 153 const TargetRegisterClass *RC = MRI.getRegClass(Reg); 154 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC); 155 } 156 157 static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI, 158 const MachineRegisterInfo &MRI, 159 const Register Reg) { 160 const SIMachineFunctionInfo *MFI = 161 MRI.getMF().getInfo<SIMachineFunctionInfo>(); 162 const TargetRegisterClass *RC = MRI.getRegClass(Reg); 163 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) && 164 MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); 165 } 166 167 /// -{sgpr|wwm|vgpr}-regalloc=... command line option. 168 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } 169 170 /// A dummy default pass factory indicates whether the register allocator is 171 /// overridden on the command line. 172 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; 173 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; 174 static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag; 175 176 static SGPRRegisterRegAlloc 177 defaultSGPRRegAlloc("default", 178 "pick SGPR register allocator based on -O option", 179 useDefaultRegisterAllocator); 180 181 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, 182 RegisterPassParser<SGPRRegisterRegAlloc>> 183 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 184 cl::desc("Register allocator to use for SGPRs")); 185 186 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, 187 RegisterPassParser<VGPRRegisterRegAlloc>> 188 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 189 cl::desc("Register allocator to use for VGPRs")); 190 191 static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false, 192 RegisterPassParser<WWMRegisterRegAlloc>> 193 WWMRegAlloc("wwm-regalloc", cl::Hidden, 194 cl::init(&useDefaultRegisterAllocator), 195 cl::desc("Register allocator to use for WWM registers")); 196 197 static void initializeDefaultSGPRRegisterAllocatorOnce() { 198 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 199 200 if (!Ctor) { 201 Ctor = SGPRRegAlloc; 202 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); 203 } 204 } 205 206 static void initializeDefaultVGPRRegisterAllocatorOnce() { 207 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 208 209 if (!Ctor) { 210 Ctor = VGPRRegAlloc; 211 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); 212 } 213 } 214 215 static void initializeDefaultWWMRegisterAllocatorOnce() { 216 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault(); 217 218 if (!Ctor) { 219 Ctor = WWMRegAlloc; 220 WWMRegisterRegAlloc::setDefault(WWMRegAlloc); 221 } 222 } 223 224 static FunctionPass *createBasicSGPRRegisterAllocator() { 225 return createBasicRegisterAllocator(onlyAllocateSGPRs); 226 } 227 228 static FunctionPass *createGreedySGPRRegisterAllocator() { 229 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 230 } 231 232 static FunctionPass *createFastSGPRRegisterAllocator() { 233 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 234 } 235 236 static FunctionPass *createBasicVGPRRegisterAllocator() { 237 return createBasicRegisterAllocator(onlyAllocateVGPRs); 238 } 239 240 static FunctionPass *createGreedyVGPRRegisterAllocator() { 241 return createGreedyRegisterAllocator(onlyAllocateVGPRs); 242 } 243 244 static FunctionPass *createFastVGPRRegisterAllocator() { 245 return createFastRegisterAllocator(onlyAllocateVGPRs, true); 246 } 247 248 static FunctionPass *createBasicWWMRegisterAllocator() { 249 return createBasicRegisterAllocator(onlyAllocateWWMRegs); 250 } 251 252 static FunctionPass *createGreedyWWMRegisterAllocator() { 253 return createGreedyRegisterAllocator(onlyAllocateWWMRegs); 254 } 255 256 static FunctionPass *createFastWWMRegisterAllocator() { 257 return createFastRegisterAllocator(onlyAllocateWWMRegs, false); 258 } 259 260 static SGPRRegisterRegAlloc basicRegAllocSGPR( 261 "basic", "basic register allocator", createBasicSGPRRegisterAllocator); 262 static SGPRRegisterRegAlloc greedyRegAllocSGPR( 263 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); 264 265 static SGPRRegisterRegAlloc fastRegAllocSGPR( 266 "fast", "fast register allocator", createFastSGPRRegisterAllocator); 267 268 269 static VGPRRegisterRegAlloc basicRegAllocVGPR( 270 "basic", "basic register allocator", createBasicVGPRRegisterAllocator); 271 static VGPRRegisterRegAlloc greedyRegAllocVGPR( 272 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); 273 274 static VGPRRegisterRegAlloc fastRegAllocVGPR( 275 "fast", "fast register allocator", createFastVGPRRegisterAllocator); 276 static WWMRegisterRegAlloc basicRegAllocWWMReg("basic", 277 "basic register allocator", 278 createBasicWWMRegisterAllocator); 279 static WWMRegisterRegAlloc 280 greedyRegAllocWWMReg("greedy", "greedy register allocator", 281 createGreedyWWMRegisterAllocator); 282 static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator", 283 createFastWWMRegisterAllocator); 284 285 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { 286 return Phase == ThinOrFullLTOPhase::FullLTOPreLink || 287 Phase == ThinOrFullLTOPhase::ThinLTOPreLink; 288 } 289 } // anonymous namespace 290 291 static cl::opt<bool> 292 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 293 cl::desc("Run early if-conversion"), 294 cl::init(false)); 295 296 static cl::opt<bool> 297 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 298 cl::desc("Run pre-RA exec mask optimizations"), 299 cl::init(true)); 300 301 static cl::opt<bool> 302 LowerCtorDtor("amdgpu-lower-global-ctor-dtor", 303 cl::desc("Lower GPU ctor / dtors to globals on the device."), 304 cl::init(true), cl::Hidden); 305 306 // Option to disable vectorizer for tests. 307 static cl::opt<bool> EnableLoadStoreVectorizer( 308 "amdgpu-load-store-vectorizer", 309 cl::desc("Enable load store vectorizer"), 310 cl::init(true), 311 cl::Hidden); 312 313 // Option to control global loads scalarization 314 static cl::opt<bool> ScalarizeGlobal( 315 "amdgpu-scalarize-global-loads", 316 cl::desc("Enable global load scalarization"), 317 cl::init(true), 318 cl::Hidden); 319 320 // Option to run internalize pass. 321 static cl::opt<bool> InternalizeSymbols( 322 "amdgpu-internalize-symbols", 323 cl::desc("Enable elimination of non-kernel functions and unused globals"), 324 cl::init(false), 325 cl::Hidden); 326 327 // Option to inline all early. 328 static cl::opt<bool> EarlyInlineAll( 329 "amdgpu-early-inline-all", 330 cl::desc("Inline all functions early"), 331 cl::init(false), 332 cl::Hidden); 333 334 static cl::opt<bool> RemoveIncompatibleFunctions( 335 "amdgpu-enable-remove-incompatible-functions", cl::Hidden, 336 cl::desc("Enable removal of functions when they" 337 "use features not supported by the target GPU"), 338 cl::init(true)); 339 340 static cl::opt<bool> EnableSDWAPeephole( 341 "amdgpu-sdwa-peephole", 342 cl::desc("Enable SDWA peepholer"), 343 cl::init(true)); 344 345 static cl::opt<bool> EnableDPPCombine( 346 "amdgpu-dpp-combine", 347 cl::desc("Enable DPP combiner"), 348 cl::init(true)); 349 350 // Enable address space based alias analysis 351 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 352 cl::desc("Enable AMDGPU Alias Analysis"), 353 cl::init(true)); 354 355 // Enable lib calls simplifications 356 static cl::opt<bool> EnableLibCallSimplify( 357 "amdgpu-simplify-libcall", 358 cl::desc("Enable amdgpu library simplifications"), 359 cl::init(true), 360 cl::Hidden); 361 362 static cl::opt<bool> EnableLowerKernelArguments( 363 "amdgpu-ir-lower-kernel-arguments", 364 cl::desc("Lower kernel argument loads in IR pass"), 365 cl::init(true), 366 cl::Hidden); 367 368 static cl::opt<bool> EnableRegReassign( 369 "amdgpu-reassign-regs", 370 cl::desc("Enable register reassign optimizations on gfx10+"), 371 cl::init(true), 372 cl::Hidden); 373 374 static cl::opt<bool> OptVGPRLiveRange( 375 "amdgpu-opt-vgpr-liverange", 376 cl::desc("Enable VGPR liverange optimizations for if-else structure"), 377 cl::init(true), cl::Hidden); 378 379 static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy( 380 "amdgpu-atomic-optimizer-strategy", 381 cl::desc("Select DPP or Iterative strategy for scan"), 382 cl::init(ScanOptions::Iterative), 383 cl::values( 384 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"), 385 clEnumValN(ScanOptions::Iterative, "Iterative", 386 "Use Iterative approach for scan"), 387 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer"))); 388 389 // Enable Mode register optimization 390 static cl::opt<bool> EnableSIModeRegisterPass( 391 "amdgpu-mode-register", 392 cl::desc("Enable mode register pass"), 393 cl::init(true), 394 cl::Hidden); 395 396 // Enable GFX11+ s_delay_alu insertion 397 static cl::opt<bool> 398 EnableInsertDelayAlu("amdgpu-enable-delay-alu", 399 cl::desc("Enable s_delay_alu insertion"), 400 cl::init(true), cl::Hidden); 401 402 // Enable GFX11+ VOPD 403 static cl::opt<bool> 404 EnableVOPD("amdgpu-enable-vopd", 405 cl::desc("Enable VOPD, dual issue of VALU in wave32"), 406 cl::init(true), cl::Hidden); 407 408 // Option is used in lit tests to prevent deadcoding of patterns inspected. 409 static cl::opt<bool> 410 EnableDCEInRA("amdgpu-dce-in-ra", 411 cl::init(true), cl::Hidden, 412 cl::desc("Enable machine DCE inside regalloc")); 413 414 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority", 415 cl::desc("Adjust wave priority"), 416 cl::init(false), cl::Hidden); 417 418 static cl::opt<bool> EnableScalarIRPasses( 419 "amdgpu-scalar-ir-passes", 420 cl::desc("Enable scalar IR passes"), 421 cl::init(true), 422 cl::Hidden); 423 424 static cl::opt<bool> 425 EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", 426 cl::desc("Enable lowering of lds to global memory pass " 427 "and asan instrument resulting IR."), 428 cl::init(true), cl::Hidden); 429 430 static cl::opt<bool, true> EnableLowerModuleLDS( 431 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), 432 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), 433 cl::Hidden); 434 435 static cl::opt<bool> EnablePreRAOptimizations( 436 "amdgpu-enable-pre-ra-optimizations", 437 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), 438 cl::Hidden); 439 440 static cl::opt<bool> EnablePromoteKernelArguments( 441 "amdgpu-enable-promote-kernel-arguments", 442 cl::desc("Enable promotion of flat kernel pointer arguments to global"), 443 cl::Hidden, cl::init(true)); 444 445 static cl::opt<bool> EnableImageIntrinsicOptimizer( 446 "amdgpu-enable-image-intrinsic-optimizer", 447 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), 448 cl::Hidden); 449 450 static cl::opt<bool> 451 EnableLoopPrefetch("amdgpu-loop-prefetch", 452 cl::desc("Enable loop data prefetch on AMDGPU"), 453 cl::Hidden, cl::init(false)); 454 455 static cl::opt<std::string> 456 AMDGPUSchedStrategy("amdgpu-sched-strategy", 457 cl::desc("Select custom AMDGPU scheduling strategy."), 458 cl::Hidden, cl::init("")); 459 460 static cl::opt<bool> EnableRewritePartialRegUses( 461 "amdgpu-enable-rewrite-partial-reg-uses", 462 cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), 463 cl::Hidden); 464 465 static cl::opt<bool> EnableHipStdPar( 466 "amdgpu-enable-hipstdpar", 467 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false), 468 cl::Hidden); 469 470 static cl::opt<bool> 471 EnableAMDGPUAttributor("amdgpu-attributor-enable", 472 cl::desc("Enable AMDGPUAttributorPass"), 473 cl::init(true), cl::Hidden); 474 475 static cl::opt<bool> NewRegBankSelect( 476 "new-reg-bank-select", 477 cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of " 478 "regbankselect"), 479 cl::init(false), cl::Hidden); 480 481 static cl::opt<bool> HasClosedWorldAssumption( 482 "amdgpu-link-time-closed-world", 483 cl::desc("Whether has closed-world assumption at link time"), 484 cl::init(false), cl::Hidden); 485 486 extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 487 // Register the target 488 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); 489 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 490 491 PassRegistry *PR = PassRegistry::getPassRegistry(); 492 initializeR600ClauseMergePassPass(*PR); 493 initializeR600ControlFlowFinalizerPass(*PR); 494 initializeR600PacketizerPass(*PR); 495 initializeR600ExpandSpecialInstrsPassPass(*PR); 496 initializeR600VectorRegMergerPass(*PR); 497 initializeR600EmitClauseMarkersPass(*PR); 498 initializeR600MachineCFGStructurizerPass(*PR); 499 initializeGlobalISel(*PR); 500 initializeAMDGPUAsmPrinterPass(*PR); 501 initializeAMDGPUDAGToDAGISelLegacyPass(*PR); 502 initializeGCNDPPCombineLegacyPass(*PR); 503 initializeSILowerI1CopiesLegacyPass(*PR); 504 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); 505 initializeAMDGPURegBankSelectPass(*PR); 506 initializeAMDGPURegBankLegalizePass(*PR); 507 initializeSILowerWWMCopiesLegacyPass(*PR); 508 initializeAMDGPUMarkLastScratchLoadLegacyPass(*PR); 509 initializeSILowerSGPRSpillsLegacyPass(*PR); 510 initializeSIFixSGPRCopiesLegacyPass(*PR); 511 initializeSIFixVGPRCopiesLegacyPass(*PR); 512 initializeSIFoldOperandsLegacyPass(*PR); 513 initializeSIPeepholeSDWALegacyPass(*PR); 514 initializeSIShrinkInstructionsLegacyPass(*PR); 515 initializeSIOptimizeExecMaskingPreRALegacyPass(*PR); 516 initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR); 517 initializeSILoadStoreOptimizerLegacyPass(*PR); 518 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); 519 initializeAMDGPUAlwaysInlinePass(*PR); 520 initializeAMDGPUSwLowerLDSLegacyPass(*PR); 521 initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); 522 initializeAMDGPUArgumentUsageInfoPass(*PR); 523 initializeAMDGPUAtomicOptimizerPass(*PR); 524 initializeAMDGPULowerKernelArgumentsPass(*PR); 525 initializeAMDGPUPromoteKernelArgumentsPass(*PR); 526 initializeAMDGPULowerKernelAttributesPass(*PR); 527 initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR); 528 initializeAMDGPUPostLegalizerCombinerPass(*PR); 529 initializeAMDGPUPreLegalizerCombinerPass(*PR); 530 initializeAMDGPURegBankCombinerPass(*PR); 531 initializeAMDGPUPromoteAllocaPass(*PR); 532 initializeAMDGPUCodeGenPreparePass(*PR); 533 initializeAMDGPULateCodeGenPrepareLegacyPass(*PR); 534 initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR); 535 initializeAMDGPULowerModuleLDSLegacyPass(*PR); 536 initializeAMDGPULowerBufferFatPointersPass(*PR); 537 initializeAMDGPUReserveWWMRegsLegacyPass(*PR); 538 initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR); 539 initializeAMDGPURewriteOutArgumentsPass(*PR); 540 initializeAMDGPURewriteUndefForPHILegacyPass(*PR); 541 initializeSIAnnotateControlFlowLegacyPass(*PR); 542 initializeAMDGPUInsertDelayAluLegacyPass(*PR); 543 initializeSIInsertHardClausesLegacyPass(*PR); 544 initializeSIInsertWaitcntsLegacyPass(*PR); 545 initializeSIModeRegisterLegacyPass(*PR); 546 initializeSIWholeQuadModeLegacyPass(*PR); 547 initializeSILowerControlFlowLegacyPass(*PR); 548 initializeSIPreEmitPeepholeLegacyPass(*PR); 549 initializeSILateBranchLoweringLegacyPass(*PR); 550 initializeSIMemoryLegalizerLegacyPass(*PR); 551 initializeSIOptimizeExecMaskingLegacyPass(*PR); 552 initializeSIPreAllocateWWMRegsLegacyPass(*PR); 553 initializeSIFormMemoryClausesLegacyPass(*PR); 554 initializeSIPostRABundlerLegacyPass(*PR); 555 initializeGCNCreateVOPDLegacyPass(*PR); 556 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 557 initializeAMDGPUAAWrapperPassPass(*PR); 558 initializeAMDGPUExternalAAWrapperPass(*PR); 559 initializeAMDGPUImageIntrinsicOptimizerPass(*PR); 560 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 561 initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR); 562 initializeGCNNSAReassignLegacyPass(*PR); 563 initializeGCNPreRAOptimizationsLegacyPass(*PR); 564 initializeGCNPreRALongBranchRegLegacyPass(*PR); 565 initializeGCNRewritePartialRegUsesLegacyPass(*PR); 566 initializeGCNRegPressurePrinterPass(*PR); 567 initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); 568 initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); 569 initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); 570 } 571 572 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 573 return std::make_unique<AMDGPUTargetObjectFile>(); 574 } 575 576 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 577 return new SIScheduleDAGMI(C); 578 } 579 580 static ScheduleDAGInstrs * 581 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 582 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 583 ScheduleDAGMILive *DAG = 584 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 585 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 586 if (ST.shouldClusterStores()) 587 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 588 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); 589 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 590 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 591 return DAG; 592 } 593 594 static ScheduleDAGInstrs * 595 createGCNMaxILPMachineScheduler(MachineSchedContext *C) { 596 ScheduleDAGMILive *DAG = 597 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C)); 598 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); 599 return DAG; 600 } 601 602 static ScheduleDAGInstrs * 603 createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { 604 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 605 ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive( 606 C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C)); 607 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 608 if (ST.shouldClusterStores()) 609 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 610 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 611 return DAG; 612 } 613 614 static ScheduleDAGInstrs * 615 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 616 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 617 auto *DAG = new GCNIterativeScheduler( 618 C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 619 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 620 if (ST.shouldClusterStores()) 621 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 622 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); 623 return DAG; 624 } 625 626 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 627 auto *DAG = new GCNIterativeScheduler( 628 C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 629 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); 630 return DAG; 631 } 632 633 static ScheduleDAGInstrs * 634 createIterativeILPMachineScheduler(MachineSchedContext *C) { 635 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 636 auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP); 637 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 638 if (ST.shouldClusterStores()) 639 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 640 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 641 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); 642 return DAG; 643 } 644 645 static MachineSchedRegistry 646 SISchedRegistry("si", "Run SI's custom scheduler", 647 createSIMachineScheduler); 648 649 static MachineSchedRegistry 650 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 651 "Run GCN scheduler to maximize occupancy", 652 createGCNMaxOccupancyMachineScheduler); 653 654 static MachineSchedRegistry 655 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", 656 createGCNMaxILPMachineScheduler); 657 658 static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry( 659 "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause", 660 createGCNMaxMemoryClauseMachineScheduler); 661 662 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry( 663 "gcn-iterative-max-occupancy-experimental", 664 "Run GCN scheduler to maximize occupancy (experimental)", 665 createIterativeGCNMaxOccupancyMachineScheduler); 666 667 static MachineSchedRegistry GCNMinRegSchedRegistry( 668 "gcn-iterative-minreg", 669 "Run GCN iterative scheduler for minimal register usage (experimental)", 670 createMinRegScheduler); 671 672 static MachineSchedRegistry GCNILPSchedRegistry( 673 "gcn-iterative-ilp", 674 "Run GCN iterative scheduler for ILP scheduling (experimental)", 675 createIterativeILPMachineScheduler); 676 677 static StringRef computeDataLayout(const Triple &TT) { 678 if (TT.getArch() == Triple::r600) { 679 // 32-bit pointers. 680 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 681 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; 682 } 683 684 // 32-bit private, local, and region pointers. 64-bit global, constant and 685 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit 686 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values 687 // (address space 7), and 128-bit non-integral buffer resourcees (address 688 // space 8) which cannot be non-trivilally accessed by LLVM memory operations 689 // like getelementptr. 690 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 691 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-" 692 "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-" 693 "v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"; 694 } 695 696 LLVM_READNONE 697 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 698 if (!GPU.empty()) 699 return GPU; 700 701 // Need to default to a target with flat support for HSA. 702 if (TT.isAMDGCN()) 703 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 704 705 return "r600"; 706 } 707 708 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { 709 // The AMDGPU toolchain only supports generating shared objects, so we 710 // must always use PIC. 711 return Reloc::PIC_; 712 } 713 714 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 715 StringRef CPU, StringRef FS, 716 const TargetOptions &Options, 717 std::optional<Reloc::Model> RM, 718 std::optional<CodeModel::Model> CM, 719 CodeGenOptLevel OptLevel) 720 : CodeGenTargetMachineImpl( 721 T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options, 722 getEffectiveRelocModel(RM), 723 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 724 TLOF(createTLOF(getTargetTriple())) { 725 initAsmInfo(); 726 if (TT.isAMDGCN()) { 727 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) 728 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); 729 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) 730 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); 731 } 732 } 733 734 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 735 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; 736 737 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 738 739 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 740 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 741 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); 742 } 743 744 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 745 Attribute FSAttr = F.getFnAttribute("target-features"); 746 747 return FSAttr.isValid() ? FSAttr.getValueAsString() 748 : getTargetFeatureString(); 749 } 750 751 llvm::ScheduleDAGInstrs * 752 AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const { 753 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 754 ScheduleDAGMILive *DAG = createSchedLive(C); 755 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 756 if (ST.shouldClusterStores()) 757 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 758 return DAG; 759 } 760 761 /// Predicate for Internalize pass. 762 static bool mustPreserveGV(const GlobalValue &GV) { 763 if (const Function *F = dyn_cast<Function>(&GV)) 764 return F->isDeclaration() || F->getName().starts_with("__asan_") || 765 F->getName().starts_with("__sanitizer_") || 766 AMDGPU::isEntryFunctionCC(F->getCallingConv()); 767 768 GV.removeDeadConstantUsers(); 769 return !GV.use_empty(); 770 } 771 772 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 773 AAM.registerFunctionAnalysis<AMDGPUAA>(); 774 } 775 776 static Expected<ScanOptions> 777 parseAMDGPUAtomicOptimizerStrategy(StringRef Params) { 778 if (Params.empty()) 779 return ScanOptions::Iterative; 780 Params.consume_front("strategy="); 781 auto Result = StringSwitch<std::optional<ScanOptions>>(Params) 782 .Case("dpp", ScanOptions::DPP) 783 .Cases("iterative", "", ScanOptions::Iterative) 784 .Case("none", ScanOptions::None) 785 .Default(std::nullopt); 786 if (Result) 787 return *Result; 788 return make_error<StringError>("invalid parameter", inconvertibleErrorCode()); 789 } 790 791 Expected<AMDGPUAttributorOptions> 792 parseAMDGPUAttributorPassOptions(StringRef Params) { 793 AMDGPUAttributorOptions Result; 794 while (!Params.empty()) { 795 StringRef ParamName; 796 std::tie(ParamName, Params) = Params.split(';'); 797 if (ParamName == "closed-world") { 798 Result.IsClosedWorld = true; 799 } else { 800 return make_error<StringError>( 801 formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName) 802 .str(), 803 inconvertibleErrorCode()); 804 } 805 } 806 return Result; 807 } 808 809 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 810 811 #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def" 812 #include "llvm/Passes/TargetPassRegistry.inc" 813 814 PB.registerScalarOptimizerLateEPCallback( 815 [](FunctionPassManager &FPM, OptimizationLevel Level) { 816 if (Level == OptimizationLevel::O0) 817 return; 818 819 FPM.addPass(InferAddressSpacesPass()); 820 }); 821 822 PB.registerVectorizerEndEPCallback( 823 [](FunctionPassManager &FPM, OptimizationLevel Level) { 824 if (Level == OptimizationLevel::O0) 825 return; 826 827 FPM.addPass(InferAddressSpacesPass()); 828 }); 829 830 PB.registerPipelineEarlySimplificationEPCallback( 831 [](ModulePassManager &PM, OptimizationLevel Level, 832 ThinOrFullLTOPhase Phase) { 833 if (!isLTOPreLink(Phase)) { 834 // When we are not using -fgpu-rdc, we can run accelerator code 835 // selection relatively early, but still after linking to prevent 836 // eager removal of potentially reachable symbols. 837 if (EnableHipStdPar) 838 PM.addPass(HipStdParAcceleratorCodeSelectionPass()); 839 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 840 } 841 842 if (Level == OptimizationLevel::O0) 843 return; 844 845 PM.addPass(AMDGPUUnifyMetadataPass()); 846 847 // We don't want to run internalization at per-module stage. 848 if (InternalizeSymbols && !isLTOPreLink(Phase)) { 849 PM.addPass(InternalizePass(mustPreserveGV)); 850 PM.addPass(GlobalDCEPass()); 851 } 852 853 if (EarlyInlineAll && !EnableFunctionCalls) 854 PM.addPass(AMDGPUAlwaysInlinePass()); 855 }); 856 857 PB.registerPeepholeEPCallback( 858 [](FunctionPassManager &FPM, OptimizationLevel Level) { 859 if (Level == OptimizationLevel::O0) 860 return; 861 862 FPM.addPass(AMDGPUUseNativeCallsPass()); 863 if (EnableLibCallSimplify) 864 FPM.addPass(AMDGPUSimplifyLibCallsPass()); 865 }); 866 867 PB.registerCGSCCOptimizerLateEPCallback( 868 [this](CGSCCPassManager &PM, OptimizationLevel Level) { 869 if (Level == OptimizationLevel::O0) 870 return; 871 872 FunctionPassManager FPM; 873 874 // Add promote kernel arguments pass to the opt pipeline right before 875 // infer address spaces which is needed to do actual address space 876 // rewriting. 877 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && 878 EnablePromoteKernelArguments) 879 FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); 880 881 // Add infer address spaces pass to the opt pipeline after inlining 882 // but before SROA to increase SROA opportunities. 883 FPM.addPass(InferAddressSpacesPass()); 884 885 // This should run after inlining to have any chance of doing 886 // anything, and before other cleanup optimizations. 887 FPM.addPass(AMDGPULowerKernelAttributesPass()); 888 889 if (Level != OptimizationLevel::O0) { 890 // Promote alloca to vector before SROA and loop unroll. If we 891 // manage to eliminate allocas before unroll we may choose to unroll 892 // less. 893 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 894 } 895 896 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); 897 }); 898 899 // FIXME: Why is AMDGPUAttributor not in CGSCC? 900 PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM, 901 OptimizationLevel Level, 902 ThinOrFullLTOPhase Phase) { 903 if (Level != OptimizationLevel::O0) { 904 if (!isLTOPreLink(Phase)) { 905 AMDGPUAttributorOptions Opts; 906 MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); 907 } 908 } 909 }); 910 911 PB.registerFullLinkTimeOptimizationLastEPCallback( 912 [this](ModulePassManager &PM, OptimizationLevel Level) { 913 // When we are using -fgpu-rdc, we can only run accelerator code 914 // selection after linking to prevent, otherwise we end up removing 915 // potentially reachable symbols that were exported as external in other 916 // modules. 917 if (EnableHipStdPar) 918 PM.addPass(HipStdParAcceleratorCodeSelectionPass()); 919 // We want to support the -lto-partitions=N option as "best effort". 920 // For that, we need to lower LDS earlier in the pipeline before the 921 // module is partitioned for codegen. 922 if (EnableSwLowerLDS) 923 PM.addPass(AMDGPUSwLowerLDSPass(*this)); 924 if (EnableLowerModuleLDS) 925 PM.addPass(AMDGPULowerModuleLDSPass(*this)); 926 if (Level != OptimizationLevel::O0) { 927 // We only want to run this with O2 or higher since inliner and SROA 928 // don't run in O1. 929 if (Level != OptimizationLevel::O1) { 930 PM.addPass( 931 createModuleToFunctionPassAdaptor(InferAddressSpacesPass())); 932 } 933 // Do we really need internalization in LTO? 934 if (InternalizeSymbols) { 935 PM.addPass(InternalizePass(mustPreserveGV)); 936 PM.addPass(GlobalDCEPass()); 937 } 938 if (EnableAMDGPUAttributor) { 939 AMDGPUAttributorOptions Opt; 940 if (HasClosedWorldAssumption) 941 Opt.IsClosedWorld = true; 942 PM.addPass(AMDGPUAttributorPass( 943 *this, Opt, ThinOrFullLTOPhase::FullLTOPostLink)); 944 } 945 } 946 if (!NoKernelInfoEndLTO) { 947 FunctionPassManager FPM; 948 FPM.addPass(KernelInfoPrinter(this)); 949 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 950 } 951 }); 952 953 PB.registerRegClassFilterParsingCallback( 954 [](StringRef FilterName) -> RegAllocFilterFunc { 955 if (FilterName == "sgpr") 956 return onlyAllocateSGPRs; 957 if (FilterName == "vgpr") 958 return onlyAllocateVGPRs; 959 if (FilterName == "wwm") 960 return onlyAllocateWWMRegs; 961 return nullptr; 962 }); 963 } 964 965 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { 966 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 967 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 968 AddrSpace == AMDGPUAS::REGION_ADDRESS) 969 ? -1 970 : 0; 971 } 972 973 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, 974 unsigned DestAS) const { 975 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && 976 AMDGPU::isFlatGlobalAddrSpace(DestAS); 977 } 978 979 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { 980 if (auto *Arg = dyn_cast<Argument>(V); 981 Arg && 982 AMDGPU::isModuleEntryFunctionCC(Arg->getParent()->getCallingConv()) && 983 !Arg->hasByRefAttr()) 984 return AMDGPUAS::GLOBAL_ADDRESS; 985 986 const auto *LD = dyn_cast<LoadInst>(V); 987 if (!LD) // TODO: Handle invariant load like constant. 988 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 989 990 // It must be a generic pointer loaded. 991 assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); 992 993 const auto *Ptr = LD->getPointerOperand(); 994 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 995 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 996 // For a generic pointer loaded from the constant memory, it could be assumed 997 // as a global pointer since the constant memory is only populated on the 998 // host side. As implied by the offload programming model, only global 999 // pointers could be referenced on the host side. 1000 return AMDGPUAS::GLOBAL_ADDRESS; 1001 } 1002 1003 std::pair<const Value *, unsigned> 1004 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { 1005 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 1006 switch (II->getIntrinsicID()) { 1007 case Intrinsic::amdgcn_is_shared: 1008 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); 1009 case Intrinsic::amdgcn_is_private: 1010 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); 1011 default: 1012 break; 1013 } 1014 return std::pair(nullptr, -1); 1015 } 1016 // Check the global pointer predication based on 1017 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and 1018 // the order of 'is_shared' and 'is_private' is not significant. 1019 Value *Ptr; 1020 if (match( 1021 const_cast<Value *>(V), 1022 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), 1023 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( 1024 m_Deferred(Ptr)))))) 1025 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); 1026 1027 return std::pair(nullptr, -1); 1028 } 1029 1030 unsigned 1031 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { 1032 switch (Kind) { 1033 case PseudoSourceValue::Stack: 1034 case PseudoSourceValue::FixedStack: 1035 return AMDGPUAS::PRIVATE_ADDRESS; 1036 case PseudoSourceValue::ConstantPool: 1037 case PseudoSourceValue::GOT: 1038 case PseudoSourceValue::JumpTable: 1039 case PseudoSourceValue::GlobalValueCallEntry: 1040 case PseudoSourceValue::ExternalSymbolCallEntry: 1041 return AMDGPUAS::CONSTANT_ADDRESS; 1042 } 1043 return AMDGPUAS::FLAT_ADDRESS; 1044 } 1045 1046 bool AMDGPUTargetMachine::splitModule( 1047 Module &M, unsigned NumParts, 1048 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { 1049 // FIXME(?): Would be better to use an already existing Analysis/PassManager, 1050 // but all current users of this API don't have one ready and would need to 1051 // create one anyway. Let's hide the boilerplate for now to keep it simple. 1052 1053 LoopAnalysisManager LAM; 1054 FunctionAnalysisManager FAM; 1055 CGSCCAnalysisManager CGAM; 1056 ModuleAnalysisManager MAM; 1057 1058 PassBuilder PB(this); 1059 PB.registerModuleAnalyses(MAM); 1060 PB.registerFunctionAnalyses(FAM); 1061 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); 1062 1063 ModulePassManager MPM; 1064 MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback)); 1065 MPM.run(M, MAM); 1066 return true; 1067 } 1068 1069 //===----------------------------------------------------------------------===// 1070 // GCN Target Machine (SI+) 1071 //===----------------------------------------------------------------------===// 1072 1073 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 1074 StringRef CPU, StringRef FS, 1075 const TargetOptions &Options, 1076 std::optional<Reloc::Model> RM, 1077 std::optional<CodeModel::Model> CM, 1078 CodeGenOptLevel OL, bool JIT) 1079 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 1080 1081 const TargetSubtargetInfo * 1082 GCNTargetMachine::getSubtargetImpl(const Function &F) const { 1083 StringRef GPU = getGPUName(F); 1084 StringRef FS = getFeatureString(F); 1085 1086 SmallString<128> SubtargetKey(GPU); 1087 SubtargetKey.append(FS); 1088 1089 auto &I = SubtargetMap[SubtargetKey]; 1090 if (!I) { 1091 // This needs to be done before we create a new subtarget since any 1092 // creation will depend on the TM and the code generation flags on the 1093 // function that reside in TargetOptions. 1094 resetTargetOptions(F); 1095 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 1096 } 1097 1098 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 1099 1100 return I.get(); 1101 } 1102 1103 TargetTransformInfo 1104 GCNTargetMachine::getTargetTransformInfo(const Function &F) const { 1105 return TargetTransformInfo(std::make_unique<GCNTTIImpl>(this, F)); 1106 } 1107 1108 Error GCNTargetMachine::buildCodeGenPipeline( 1109 ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, 1110 CodeGenFileType FileType, const CGPassBuilderOption &Opts, 1111 PassInstrumentationCallbacks *PIC) { 1112 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC); 1113 return CGPB.buildPipeline(MPM, Out, DwoOut, FileType); 1114 } 1115 1116 ScheduleDAGInstrs * 1117 GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const { 1118 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1119 if (ST.enableSIScheduler()) 1120 return createSIMachineScheduler(C); 1121 1122 Attribute SchedStrategyAttr = 1123 C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy"); 1124 StringRef SchedStrategy = SchedStrategyAttr.isValid() 1125 ? SchedStrategyAttr.getValueAsString() 1126 : AMDGPUSchedStrategy; 1127 1128 if (SchedStrategy == "max-ilp") 1129 return createGCNMaxILPMachineScheduler(C); 1130 1131 if (SchedStrategy == "max-memory-clause") 1132 return createGCNMaxMemoryClauseMachineScheduler(C); 1133 1134 if (SchedStrategy == "iterative-ilp") 1135 return createIterativeILPMachineScheduler(C); 1136 1137 if (SchedStrategy == "iterative-minreg") 1138 return createMinRegScheduler(C); 1139 1140 if (SchedStrategy == "iterative-maxocc") 1141 return createIterativeGCNMaxOccupancyMachineScheduler(C); 1142 1143 return createGCNMaxOccupancyMachineScheduler(C); 1144 } 1145 1146 ScheduleDAGInstrs * 1147 GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { 1148 ScheduleDAGMI *DAG = 1149 new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(C), 1150 /*RemoveKillFlags=*/true); 1151 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1152 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1153 if (ST.shouldClusterStores()) 1154 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 1155 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); 1156 if ((EnableVOPD.getNumOccurrences() || 1157 getOptLevel() >= CodeGenOptLevel::Less) && 1158 EnableVOPD) 1159 DAG->addMutation(createVOPDPairingMutation()); 1160 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 1161 return DAG; 1162 } 1163 //===----------------------------------------------------------------------===// 1164 // AMDGPU Legacy Pass Setup 1165 //===----------------------------------------------------------------------===// 1166 1167 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { 1168 return getStandardCSEConfigForOpt(TM->getOptLevel()); 1169 } 1170 1171 namespace { 1172 1173 class GCNPassConfig final : public AMDGPUPassConfig { 1174 public: 1175 GCNPassConfig(TargetMachine &TM, PassManagerBase &PM) 1176 : AMDGPUPassConfig(TM, PM) { 1177 // It is necessary to know the register usage of the entire call graph. We 1178 // allow calls without EnableAMDGPUFunctionCalls if they are marked 1179 // noinline, so this is always required. 1180 setRequiresCodeGenSCCOrder(true); 1181 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); 1182 } 1183 1184 GCNTargetMachine &getGCNTargetMachine() const { 1185 return getTM<GCNTargetMachine>(); 1186 } 1187 1188 bool addPreISel() override; 1189 void addMachineSSAOptimization() override; 1190 bool addILPOpts() override; 1191 bool addInstSelector() override; 1192 bool addIRTranslator() override; 1193 void addPreLegalizeMachineIR() override; 1194 bool addLegalizeMachineIR() override; 1195 void addPreRegBankSelect() override; 1196 bool addRegBankSelect() override; 1197 void addPreGlobalInstructionSelect() override; 1198 bool addGlobalInstructionSelect() override; 1199 void addFastRegAlloc() override; 1200 void addOptimizedRegAlloc() override; 1201 1202 FunctionPass *createSGPRAllocPass(bool Optimized); 1203 FunctionPass *createVGPRAllocPass(bool Optimized); 1204 FunctionPass *createWWMRegAllocPass(bool Optimized); 1205 FunctionPass *createRegAllocPass(bool Optimized) override; 1206 1207 bool addRegAssignAndRewriteFast() override; 1208 bool addRegAssignAndRewriteOptimized() override; 1209 1210 bool addPreRewrite() override; 1211 void addPostRegAlloc() override; 1212 void addPreSched2() override; 1213 void addPreEmitPass() override; 1214 void addPostBBSections() override; 1215 }; 1216 1217 } // end anonymous namespace 1218 1219 AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM) 1220 : TargetPassConfig(TM, PM) { 1221 // Exceptions and StackMaps are not supported, so these passes will never do 1222 // anything. 1223 disablePass(&StackMapLivenessID); 1224 disablePass(&FuncletLayoutID); 1225 // Garbage collection is not supported. 1226 disablePass(&GCLoweringID); 1227 disablePass(&ShadowStackGCLoweringID); 1228 } 1229 1230 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 1231 if (getOptLevel() == CodeGenOptLevel::Aggressive) 1232 addPass(createGVNPass()); 1233 else 1234 addPass(createEarlyCSEPass()); 1235 } 1236 1237 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 1238 if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) 1239 addPass(createLoopDataPrefetchPass()); 1240 addPass(createSeparateConstOffsetFromGEPPass()); 1241 // ReassociateGEPs exposes more opportunities for SLSR. See 1242 // the example in reassociate-geps-and-slsr.ll. 1243 addPass(createStraightLineStrengthReducePass()); 1244 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 1245 // EarlyCSE can reuse. 1246 addEarlyCSEOrGVNPass(); 1247 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 1248 addPass(createNaryReassociatePass()); 1249 // NaryReassociate on GEPs creates redundant common expressions, so run 1250 // EarlyCSE after it. 1251 addPass(createEarlyCSEPass()); 1252 } 1253 1254 void AMDGPUPassConfig::addIRPasses() { 1255 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 1256 1257 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) 1258 addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM)); 1259 1260 // There is no reason to run these. 1261 disablePass(&StackMapLivenessID); 1262 disablePass(&FuncletLayoutID); 1263 disablePass(&PatchableFunctionID); 1264 1265 addPass(createAMDGPUPrintfRuntimeBinding()); 1266 if (LowerCtorDtor) 1267 addPass(createAMDGPUCtorDtorLoweringLegacyPass()); 1268 1269 if (isPassEnabled(EnableImageIntrinsicOptimizer)) 1270 addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); 1271 1272 // This can be disabled by passing ::Disable here or on the command line 1273 // with --expand-variadics-override=disable. 1274 addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); 1275 1276 // Function calls are not supported, so make sure we inline everything. 1277 addPass(createAMDGPUAlwaysInlinePass()); 1278 addPass(createAlwaysInlinerLegacyPass()); 1279 1280 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 1281 if (TM.getTargetTriple().getArch() == Triple::r600) 1282 addPass(createR600OpenCLImageTypeLoweringPass()); 1283 1284 // Make enqueued block runtime handles externally visible. 1285 addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass()); 1286 1287 // Lower LDS accesses to global memory pass if address sanitizer is enabled. 1288 if (EnableSwLowerLDS) 1289 addPass(createAMDGPUSwLowerLDSLegacyPass(&TM)); 1290 1291 // Runs before PromoteAlloca so the latter can account for function uses 1292 if (EnableLowerModuleLDS) { 1293 addPass(createAMDGPULowerModuleLDSLegacyPass(&TM)); 1294 } 1295 1296 // Run atomic optimizer before Atomic Expand 1297 if ((TM.getTargetTriple().isAMDGCN()) && 1298 (TM.getOptLevel() >= CodeGenOptLevel::Less) && 1299 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) { 1300 addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy)); 1301 } 1302 1303 addPass(createAtomicExpandLegacyPass()); 1304 1305 if (TM.getOptLevel() > CodeGenOptLevel::None) { 1306 addPass(createAMDGPUPromoteAlloca()); 1307 1308 if (isPassEnabled(EnableScalarIRPasses)) 1309 addStraightLineScalarOptimizationPasses(); 1310 1311 if (EnableAMDGPUAliasAnalysis) { 1312 addPass(createAMDGPUAAWrapperPass()); 1313 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 1314 AAResults &AAR) { 1315 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 1316 AAR.addAAResult(WrapperPass->getResult()); 1317 })); 1318 } 1319 1320 if (TM.getTargetTriple().isAMDGCN()) { 1321 // TODO: May want to move later or split into an early and late one. 1322 addPass(createAMDGPUCodeGenPreparePass()); 1323 } 1324 1325 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may 1326 // have expanded. 1327 if (TM.getOptLevel() > CodeGenOptLevel::Less) 1328 addPass(createLICMPass()); 1329 } 1330 1331 TargetPassConfig::addIRPasses(); 1332 1333 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1334 // example, GVN can combine 1335 // 1336 // %0 = add %a, %b 1337 // %1 = add %b, %a 1338 // 1339 // and 1340 // 1341 // %0 = shl nsw %a, 2 1342 // %1 = shl %a, 2 1343 // 1344 // but EarlyCSE can do neither of them. 1345 if (isPassEnabled(EnableScalarIRPasses)) 1346 addEarlyCSEOrGVNPass(); 1347 } 1348 1349 void AMDGPUPassConfig::addCodeGenPrepare() { 1350 if (TM->getTargetTriple().isAMDGCN() && 1351 TM->getOptLevel() > CodeGenOptLevel::None) 1352 addPass(createAMDGPUPreloadKernelArgumentsLegacyPass(TM)); 1353 1354 if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) 1355 addPass(createAMDGPULowerKernelArgumentsPass()); 1356 1357 if (TM->getTargetTriple().isAMDGCN()) { 1358 // This lowering has been placed after codegenprepare to take advantage of 1359 // address mode matching (which is why it isn't put with the LDS lowerings). 1360 // It could be placed anywhere before uniformity annotations (an analysis 1361 // that it changes by splitting up fat pointers into their components) 1362 // but has been put before switch lowering and CFG flattening so that those 1363 // passes can run on the more optimized control flow this pass creates in 1364 // many cases. 1365 // 1366 // FIXME: This should ideally be put after the LoadStoreVectorizer. 1367 // However, due to some annoying facts about ResourceUsageAnalysis, 1368 // (especially as exercised in the resource-usage-dead-function test), 1369 // we need all the function passes codegenprepare all the way through 1370 // said resource usage analysis to run on the call graph produced 1371 // before codegenprepare runs (because codegenprepare will knock some 1372 // nodes out of the graph, which leads to function-level passes not 1373 // being run on them, which causes crashes in the resource usage analysis). 1374 addPass(createAMDGPULowerBufferFatPointersPass()); 1375 // In accordance with the above FIXME, manually force all the 1376 // function-level passes into a CGSCCPassManager. 1377 addPass(new DummyCGSCCPass()); 1378 } 1379 1380 TargetPassConfig::addCodeGenPrepare(); 1381 1382 if (isPassEnabled(EnableLoadStoreVectorizer)) 1383 addPass(createLoadStoreVectorizerPass()); 1384 1385 // LowerSwitch pass may introduce unreachable blocks that can 1386 // cause unexpected behavior for subsequent passes. Placing it 1387 // here seems better that these blocks would get cleaned up by 1388 // UnreachableBlockElim inserted next in the pass flow. 1389 addPass(createLowerSwitchPass()); 1390 } 1391 1392 bool AMDGPUPassConfig::addPreISel() { 1393 if (TM->getOptLevel() > CodeGenOptLevel::None) 1394 addPass(createFlattenCFGPass()); 1395 return false; 1396 } 1397 1398 bool AMDGPUPassConfig::addInstSelector() { 1399 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel())); 1400 return false; 1401 } 1402 1403 bool AMDGPUPassConfig::addGCPasses() { 1404 // Do nothing. GC is not supported. 1405 return false; 1406 } 1407 1408 //===----------------------------------------------------------------------===// 1409 // GCN Legacy Pass Setup 1410 //===----------------------------------------------------------------------===// 1411 1412 bool GCNPassConfig::addPreISel() { 1413 AMDGPUPassConfig::addPreISel(); 1414 1415 if (TM->getOptLevel() > CodeGenOptLevel::None) 1416 addPass(createSinkingPass()); 1417 1418 if (TM->getOptLevel() > CodeGenOptLevel::None) 1419 addPass(createAMDGPULateCodeGenPrepareLegacyPass()); 1420 1421 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 1422 // regions formed by them. 1423 addPass(&AMDGPUUnifyDivergentExitNodesID); 1424 addPass(createFixIrreduciblePass()); 1425 addPass(createUnifyLoopExitsPass()); 1426 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions 1427 1428 addPass(createAMDGPUAnnotateUniformValuesLegacy()); 1429 addPass(createSIAnnotateControlFlowLegacyPass()); 1430 // TODO: Move this right after structurizeCFG to avoid extra divergence 1431 // analysis. This depends on stopping SIAnnotateControlFlow from making 1432 // control flow modifications. 1433 addPass(createAMDGPURewriteUndefForPHILegacyPass()); 1434 1435 // SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel 1436 // with -new-reg-bank-select and without any of the fallback options. 1437 if (!getCGPassBuilderOption().EnableGlobalISelOption || 1438 !isGlobalISelAbortEnabled() || !NewRegBankSelect) 1439 addPass(createLCSSAPass()); 1440 1441 if (TM->getOptLevel() > CodeGenOptLevel::Less) 1442 addPass(&AMDGPUPerfHintAnalysisLegacyID); 1443 1444 return false; 1445 } 1446 1447 void GCNPassConfig::addMachineSSAOptimization() { 1448 TargetPassConfig::addMachineSSAOptimization(); 1449 1450 // We want to fold operands after PeepholeOptimizer has run (or as part of 1451 // it), because it will eliminate extra copies making it easier to fold the 1452 // real source operand. We want to eliminate dead instructions after, so that 1453 // we see fewer uses of the copies. We then need to clean up the dead 1454 // instructions leftover after the operands are folded as well. 1455 // 1456 // XXX - Can we get away without running DeadMachineInstructionElim again? 1457 addPass(&SIFoldOperandsLegacyID); 1458 if (EnableDPPCombine) 1459 addPass(&GCNDPPCombineLegacyID); 1460 addPass(&SILoadStoreOptimizerLegacyID); 1461 if (isPassEnabled(EnableSDWAPeephole)) { 1462 addPass(&SIPeepholeSDWALegacyID); 1463 addPass(&EarlyMachineLICMID); 1464 addPass(&MachineCSELegacyID); 1465 addPass(&SIFoldOperandsLegacyID); 1466 } 1467 addPass(&DeadMachineInstructionElimID); 1468 addPass(createSIShrinkInstructionsLegacyPass()); 1469 } 1470 1471 bool GCNPassConfig::addILPOpts() { 1472 if (EnableEarlyIfConversion) 1473 addPass(&EarlyIfConverterLegacyID); 1474 1475 TargetPassConfig::addILPOpts(); 1476 return false; 1477 } 1478 1479 bool GCNPassConfig::addInstSelector() { 1480 AMDGPUPassConfig::addInstSelector(); 1481 addPass(&SIFixSGPRCopiesLegacyID); 1482 addPass(createSILowerI1CopiesLegacyPass()); 1483 return false; 1484 } 1485 1486 bool GCNPassConfig::addIRTranslator() { 1487 addPass(new IRTranslator(getOptLevel())); 1488 return false; 1489 } 1490 1491 void GCNPassConfig::addPreLegalizeMachineIR() { 1492 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; 1493 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 1494 addPass(new Localizer()); 1495 } 1496 1497 bool GCNPassConfig::addLegalizeMachineIR() { 1498 addPass(new Legalizer()); 1499 return false; 1500 } 1501 1502 void GCNPassConfig::addPreRegBankSelect() { 1503 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; 1504 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); 1505 addPass(createAMDGPUGlobalISelDivergenceLoweringPass()); 1506 } 1507 1508 bool GCNPassConfig::addRegBankSelect() { 1509 if (NewRegBankSelect) { 1510 addPass(createAMDGPURegBankSelectPass()); 1511 addPass(createAMDGPURegBankLegalizePass()); 1512 } else { 1513 addPass(new RegBankSelect()); 1514 } 1515 return false; 1516 } 1517 1518 void GCNPassConfig::addPreGlobalInstructionSelect() { 1519 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; 1520 addPass(createAMDGPURegBankCombiner(IsOptNone)); 1521 } 1522 1523 bool GCNPassConfig::addGlobalInstructionSelect() { 1524 addPass(new InstructionSelect(getOptLevel())); 1525 return false; 1526 } 1527 1528 void GCNPassConfig::addFastRegAlloc() { 1529 // FIXME: We have to disable the verifier here because of PHIElimination + 1530 // TwoAddressInstructions disabling it. 1531 1532 // This must be run immediately after phi elimination and before 1533 // TwoAddressInstructions, otherwise the processing of the tied operand of 1534 // SI_ELSE will introduce a copy of the tied operand source after the else. 1535 insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID); 1536 1537 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); 1538 1539 TargetPassConfig::addFastRegAlloc(); 1540 } 1541 1542 void GCNPassConfig::addOptimizedRegAlloc() { 1543 if (EnableDCEInRA) 1544 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 1545 1546 // FIXME: when an instruction has a Killed operand, and the instruction is 1547 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 1548 // the register in LiveVariables, this would trigger a failure in verifier, 1549 // we should fix it and enable the verifier. 1550 if (OptVGPRLiveRange) 1551 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeLegacyID); 1552 1553 // This must be run immediately after phi elimination and before 1554 // TwoAddressInstructions, otherwise the processing of the tied operand of 1555 // SI_ELSE will introduce a copy of the tied operand source after the else. 1556 insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID); 1557 1558 if (EnableRewritePartialRegUses) 1559 insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID); 1560 1561 if (isPassEnabled(EnablePreRAOptimizations)) 1562 insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID); 1563 1564 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 1565 // instructions that cause scheduling barriers. 1566 insertPass(&MachineSchedulerID, &SIWholeQuadModeID); 1567 1568 if (OptExecMaskPreRA) 1569 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 1570 1571 // This is not an essential optimization and it has a noticeable impact on 1572 // compilation time, so we only enable it from O2. 1573 if (TM->getOptLevel() > CodeGenOptLevel::Less) 1574 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 1575 1576 TargetPassConfig::addOptimizedRegAlloc(); 1577 } 1578 1579 bool GCNPassConfig::addPreRewrite() { 1580 if (EnableRegReassign) 1581 addPass(&GCNNSAReassignID); 1582 1583 addPass(&AMDGPURewriteAGPRCopyMFMALegacyID); 1584 return true; 1585 } 1586 1587 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { 1588 // Initialize the global default. 1589 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, 1590 initializeDefaultSGPRRegisterAllocatorOnce); 1591 1592 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 1593 if (Ctor != useDefaultRegisterAllocator) 1594 return Ctor(); 1595 1596 if (Optimized) 1597 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 1598 1599 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 1600 } 1601 1602 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { 1603 // Initialize the global default. 1604 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, 1605 initializeDefaultVGPRRegisterAllocatorOnce); 1606 1607 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 1608 if (Ctor != useDefaultRegisterAllocator) 1609 return Ctor(); 1610 1611 if (Optimized) 1612 return createGreedyVGPRRegisterAllocator(); 1613 1614 return createFastVGPRRegisterAllocator(); 1615 } 1616 1617 FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) { 1618 // Initialize the global default. 1619 llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag, 1620 initializeDefaultWWMRegisterAllocatorOnce); 1621 1622 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault(); 1623 if (Ctor != useDefaultRegisterAllocator) 1624 return Ctor(); 1625 1626 if (Optimized) 1627 return createGreedyWWMRegisterAllocator(); 1628 1629 return createFastWWMRegisterAllocator(); 1630 } 1631 1632 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { 1633 llvm_unreachable("should not be used"); 1634 } 1635 1636 static const char RegAllocOptNotSupportedMessage[] = 1637 "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, " 1638 "and -vgpr-regalloc"; 1639 1640 bool GCNPassConfig::addRegAssignAndRewriteFast() { 1641 if (!usingDefaultRegAlloc()) 1642 reportFatalUsageError(RegAllocOptNotSupportedMessage); 1643 1644 addPass(&GCNPreRALongBranchRegID); 1645 1646 addPass(createSGPRAllocPass(false)); 1647 1648 // Equivalent of PEI for SGPRs. 1649 addPass(&SILowerSGPRSpillsLegacyID); 1650 1651 // To Allocate wwm registers used in whole quad mode operations (for shaders). 1652 addPass(&SIPreAllocateWWMRegsLegacyID); 1653 1654 // For allocating other wwm register operands. 1655 addPass(createWWMRegAllocPass(false)); 1656 1657 addPass(&SILowerWWMCopiesLegacyID); 1658 addPass(&AMDGPUReserveWWMRegsLegacyID); 1659 1660 // For allocating per-thread VGPRs. 1661 addPass(createVGPRAllocPass(false)); 1662 1663 return true; 1664 } 1665 1666 bool GCNPassConfig::addRegAssignAndRewriteOptimized() { 1667 if (!usingDefaultRegAlloc()) 1668 reportFatalUsageError(RegAllocOptNotSupportedMessage); 1669 1670 addPass(&GCNPreRALongBranchRegID); 1671 1672 addPass(createSGPRAllocPass(true)); 1673 1674 // Commit allocated register changes. This is mostly necessary because too 1675 // many things rely on the use lists of the physical registers, such as the 1676 // verifier. This is only necessary with allocators which use LiveIntervals, 1677 // since FastRegAlloc does the replacements itself. 1678 addPass(createVirtRegRewriter(false)); 1679 1680 // At this point, the sgpr-regalloc has been done and it is good to have the 1681 // stack slot coloring to try to optimize the SGPR spill stack indices before 1682 // attempting the custom SGPR spill lowering. 1683 addPass(&StackSlotColoringID); 1684 1685 // Equivalent of PEI for SGPRs. 1686 addPass(&SILowerSGPRSpillsLegacyID); 1687 1688 // To Allocate wwm registers used in whole quad mode operations (for shaders). 1689 addPass(&SIPreAllocateWWMRegsLegacyID); 1690 1691 // For allocating other whole wave mode registers. 1692 addPass(createWWMRegAllocPass(true)); 1693 addPass(&SILowerWWMCopiesLegacyID); 1694 addPass(createVirtRegRewriter(false)); 1695 addPass(&AMDGPUReserveWWMRegsLegacyID); 1696 1697 // For allocating per-thread VGPRs. 1698 addPass(createVGPRAllocPass(true)); 1699 1700 addPreRewrite(); 1701 addPass(&VirtRegRewriterID); 1702 1703 addPass(&AMDGPUMarkLastScratchLoadID); 1704 1705 return true; 1706 } 1707 1708 void GCNPassConfig::addPostRegAlloc() { 1709 addPass(&SIFixVGPRCopiesID); 1710 if (getOptLevel() > CodeGenOptLevel::None) 1711 addPass(&SIOptimizeExecMaskingLegacyID); 1712 TargetPassConfig::addPostRegAlloc(); 1713 } 1714 1715 void GCNPassConfig::addPreSched2() { 1716 if (TM->getOptLevel() > CodeGenOptLevel::None) 1717 addPass(createSIShrinkInstructionsLegacyPass()); 1718 addPass(&SIPostRABundlerLegacyID); 1719 } 1720 1721 void GCNPassConfig::addPreEmitPass() { 1722 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) 1723 addPass(&GCNCreateVOPDID); 1724 addPass(createSIMemoryLegalizerPass()); 1725 addPass(createSIInsertWaitcntsPass()); 1726 1727 addPass(createSIModeRegisterPass()); 1728 1729 if (getOptLevel() > CodeGenOptLevel::None) 1730 addPass(&SIInsertHardClausesID); 1731 1732 addPass(&SILateBranchLoweringPassID); 1733 if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) 1734 addPass(createAMDGPUSetWavePriorityPass()); 1735 if (getOptLevel() > CodeGenOptLevel::None) 1736 addPass(&SIPreEmitPeepholeID); 1737 // The hazard recognizer that runs as part of the post-ra scheduler does not 1738 // guarantee to be able handle all hazards correctly. This is because if there 1739 // are multiple scheduling regions in a basic block, the regions are scheduled 1740 // bottom up, so when we begin to schedule a region we don't know what 1741 // instructions were emitted directly before it. 1742 // 1743 // Here we add a stand-alone hazard recognizer pass which can handle all 1744 // cases. 1745 addPass(&PostRAHazardRecognizerID); 1746 1747 addPass(&AMDGPUWaitSGPRHazardsLegacyID); 1748 1749 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) 1750 addPass(&AMDGPUInsertDelayAluID); 1751 1752 addPass(&BranchRelaxationPassID); 1753 } 1754 1755 void GCNPassConfig::addPostBBSections() { 1756 // We run this later to avoid passes like livedebugvalues and BBSections 1757 // having to deal with the apparent multi-entry functions we may generate. 1758 addPass(createAMDGPUPreloadKernArgPrologLegacyPass()); 1759 } 1760 1761 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1762 return new GCNPassConfig(*this, PM); 1763 } 1764 1765 void GCNTargetMachine::registerMachineRegisterInfoCallback( 1766 MachineFunction &MF) const { 1767 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1768 MF.getRegInfo().addDelegate(MFI); 1769 } 1770 1771 MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo( 1772 BumpPtrAllocator &Allocator, const Function &F, 1773 const TargetSubtargetInfo *STI) const { 1774 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>( 1775 Allocator, F, static_cast<const GCNSubtarget *>(STI)); 1776 } 1777 1778 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1779 return new yaml::SIMachineFunctionInfo(); 1780 } 1781 1782 yaml::MachineFunctionInfo * 1783 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1784 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1785 return new yaml::SIMachineFunctionInfo( 1786 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF); 1787 } 1788 1789 bool GCNTargetMachine::parseMachineFunctionInfo( 1790 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1791 SMDiagnostic &Error, SMRange &SourceRange) const { 1792 const yaml::SIMachineFunctionInfo &YamlMFI = 1793 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1794 MachineFunction &MF = PFS.MF; 1795 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1796 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1797 1798 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) 1799 return true; 1800 1801 if (MFI->Occupancy == 0) { 1802 // Fixup the subtarget dependent default value. 1803 MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second; 1804 } 1805 1806 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { 1807 Register TempReg; 1808 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { 1809 SourceRange = RegName.SourceRange; 1810 return true; 1811 } 1812 RegVal = TempReg; 1813 1814 return false; 1815 }; 1816 1817 auto parseOptionalRegister = [&](const yaml::StringValue &RegName, 1818 Register &RegVal) { 1819 return !RegName.Value.empty() && parseRegister(RegName, RegVal); 1820 }; 1821 1822 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) 1823 return true; 1824 1825 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy)) 1826 return true; 1827 1828 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg, 1829 MFI->LongBranchReservedReg)) 1830 return true; 1831 1832 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1833 // Create a diagnostic for a the register string literal. 1834 const MemoryBuffer &Buffer = 1835 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1836 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1837 RegName.Value.size(), SourceMgr::DK_Error, 1838 "incorrect register class for field", RegName.Value, 1839 {}, {}); 1840 SourceRange = RegName.SourceRange; 1841 return true; 1842 }; 1843 1844 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1845 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1846 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1847 return true; 1848 1849 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1850 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1851 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1852 } 1853 1854 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1855 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1856 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1857 } 1858 1859 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1860 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1861 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1862 } 1863 1864 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { 1865 Register ParsedReg; 1866 if (parseRegister(YamlReg, ParsedReg)) 1867 return true; 1868 1869 MFI->reserveWWMRegister(ParsedReg); 1870 } 1871 1872 for (const auto &[_, Info] : PFS.VRegInfosNamed) { 1873 MFI->setFlag(Info->VReg, Info->Flags); 1874 } 1875 for (const auto &[_, Info] : PFS.VRegInfos) { 1876 MFI->setFlag(Info->VReg, Info->Flags); 1877 } 1878 1879 for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) { 1880 Register ParsedReg; 1881 if (parseRegister(YamlRegStr, ParsedReg)) 1882 return true; 1883 MFI->SpillPhysVGPRs.push_back(ParsedReg); 1884 } 1885 1886 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A, 1887 const TargetRegisterClass &RC, 1888 ArgDescriptor &Arg, unsigned UserSGPRs, 1889 unsigned SystemSGPRs) { 1890 // Skip parsing if it's not present. 1891 if (!A) 1892 return false; 1893 1894 if (A->IsRegister) { 1895 Register Reg; 1896 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1897 SourceRange = A->RegisterName.SourceRange; 1898 return true; 1899 } 1900 if (!RC.contains(Reg)) 1901 return diagnoseRegisterClass(A->RegisterName); 1902 Arg = ArgDescriptor::createRegister(Reg); 1903 } else 1904 Arg = ArgDescriptor::createStack(A->StackOffset); 1905 // Check and apply the optional mask. 1906 if (A->Mask) 1907 Arg = ArgDescriptor::createArg(Arg, *A->Mask); 1908 1909 MFI->NumUserSGPRs += UserSGPRs; 1910 MFI->NumSystemSGPRs += SystemSGPRs; 1911 return false; 1912 }; 1913 1914 if (YamlMFI.ArgInfo && 1915 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1916 AMDGPU::SGPR_128RegClass, 1917 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1918 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1919 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1920 2, 0) || 1921 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1922 MFI->ArgInfo.QueuePtr, 2, 0) || 1923 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1924 AMDGPU::SReg_64RegClass, 1925 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1926 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1927 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1928 2, 0) || 1929 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1930 AMDGPU::SReg_64RegClass, 1931 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1932 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1933 AMDGPU::SGPR_32RegClass, 1934 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1935 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId, 1936 AMDGPU::SGPR_32RegClass, 1937 MFI->ArgInfo.LDSKernelId, 0, 1) || 1938 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1939 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1940 0, 1) || 1941 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1942 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1943 0, 1) || 1944 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1945 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1946 0, 1) || 1947 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1948 AMDGPU::SGPR_32RegClass, 1949 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1950 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1951 AMDGPU::SGPR_32RegClass, 1952 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1953 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1954 AMDGPU::SReg_64RegClass, 1955 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1956 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1957 AMDGPU::SReg_64RegClass, 1958 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1959 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1960 AMDGPU::VGPR_32RegClass, 1961 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1962 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1963 AMDGPU::VGPR_32RegClass, 1964 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1965 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1966 AMDGPU::VGPR_32RegClass, 1967 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1968 return true; 1969 1970 if (ST.hasIEEEMode()) 1971 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1972 if (ST.hasDX10ClampMode()) 1973 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1974 1975 // FIXME: Move proper support for denormal-fp-math into base MachineFunction 1976 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals 1977 ? DenormalMode::IEEE 1978 : DenormalMode::PreserveSign; 1979 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals 1980 ? DenormalMode::IEEE 1981 : DenormalMode::PreserveSign; 1982 1983 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals 1984 ? DenormalMode::IEEE 1985 : DenormalMode::PreserveSign; 1986 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals 1987 ? DenormalMode::IEEE 1988 : DenormalMode::PreserveSign; 1989 1990 if (YamlMFI.HasInitWholeWave) 1991 MFI->setInitWholeWave(); 1992 1993 return false; 1994 } 1995 1996 //===----------------------------------------------------------------------===// 1997 // AMDGPU CodeGen Pass Builder interface. 1998 //===----------------------------------------------------------------------===// 1999 2000 AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder( 2001 GCNTargetMachine &TM, const CGPassBuilderOption &Opts, 2002 PassInstrumentationCallbacks *PIC) 2003 : CodeGenPassBuilder(TM, Opts, PIC) { 2004 Opt.MISchedPostRA = true; 2005 Opt.RequiresCodeGenSCCOrder = true; 2006 // Exceptions and StackMaps are not supported, so these passes will never do 2007 // anything. 2008 // Garbage collection is not supported. 2009 disablePass<StackMapLivenessPass, FuncletLayoutPass, 2010 ShadowStackGCLoweringPass>(); 2011 } 2012 2013 void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { 2014 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) 2015 addPass(AMDGPURemoveIncompatibleFunctionsPass(TM)); 2016 2017 addPass(AMDGPUPrintfRuntimeBindingPass()); 2018 if (LowerCtorDtor) 2019 addPass(AMDGPUCtorDtorLoweringPass()); 2020 2021 if (isPassEnabled(EnableImageIntrinsicOptimizer)) 2022 addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); 2023 2024 // This can be disabled by passing ::Disable here or on the command line 2025 // with --expand-variadics-override=disable. 2026 addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); 2027 2028 addPass(AMDGPUAlwaysInlinePass()); 2029 addPass(AlwaysInlinerPass()); 2030 2031 addPass(AMDGPUExportKernelRuntimeHandlesPass()); 2032 2033 if (EnableSwLowerLDS) 2034 addPass(AMDGPUSwLowerLDSPass(TM)); 2035 2036 // Runs before PromoteAlloca so the latter can account for function uses 2037 if (EnableLowerModuleLDS) 2038 addPass(AMDGPULowerModuleLDSPass(TM)); 2039 2040 // Run atomic optimizer before Atomic Expand 2041 if (TM.getOptLevel() >= CodeGenOptLevel::Less && 2042 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) 2043 addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy)); 2044 2045 addPass(AtomicExpandPass(&TM)); 2046 2047 if (TM.getOptLevel() > CodeGenOptLevel::None) { 2048 addPass(AMDGPUPromoteAllocaPass(TM)); 2049 if (isPassEnabled(EnableScalarIRPasses)) 2050 addStraightLineScalarOptimizationPasses(addPass); 2051 2052 // TODO: Handle EnableAMDGPUAliasAnalysis 2053 2054 // TODO: May want to move later or split into an early and late one. 2055 addPass(AMDGPUCodeGenPreparePass(TM)); 2056 2057 // TODO: LICM 2058 } 2059 2060 Base::addIRPasses(addPass); 2061 2062 // EarlyCSE is not always strong enough to clean up what LSR produces. For 2063 // example, GVN can combine 2064 // 2065 // %0 = add %a, %b 2066 // %1 = add %b, %a 2067 // 2068 // and 2069 // 2070 // %0 = shl nsw %a, 2 2071 // %1 = shl %a, 2 2072 // 2073 // but EarlyCSE can do neither of them. 2074 if (isPassEnabled(EnableScalarIRPasses)) 2075 addEarlyCSEOrGVNPass(addPass); 2076 } 2077 2078 void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { 2079 if (TM.getOptLevel() > CodeGenOptLevel::None) 2080 addPass(AMDGPUPreloadKernelArgumentsPass(TM)); 2081 2082 if (EnableLowerKernelArguments) 2083 addPass(AMDGPULowerKernelArgumentsPass(TM)); 2084 2085 // This lowering has been placed after codegenprepare to take advantage of 2086 // address mode matching (which is why it isn't put with the LDS lowerings). 2087 // It could be placed anywhere before uniformity annotations (an analysis 2088 // that it changes by splitting up fat pointers into their components) 2089 // but has been put before switch lowering and CFG flattening so that those 2090 // passes can run on the more optimized control flow this pass creates in 2091 // many cases. 2092 // 2093 // FIXME: This should ideally be put after the LoadStoreVectorizer. 2094 // However, due to some annoying facts about ResourceUsageAnalysis, 2095 // (especially as exercised in the resource-usage-dead-function test), 2096 // we need all the function passes codegenprepare all the way through 2097 // said resource usage analysis to run on the call graph produced 2098 // before codegenprepare runs (because codegenprepare will knock some 2099 // nodes out of the graph, which leads to function-level passes not 2100 // being run on them, which causes crashes in the resource usage analysis). 2101 addPass(AMDGPULowerBufferFatPointersPass(TM)); 2102 2103 addPass.requireCGSCCOrder(); 2104 2105 Base::addCodeGenPrepare(addPass); 2106 2107 if (isPassEnabled(EnableLoadStoreVectorizer)) 2108 addPass(LoadStoreVectorizerPass()); 2109 2110 // LowerSwitch pass may introduce unreachable blocks that can cause unexpected 2111 // behavior for subsequent passes. Placing it here seems better that these 2112 // blocks would get cleaned up by UnreachableBlockElim inserted next in the 2113 // pass flow. 2114 addPass(LowerSwitchPass()); 2115 } 2116 2117 void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { 2118 2119 if (TM.getOptLevel() > CodeGenOptLevel::None) { 2120 addPass(FlattenCFGPass()); 2121 addPass(SinkingPass()); 2122 addPass(AMDGPULateCodeGenPreparePass(TM)); 2123 } 2124 2125 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 2126 // regions formed by them. 2127 2128 addPass(AMDGPUUnifyDivergentExitNodesPass()); 2129 addPass(FixIrreduciblePass()); 2130 addPass(UnifyLoopExitsPass()); 2131 addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false)); 2132 2133 addPass(AMDGPUAnnotateUniformValuesPass()); 2134 2135 addPass(SIAnnotateControlFlowPass(TM)); 2136 2137 // TODO: Move this right after structurizeCFG to avoid extra divergence 2138 // analysis. This depends on stopping SIAnnotateControlFlow from making 2139 // control flow modifications. 2140 addPass(AMDGPURewriteUndefForPHIPass()); 2141 2142 if (!getCGPassBuilderOption().EnableGlobalISelOption || 2143 !isGlobalISelAbortEnabled() || !NewRegBankSelect) 2144 addPass(LCSSAPass()); 2145 2146 if (TM.getOptLevel() > CodeGenOptLevel::Less) 2147 addPass(AMDGPUPerfHintAnalysisPass(TM)); 2148 2149 // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why 2150 // isn't this in addInstSelector? 2151 addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>(), 2152 /*Force=*/true); 2153 } 2154 2155 void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const { 2156 if (EnableEarlyIfConversion) 2157 addPass(EarlyIfConverterPass()); 2158 2159 Base::addILPOpts(addPass); 2160 } 2161 2162 void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass, 2163 CreateMCStreamer) const { 2164 // TODO: Add AsmPrinter. 2165 } 2166 2167 Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const { 2168 addPass(AMDGPUISelDAGToDAGPass(TM)); 2169 addPass(SIFixSGPRCopiesPass()); 2170 addPass(SILowerI1CopiesPass()); 2171 return Error::success(); 2172 } 2173 2174 void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const { 2175 if (EnableRegReassign) { 2176 addPass(GCNNSAReassignPass()); 2177 } 2178 } 2179 2180 void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization( 2181 AddMachinePass &addPass) const { 2182 Base::addMachineSSAOptimization(addPass); 2183 2184 addPass(SIFoldOperandsPass()); 2185 if (EnableDPPCombine) { 2186 addPass(GCNDPPCombinePass()); 2187 } 2188 addPass(SILoadStoreOptimizerPass()); 2189 if (isPassEnabled(EnableSDWAPeephole)) { 2190 addPass(SIPeepholeSDWAPass()); 2191 addPass(EarlyMachineLICMPass()); 2192 addPass(MachineCSEPass()); 2193 addPass(SIFoldOperandsPass()); 2194 } 2195 addPass(DeadMachineInstructionElimPass()); 2196 addPass(SIShrinkInstructionsPass()); 2197 } 2198 2199 void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( 2200 AddMachinePass &addPass) const { 2201 if (EnableDCEInRA) 2202 insertPass<DetectDeadLanesPass>(DeadMachineInstructionElimPass()); 2203 2204 // FIXME: when an instruction has a Killed operand, and the instruction is 2205 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 2206 // the register in LiveVariables, this would trigger a failure in verifier, 2207 // we should fix it and enable the verifier. 2208 if (OptVGPRLiveRange) 2209 insertPass<RequireAnalysisPass<LiveVariablesAnalysis, MachineFunction>>( 2210 SIOptimizeVGPRLiveRangePass()); 2211 2212 // This must be run immediately after phi elimination and before 2213 // TwoAddressInstructions, otherwise the processing of the tied operand of 2214 // SI_ELSE will introduce a copy of the tied operand source after the else. 2215 insertPass<PHIEliminationPass>(SILowerControlFlowPass()); 2216 2217 if (EnableRewritePartialRegUses) 2218 insertPass<RenameIndependentSubregsPass>(GCNRewritePartialRegUsesPass()); 2219 2220 if (isPassEnabled(EnablePreRAOptimizations)) 2221 insertPass<MachineSchedulerPass>(GCNPreRAOptimizationsPass()); 2222 2223 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 2224 // instructions that cause scheduling barriers. 2225 insertPass<MachineSchedulerPass>(SIWholeQuadModePass()); 2226 2227 if (OptExecMaskPreRA) 2228 insertPass<MachineSchedulerPass>(SIOptimizeExecMaskingPreRAPass()); 2229 2230 // This is not an essential optimization and it has a noticeable impact on 2231 // compilation time, so we only enable it from O2. 2232 if (TM.getOptLevel() > CodeGenOptLevel::Less) 2233 insertPass<MachineSchedulerPass>(SIFormMemoryClausesPass()); 2234 2235 Base::addOptimizedRegAlloc(addPass); 2236 } 2237 2238 Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized( 2239 AddMachinePass &addPass) const { 2240 // TODO: Check --regalloc-npm option 2241 2242 addPass(GCNPreRALongBranchRegPass()); 2243 2244 addPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"})); 2245 2246 // Commit allocated register changes. This is mostly necessary because too 2247 // many things rely on the use lists of the physical registers, such as the 2248 // verifier. This is only necessary with allocators which use LiveIntervals, 2249 // since FastRegAlloc does the replacements itself. 2250 addPass(VirtRegRewriterPass(false)); 2251 2252 // At this point, the sgpr-regalloc has been done and it is good to have the 2253 // stack slot coloring to try to optimize the SGPR spill stack indices before 2254 // attempting the custom SGPR spill lowering. 2255 addPass(StackSlotColoringPass()); 2256 2257 // Equivalent of PEI for SGPRs. 2258 addPass(SILowerSGPRSpillsPass()); 2259 2260 // To Allocate wwm registers used in whole quad mode operations (for shaders). 2261 addPass(SIPreAllocateWWMRegsPass()); 2262 2263 // For allocating other wwm register operands. 2264 addPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"})); 2265 addPass(SILowerWWMCopiesPass()); 2266 addPass(VirtRegRewriterPass(false)); 2267 addPass(AMDGPUReserveWWMRegsPass()); 2268 2269 // For allocating per-thread VGPRs. 2270 addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"})); 2271 2272 2273 addPreRewrite(addPass); 2274 addPass(VirtRegRewriterPass(true)); 2275 2276 addPass(AMDGPUMarkLastScratchLoadPass()); 2277 return Error::success(); 2278 } 2279 2280 void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { 2281 addPass(SIFixVGPRCopiesPass()); 2282 if (TM.getOptLevel() > CodeGenOptLevel::None) 2283 addPass(SIOptimizeExecMaskingPass()); 2284 Base::addPostRegAlloc(addPass); 2285 } 2286 2287 void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { 2288 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { 2289 addPass(GCNCreateVOPDPass()); 2290 } 2291 2292 addPass(SIMemoryLegalizerPass()); 2293 addPass(SIInsertWaitcntsPass()); 2294 2295 // TODO: addPass(SIModeRegisterPass()); 2296 2297 if (TM.getOptLevel() > CodeGenOptLevel::None) { 2298 // TODO: addPass(SIInsertHardClausesPass()); 2299 } 2300 2301 addPass(SILateBranchLoweringPass()); 2302 2303 if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) 2304 addPass(AMDGPUSetWavePriorityPass()); 2305 2306 if (TM.getOptLevel() > CodeGenOptLevel::None) 2307 addPass(SIPreEmitPeepholePass()); 2308 2309 // The hazard recognizer that runs as part of the post-ra scheduler does not 2310 // guarantee to be able handle all hazards correctly. This is because if there 2311 // are multiple scheduling regions in a basic block, the regions are scheduled 2312 // bottom up, so when we begin to schedule a region we don't know what 2313 // instructions were emitted directly before it. 2314 // 2315 // Here we add a stand-alone hazard recognizer pass which can handle all 2316 // cases. 2317 addPass(PostRAHazardRecognizerPass()); 2318 addPass(AMDGPUWaitSGPRHazardsPass()); 2319 2320 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) { 2321 addPass(AMDGPUInsertDelayAluPass()); 2322 } 2323 2324 addPass(BranchRelaxationPass()); 2325 } 2326 2327 bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt, 2328 CodeGenOptLevel Level) const { 2329 if (Opt.getNumOccurrences()) 2330 return Opt; 2331 if (TM.getOptLevel() < Level) 2332 return false; 2333 return Opt; 2334 } 2335 2336 void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(AddIRPass &addPass) const { 2337 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) 2338 addPass(GVNPass()); 2339 else 2340 addPass(EarlyCSEPass()); 2341 } 2342 2343 void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses( 2344 AddIRPass &addPass) const { 2345 if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) 2346 addPass(LoopDataPrefetchPass()); 2347 2348 addPass(SeparateConstOffsetFromGEPPass()); 2349 2350 // ReassociateGEPs exposes more opportunities for SLSR. See 2351 // the example in reassociate-geps-and-slsr.ll. 2352 addPass(StraightLineStrengthReducePass()); 2353 2354 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 2355 // EarlyCSE can reuse. 2356 addEarlyCSEOrGVNPass(addPass); 2357 2358 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 2359 addPass(NaryReassociatePass()); 2360 2361 // NaryReassociate on GEPs creates redundant common expressions, so run 2362 // EarlyCSE after it. 2363 addPass(EarlyCSEPass()); 2364 } 2365