1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information needed to emit code for SI+ GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCodeGenPassBuilder.h"
19 #include "AMDGPUCtorDtorLowering.h"
20 #include "AMDGPUExportClustering.h"
21 #include "AMDGPUIGroupLP.h"
22 #include "AMDGPUISelDAGToDAG.h"
23 #include "AMDGPUMacroFusion.h"
24 #include "AMDGPURegBankSelect.h"
25 #include "AMDGPUSplitModule.h"
26 #include "AMDGPUTargetObjectFile.h"
27 #include "AMDGPUTargetTransformInfo.h"
28 #include "AMDGPUUnifyDivergentExitNodes.h"
29 #include "GCNIterativeScheduler.h"
30 #include "GCNSchedStrategy.h"
31 #include "GCNVOPDUtils.h"
32 #include "R600.h"
33 #include "R600MachineFunctionInfo.h"
34 #include "R600TargetMachine.h"
35 #include "SIMachineFunctionInfo.h"
36 #include "SIMachineScheduler.h"
37 #include "TargetInfo/AMDGPUTargetInfo.h"
38 #include "Utils/AMDGPUBaseInfo.h"
39 #include "llvm/Analysis/CGSCCPassManager.h"
40 #include "llvm/Analysis/CallGraphSCCPass.h"
41 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
42 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
43 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
44 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
45 #include "llvm/CodeGen/GlobalISel/Localizer.h"
46 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
47 #include "llvm/CodeGen/MIRParser/MIParser.h"
48 #include "llvm/CodeGen/Passes.h"
49 #include "llvm/CodeGen/RegAllocRegistry.h"
50 #include "llvm/CodeGen/TargetPassConfig.h"
51 #include "llvm/IR/IntrinsicsAMDGPU.h"
52 #include "llvm/IR/PassManager.h"
53 #include "llvm/IR/PatternMatch.h"
54 #include "llvm/InitializePasses.h"
55 #include "llvm/MC/TargetRegistry.h"
56 #include "llvm/Passes/PassBuilder.h"
57 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
58 #include "llvm/Transforms/IPO.h"
59 #include "llvm/Transforms/IPO/AlwaysInliner.h"
60 #include "llvm/Transforms/IPO/ExpandVariadics.h"
61 #include "llvm/Transforms/IPO/GlobalDCE.h"
62 #include "llvm/Transforms/IPO/Internalize.h"
63 #include "llvm/Transforms/Scalar.h"
64 #include "llvm/Transforms/Scalar/GVN.h"
65 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
66 #include "llvm/Transforms/Utils.h"
67 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
68 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
69 #include <optional>
70
71 using namespace llvm;
72 using namespace llvm::PatternMatch;
73
74 namespace {
75 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
76 public:
SGPRRegisterRegAlloc(const char * N,const char * D,FunctionPassCtor C)77 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
78 : RegisterRegAllocBase(N, D, C) {}
79 };
80
81 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
82 public:
VGPRRegisterRegAlloc(const char * N,const char * D,FunctionPassCtor C)83 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
84 : RegisterRegAllocBase(N, D, C) {}
85 };
86
onlyAllocateSGPRs(const TargetRegisterInfo & TRI,const MachineRegisterInfo & MRI,const Register Reg)87 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
88 const MachineRegisterInfo &MRI,
89 const Register Reg) {
90 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
91 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
92 }
93
onlyAllocateVGPRs(const TargetRegisterInfo & TRI,const MachineRegisterInfo & MRI,const Register Reg)94 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
95 const MachineRegisterInfo &MRI,
96 const Register Reg) {
97 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
98 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
99 }
100
101 /// -{sgpr|vgpr}-regalloc=... command line option.
useDefaultRegisterAllocator()102 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
103
104 /// A dummy default pass factory indicates whether the register allocator is
105 /// overridden on the command line.
106 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
107 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
108
109 static SGPRRegisterRegAlloc
110 defaultSGPRRegAlloc("default",
111 "pick SGPR register allocator based on -O option",
112 useDefaultRegisterAllocator);
113
114 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
115 RegisterPassParser<SGPRRegisterRegAlloc>>
116 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
117 cl::desc("Register allocator to use for SGPRs"));
118
119 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
120 RegisterPassParser<VGPRRegisterRegAlloc>>
121 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
122 cl::desc("Register allocator to use for VGPRs"));
123
124
initializeDefaultSGPRRegisterAllocatorOnce()125 static void initializeDefaultSGPRRegisterAllocatorOnce() {
126 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
127
128 if (!Ctor) {
129 Ctor = SGPRRegAlloc;
130 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
131 }
132 }
133
initializeDefaultVGPRRegisterAllocatorOnce()134 static void initializeDefaultVGPRRegisterAllocatorOnce() {
135 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
136
137 if (!Ctor) {
138 Ctor = VGPRRegAlloc;
139 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
140 }
141 }
142
createBasicSGPRRegisterAllocator()143 static FunctionPass *createBasicSGPRRegisterAllocator() {
144 return createBasicRegisterAllocator(onlyAllocateSGPRs);
145 }
146
createGreedySGPRRegisterAllocator()147 static FunctionPass *createGreedySGPRRegisterAllocator() {
148 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
149 }
150
createFastSGPRRegisterAllocator()151 static FunctionPass *createFastSGPRRegisterAllocator() {
152 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
153 }
154
createBasicVGPRRegisterAllocator()155 static FunctionPass *createBasicVGPRRegisterAllocator() {
156 return createBasicRegisterAllocator(onlyAllocateVGPRs);
157 }
158
createGreedyVGPRRegisterAllocator()159 static FunctionPass *createGreedyVGPRRegisterAllocator() {
160 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
161 }
162
createFastVGPRRegisterAllocator()163 static FunctionPass *createFastVGPRRegisterAllocator() {
164 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
165 }
166
167 static SGPRRegisterRegAlloc basicRegAllocSGPR(
168 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
169 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
170 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
171
172 static SGPRRegisterRegAlloc fastRegAllocSGPR(
173 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
174
175
176 static VGPRRegisterRegAlloc basicRegAllocVGPR(
177 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
178 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
179 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
180
181 static VGPRRegisterRegAlloc fastRegAllocVGPR(
182 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
183 } // anonymous namespace
184
185 static cl::opt<bool>
186 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
187 cl::desc("Run early if-conversion"),
188 cl::init(false));
189
190 static cl::opt<bool>
191 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
192 cl::desc("Run pre-RA exec mask optimizations"),
193 cl::init(true));
194
195 static cl::opt<bool>
196 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
197 cl::desc("Lower GPU ctor / dtors to globals on the device."),
198 cl::init(true), cl::Hidden);
199
200 // Option to disable vectorizer for tests.
201 static cl::opt<bool> EnableLoadStoreVectorizer(
202 "amdgpu-load-store-vectorizer",
203 cl::desc("Enable load store vectorizer"),
204 cl::init(true),
205 cl::Hidden);
206
207 // Option to control global loads scalarization
208 static cl::opt<bool> ScalarizeGlobal(
209 "amdgpu-scalarize-global-loads",
210 cl::desc("Enable global load scalarization"),
211 cl::init(true),
212 cl::Hidden);
213
214 // Option to run internalize pass.
215 static cl::opt<bool> InternalizeSymbols(
216 "amdgpu-internalize-symbols",
217 cl::desc("Enable elimination of non-kernel functions and unused globals"),
218 cl::init(false),
219 cl::Hidden);
220
221 // Option to inline all early.
222 static cl::opt<bool> EarlyInlineAll(
223 "amdgpu-early-inline-all",
224 cl::desc("Inline all functions early"),
225 cl::init(false),
226 cl::Hidden);
227
228 static cl::opt<bool> RemoveIncompatibleFunctions(
229 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
230 cl::desc("Enable removal of functions when they"
231 "use features not supported by the target GPU"),
232 cl::init(true));
233
234 static cl::opt<bool> EnableSDWAPeephole(
235 "amdgpu-sdwa-peephole",
236 cl::desc("Enable SDWA peepholer"),
237 cl::init(true));
238
239 static cl::opt<bool> EnableDPPCombine(
240 "amdgpu-dpp-combine",
241 cl::desc("Enable DPP combiner"),
242 cl::init(true));
243
244 // Enable address space based alias analysis
245 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
246 cl::desc("Enable AMDGPU Alias Analysis"),
247 cl::init(true));
248
249 // Option to run late CFG structurizer
250 static cl::opt<bool, true> LateCFGStructurize(
251 "amdgpu-late-structurize",
252 cl::desc("Enable late CFG structurization"),
253 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
254 cl::Hidden);
255
256 // Disable structurizer-based control-flow lowering in order to test convergence
257 // control tokens. This should eventually be replaced by the wave-transform.
258 static cl::opt<bool, true> DisableStructurizer(
259 "amdgpu-disable-structurizer",
260 cl::desc("Disable structurizer for experiments; produces unusable code"),
261 cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden);
262
263 // Enable lib calls simplifications
264 static cl::opt<bool> EnableLibCallSimplify(
265 "amdgpu-simplify-libcall",
266 cl::desc("Enable amdgpu library simplifications"),
267 cl::init(true),
268 cl::Hidden);
269
270 static cl::opt<bool> EnableLowerKernelArguments(
271 "amdgpu-ir-lower-kernel-arguments",
272 cl::desc("Lower kernel argument loads in IR pass"),
273 cl::init(true),
274 cl::Hidden);
275
276 static cl::opt<bool> EnableRegReassign(
277 "amdgpu-reassign-regs",
278 cl::desc("Enable register reassign optimizations on gfx10+"),
279 cl::init(true),
280 cl::Hidden);
281
282 static cl::opt<bool> OptVGPRLiveRange(
283 "amdgpu-opt-vgpr-liverange",
284 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
285 cl::init(true), cl::Hidden);
286
287 static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
288 "amdgpu-atomic-optimizer-strategy",
289 cl::desc("Select DPP or Iterative strategy for scan"),
290 cl::init(ScanOptions::Iterative),
291 cl::values(
292 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
293 clEnumValN(ScanOptions::Iterative, "Iterative",
294 "Use Iterative approach for scan"),
295 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
296
297 // Enable Mode register optimization
298 static cl::opt<bool> EnableSIModeRegisterPass(
299 "amdgpu-mode-register",
300 cl::desc("Enable mode register pass"),
301 cl::init(true),
302 cl::Hidden);
303
304 // Enable GFX11.5+ s_singleuse_vdst insertion
305 static cl::opt<bool>
306 EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
307 cl::desc("Enable s_singleuse_vdst insertion"),
308 cl::init(false), cl::Hidden);
309
310 // Enable GFX11+ s_delay_alu insertion
311 static cl::opt<bool>
312 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
313 cl::desc("Enable s_delay_alu insertion"),
314 cl::init(true), cl::Hidden);
315
316 // Enable GFX11+ VOPD
317 static cl::opt<bool>
318 EnableVOPD("amdgpu-enable-vopd",
319 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
320 cl::init(true), cl::Hidden);
321
322 // Option is used in lit tests to prevent deadcoding of patterns inspected.
323 static cl::opt<bool>
324 EnableDCEInRA("amdgpu-dce-in-ra",
325 cl::init(true), cl::Hidden,
326 cl::desc("Enable machine DCE inside regalloc"));
327
328 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
329 cl::desc("Adjust wave priority"),
330 cl::init(false), cl::Hidden);
331
332 static cl::opt<bool> EnableScalarIRPasses(
333 "amdgpu-scalar-ir-passes",
334 cl::desc("Enable scalar IR passes"),
335 cl::init(true),
336 cl::Hidden);
337
338 static cl::opt<bool> EnableStructurizerWorkarounds(
339 "amdgpu-enable-structurizer-workarounds",
340 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
341 cl::Hidden);
342
343 static cl::opt<bool, true> EnableLowerModuleLDS(
344 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
345 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
346 cl::Hidden);
347
348 static cl::opt<bool> EnablePreRAOptimizations(
349 "amdgpu-enable-pre-ra-optimizations",
350 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
351 cl::Hidden);
352
353 static cl::opt<bool> EnablePromoteKernelArguments(
354 "amdgpu-enable-promote-kernel-arguments",
355 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
356 cl::Hidden, cl::init(true));
357
358 static cl::opt<bool> EnableImageIntrinsicOptimizer(
359 "amdgpu-enable-image-intrinsic-optimizer",
360 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
361 cl::Hidden);
362
363 static cl::opt<bool>
364 EnableLoopPrefetch("amdgpu-loop-prefetch",
365 cl::desc("Enable loop data prefetch on AMDGPU"),
366 cl::Hidden, cl::init(false));
367
368 static cl::opt<bool> EnableMaxIlpSchedStrategy(
369 "amdgpu-enable-max-ilp-scheduling-strategy",
370 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
371 cl::Hidden, cl::init(false));
372
373 static cl::opt<bool> EnableRewritePartialRegUses(
374 "amdgpu-enable-rewrite-partial-reg-uses",
375 cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
376 cl::Hidden);
377
378 static cl::opt<bool> EnableHipStdPar(
379 "amdgpu-enable-hipstdpar",
380 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
381 cl::Hidden);
382
LLVMInitializeAMDGPUTarget()383 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
384 // Register the target
385 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
386 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
387
388 PassRegistry *PR = PassRegistry::getPassRegistry();
389 initializeR600ClauseMergePassPass(*PR);
390 initializeR600ControlFlowFinalizerPass(*PR);
391 initializeR600PacketizerPass(*PR);
392 initializeR600ExpandSpecialInstrsPassPass(*PR);
393 initializeR600VectorRegMergerPass(*PR);
394 initializeGlobalISel(*PR);
395 initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
396 initializeGCNDPPCombinePass(*PR);
397 initializeSILowerI1CopiesPass(*PR);
398 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
399 initializeSILowerWWMCopiesPass(*PR);
400 initializeAMDGPUMarkLastScratchLoadPass(*PR);
401 initializeSILowerSGPRSpillsPass(*PR);
402 initializeSIFixSGPRCopiesPass(*PR);
403 initializeSIFixVGPRCopiesPass(*PR);
404 initializeSIFoldOperandsPass(*PR);
405 initializeSIPeepholeSDWAPass(*PR);
406 initializeSIShrinkInstructionsPass(*PR);
407 initializeSIOptimizeExecMaskingPreRAPass(*PR);
408 initializeSIOptimizeVGPRLiveRangePass(*PR);
409 initializeSILoadStoreOptimizerPass(*PR);
410 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
411 initializeAMDGPUAlwaysInlinePass(*PR);
412 initializeAMDGPUAttributorLegacyPass(*PR);
413 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
414 initializeAMDGPUAnnotateUniformValuesPass(*PR);
415 initializeAMDGPUArgumentUsageInfoPass(*PR);
416 initializeAMDGPUAtomicOptimizerPass(*PR);
417 initializeAMDGPULowerKernelArgumentsPass(*PR);
418 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
419 initializeAMDGPULowerKernelAttributesPass(*PR);
420 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
421 initializeAMDGPUPostLegalizerCombinerPass(*PR);
422 initializeAMDGPUPreLegalizerCombinerPass(*PR);
423 initializeAMDGPURegBankCombinerPass(*PR);
424 initializeAMDGPURegBankSelectPass(*PR);
425 initializeAMDGPUPromoteAllocaPass(*PR);
426 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
427 initializeAMDGPUCodeGenPreparePass(*PR);
428 initializeAMDGPULateCodeGenPreparePass(*PR);
429 initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
430 initializeAMDGPULowerModuleLDSLegacyPass(*PR);
431 initializeAMDGPULowerBufferFatPointersPass(*PR);
432 initializeAMDGPURewriteOutArgumentsPass(*PR);
433 initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
434 initializeAMDGPUUnifyMetadataPass(*PR);
435 initializeSIAnnotateControlFlowPass(*PR);
436 initializeAMDGPUInsertSingleUseVDSTPass(*PR);
437 initializeAMDGPUInsertDelayAluPass(*PR);
438 initializeSIInsertHardClausesPass(*PR);
439 initializeSIInsertWaitcntsPass(*PR);
440 initializeSIModeRegisterPass(*PR);
441 initializeSIWholeQuadModePass(*PR);
442 initializeSILowerControlFlowPass(*PR);
443 initializeSIPreEmitPeepholePass(*PR);
444 initializeSILateBranchLoweringPass(*PR);
445 initializeSIMemoryLegalizerPass(*PR);
446 initializeSIOptimizeExecMaskingPass(*PR);
447 initializeSIPreAllocateWWMRegsPass(*PR);
448 initializeSIFormMemoryClausesPass(*PR);
449 initializeSIPostRABundlerPass(*PR);
450 initializeGCNCreateVOPDPass(*PR);
451 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
452 initializeAMDGPUAAWrapperPassPass(*PR);
453 initializeAMDGPUExternalAAWrapperPass(*PR);
454 initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
455 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
456 initializeAMDGPUResourceUsageAnalysisPass(*PR);
457 initializeGCNNSAReassignPass(*PR);
458 initializeGCNPreRAOptimizationsPass(*PR);
459 initializeGCNPreRALongBranchRegPass(*PR);
460 initializeGCNRewritePartialRegUsesPass(*PR);
461 initializeGCNRegPressurePrinterPass(*PR);
462 }
463
createTLOF(const Triple & TT)464 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
465 return std::make_unique<AMDGPUTargetObjectFile>();
466 }
467
createSIMachineScheduler(MachineSchedContext * C)468 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
469 return new SIScheduleDAGMI(C);
470 }
471
472 static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext * C)473 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
474 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
475 ScheduleDAGMILive *DAG =
476 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
477 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
478 if (ST.shouldClusterStores())
479 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
480 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
481 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
482 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
483 return DAG;
484 }
485
486 static ScheduleDAGInstrs *
createGCNMaxILPMachineScheduler(MachineSchedContext * C)487 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
488 ScheduleDAGMILive *DAG =
489 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
490 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
491 return DAG;
492 }
493
494 static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext * C)495 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
496 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
497 auto DAG = new GCNIterativeScheduler(C,
498 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
499 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
500 if (ST.shouldClusterStores())
501 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
502 return DAG;
503 }
504
createMinRegScheduler(MachineSchedContext * C)505 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
506 return new GCNIterativeScheduler(C,
507 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
508 }
509
510 static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext * C)511 createIterativeILPMachineScheduler(MachineSchedContext *C) {
512 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
513 auto DAG = new GCNIterativeScheduler(C,
514 GCNIterativeScheduler::SCHEDULE_ILP);
515 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
516 if (ST.shouldClusterStores())
517 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
518 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
519 return DAG;
520 }
521
522 static MachineSchedRegistry
523 SISchedRegistry("si", "Run SI's custom scheduler",
524 createSIMachineScheduler);
525
526 static MachineSchedRegistry
527 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
528 "Run GCN scheduler to maximize occupancy",
529 createGCNMaxOccupancyMachineScheduler);
530
531 static MachineSchedRegistry
532 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
533 createGCNMaxILPMachineScheduler);
534
535 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
536 "gcn-iterative-max-occupancy-experimental",
537 "Run GCN scheduler to maximize occupancy (experimental)",
538 createIterativeGCNMaxOccupancyMachineScheduler);
539
540 static MachineSchedRegistry GCNMinRegSchedRegistry(
541 "gcn-iterative-minreg",
542 "Run GCN iterative scheduler for minimal register usage (experimental)",
543 createMinRegScheduler);
544
545 static MachineSchedRegistry GCNILPSchedRegistry(
546 "gcn-iterative-ilp",
547 "Run GCN iterative scheduler for ILP scheduling (experimental)",
548 createIterativeILPMachineScheduler);
549
computeDataLayout(const Triple & TT)550 static StringRef computeDataLayout(const Triple &TT) {
551 if (TT.getArch() == Triple::r600) {
552 // 32-bit pointers.
553 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
554 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
555 }
556
557 // 32-bit private, local, and region pointers. 64-bit global, constant and
558 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
559 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
560 // (address space 7), and 128-bit non-integral buffer resourcees (address
561 // space 8) which cannot be non-trivilally accessed by LLVM memory operations
562 // like getelementptr.
563 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
564 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
565 "v32:32-v48:64-v96:"
566 "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
567 "G1-ni:7:8:9";
568 }
569
570 LLVM_READNONE
getGPUOrDefault(const Triple & TT,StringRef GPU)571 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
572 if (!GPU.empty())
573 return GPU;
574
575 // Need to default to a target with flat support for HSA.
576 if (TT.getArch() == Triple::amdgcn)
577 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
578
579 return "r600";
580 }
581
getEffectiveRelocModel(std::optional<Reloc::Model> RM)582 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
583 // The AMDGPU toolchain only supports generating shared objects, so we
584 // must always use PIC.
585 return Reloc::PIC_;
586 }
587
AMDGPUTargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,const TargetOptions & Options,std::optional<Reloc::Model> RM,std::optional<CodeModel::Model> CM,CodeGenOptLevel OptLevel)588 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
589 StringRef CPU, StringRef FS,
590 const TargetOptions &Options,
591 std::optional<Reloc::Model> RM,
592 std::optional<CodeModel::Model> CM,
593 CodeGenOptLevel OptLevel)
594 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
595 FS, Options, getEffectiveRelocModel(RM),
596 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
597 TLOF(createTLOF(getTargetTriple())) {
598 initAsmInfo();
599 if (TT.getArch() == Triple::amdgcn) {
600 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
601 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
602 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
603 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
604 }
605 }
606
607 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
608 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
609 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
610 bool AMDGPUTargetMachine::DisableStructurizer = false;
611
612 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
613
getGPUName(const Function & F) const614 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
615 Attribute GPUAttr = F.getFnAttribute("target-cpu");
616 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
617 }
618
getFeatureString(const Function & F) const619 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
620 Attribute FSAttr = F.getFnAttribute("target-features");
621
622 return FSAttr.isValid() ? FSAttr.getValueAsString()
623 : getTargetFeatureString();
624 }
625
626 /// Predicate for Internalize pass.
mustPreserveGV(const GlobalValue & GV)627 static bool mustPreserveGV(const GlobalValue &GV) {
628 if (const Function *F = dyn_cast<Function>(&GV))
629 return F->isDeclaration() || F->getName().starts_with("__asan_") ||
630 F->getName().starts_with("__sanitizer_") ||
631 AMDGPU::isEntryFunctionCC(F->getCallingConv());
632
633 GV.removeDeadConstantUsers();
634 return !GV.use_empty();
635 }
636
registerDefaultAliasAnalyses(AAManager & AAM)637 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
638 AAM.registerFunctionAnalysis<AMDGPUAA>();
639 }
640
641 static Expected<ScanOptions>
parseAMDGPUAtomicOptimizerStrategy(StringRef Params)642 parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
643 if (Params.empty())
644 return ScanOptions::Iterative;
645 Params.consume_front("strategy=");
646 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
647 .Case("dpp", ScanOptions::DPP)
648 .Cases("iterative", "", ScanOptions::Iterative)
649 .Case("none", ScanOptions::None)
650 .Default(std::nullopt);
651 if (Result)
652 return *Result;
653 return make_error<StringError>("invalid parameter", inconvertibleErrorCode());
654 }
655
buildCodeGenPipeline(ModulePassManager & MPM,raw_pwrite_stream & Out,raw_pwrite_stream * DwoOut,CodeGenFileType FileType,const CGPassBuilderOption & Opts,PassInstrumentationCallbacks * PIC)656 Error AMDGPUTargetMachine::buildCodeGenPipeline(
657 ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
658 CodeGenFileType FileType, const CGPassBuilderOption &Opts,
659 PassInstrumentationCallbacks *PIC) {
660 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
661 return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
662 }
663
registerPassBuilderCallbacks(PassBuilder & PB)664 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
665
666 #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
667 #include "llvm/Passes/TargetPassRegistry.inc"
668
669 PB.registerPipelineStartEPCallback(
670 [](ModulePassManager &PM, OptimizationLevel Level) {
671 FunctionPassManager FPM;
672 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
673 if (EnableHipStdPar)
674 PM.addPass(HipStdParAcceleratorCodeSelectionPass());
675 });
676
677 PB.registerPipelineEarlySimplificationEPCallback(
678 [](ModulePassManager &PM, OptimizationLevel Level) {
679 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
680
681 if (Level == OptimizationLevel::O0)
682 return;
683
684 PM.addPass(AMDGPUUnifyMetadataPass());
685
686 if (InternalizeSymbols) {
687 PM.addPass(InternalizePass(mustPreserveGV));
688 PM.addPass(GlobalDCEPass());
689 }
690
691 if (EarlyInlineAll && !EnableFunctionCalls)
692 PM.addPass(AMDGPUAlwaysInlinePass());
693 });
694
695 PB.registerPeepholeEPCallback(
696 [](FunctionPassManager &FPM, OptimizationLevel Level) {
697 if (Level == OptimizationLevel::O0)
698 return;
699
700 FPM.addPass(AMDGPUUseNativeCallsPass());
701 if (EnableLibCallSimplify)
702 FPM.addPass(AMDGPUSimplifyLibCallsPass());
703 });
704
705 PB.registerCGSCCOptimizerLateEPCallback(
706 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
707 if (Level == OptimizationLevel::O0)
708 return;
709
710 FunctionPassManager FPM;
711
712 // Add promote kernel arguments pass to the opt pipeline right before
713 // infer address spaces which is needed to do actual address space
714 // rewriting.
715 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
716 EnablePromoteKernelArguments)
717 FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
718
719 // Add infer address spaces pass to the opt pipeline after inlining
720 // but before SROA to increase SROA opportunities.
721 FPM.addPass(InferAddressSpacesPass());
722
723 // This should run after inlining to have any chance of doing
724 // anything, and before other cleanup optimizations.
725 FPM.addPass(AMDGPULowerKernelAttributesPass());
726
727 if (Level != OptimizationLevel::O0) {
728 // Promote alloca to vector before SROA and loop unroll. If we
729 // manage to eliminate allocas before unroll we may choose to unroll
730 // less.
731 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
732 }
733
734 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
735 });
736
737 // FIXME: Why is AMDGPUAttributor not in CGSCC?
738 PB.registerOptimizerLastEPCallback(
739 [this](ModulePassManager &MPM, OptimizationLevel Level) {
740 if (Level != OptimizationLevel::O0) {
741 MPM.addPass(AMDGPUAttributorPass(*this));
742 }
743 });
744
745 PB.registerFullLinkTimeOptimizationLastEPCallback(
746 [this](ModulePassManager &PM, OptimizationLevel Level) {
747 // We want to support the -lto-partitions=N option as "best effort".
748 // For that, we need to lower LDS earlier in the pipeline before the
749 // module is partitioned for codegen.
750 if (EnableLowerModuleLDS)
751 PM.addPass(AMDGPULowerModuleLDSPass(*this));
752 });
753
754 PB.registerRegClassFilterParsingCallback(
755 [](StringRef FilterName) -> RegAllocFilterFunc {
756 if (FilterName == "sgpr")
757 return onlyAllocateSGPRs;
758 if (FilterName == "vgpr")
759 return onlyAllocateVGPRs;
760 return nullptr;
761 });
762 }
763
getNullPointerValue(unsigned AddrSpace)764 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
765 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
766 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
767 AddrSpace == AMDGPUAS::REGION_ADDRESS)
768 ? -1
769 : 0;
770 }
771
isNoopAddrSpaceCast(unsigned SrcAS,unsigned DestAS) const772 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
773 unsigned DestAS) const {
774 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
775 AMDGPU::isFlatGlobalAddrSpace(DestAS);
776 }
777
getAssumedAddrSpace(const Value * V) const778 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
779 const auto *LD = dyn_cast<LoadInst>(V);
780 if (!LD)
781 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
782
783 // It must be a generic pointer loaded.
784 assert(V->getType()->isPointerTy() &&
785 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
786
787 const auto *Ptr = LD->getPointerOperand();
788 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
789 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
790 // For a generic pointer loaded from the constant memory, it could be assumed
791 // as a global pointer since the constant memory is only populated on the
792 // host side. As implied by the offload programming model, only global
793 // pointers could be referenced on the host side.
794 return AMDGPUAS::GLOBAL_ADDRESS;
795 }
796
797 std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value * V) const798 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
799 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
800 switch (II->getIntrinsicID()) {
801 case Intrinsic::amdgcn_is_shared:
802 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
803 case Intrinsic::amdgcn_is_private:
804 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
805 default:
806 break;
807 }
808 return std::pair(nullptr, -1);
809 }
810 // Check the global pointer predication based on
811 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
812 // the order of 'is_shared' and 'is_private' is not significant.
813 Value *Ptr;
814 if (match(
815 const_cast<Value *>(V),
816 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
817 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
818 m_Deferred(Ptr))))))
819 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
820
821 return std::pair(nullptr, -1);
822 }
823
824 unsigned
getAddressSpaceForPseudoSourceKind(unsigned Kind) const825 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
826 switch (Kind) {
827 case PseudoSourceValue::Stack:
828 case PseudoSourceValue::FixedStack:
829 return AMDGPUAS::PRIVATE_ADDRESS;
830 case PseudoSourceValue::ConstantPool:
831 case PseudoSourceValue::GOT:
832 case PseudoSourceValue::JumpTable:
833 case PseudoSourceValue::GlobalValueCallEntry:
834 case PseudoSourceValue::ExternalSymbolCallEntry:
835 return AMDGPUAS::CONSTANT_ADDRESS;
836 }
837 return AMDGPUAS::FLAT_ADDRESS;
838 }
839
splitModule(Module & M,unsigned NumParts,function_ref<void (std::unique_ptr<Module> MPart)> ModuleCallback)840 bool AMDGPUTargetMachine::splitModule(
841 Module &M, unsigned NumParts,
842 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
843 // FIXME(?): Would be better to use an already existing Analysis/PassManager,
844 // but all current users of this API don't have one ready and would need to
845 // create one anyway. Let's hide the boilerplate for now to keep it simple.
846
847 LoopAnalysisManager LAM;
848 FunctionAnalysisManager FAM;
849 CGSCCAnalysisManager CGAM;
850 ModuleAnalysisManager MAM;
851
852 PassBuilder PB(this);
853 PB.registerModuleAnalyses(MAM);
854 PB.registerFunctionAnalyses(FAM);
855 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
856
857 ModulePassManager MPM;
858 MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback));
859 MPM.run(M, MAM);
860 return true;
861 }
862
863 //===----------------------------------------------------------------------===//
864 // GCN Target Machine (SI+)
865 //===----------------------------------------------------------------------===//
866
GCNTargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,const TargetOptions & Options,std::optional<Reloc::Model> RM,std::optional<CodeModel::Model> CM,CodeGenOptLevel OL,bool JIT)867 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
868 StringRef CPU, StringRef FS,
869 const TargetOptions &Options,
870 std::optional<Reloc::Model> RM,
871 std::optional<CodeModel::Model> CM,
872 CodeGenOptLevel OL, bool JIT)
873 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
874
875 const TargetSubtargetInfo *
getSubtargetImpl(const Function & F) const876 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
877 StringRef GPU = getGPUName(F);
878 StringRef FS = getFeatureString(F);
879
880 SmallString<128> SubtargetKey(GPU);
881 SubtargetKey.append(FS);
882
883 auto &I = SubtargetMap[SubtargetKey];
884 if (!I) {
885 // This needs to be done before we create a new subtarget since any
886 // creation will depend on the TM and the code generation flags on the
887 // function that reside in TargetOptions.
888 resetTargetOptions(F);
889 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
890 }
891
892 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
893
894 return I.get();
895 }
896
897 TargetTransformInfo
getTargetTransformInfo(const Function & F) const898 GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
899 return TargetTransformInfo(GCNTTIImpl(this, F));
900 }
901
902 //===----------------------------------------------------------------------===//
903 // AMDGPU Pass Setup
904 //===----------------------------------------------------------------------===//
905
getCSEConfig() const906 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
907 return getStandardCSEConfigForOpt(TM->getOptLevel());
908 }
909
910 namespace {
911
912 class GCNPassConfig final : public AMDGPUPassConfig {
913 public:
GCNPassConfig(LLVMTargetMachine & TM,PassManagerBase & PM)914 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
915 : AMDGPUPassConfig(TM, PM) {
916 // It is necessary to know the register usage of the entire call graph. We
917 // allow calls without EnableAMDGPUFunctionCalls if they are marked
918 // noinline, so this is always required.
919 setRequiresCodeGenSCCOrder(true);
920 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
921 }
922
getGCNTargetMachine() const923 GCNTargetMachine &getGCNTargetMachine() const {
924 return getTM<GCNTargetMachine>();
925 }
926
927 ScheduleDAGInstrs *
928 createMachineScheduler(MachineSchedContext *C) const override;
929
930 ScheduleDAGInstrs *
createPostMachineScheduler(MachineSchedContext * C) const931 createPostMachineScheduler(MachineSchedContext *C) const override {
932 ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
933 C, std::make_unique<PostGenericScheduler>(C),
934 /*RemoveKillFlags=*/true);
935 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
936 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
937 if (ST.shouldClusterStores())
938 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
939 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
940 DAG->addMutation(
941 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
942 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
943 DAG->addMutation(createVOPDPairingMutation());
944 return DAG;
945 }
946
947 bool addPreISel() override;
948 void addMachineSSAOptimization() override;
949 bool addILPOpts() override;
950 bool addInstSelector() override;
951 bool addIRTranslator() override;
952 void addPreLegalizeMachineIR() override;
953 bool addLegalizeMachineIR() override;
954 void addPreRegBankSelect() override;
955 bool addRegBankSelect() override;
956 void addPreGlobalInstructionSelect() override;
957 bool addGlobalInstructionSelect() override;
958 void addFastRegAlloc() override;
959 void addOptimizedRegAlloc() override;
960
961 FunctionPass *createSGPRAllocPass(bool Optimized);
962 FunctionPass *createVGPRAllocPass(bool Optimized);
963 FunctionPass *createRegAllocPass(bool Optimized) override;
964
965 bool addRegAssignAndRewriteFast() override;
966 bool addRegAssignAndRewriteOptimized() override;
967
968 void addPreRegAlloc() override;
969 bool addPreRewrite() override;
970 void addPostRegAlloc() override;
971 void addPreSched2() override;
972 void addPreEmitPass() override;
973 };
974
975 } // end anonymous namespace
976
AMDGPUPassConfig(LLVMTargetMachine & TM,PassManagerBase & PM)977 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
978 : TargetPassConfig(TM, PM) {
979 // Exceptions and StackMaps are not supported, so these passes will never do
980 // anything.
981 disablePass(&StackMapLivenessID);
982 disablePass(&FuncletLayoutID);
983 // Garbage collection is not supported.
984 disablePass(&GCLoweringID);
985 disablePass(&ShadowStackGCLoweringID);
986 }
987
addEarlyCSEOrGVNPass()988 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
989 if (getOptLevel() == CodeGenOptLevel::Aggressive)
990 addPass(createGVNPass());
991 else
992 addPass(createEarlyCSEPass());
993 }
994
addStraightLineScalarOptimizationPasses()995 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
996 if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
997 addPass(createLoopDataPrefetchPass());
998 addPass(createSeparateConstOffsetFromGEPPass());
999 // ReassociateGEPs exposes more opportunities for SLSR. See
1000 // the example in reassociate-geps-and-slsr.ll.
1001 addPass(createStraightLineStrengthReducePass());
1002 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1003 // EarlyCSE can reuse.
1004 addEarlyCSEOrGVNPass();
1005 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1006 addPass(createNaryReassociatePass());
1007 // NaryReassociate on GEPs creates redundant common expressions, so run
1008 // EarlyCSE after it.
1009 addPass(createEarlyCSEPass());
1010 }
1011
addIRPasses()1012 void AMDGPUPassConfig::addIRPasses() {
1013 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1014
1015 Triple::ArchType Arch = TM.getTargetTriple().getArch();
1016 if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn)
1017 addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1018
1019 // There is no reason to run these.
1020 disablePass(&StackMapLivenessID);
1021 disablePass(&FuncletLayoutID);
1022 disablePass(&PatchableFunctionID);
1023
1024 addPass(createAMDGPUPrintfRuntimeBinding());
1025 if (LowerCtorDtor)
1026 addPass(createAMDGPUCtorDtorLoweringLegacyPass());
1027
1028 if (isPassEnabled(EnableImageIntrinsicOptimizer))
1029 addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
1030
1031 // This can be disabled by passing ::Disable here or on the command line
1032 // with --expand-variadics-override=disable.
1033 addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
1034
1035 // Function calls are not supported, so make sure we inline everything.
1036 addPass(createAMDGPUAlwaysInlinePass());
1037 addPass(createAlwaysInlinerLegacyPass());
1038
1039 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1040 if (Arch == Triple::r600)
1041 addPass(createR600OpenCLImageTypeLoweringPass());
1042
1043 // Replace OpenCL enqueued block function pointers with global variables.
1044 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
1045
1046 // Runs before PromoteAlloca so the latter can account for function uses
1047 if (EnableLowerModuleLDS) {
1048 addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
1049 }
1050
1051 if (TM.getOptLevel() > CodeGenOptLevel::None)
1052 addPass(createInferAddressSpacesPass());
1053
1054 // Run atomic optimizer before Atomic Expand
1055 if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
1056 (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1057 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1058 addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
1059 }
1060
1061 addPass(createAtomicExpandLegacyPass());
1062
1063 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1064 addPass(createAMDGPUPromoteAlloca());
1065
1066 if (isPassEnabled(EnableScalarIRPasses))
1067 addStraightLineScalarOptimizationPasses();
1068
1069 if (EnableAMDGPUAliasAnalysis) {
1070 addPass(createAMDGPUAAWrapperPass());
1071 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1072 AAResults &AAR) {
1073 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1074 AAR.addAAResult(WrapperPass->getResult());
1075 }));
1076 }
1077
1078 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1079 // TODO: May want to move later or split into an early and late one.
1080 addPass(createAMDGPUCodeGenPreparePass());
1081 }
1082
1083 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1084 // have expanded.
1085 if (TM.getOptLevel() > CodeGenOptLevel::Less)
1086 addPass(createLICMPass());
1087 }
1088
1089 TargetPassConfig::addIRPasses();
1090
1091 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1092 // example, GVN can combine
1093 //
1094 // %0 = add %a, %b
1095 // %1 = add %b, %a
1096 //
1097 // and
1098 //
1099 // %0 = shl nsw %a, 2
1100 // %1 = shl %a, 2
1101 //
1102 // but EarlyCSE can do neither of them.
1103 if (isPassEnabled(EnableScalarIRPasses))
1104 addEarlyCSEOrGVNPass();
1105 }
1106
addCodeGenPrepare()1107 void AMDGPUPassConfig::addCodeGenPrepare() {
1108 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1109 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1110 // analysis, and should be removed.
1111 addPass(createAMDGPUAnnotateKernelFeaturesPass());
1112 }
1113
1114 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1115 EnableLowerKernelArguments)
1116 addPass(createAMDGPULowerKernelArgumentsPass());
1117
1118 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1119 // This lowering has been placed after codegenprepare to take advantage of
1120 // address mode matching (which is why it isn't put with the LDS lowerings).
1121 // It could be placed anywhere before uniformity annotations (an analysis
1122 // that it changes by splitting up fat pointers into their components)
1123 // but has been put before switch lowering and CFG flattening so that those
1124 // passes can run on the more optimized control flow this pass creates in
1125 // many cases.
1126 //
1127 // FIXME: This should ideally be put after the LoadStoreVectorizer.
1128 // However, due to some annoying facts about ResourceUsageAnalysis,
1129 // (especially as exercised in the resource-usage-dead-function test),
1130 // we need all the function passes codegenprepare all the way through
1131 // said resource usage analysis to run on the call graph produced
1132 // before codegenprepare runs (because codegenprepare will knock some
1133 // nodes out of the graph, which leads to function-level passes not
1134 // being run on them, which causes crashes in the resource usage analysis).
1135 addPass(createAMDGPULowerBufferFatPointersPass());
1136 // In accordance with the above FIXME, manually force all the
1137 // function-level passes into a CGSCCPassManager.
1138 addPass(new DummyCGSCCPass());
1139 }
1140
1141 TargetPassConfig::addCodeGenPrepare();
1142
1143 if (isPassEnabled(EnableLoadStoreVectorizer))
1144 addPass(createLoadStoreVectorizerPass());
1145
1146 // LowerSwitch pass may introduce unreachable blocks that can
1147 // cause unexpected behavior for subsequent passes. Placing it
1148 // here seems better that these blocks would get cleaned up by
1149 // UnreachableBlockElim inserted next in the pass flow.
1150 addPass(createLowerSwitchPass());
1151 }
1152
addPreISel()1153 bool AMDGPUPassConfig::addPreISel() {
1154 if (TM->getOptLevel() > CodeGenOptLevel::None)
1155 addPass(createFlattenCFGPass());
1156 return false;
1157 }
1158
addInstSelector()1159 bool AMDGPUPassConfig::addInstSelector() {
1160 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1161 return false;
1162 }
1163
addGCPasses()1164 bool AMDGPUPassConfig::addGCPasses() {
1165 // Do nothing. GC is not supported.
1166 return false;
1167 }
1168
1169 llvm::ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext * C) const1170 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1171 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1172 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1173 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1174 if (ST.shouldClusterStores())
1175 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1176 return DAG;
1177 }
1178
createMachineFunctionInfo(BumpPtrAllocator & Allocator,const Function & F,const TargetSubtargetInfo * STI) const1179 MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
1180 BumpPtrAllocator &Allocator, const Function &F,
1181 const TargetSubtargetInfo *STI) const {
1182 return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1183 Allocator, F, static_cast<const R600Subtarget *>(STI));
1184 }
1185
1186 //===----------------------------------------------------------------------===//
1187 // GCN Pass Setup
1188 //===----------------------------------------------------------------------===//
1189
createMachineScheduler(MachineSchedContext * C) const1190 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1191 MachineSchedContext *C) const {
1192 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1193 if (ST.enableSIScheduler())
1194 return createSIMachineScheduler(C);
1195
1196 if (EnableMaxIlpSchedStrategy)
1197 return createGCNMaxILPMachineScheduler(C);
1198
1199 return createGCNMaxOccupancyMachineScheduler(C);
1200 }
1201
addPreISel()1202 bool GCNPassConfig::addPreISel() {
1203 AMDGPUPassConfig::addPreISel();
1204
1205 if (TM->getOptLevel() > CodeGenOptLevel::None)
1206 addPass(createSinkingPass());
1207
1208 if (TM->getOptLevel() > CodeGenOptLevel::None)
1209 addPass(createAMDGPULateCodeGenPreparePass());
1210
1211 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1212 // regions formed by them.
1213 addPass(&AMDGPUUnifyDivergentExitNodesID);
1214 if (!LateCFGStructurize && !DisableStructurizer) {
1215 if (EnableStructurizerWorkarounds) {
1216 addPass(createFixIrreduciblePass());
1217 addPass(createUnifyLoopExitsPass());
1218 }
1219 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1220 }
1221 addPass(createAMDGPUAnnotateUniformValues());
1222 if (!LateCFGStructurize && !DisableStructurizer) {
1223 addPass(createSIAnnotateControlFlowPass());
1224 // TODO: Move this right after structurizeCFG to avoid extra divergence
1225 // analysis. This depends on stopping SIAnnotateControlFlow from making
1226 // control flow modifications.
1227 addPass(createAMDGPURewriteUndefForPHILegacyPass());
1228 }
1229 addPass(createLCSSAPass());
1230
1231 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1232 addPass(&AMDGPUPerfHintAnalysisID);
1233
1234 return false;
1235 }
1236
addMachineSSAOptimization()1237 void GCNPassConfig::addMachineSSAOptimization() {
1238 TargetPassConfig::addMachineSSAOptimization();
1239
1240 // We want to fold operands after PeepholeOptimizer has run (or as part of
1241 // it), because it will eliminate extra copies making it easier to fold the
1242 // real source operand. We want to eliminate dead instructions after, so that
1243 // we see fewer uses of the copies. We then need to clean up the dead
1244 // instructions leftover after the operands are folded as well.
1245 //
1246 // XXX - Can we get away without running DeadMachineInstructionElim again?
1247 addPass(&SIFoldOperandsID);
1248 if (EnableDPPCombine)
1249 addPass(&GCNDPPCombineID);
1250 addPass(&SILoadStoreOptimizerID);
1251 if (isPassEnabled(EnableSDWAPeephole)) {
1252 addPass(&SIPeepholeSDWAID);
1253 addPass(&EarlyMachineLICMID);
1254 addPass(&MachineCSEID);
1255 addPass(&SIFoldOperandsID);
1256 }
1257 addPass(&DeadMachineInstructionElimID);
1258 addPass(createSIShrinkInstructionsPass());
1259 }
1260
addILPOpts()1261 bool GCNPassConfig::addILPOpts() {
1262 if (EnableEarlyIfConversion)
1263 addPass(&EarlyIfConverterID);
1264
1265 TargetPassConfig::addILPOpts();
1266 return false;
1267 }
1268
addInstSelector()1269 bool GCNPassConfig::addInstSelector() {
1270 AMDGPUPassConfig::addInstSelector();
1271 addPass(&SIFixSGPRCopiesID);
1272 addPass(createSILowerI1CopiesPass());
1273 return false;
1274 }
1275
addIRTranslator()1276 bool GCNPassConfig::addIRTranslator() {
1277 addPass(new IRTranslator(getOptLevel()));
1278 return false;
1279 }
1280
addPreLegalizeMachineIR()1281 void GCNPassConfig::addPreLegalizeMachineIR() {
1282 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1283 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1284 addPass(new Localizer());
1285 }
1286
addLegalizeMachineIR()1287 bool GCNPassConfig::addLegalizeMachineIR() {
1288 addPass(new Legalizer());
1289 return false;
1290 }
1291
addPreRegBankSelect()1292 void GCNPassConfig::addPreRegBankSelect() {
1293 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1294 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1295 addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
1296 }
1297
addRegBankSelect()1298 bool GCNPassConfig::addRegBankSelect() {
1299 addPass(new AMDGPURegBankSelect());
1300 return false;
1301 }
1302
addPreGlobalInstructionSelect()1303 void GCNPassConfig::addPreGlobalInstructionSelect() {
1304 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1305 addPass(createAMDGPURegBankCombiner(IsOptNone));
1306 }
1307
addGlobalInstructionSelect()1308 bool GCNPassConfig::addGlobalInstructionSelect() {
1309 addPass(new InstructionSelect(getOptLevel()));
1310 return false;
1311 }
1312
addPreRegAlloc()1313 void GCNPassConfig::addPreRegAlloc() {
1314 if (LateCFGStructurize) {
1315 addPass(createAMDGPUMachineCFGStructurizerPass());
1316 }
1317 }
1318
addFastRegAlloc()1319 void GCNPassConfig::addFastRegAlloc() {
1320 // FIXME: We have to disable the verifier here because of PHIElimination +
1321 // TwoAddressInstructions disabling it.
1322
1323 // This must be run immediately after phi elimination and before
1324 // TwoAddressInstructions, otherwise the processing of the tied operand of
1325 // SI_ELSE will introduce a copy of the tied operand source after the else.
1326 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1327
1328 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1329
1330 TargetPassConfig::addFastRegAlloc();
1331 }
1332
addOptimizedRegAlloc()1333 void GCNPassConfig::addOptimizedRegAlloc() {
1334 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1335 // instructions that cause scheduling barriers.
1336 insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1337
1338 if (OptExecMaskPreRA)
1339 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1340
1341 if (EnableRewritePartialRegUses)
1342 insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
1343
1344 if (isPassEnabled(EnablePreRAOptimizations))
1345 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1346
1347 // This is not an essential optimization and it has a noticeable impact on
1348 // compilation time, so we only enable it from O2.
1349 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1350 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1351
1352 // FIXME: when an instruction has a Killed operand, and the instruction is
1353 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1354 // the register in LiveVariables, this would trigger a failure in verifier,
1355 // we should fix it and enable the verifier.
1356 if (OptVGPRLiveRange)
1357 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1358 // This must be run immediately after phi elimination and before
1359 // TwoAddressInstructions, otherwise the processing of the tied operand of
1360 // SI_ELSE will introduce a copy of the tied operand source after the else.
1361 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1362
1363 if (EnableDCEInRA)
1364 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1365
1366 TargetPassConfig::addOptimizedRegAlloc();
1367 }
1368
addPreRewrite()1369 bool GCNPassConfig::addPreRewrite() {
1370 addPass(&SILowerWWMCopiesID);
1371 if (EnableRegReassign)
1372 addPass(&GCNNSAReassignID);
1373 return true;
1374 }
1375
createSGPRAllocPass(bool Optimized)1376 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1377 // Initialize the global default.
1378 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1379 initializeDefaultSGPRRegisterAllocatorOnce);
1380
1381 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1382 if (Ctor != useDefaultRegisterAllocator)
1383 return Ctor();
1384
1385 if (Optimized)
1386 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1387
1388 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1389 }
1390
createVGPRAllocPass(bool Optimized)1391 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1392 // Initialize the global default.
1393 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1394 initializeDefaultVGPRRegisterAllocatorOnce);
1395
1396 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1397 if (Ctor != useDefaultRegisterAllocator)
1398 return Ctor();
1399
1400 if (Optimized)
1401 return createGreedyVGPRRegisterAllocator();
1402
1403 return createFastVGPRRegisterAllocator();
1404 }
1405
createRegAllocPass(bool Optimized)1406 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1407 llvm_unreachable("should not be used");
1408 }
1409
1410 static const char RegAllocOptNotSupportedMessage[] =
1411 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1412
addRegAssignAndRewriteFast()1413 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1414 if (!usingDefaultRegAlloc())
1415 report_fatal_error(RegAllocOptNotSupportedMessage);
1416
1417 addPass(&GCNPreRALongBranchRegID);
1418
1419 addPass(createSGPRAllocPass(false));
1420
1421 // Equivalent of PEI for SGPRs.
1422 addPass(&SILowerSGPRSpillsID);
1423 addPass(&SIPreAllocateWWMRegsID);
1424
1425 addPass(createVGPRAllocPass(false));
1426
1427 addPass(&SILowerWWMCopiesID);
1428 return true;
1429 }
1430
addRegAssignAndRewriteOptimized()1431 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1432 if (!usingDefaultRegAlloc())
1433 report_fatal_error(RegAllocOptNotSupportedMessage);
1434
1435 addPass(&GCNPreRALongBranchRegID);
1436
1437 addPass(createSGPRAllocPass(true));
1438
1439 // Commit allocated register changes. This is mostly necessary because too
1440 // many things rely on the use lists of the physical registers, such as the
1441 // verifier. This is only necessary with allocators which use LiveIntervals,
1442 // since FastRegAlloc does the replacements itself.
1443 addPass(createVirtRegRewriter(false));
1444
1445 // Equivalent of PEI for SGPRs.
1446 addPass(&SILowerSGPRSpillsID);
1447 addPass(&SIPreAllocateWWMRegsID);
1448
1449 addPass(createVGPRAllocPass(true));
1450
1451 addPreRewrite();
1452 addPass(&VirtRegRewriterID);
1453
1454 addPass(&AMDGPUMarkLastScratchLoadID);
1455
1456 return true;
1457 }
1458
addPostRegAlloc()1459 void GCNPassConfig::addPostRegAlloc() {
1460 addPass(&SIFixVGPRCopiesID);
1461 if (getOptLevel() > CodeGenOptLevel::None)
1462 addPass(&SIOptimizeExecMaskingID);
1463 TargetPassConfig::addPostRegAlloc();
1464 }
1465
addPreSched2()1466 void GCNPassConfig::addPreSched2() {
1467 if (TM->getOptLevel() > CodeGenOptLevel::None)
1468 addPass(createSIShrinkInstructionsPass());
1469 addPass(&SIPostRABundlerID);
1470 }
1471
addPreEmitPass()1472 void GCNPassConfig::addPreEmitPass() {
1473 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1474 addPass(&GCNCreateVOPDID);
1475 addPass(createSIMemoryLegalizerPass());
1476 addPass(createSIInsertWaitcntsPass());
1477
1478 addPass(createSIModeRegisterPass());
1479
1480 if (getOptLevel() > CodeGenOptLevel::None)
1481 addPass(&SIInsertHardClausesID);
1482
1483 addPass(&SILateBranchLoweringPassID);
1484 if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
1485 addPass(createAMDGPUSetWavePriorityPass());
1486 if (getOptLevel() > CodeGenOptLevel::None)
1487 addPass(&SIPreEmitPeepholeID);
1488 // The hazard recognizer that runs as part of the post-ra scheduler does not
1489 // guarantee to be able handle all hazards correctly. This is because if there
1490 // are multiple scheduling regions in a basic block, the regions are scheduled
1491 // bottom up, so when we begin to schedule a region we don't know what
1492 // instructions were emitted directly before it.
1493 //
1494 // Here we add a stand-alone hazard recognizer pass which can handle all
1495 // cases.
1496 addPass(&PostRAHazardRecognizerID);
1497
1498 if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
1499 addPass(&AMDGPUInsertSingleUseVDSTID);
1500
1501 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
1502 addPass(&AMDGPUInsertDelayAluID);
1503
1504 addPass(&BranchRelaxationPassID);
1505 }
1506
createPassConfig(PassManagerBase & PM)1507 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1508 return new GCNPassConfig(*this, PM);
1509 }
1510
registerMachineRegisterInfoCallback(MachineFunction & MF) const1511 void GCNTargetMachine::registerMachineRegisterInfoCallback(
1512 MachineFunction &MF) const {
1513 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1514 MF.getRegInfo().addDelegate(MFI);
1515 }
1516
createMachineFunctionInfo(BumpPtrAllocator & Allocator,const Function & F,const TargetSubtargetInfo * STI) const1517 MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1518 BumpPtrAllocator &Allocator, const Function &F,
1519 const TargetSubtargetInfo *STI) const {
1520 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1521 Allocator, F, static_cast<const GCNSubtarget *>(STI));
1522 }
1523
createDefaultFuncInfoYAML() const1524 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1525 return new yaml::SIMachineFunctionInfo();
1526 }
1527
1528 yaml::MachineFunctionInfo *
convertFuncInfoToYAML(const MachineFunction & MF) const1529 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1530 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1531 return new yaml::SIMachineFunctionInfo(
1532 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1533 }
1534
parseMachineFunctionInfo(const yaml::MachineFunctionInfo & MFI_,PerFunctionMIParsingState & PFS,SMDiagnostic & Error,SMRange & SourceRange) const1535 bool GCNTargetMachine::parseMachineFunctionInfo(
1536 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1537 SMDiagnostic &Error, SMRange &SourceRange) const {
1538 const yaml::SIMachineFunctionInfo &YamlMFI =
1539 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1540 MachineFunction &MF = PFS.MF;
1541 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1542 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1543
1544 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1545 return true;
1546
1547 if (MFI->Occupancy == 0) {
1548 // Fixup the subtarget dependent default value.
1549 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1550 }
1551
1552 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1553 Register TempReg;
1554 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1555 SourceRange = RegName.SourceRange;
1556 return true;
1557 }
1558 RegVal = TempReg;
1559
1560 return false;
1561 };
1562
1563 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1564 Register &RegVal) {
1565 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1566 };
1567
1568 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1569 return true;
1570
1571 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1572 return true;
1573
1574 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1575 MFI->LongBranchReservedReg))
1576 return true;
1577
1578 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1579 // Create a diagnostic for a the register string literal.
1580 const MemoryBuffer &Buffer =
1581 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1582 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1583 RegName.Value.size(), SourceMgr::DK_Error,
1584 "incorrect register class for field", RegName.Value,
1585 std::nullopt, std::nullopt);
1586 SourceRange = RegName.SourceRange;
1587 return true;
1588 };
1589
1590 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1591 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1592 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1593 return true;
1594
1595 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1596 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1597 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1598 }
1599
1600 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1601 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1602 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1603 }
1604
1605 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1606 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1607 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1608 }
1609
1610 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1611 Register ParsedReg;
1612 if (parseRegister(YamlReg, ParsedReg))
1613 return true;
1614
1615 MFI->reserveWWMRegister(ParsedReg);
1616 }
1617
1618 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1619 const TargetRegisterClass &RC,
1620 ArgDescriptor &Arg, unsigned UserSGPRs,
1621 unsigned SystemSGPRs) {
1622 // Skip parsing if it's not present.
1623 if (!A)
1624 return false;
1625
1626 if (A->IsRegister) {
1627 Register Reg;
1628 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1629 SourceRange = A->RegisterName.SourceRange;
1630 return true;
1631 }
1632 if (!RC.contains(Reg))
1633 return diagnoseRegisterClass(A->RegisterName);
1634 Arg = ArgDescriptor::createRegister(Reg);
1635 } else
1636 Arg = ArgDescriptor::createStack(A->StackOffset);
1637 // Check and apply the optional mask.
1638 if (A->Mask)
1639 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1640
1641 MFI->NumUserSGPRs += UserSGPRs;
1642 MFI->NumSystemSGPRs += SystemSGPRs;
1643 return false;
1644 };
1645
1646 if (YamlMFI.ArgInfo &&
1647 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1648 AMDGPU::SGPR_128RegClass,
1649 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1650 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1651 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1652 2, 0) ||
1653 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1654 MFI->ArgInfo.QueuePtr, 2, 0) ||
1655 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1656 AMDGPU::SReg_64RegClass,
1657 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1658 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1659 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1660 2, 0) ||
1661 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1662 AMDGPU::SReg_64RegClass,
1663 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1664 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1665 AMDGPU::SGPR_32RegClass,
1666 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1667 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1668 AMDGPU::SGPR_32RegClass,
1669 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1670 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1671 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1672 0, 1) ||
1673 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1674 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1675 0, 1) ||
1676 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1677 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1678 0, 1) ||
1679 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1680 AMDGPU::SGPR_32RegClass,
1681 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1682 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1683 AMDGPU::SGPR_32RegClass,
1684 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1685 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1686 AMDGPU::SReg_64RegClass,
1687 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1688 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1689 AMDGPU::SReg_64RegClass,
1690 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1691 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1692 AMDGPU::VGPR_32RegClass,
1693 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1694 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1695 AMDGPU::VGPR_32RegClass,
1696 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1697 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1698 AMDGPU::VGPR_32RegClass,
1699 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1700 return true;
1701
1702 if (ST.hasIEEEMode())
1703 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1704 if (ST.hasDX10ClampMode())
1705 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1706
1707 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1708 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1709 ? DenormalMode::IEEE
1710 : DenormalMode::PreserveSign;
1711 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1712 ? DenormalMode::IEEE
1713 : DenormalMode::PreserveSign;
1714
1715 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1716 ? DenormalMode::IEEE
1717 : DenormalMode::PreserveSign;
1718 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1719 ? DenormalMode::IEEE
1720 : DenormalMode::PreserveSign;
1721
1722 return false;
1723 }
1724