xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information  needed to emit code for SI+ GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCtorDtorLowering.h"
19 #include "AMDGPUExportClustering.h"
20 #include "AMDGPUIGroupLP.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPURegBankSelect.h"
23 #include "AMDGPUTargetObjectFile.h"
24 #include "AMDGPUTargetTransformInfo.h"
25 #include "AMDGPUUnifyDivergentExitNodes.h"
26 #include "GCNIterativeScheduler.h"
27 #include "GCNSchedStrategy.h"
28 #include "GCNVOPDUtils.h"
29 #include "R600.h"
30 #include "R600MachineFunctionInfo.h"
31 #include "R600TargetMachine.h"
32 #include "SIMachineFunctionInfo.h"
33 #include "SIMachineScheduler.h"
34 #include "TargetInfo/AMDGPUTargetInfo.h"
35 #include "Utils/AMDGPUBaseInfo.h"
36 #include "llvm/Analysis/CGSCCPassManager.h"
37 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
38 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
39 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
40 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
41 #include "llvm/CodeGen/GlobalISel/Localizer.h"
42 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
43 #include "llvm/CodeGen/MIRParser/MIParser.h"
44 #include "llvm/CodeGen/Passes.h"
45 #include "llvm/CodeGen/RegAllocRegistry.h"
46 #include "llvm/CodeGen/TargetPassConfig.h"
47 #include "llvm/IR/IntrinsicsAMDGPU.h"
48 #include "llvm/IR/PassManager.h"
49 #include "llvm/IR/PatternMatch.h"
50 #include "llvm/InitializePasses.h"
51 #include "llvm/MC/TargetRegistry.h"
52 #include "llvm/Passes/PassBuilder.h"
53 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
54 #include "llvm/Transforms/IPO.h"
55 #include "llvm/Transforms/IPO/AlwaysInliner.h"
56 #include "llvm/Transforms/IPO/GlobalDCE.h"
57 #include "llvm/Transforms/IPO/Internalize.h"
58 #include "llvm/Transforms/Scalar.h"
59 #include "llvm/Transforms/Scalar/GVN.h"
60 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
61 #include "llvm/Transforms/Utils.h"
62 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
63 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
64 #include <optional>
65 
66 using namespace llvm;
67 using namespace llvm::PatternMatch;
68 
69 namespace {
70 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
71 public:
72   SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
73     : RegisterRegAllocBase(N, D, C) {}
74 };
75 
76 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
77 public:
78   VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
79     : RegisterRegAllocBase(N, D, C) {}
80 };
81 
82 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
83                               const TargetRegisterClass &RC) {
84   return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
85 }
86 
87 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
88                               const TargetRegisterClass &RC) {
89   return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
90 }
91 
92 
93 /// -{sgpr|vgpr}-regalloc=... command line option.
94 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
95 
96 /// A dummy default pass factory indicates whether the register allocator is
97 /// overridden on the command line.
98 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
99 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
100 
101 static SGPRRegisterRegAlloc
102 defaultSGPRRegAlloc("default",
103                     "pick SGPR register allocator based on -O option",
104                     useDefaultRegisterAllocator);
105 
106 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
107                RegisterPassParser<SGPRRegisterRegAlloc>>
108 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
109              cl::desc("Register allocator to use for SGPRs"));
110 
111 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
112                RegisterPassParser<VGPRRegisterRegAlloc>>
113 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
114              cl::desc("Register allocator to use for VGPRs"));
115 
116 
117 static void initializeDefaultSGPRRegisterAllocatorOnce() {
118   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
119 
120   if (!Ctor) {
121     Ctor = SGPRRegAlloc;
122     SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
123   }
124 }
125 
126 static void initializeDefaultVGPRRegisterAllocatorOnce() {
127   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
128 
129   if (!Ctor) {
130     Ctor = VGPRRegAlloc;
131     VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
132   }
133 }
134 
135 static FunctionPass *createBasicSGPRRegisterAllocator() {
136   return createBasicRegisterAllocator(onlyAllocateSGPRs);
137 }
138 
139 static FunctionPass *createGreedySGPRRegisterAllocator() {
140   return createGreedyRegisterAllocator(onlyAllocateSGPRs);
141 }
142 
143 static FunctionPass *createFastSGPRRegisterAllocator() {
144   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
145 }
146 
147 static FunctionPass *createBasicVGPRRegisterAllocator() {
148   return createBasicRegisterAllocator(onlyAllocateVGPRs);
149 }
150 
151 static FunctionPass *createGreedyVGPRRegisterAllocator() {
152   return createGreedyRegisterAllocator(onlyAllocateVGPRs);
153 }
154 
155 static FunctionPass *createFastVGPRRegisterAllocator() {
156   return createFastRegisterAllocator(onlyAllocateVGPRs, true);
157 }
158 
159 static SGPRRegisterRegAlloc basicRegAllocSGPR(
160   "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
161 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
162   "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
163 
164 static SGPRRegisterRegAlloc fastRegAllocSGPR(
165   "fast", "fast register allocator", createFastSGPRRegisterAllocator);
166 
167 
168 static VGPRRegisterRegAlloc basicRegAllocVGPR(
169   "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
170 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
171   "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
172 
173 static VGPRRegisterRegAlloc fastRegAllocVGPR(
174   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
175 }
176 
177 static cl::opt<bool>
178 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
179                         cl::desc("Run early if-conversion"),
180                         cl::init(false));
181 
182 static cl::opt<bool>
183 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
184             cl::desc("Run pre-RA exec mask optimizations"),
185             cl::init(true));
186 
187 static cl::opt<bool>
188     LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
189                   cl::desc("Lower GPU ctor / dtors to globals on the device."),
190                   cl::init(true), cl::Hidden);
191 
192 // Option to disable vectorizer for tests.
193 static cl::opt<bool> EnableLoadStoreVectorizer(
194   "amdgpu-load-store-vectorizer",
195   cl::desc("Enable load store vectorizer"),
196   cl::init(true),
197   cl::Hidden);
198 
199 // Option to control global loads scalarization
200 static cl::opt<bool> ScalarizeGlobal(
201   "amdgpu-scalarize-global-loads",
202   cl::desc("Enable global load scalarization"),
203   cl::init(true),
204   cl::Hidden);
205 
206 // Option to run internalize pass.
207 static cl::opt<bool> InternalizeSymbols(
208   "amdgpu-internalize-symbols",
209   cl::desc("Enable elimination of non-kernel functions and unused globals"),
210   cl::init(false),
211   cl::Hidden);
212 
213 // Option to inline all early.
214 static cl::opt<bool> EarlyInlineAll(
215   "amdgpu-early-inline-all",
216   cl::desc("Inline all functions early"),
217   cl::init(false),
218   cl::Hidden);
219 
220 static cl::opt<bool> RemoveIncompatibleFunctions(
221     "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
222     cl::desc("Enable removal of functions when they"
223              "use features not supported by the target GPU"),
224     cl::init(true));
225 
226 static cl::opt<bool> EnableSDWAPeephole(
227   "amdgpu-sdwa-peephole",
228   cl::desc("Enable SDWA peepholer"),
229   cl::init(true));
230 
231 static cl::opt<bool> EnableDPPCombine(
232   "amdgpu-dpp-combine",
233   cl::desc("Enable DPP combiner"),
234   cl::init(true));
235 
236 // Enable address space based alias analysis
237 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
238   cl::desc("Enable AMDGPU Alias Analysis"),
239   cl::init(true));
240 
241 // Option to run late CFG structurizer
242 static cl::opt<bool, true> LateCFGStructurize(
243   "amdgpu-late-structurize",
244   cl::desc("Enable late CFG structurization"),
245   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
246   cl::Hidden);
247 
248 // Enable lib calls simplifications
249 static cl::opt<bool> EnableLibCallSimplify(
250   "amdgpu-simplify-libcall",
251   cl::desc("Enable amdgpu library simplifications"),
252   cl::init(true),
253   cl::Hidden);
254 
255 static cl::opt<bool> EnableLowerKernelArguments(
256   "amdgpu-ir-lower-kernel-arguments",
257   cl::desc("Lower kernel argument loads in IR pass"),
258   cl::init(true),
259   cl::Hidden);
260 
261 static cl::opt<bool> EnableRegReassign(
262   "amdgpu-reassign-regs",
263   cl::desc("Enable register reassign optimizations on gfx10+"),
264   cl::init(true),
265   cl::Hidden);
266 
267 static cl::opt<bool> OptVGPRLiveRange(
268     "amdgpu-opt-vgpr-liverange",
269     cl::desc("Enable VGPR liverange optimizations for if-else structure"),
270     cl::init(true), cl::Hidden);
271 
272 static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
273     "amdgpu-atomic-optimizer-strategy",
274     cl::desc("Select DPP or Iterative strategy for scan"),
275     cl::init(ScanOptions::Iterative),
276     cl::values(
277         clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
278         clEnumValN(ScanOptions::Iterative, "Iterative",
279                    "Use Iterative approach for scan"),
280         clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
281 
282 // Enable Mode register optimization
283 static cl::opt<bool> EnableSIModeRegisterPass(
284   "amdgpu-mode-register",
285   cl::desc("Enable mode register pass"),
286   cl::init(true),
287   cl::Hidden);
288 
289 // Enable GFX11.5+ s_singleuse_vdst insertion
290 static cl::opt<bool>
291     EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
292                               cl::desc("Enable s_singleuse_vdst insertion"),
293                               cl::init(false), cl::Hidden);
294 
295 // Enable GFX11+ s_delay_alu insertion
296 static cl::opt<bool>
297     EnableInsertDelayAlu("amdgpu-enable-delay-alu",
298                          cl::desc("Enable s_delay_alu insertion"),
299                          cl::init(true), cl::Hidden);
300 
301 // Enable GFX11+ VOPD
302 static cl::opt<bool>
303     EnableVOPD("amdgpu-enable-vopd",
304                cl::desc("Enable VOPD, dual issue of VALU in wave32"),
305                cl::init(true), cl::Hidden);
306 
307 // Option is used in lit tests to prevent deadcoding of patterns inspected.
308 static cl::opt<bool>
309 EnableDCEInRA("amdgpu-dce-in-ra",
310     cl::init(true), cl::Hidden,
311     cl::desc("Enable machine DCE inside regalloc"));
312 
313 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
314                                            cl::desc("Adjust wave priority"),
315                                            cl::init(false), cl::Hidden);
316 
317 static cl::opt<bool> EnableScalarIRPasses(
318   "amdgpu-scalar-ir-passes",
319   cl::desc("Enable scalar IR passes"),
320   cl::init(true),
321   cl::Hidden);
322 
323 static cl::opt<bool> EnableStructurizerWorkarounds(
324     "amdgpu-enable-structurizer-workarounds",
325     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
326     cl::Hidden);
327 
328 static cl::opt<bool, true> EnableLowerModuleLDS(
329     "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
330     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
331     cl::Hidden);
332 
333 static cl::opt<bool> EnablePreRAOptimizations(
334     "amdgpu-enable-pre-ra-optimizations",
335     cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
336     cl::Hidden);
337 
338 static cl::opt<bool> EnablePromoteKernelArguments(
339     "amdgpu-enable-promote-kernel-arguments",
340     cl::desc("Enable promotion of flat kernel pointer arguments to global"),
341     cl::Hidden, cl::init(true));
342 
343 static cl::opt<bool> EnableImageIntrinsicOptimizer(
344     "amdgpu-enable-image-intrinsic-optimizer",
345     cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
346     cl::Hidden);
347 
348 static cl::opt<bool>
349     EnableLoopPrefetch("amdgpu-loop-prefetch",
350                        cl::desc("Enable loop data prefetch on AMDGPU"),
351                        cl::Hidden, cl::init(false));
352 
353 static cl::opt<bool> EnableMaxIlpSchedStrategy(
354     "amdgpu-enable-max-ilp-scheduling-strategy",
355     cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
356     cl::Hidden, cl::init(false));
357 
358 static cl::opt<bool> EnableRewritePartialRegUses(
359     "amdgpu-enable-rewrite-partial-reg-uses",
360     cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
361     cl::Hidden);
362 
363 static cl::opt<bool> EnableHipStdPar(
364   "amdgpu-enable-hipstdpar",
365   cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
366   cl::Hidden);
367 
368 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
369   // Register the target
370   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
371   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
372 
373   PassRegistry *PR = PassRegistry::getPassRegistry();
374   initializeR600ClauseMergePassPass(*PR);
375   initializeR600ControlFlowFinalizerPass(*PR);
376   initializeR600PacketizerPass(*PR);
377   initializeR600ExpandSpecialInstrsPassPass(*PR);
378   initializeR600VectorRegMergerPass(*PR);
379   initializeGlobalISel(*PR);
380   initializeAMDGPUDAGToDAGISelPass(*PR);
381   initializeGCNDPPCombinePass(*PR);
382   initializeSILowerI1CopiesPass(*PR);
383   initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
384   initializeSILowerWWMCopiesPass(*PR);
385   initializeSILowerSGPRSpillsPass(*PR);
386   initializeSIFixSGPRCopiesPass(*PR);
387   initializeSIFixVGPRCopiesPass(*PR);
388   initializeSIFoldOperandsPass(*PR);
389   initializeSIPeepholeSDWAPass(*PR);
390   initializeSIShrinkInstructionsPass(*PR);
391   initializeSIOptimizeExecMaskingPreRAPass(*PR);
392   initializeSIOptimizeVGPRLiveRangePass(*PR);
393   initializeSILoadStoreOptimizerPass(*PR);
394   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
395   initializeAMDGPUAlwaysInlinePass(*PR);
396   initializeAMDGPUAttributorLegacyPass(*PR);
397   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
398   initializeAMDGPUAnnotateUniformValuesPass(*PR);
399   initializeAMDGPUArgumentUsageInfoPass(*PR);
400   initializeAMDGPUAtomicOptimizerPass(*PR);
401   initializeAMDGPULowerKernelArgumentsPass(*PR);
402   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
403   initializeAMDGPULowerKernelAttributesPass(*PR);
404   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
405   initializeAMDGPUPostLegalizerCombinerPass(*PR);
406   initializeAMDGPUPreLegalizerCombinerPass(*PR);
407   initializeAMDGPURegBankCombinerPass(*PR);
408   initializeAMDGPURegBankSelectPass(*PR);
409   initializeAMDGPUPromoteAllocaPass(*PR);
410   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
411   initializeAMDGPUCodeGenPreparePass(*PR);
412   initializeAMDGPULateCodeGenPreparePass(*PR);
413   initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
414   initializeAMDGPULowerModuleLDSLegacyPass(*PR);
415   initializeAMDGPURewriteOutArgumentsPass(*PR);
416   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
417   initializeAMDGPUUnifyMetadataPass(*PR);
418   initializeSIAnnotateControlFlowPass(*PR);
419   initializeAMDGPUInsertSingleUseVDSTPass(*PR);
420   initializeAMDGPUInsertDelayAluPass(*PR);
421   initializeSIInsertHardClausesPass(*PR);
422   initializeSIInsertWaitcntsPass(*PR);
423   initializeSIModeRegisterPass(*PR);
424   initializeSIWholeQuadModePass(*PR);
425   initializeSILowerControlFlowPass(*PR);
426   initializeSIPreEmitPeepholePass(*PR);
427   initializeSILateBranchLoweringPass(*PR);
428   initializeSIMemoryLegalizerPass(*PR);
429   initializeSIOptimizeExecMaskingPass(*PR);
430   initializeSIPreAllocateWWMRegsPass(*PR);
431   initializeSIFormMemoryClausesPass(*PR);
432   initializeSIPostRABundlerPass(*PR);
433   initializeGCNCreateVOPDPass(*PR);
434   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
435   initializeAMDGPUAAWrapperPassPass(*PR);
436   initializeAMDGPUExternalAAWrapperPass(*PR);
437   initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
438   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
439   initializeAMDGPUResourceUsageAnalysisPass(*PR);
440   initializeGCNNSAReassignPass(*PR);
441   initializeGCNPreRAOptimizationsPass(*PR);
442   initializeGCNPreRALongBranchRegPass(*PR);
443   initializeGCNRewritePartialRegUsesPass(*PR);
444   initializeGCNRegPressurePrinterPass(*PR);
445 }
446 
447 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
448   return std::make_unique<AMDGPUTargetObjectFile>();
449 }
450 
451 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
452   return new SIScheduleDAGMI(C);
453 }
454 
455 static ScheduleDAGInstrs *
456 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
457   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
458   ScheduleDAGMILive *DAG =
459     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
460   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
461   if (ST.shouldClusterStores())
462     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
463   DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
464   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
465   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
466   return DAG;
467 }
468 
469 static ScheduleDAGInstrs *
470 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
471   ScheduleDAGMILive *DAG =
472       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
473   DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
474   return DAG;
475 }
476 
477 static ScheduleDAGInstrs *
478 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
479   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
480   auto DAG = new GCNIterativeScheduler(C,
481     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
482   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
483   if (ST.shouldClusterStores())
484     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
485   return DAG;
486 }
487 
488 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
489   return new GCNIterativeScheduler(C,
490     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
491 }
492 
493 static ScheduleDAGInstrs *
494 createIterativeILPMachineScheduler(MachineSchedContext *C) {
495   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
496   auto DAG = new GCNIterativeScheduler(C,
497     GCNIterativeScheduler::SCHEDULE_ILP);
498   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
499   if (ST.shouldClusterStores())
500     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
501   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
502   return DAG;
503 }
504 
505 static MachineSchedRegistry
506 SISchedRegistry("si", "Run SI's custom scheduler",
507                 createSIMachineScheduler);
508 
509 static MachineSchedRegistry
510 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
511                              "Run GCN scheduler to maximize occupancy",
512                              createGCNMaxOccupancyMachineScheduler);
513 
514 static MachineSchedRegistry
515     GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
516                            createGCNMaxILPMachineScheduler);
517 
518 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
519     "gcn-iterative-max-occupancy-experimental",
520     "Run GCN scheduler to maximize occupancy (experimental)",
521     createIterativeGCNMaxOccupancyMachineScheduler);
522 
523 static MachineSchedRegistry GCNMinRegSchedRegistry(
524     "gcn-iterative-minreg",
525     "Run GCN iterative scheduler for minimal register usage (experimental)",
526     createMinRegScheduler);
527 
528 static MachineSchedRegistry GCNILPSchedRegistry(
529     "gcn-iterative-ilp",
530     "Run GCN iterative scheduler for ILP scheduling (experimental)",
531     createIterativeILPMachineScheduler);
532 
533 static StringRef computeDataLayout(const Triple &TT) {
534   if (TT.getArch() == Triple::r600) {
535     // 32-bit pointers.
536     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
537            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
538   }
539 
540   // 32-bit private, local, and region pointers. 64-bit global, constant and
541   // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
542   // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
543   // (address space 7), and 128-bit non-integral buffer resourcees (address
544   // space 8) which cannot be non-trivilally accessed by LLVM memory operations
545   // like getelementptr.
546   return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
547          "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
548          "v32:32-v48:64-v96:"
549          "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
550          "G1-ni:7:8:9";
551 }
552 
553 LLVM_READNONE
554 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
555   if (!GPU.empty())
556     return GPU;
557 
558   // Need to default to a target with flat support for HSA.
559   if (TT.getArch() == Triple::amdgcn)
560     return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
561 
562   return "r600";
563 }
564 
565 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
566   // The AMDGPU toolchain only supports generating shared objects, so we
567   // must always use PIC.
568   return Reloc::PIC_;
569 }
570 
571 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
572                                          StringRef CPU, StringRef FS,
573                                          TargetOptions Options,
574                                          std::optional<Reloc::Model> RM,
575                                          std::optional<CodeModel::Model> CM,
576                                          CodeGenOptLevel OptLevel)
577     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
578                         FS, Options, getEffectiveRelocModel(RM),
579                         getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
580       TLOF(createTLOF(getTargetTriple())) {
581   initAsmInfo();
582   if (TT.getArch() == Triple::amdgcn) {
583     if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
584       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
585     else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
586       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
587   }
588 }
589 
590 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
591 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
592 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
593 
594 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
595 
596 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
597   Attribute GPUAttr = F.getFnAttribute("target-cpu");
598   return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
599 }
600 
601 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
602   Attribute FSAttr = F.getFnAttribute("target-features");
603 
604   return FSAttr.isValid() ? FSAttr.getValueAsString()
605                           : getTargetFeatureString();
606 }
607 
608 /// Predicate for Internalize pass.
609 static bool mustPreserveGV(const GlobalValue &GV) {
610   if (const Function *F = dyn_cast<Function>(&GV))
611     return F->isDeclaration() || F->getName().starts_with("__asan_") ||
612            F->getName().starts_with("__sanitizer_") ||
613            AMDGPU::isEntryFunctionCC(F->getCallingConv());
614 
615   GV.removeDeadConstantUsers();
616   return !GV.use_empty();
617 }
618 
619 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
620   AAM.registerFunctionAnalysis<AMDGPUAA>();
621 }
622 
623 void AMDGPUTargetMachine::registerPassBuilderCallbacks(
624     PassBuilder &PB, bool PopulateClassToPassNames) {
625   PB.registerPipelineParsingCallback(
626       [this](StringRef PassName, ModulePassManager &PM,
627              ArrayRef<PassBuilder::PipelineElement>) {
628         if (PassName == "amdgpu-attributor") {
629           PM.addPass(AMDGPUAttributorPass(*this));
630           return true;
631         }
632         if (PassName == "amdgpu-unify-metadata") {
633           PM.addPass(AMDGPUUnifyMetadataPass());
634           return true;
635         }
636         if (PassName == "amdgpu-printf-runtime-binding") {
637           PM.addPass(AMDGPUPrintfRuntimeBindingPass());
638           return true;
639         }
640         if (PassName == "amdgpu-always-inline") {
641           PM.addPass(AMDGPUAlwaysInlinePass());
642           return true;
643         }
644         if (PassName == "amdgpu-lower-module-lds") {
645           PM.addPass(AMDGPULowerModuleLDSPass(*this));
646           return true;
647         }
648         if (PassName == "amdgpu-lower-ctor-dtor") {
649           PM.addPass(AMDGPUCtorDtorLoweringPass());
650           return true;
651         }
652         return false;
653       });
654   PB.registerPipelineParsingCallback(
655       [this](StringRef PassName, FunctionPassManager &PM,
656              ArrayRef<PassBuilder::PipelineElement>) {
657         if (PassName == "amdgpu-simplifylib") {
658           PM.addPass(AMDGPUSimplifyLibCallsPass());
659           return true;
660         }
661         if (PassName == "amdgpu-image-intrinsic-opt") {
662           PM.addPass(AMDGPUImageIntrinsicOptimizerPass(*this));
663           return true;
664         }
665         if (PassName == "amdgpu-usenative") {
666           PM.addPass(AMDGPUUseNativeCallsPass());
667           return true;
668         }
669         if (PassName == "amdgpu-promote-alloca") {
670           PM.addPass(AMDGPUPromoteAllocaPass(*this));
671           return true;
672         }
673         if (PassName == "amdgpu-promote-alloca-to-vector") {
674           PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
675           return true;
676         }
677         if (PassName == "amdgpu-lower-kernel-attributes") {
678           PM.addPass(AMDGPULowerKernelAttributesPass());
679           return true;
680         }
681         if (PassName == "amdgpu-promote-kernel-arguments") {
682           PM.addPass(AMDGPUPromoteKernelArgumentsPass());
683           return true;
684         }
685         if (PassName == "amdgpu-unify-divergent-exit-nodes") {
686           PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
687           return true;
688         }
689         if (PassName == "amdgpu-atomic-optimizer") {
690           PM.addPass(
691               AMDGPUAtomicOptimizerPass(*this, AMDGPUAtomicOptimizerStrategy));
692           return true;
693         }
694         if (PassName == "amdgpu-codegenprepare") {
695           PM.addPass(AMDGPUCodeGenPreparePass(*this));
696           return true;
697         }
698         if (PassName == "amdgpu-lower-kernel-arguments") {
699           PM.addPass(AMDGPULowerKernelArgumentsPass(*this));
700           return true;
701         }
702         if (PassName == "amdgpu-rewrite-undef-for-phi") {
703           PM.addPass(AMDGPURewriteUndefForPHIPass());
704           return true;
705         }
706         return false;
707       });
708 
709   PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
710     FAM.registerPass([&] { return AMDGPUAA(); });
711   });
712 
713   PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
714     if (AAName == "amdgpu-aa") {
715       AAM.registerFunctionAnalysis<AMDGPUAA>();
716       return true;
717     }
718     return false;
719   });
720 
721   PB.registerPipelineStartEPCallback(
722       [](ModulePassManager &PM, OptimizationLevel Level) {
723         FunctionPassManager FPM;
724         FPM.addPass(AMDGPUUseNativeCallsPass());
725         if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
726           FPM.addPass(AMDGPUSimplifyLibCallsPass());
727         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
728         if (EnableHipStdPar)
729           PM.addPass(HipStdParAcceleratorCodeSelectionPass());
730       });
731 
732   PB.registerPipelineEarlySimplificationEPCallback(
733       [](ModulePassManager &PM, OptimizationLevel Level) {
734         PM.addPass(AMDGPUPrintfRuntimeBindingPass());
735 
736         if (Level == OptimizationLevel::O0)
737           return;
738 
739         PM.addPass(AMDGPUUnifyMetadataPass());
740 
741         if (InternalizeSymbols) {
742           PM.addPass(InternalizePass(mustPreserveGV));
743           PM.addPass(GlobalDCEPass());
744         }
745 
746         if (EarlyInlineAll && !EnableFunctionCalls)
747           PM.addPass(AMDGPUAlwaysInlinePass());
748       });
749 
750   PB.registerCGSCCOptimizerLateEPCallback(
751       [this](CGSCCPassManager &PM, OptimizationLevel Level) {
752         if (Level == OptimizationLevel::O0)
753           return;
754 
755         FunctionPassManager FPM;
756 
757         // Add promote kernel arguments pass to the opt pipeline right before
758         // infer address spaces which is needed to do actual address space
759         // rewriting.
760         if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
761             EnablePromoteKernelArguments)
762           FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
763 
764         // Add infer address spaces pass to the opt pipeline after inlining
765         // but before SROA to increase SROA opportunities.
766         FPM.addPass(InferAddressSpacesPass());
767 
768         // This should run after inlining to have any chance of doing
769         // anything, and before other cleanup optimizations.
770         FPM.addPass(AMDGPULowerKernelAttributesPass());
771 
772         if (Level != OptimizationLevel::O0) {
773           // Promote alloca to vector before SROA and loop unroll. If we
774           // manage to eliminate allocas before unroll we may choose to unroll
775           // less.
776           FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
777         }
778 
779         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
780       });
781 }
782 
783 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
784   return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
785           AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
786           AddrSpace == AMDGPUAS::REGION_ADDRESS)
787              ? -1
788              : 0;
789 }
790 
791 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
792                                               unsigned DestAS) const {
793   return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
794          AMDGPU::isFlatGlobalAddrSpace(DestAS);
795 }
796 
797 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
798   const auto *LD = dyn_cast<LoadInst>(V);
799   if (!LD)
800     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
801 
802   // It must be a generic pointer loaded.
803   assert(V->getType()->isPointerTy() &&
804          V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
805 
806   const auto *Ptr = LD->getPointerOperand();
807   if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
808     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
809   // For a generic pointer loaded from the constant memory, it could be assumed
810   // as a global pointer since the constant memory is only populated on the
811   // host side. As implied by the offload programming model, only global
812   // pointers could be referenced on the host side.
813   return AMDGPUAS::GLOBAL_ADDRESS;
814 }
815 
816 std::pair<const Value *, unsigned>
817 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
818   if (auto *II = dyn_cast<IntrinsicInst>(V)) {
819     switch (II->getIntrinsicID()) {
820     case Intrinsic::amdgcn_is_shared:
821       return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
822     case Intrinsic::amdgcn_is_private:
823       return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
824     default:
825       break;
826     }
827     return std::pair(nullptr, -1);
828   }
829   // Check the global pointer predication based on
830   // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
831   // the order of 'is_shared' and 'is_private' is not significant.
832   Value *Ptr;
833   if (match(
834           const_cast<Value *>(V),
835           m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
836                   m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
837                       m_Deferred(Ptr))))))
838     return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
839 
840   return std::pair(nullptr, -1);
841 }
842 
843 unsigned
844 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
845   switch (Kind) {
846   case PseudoSourceValue::Stack:
847   case PseudoSourceValue::FixedStack:
848     return AMDGPUAS::PRIVATE_ADDRESS;
849   case PseudoSourceValue::ConstantPool:
850   case PseudoSourceValue::GOT:
851   case PseudoSourceValue::JumpTable:
852   case PseudoSourceValue::GlobalValueCallEntry:
853   case PseudoSourceValue::ExternalSymbolCallEntry:
854     return AMDGPUAS::CONSTANT_ADDRESS;
855   }
856   return AMDGPUAS::FLAT_ADDRESS;
857 }
858 
859 //===----------------------------------------------------------------------===//
860 // GCN Target Machine (SI+)
861 //===----------------------------------------------------------------------===//
862 
863 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
864                                    StringRef CPU, StringRef FS,
865                                    TargetOptions Options,
866                                    std::optional<Reloc::Model> RM,
867                                    std::optional<CodeModel::Model> CM,
868                                    CodeGenOptLevel OL, bool JIT)
869     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
870 
871 const TargetSubtargetInfo *
872 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
873   StringRef GPU = getGPUName(F);
874   StringRef FS = getFeatureString(F);
875 
876   SmallString<128> SubtargetKey(GPU);
877   SubtargetKey.append(FS);
878 
879   auto &I = SubtargetMap[SubtargetKey];
880   if (!I) {
881     // This needs to be done before we create a new subtarget since any
882     // creation will depend on the TM and the code generation flags on the
883     // function that reside in TargetOptions.
884     resetTargetOptions(F);
885     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
886   }
887 
888   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
889 
890   return I.get();
891 }
892 
893 TargetTransformInfo
894 GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
895   return TargetTransformInfo(GCNTTIImpl(this, F));
896 }
897 
898 //===----------------------------------------------------------------------===//
899 // AMDGPU Pass Setup
900 //===----------------------------------------------------------------------===//
901 
902 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
903   return getStandardCSEConfigForOpt(TM->getOptLevel());
904 }
905 
906 namespace {
907 
908 class GCNPassConfig final : public AMDGPUPassConfig {
909 public:
910   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
911     : AMDGPUPassConfig(TM, PM) {
912     // It is necessary to know the register usage of the entire call graph.  We
913     // allow calls without EnableAMDGPUFunctionCalls if they are marked
914     // noinline, so this is always required.
915     setRequiresCodeGenSCCOrder(true);
916     substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
917   }
918 
919   GCNTargetMachine &getGCNTargetMachine() const {
920     return getTM<GCNTargetMachine>();
921   }
922 
923   ScheduleDAGInstrs *
924   createMachineScheduler(MachineSchedContext *C) const override;
925 
926   ScheduleDAGInstrs *
927   createPostMachineScheduler(MachineSchedContext *C) const override {
928     ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
929         C, std::make_unique<PostGenericScheduler>(C),
930         /*RemoveKillFlags=*/true);
931     const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
932     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
933     if (ST.shouldClusterStores())
934       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
935     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
936     DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
937     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
938       DAG->addMutation(createVOPDPairingMutation());
939     return DAG;
940   }
941 
942   bool addPreISel() override;
943   void addMachineSSAOptimization() override;
944   bool addILPOpts() override;
945   bool addInstSelector() override;
946   bool addIRTranslator() override;
947   void addPreLegalizeMachineIR() override;
948   bool addLegalizeMachineIR() override;
949   void addPreRegBankSelect() override;
950   bool addRegBankSelect() override;
951   void addPreGlobalInstructionSelect() override;
952   bool addGlobalInstructionSelect() override;
953   void addFastRegAlloc() override;
954   void addOptimizedRegAlloc() override;
955 
956   FunctionPass *createSGPRAllocPass(bool Optimized);
957   FunctionPass *createVGPRAllocPass(bool Optimized);
958   FunctionPass *createRegAllocPass(bool Optimized) override;
959 
960   bool addRegAssignAndRewriteFast() override;
961   bool addRegAssignAndRewriteOptimized() override;
962 
963   void addPreRegAlloc() override;
964   bool addPreRewrite() override;
965   void addPostRegAlloc() override;
966   void addPreSched2() override;
967   void addPreEmitPass() override;
968 };
969 
970 } // end anonymous namespace
971 
972 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
973     : TargetPassConfig(TM, PM) {
974   // Exceptions and StackMaps are not supported, so these passes will never do
975   // anything.
976   disablePass(&StackMapLivenessID);
977   disablePass(&FuncletLayoutID);
978   // Garbage collection is not supported.
979   disablePass(&GCLoweringID);
980   disablePass(&ShadowStackGCLoweringID);
981 }
982 
983 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
984   if (getOptLevel() == CodeGenOptLevel::Aggressive)
985     addPass(createGVNPass());
986   else
987     addPass(createEarlyCSEPass());
988 }
989 
990 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
991   if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
992     addPass(createLoopDataPrefetchPass());
993   addPass(createSeparateConstOffsetFromGEPPass());
994   // ReassociateGEPs exposes more opportunities for SLSR. See
995   // the example in reassociate-geps-and-slsr.ll.
996   addPass(createStraightLineStrengthReducePass());
997   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
998   // EarlyCSE can reuse.
999   addEarlyCSEOrGVNPass();
1000   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1001   addPass(createNaryReassociatePass());
1002   // NaryReassociate on GEPs creates redundant common expressions, so run
1003   // EarlyCSE after it.
1004   addPass(createEarlyCSEPass());
1005 }
1006 
1007 void AMDGPUPassConfig::addIRPasses() {
1008   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1009 
1010   Triple::ArchType Arch = TM.getTargetTriple().getArch();
1011   if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn)
1012     addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1013 
1014   // There is no reason to run these.
1015   disablePass(&StackMapLivenessID);
1016   disablePass(&FuncletLayoutID);
1017   disablePass(&PatchableFunctionID);
1018 
1019   addPass(createAMDGPUPrintfRuntimeBinding());
1020   if (LowerCtorDtor)
1021     addPass(createAMDGPUCtorDtorLoweringLegacyPass());
1022 
1023   if (isPassEnabled(EnableImageIntrinsicOptimizer))
1024     addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
1025 
1026   // Function calls are not supported, so make sure we inline everything.
1027   addPass(createAMDGPUAlwaysInlinePass());
1028   addPass(createAlwaysInlinerLegacyPass());
1029 
1030   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1031   if (Arch == Triple::r600)
1032     addPass(createR600OpenCLImageTypeLoweringPass());
1033 
1034   // Replace OpenCL enqueued block function pointers with global variables.
1035   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
1036 
1037   // Runs before PromoteAlloca so the latter can account for function uses
1038   if (EnableLowerModuleLDS) {
1039     addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
1040   }
1041 
1042   // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
1043   // after their introduction
1044   if (TM.getOptLevel() > CodeGenOptLevel::None)
1045     addPass(createAMDGPUAttributorLegacyPass());
1046 
1047   if (TM.getOptLevel() > CodeGenOptLevel::None)
1048     addPass(createInferAddressSpacesPass());
1049 
1050   // Run atomic optimizer before Atomic Expand
1051   if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
1052       (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1053       (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1054     addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
1055   }
1056 
1057   addPass(createAtomicExpandPass());
1058 
1059   if (TM.getOptLevel() > CodeGenOptLevel::None) {
1060     addPass(createAMDGPUPromoteAlloca());
1061 
1062     if (isPassEnabled(EnableScalarIRPasses))
1063       addStraightLineScalarOptimizationPasses();
1064 
1065     if (EnableAMDGPUAliasAnalysis) {
1066       addPass(createAMDGPUAAWrapperPass());
1067       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1068                                              AAResults &AAR) {
1069         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1070           AAR.addAAResult(WrapperPass->getResult());
1071         }));
1072     }
1073 
1074     if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1075       // TODO: May want to move later or split into an early and late one.
1076       addPass(createAMDGPUCodeGenPreparePass());
1077     }
1078 
1079     // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1080     // have expanded.
1081     if (TM.getOptLevel() > CodeGenOptLevel::Less)
1082       addPass(createLICMPass());
1083   }
1084 
1085   TargetPassConfig::addIRPasses();
1086 
1087   // EarlyCSE is not always strong enough to clean up what LSR produces. For
1088   // example, GVN can combine
1089   //
1090   //   %0 = add %a, %b
1091   //   %1 = add %b, %a
1092   //
1093   // and
1094   //
1095   //   %0 = shl nsw %a, 2
1096   //   %1 = shl %a, 2
1097   //
1098   // but EarlyCSE can do neither of them.
1099   if (isPassEnabled(EnableScalarIRPasses))
1100     addEarlyCSEOrGVNPass();
1101 }
1102 
1103 void AMDGPUPassConfig::addCodeGenPrepare() {
1104   if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1105     // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1106     // analysis, and should be removed.
1107     addPass(createAMDGPUAnnotateKernelFeaturesPass());
1108   }
1109 
1110   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1111       EnableLowerKernelArguments)
1112     addPass(createAMDGPULowerKernelArgumentsPass());
1113 
1114   TargetPassConfig::addCodeGenPrepare();
1115 
1116   if (isPassEnabled(EnableLoadStoreVectorizer))
1117     addPass(createLoadStoreVectorizerPass());
1118 
1119   // LowerSwitch pass may introduce unreachable blocks that can
1120   // cause unexpected behavior for subsequent passes. Placing it
1121   // here seems better that these blocks would get cleaned up by
1122   // UnreachableBlockElim inserted next in the pass flow.
1123   addPass(createLowerSwitchPass());
1124 }
1125 
1126 bool AMDGPUPassConfig::addPreISel() {
1127   if (TM->getOptLevel() > CodeGenOptLevel::None)
1128     addPass(createFlattenCFGPass());
1129   return false;
1130 }
1131 
1132 bool AMDGPUPassConfig::addInstSelector() {
1133   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1134   return false;
1135 }
1136 
1137 bool AMDGPUPassConfig::addGCPasses() {
1138   // Do nothing. GC is not supported.
1139   return false;
1140 }
1141 
1142 llvm::ScheduleDAGInstrs *
1143 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1144   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1145   ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1146   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1147   if (ST.shouldClusterStores())
1148     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1149   return DAG;
1150 }
1151 
1152 MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
1153     BumpPtrAllocator &Allocator, const Function &F,
1154     const TargetSubtargetInfo *STI) const {
1155   return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1156       Allocator, F, static_cast<const R600Subtarget *>(STI));
1157 }
1158 
1159 //===----------------------------------------------------------------------===//
1160 // GCN Pass Setup
1161 //===----------------------------------------------------------------------===//
1162 
1163 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1164   MachineSchedContext *C) const {
1165   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1166   if (ST.enableSIScheduler())
1167     return createSIMachineScheduler(C);
1168 
1169   if (EnableMaxIlpSchedStrategy)
1170     return createGCNMaxILPMachineScheduler(C);
1171 
1172   return createGCNMaxOccupancyMachineScheduler(C);
1173 }
1174 
1175 bool GCNPassConfig::addPreISel() {
1176   AMDGPUPassConfig::addPreISel();
1177 
1178   if (TM->getOptLevel() > CodeGenOptLevel::None)
1179     addPass(createAMDGPULateCodeGenPreparePass());
1180 
1181   if (TM->getOptLevel() > CodeGenOptLevel::None)
1182     addPass(createSinkingPass());
1183 
1184   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1185   // regions formed by them.
1186   addPass(&AMDGPUUnifyDivergentExitNodesID);
1187   if (!LateCFGStructurize) {
1188     if (EnableStructurizerWorkarounds) {
1189       addPass(createFixIrreduciblePass());
1190       addPass(createUnifyLoopExitsPass());
1191     }
1192     addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1193   }
1194   addPass(createAMDGPUAnnotateUniformValues());
1195   if (!LateCFGStructurize) {
1196     addPass(createSIAnnotateControlFlowPass());
1197     // TODO: Move this right after structurizeCFG to avoid extra divergence
1198     // analysis. This depends on stopping SIAnnotateControlFlow from making
1199     // control flow modifications.
1200     addPass(createAMDGPURewriteUndefForPHILegacyPass());
1201   }
1202   addPass(createLCSSAPass());
1203 
1204   if (TM->getOptLevel() > CodeGenOptLevel::Less)
1205     addPass(&AMDGPUPerfHintAnalysisID);
1206 
1207   return false;
1208 }
1209 
1210 void GCNPassConfig::addMachineSSAOptimization() {
1211   TargetPassConfig::addMachineSSAOptimization();
1212 
1213   // We want to fold operands after PeepholeOptimizer has run (or as part of
1214   // it), because it will eliminate extra copies making it easier to fold the
1215   // real source operand. We want to eliminate dead instructions after, so that
1216   // we see fewer uses of the copies. We then need to clean up the dead
1217   // instructions leftover after the operands are folded as well.
1218   //
1219   // XXX - Can we get away without running DeadMachineInstructionElim again?
1220   addPass(&SIFoldOperandsID);
1221   if (EnableDPPCombine)
1222     addPass(&GCNDPPCombineID);
1223   addPass(&SILoadStoreOptimizerID);
1224   if (isPassEnabled(EnableSDWAPeephole)) {
1225     addPass(&SIPeepholeSDWAID);
1226     addPass(&EarlyMachineLICMID);
1227     addPass(&MachineCSEID);
1228     addPass(&SIFoldOperandsID);
1229   }
1230   addPass(&DeadMachineInstructionElimID);
1231   addPass(createSIShrinkInstructionsPass());
1232 }
1233 
1234 bool GCNPassConfig::addILPOpts() {
1235   if (EnableEarlyIfConversion)
1236     addPass(&EarlyIfConverterID);
1237 
1238   TargetPassConfig::addILPOpts();
1239   return false;
1240 }
1241 
1242 bool GCNPassConfig::addInstSelector() {
1243   AMDGPUPassConfig::addInstSelector();
1244   addPass(&SIFixSGPRCopiesID);
1245   addPass(createSILowerI1CopiesPass());
1246   return false;
1247 }
1248 
1249 bool GCNPassConfig::addIRTranslator() {
1250   addPass(new IRTranslator(getOptLevel()));
1251   return false;
1252 }
1253 
1254 void GCNPassConfig::addPreLegalizeMachineIR() {
1255   bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1256   addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1257   addPass(new Localizer());
1258 }
1259 
1260 bool GCNPassConfig::addLegalizeMachineIR() {
1261   addPass(new Legalizer());
1262   return false;
1263 }
1264 
1265 void GCNPassConfig::addPreRegBankSelect() {
1266   bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1267   addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1268   addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
1269 }
1270 
1271 bool GCNPassConfig::addRegBankSelect() {
1272   addPass(new AMDGPURegBankSelect());
1273   return false;
1274 }
1275 
1276 void GCNPassConfig::addPreGlobalInstructionSelect() {
1277   bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1278   addPass(createAMDGPURegBankCombiner(IsOptNone));
1279 }
1280 
1281 bool GCNPassConfig::addGlobalInstructionSelect() {
1282   addPass(new InstructionSelect(getOptLevel()));
1283   return false;
1284 }
1285 
1286 void GCNPassConfig::addPreRegAlloc() {
1287   if (LateCFGStructurize) {
1288     addPass(createAMDGPUMachineCFGStructurizerPass());
1289   }
1290 }
1291 
1292 void GCNPassConfig::addFastRegAlloc() {
1293   // FIXME: We have to disable the verifier here because of PHIElimination +
1294   // TwoAddressInstructions disabling it.
1295 
1296   // This must be run immediately after phi elimination and before
1297   // TwoAddressInstructions, otherwise the processing of the tied operand of
1298   // SI_ELSE will introduce a copy of the tied operand source after the else.
1299   insertPass(&PHIEliminationID, &SILowerControlFlowID);
1300 
1301   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1302 
1303   TargetPassConfig::addFastRegAlloc();
1304 }
1305 
1306 void GCNPassConfig::addOptimizedRegAlloc() {
1307   // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1308   // instructions that cause scheduling barriers.
1309   insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1310 
1311   if (OptExecMaskPreRA)
1312     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1313 
1314   if (EnableRewritePartialRegUses)
1315     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
1316 
1317   if (isPassEnabled(EnablePreRAOptimizations))
1318     insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1319 
1320   // This is not an essential optimization and it has a noticeable impact on
1321   // compilation time, so we only enable it from O2.
1322   if (TM->getOptLevel() > CodeGenOptLevel::Less)
1323     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1324 
1325   // FIXME: when an instruction has a Killed operand, and the instruction is
1326   // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1327   // the register in LiveVariables, this would trigger a failure in verifier,
1328   // we should fix it and enable the verifier.
1329   if (OptVGPRLiveRange)
1330     insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1331   // This must be run immediately after phi elimination and before
1332   // TwoAddressInstructions, otherwise the processing of the tied operand of
1333   // SI_ELSE will introduce a copy of the tied operand source after the else.
1334   insertPass(&PHIEliminationID, &SILowerControlFlowID);
1335 
1336   if (EnableDCEInRA)
1337     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1338 
1339   TargetPassConfig::addOptimizedRegAlloc();
1340 }
1341 
1342 bool GCNPassConfig::addPreRewrite() {
1343   addPass(&SILowerWWMCopiesID);
1344   if (EnableRegReassign)
1345     addPass(&GCNNSAReassignID);
1346   return true;
1347 }
1348 
1349 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1350   // Initialize the global default.
1351   llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1352                   initializeDefaultSGPRRegisterAllocatorOnce);
1353 
1354   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1355   if (Ctor != useDefaultRegisterAllocator)
1356     return Ctor();
1357 
1358   if (Optimized)
1359     return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1360 
1361   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1362 }
1363 
1364 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1365   // Initialize the global default.
1366   llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1367                   initializeDefaultVGPRRegisterAllocatorOnce);
1368 
1369   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1370   if (Ctor != useDefaultRegisterAllocator)
1371     return Ctor();
1372 
1373   if (Optimized)
1374     return createGreedyVGPRRegisterAllocator();
1375 
1376   return createFastVGPRRegisterAllocator();
1377 }
1378 
1379 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1380   llvm_unreachable("should not be used");
1381 }
1382 
1383 static const char RegAllocOptNotSupportedMessage[] =
1384   "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1385 
1386 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1387   if (!usingDefaultRegAlloc())
1388     report_fatal_error(RegAllocOptNotSupportedMessage);
1389 
1390   addPass(&GCNPreRALongBranchRegID);
1391 
1392   addPass(createSGPRAllocPass(false));
1393 
1394   // Equivalent of PEI for SGPRs.
1395   addPass(&SILowerSGPRSpillsID);
1396   addPass(&SIPreAllocateWWMRegsID);
1397 
1398   addPass(createVGPRAllocPass(false));
1399 
1400   addPass(&SILowerWWMCopiesID);
1401   return true;
1402 }
1403 
1404 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1405   if (!usingDefaultRegAlloc())
1406     report_fatal_error(RegAllocOptNotSupportedMessage);
1407 
1408   addPass(&GCNPreRALongBranchRegID);
1409 
1410   addPass(createSGPRAllocPass(true));
1411 
1412   // Commit allocated register changes. This is mostly necessary because too
1413   // many things rely on the use lists of the physical registers, such as the
1414   // verifier. This is only necessary with allocators which use LiveIntervals,
1415   // since FastRegAlloc does the replacements itself.
1416   addPass(createVirtRegRewriter(false));
1417 
1418   // Equivalent of PEI for SGPRs.
1419   addPass(&SILowerSGPRSpillsID);
1420   addPass(&SIPreAllocateWWMRegsID);
1421 
1422   addPass(createVGPRAllocPass(true));
1423 
1424   addPreRewrite();
1425   addPass(&VirtRegRewriterID);
1426 
1427   return true;
1428 }
1429 
1430 void GCNPassConfig::addPostRegAlloc() {
1431   addPass(&SIFixVGPRCopiesID);
1432   if (getOptLevel() > CodeGenOptLevel::None)
1433     addPass(&SIOptimizeExecMaskingID);
1434   TargetPassConfig::addPostRegAlloc();
1435 }
1436 
1437 void GCNPassConfig::addPreSched2() {
1438   if (TM->getOptLevel() > CodeGenOptLevel::None)
1439     addPass(createSIShrinkInstructionsPass());
1440   addPass(&SIPostRABundlerID);
1441 }
1442 
1443 void GCNPassConfig::addPreEmitPass() {
1444   if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1445     addPass(&GCNCreateVOPDID);
1446   addPass(createSIMemoryLegalizerPass());
1447   addPass(createSIInsertWaitcntsPass());
1448 
1449   addPass(createSIModeRegisterPass());
1450 
1451   if (getOptLevel() > CodeGenOptLevel::None)
1452     addPass(&SIInsertHardClausesID);
1453 
1454   addPass(&SILateBranchLoweringPassID);
1455   if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
1456     addPass(createAMDGPUSetWavePriorityPass());
1457   if (getOptLevel() > CodeGenOptLevel::None)
1458     addPass(&SIPreEmitPeepholeID);
1459   // The hazard recognizer that runs as part of the post-ra scheduler does not
1460   // guarantee to be able handle all hazards correctly. This is because if there
1461   // are multiple scheduling regions in a basic block, the regions are scheduled
1462   // bottom up, so when we begin to schedule a region we don't know what
1463   // instructions were emitted directly before it.
1464   //
1465   // Here we add a stand-alone hazard recognizer pass which can handle all
1466   // cases.
1467   addPass(&PostRAHazardRecognizerID);
1468 
1469   if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
1470     addPass(&AMDGPUInsertSingleUseVDSTID);
1471 
1472   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
1473     addPass(&AMDGPUInsertDelayAluID);
1474 
1475   addPass(&BranchRelaxationPassID);
1476 }
1477 
1478 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1479   return new GCNPassConfig(*this, PM);
1480 }
1481 
1482 void GCNTargetMachine::registerMachineRegisterInfoCallback(
1483     MachineFunction &MF) const {
1484   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1485   MF.getRegInfo().addDelegate(MFI);
1486 }
1487 
1488 MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1489     BumpPtrAllocator &Allocator, const Function &F,
1490     const TargetSubtargetInfo *STI) const {
1491   return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1492       Allocator, F, static_cast<const GCNSubtarget *>(STI));
1493 }
1494 
1495 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1496   return new yaml::SIMachineFunctionInfo();
1497 }
1498 
1499 yaml::MachineFunctionInfo *
1500 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1501   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1502   return new yaml::SIMachineFunctionInfo(
1503       *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1504 }
1505 
1506 bool GCNTargetMachine::parseMachineFunctionInfo(
1507     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1508     SMDiagnostic &Error, SMRange &SourceRange) const {
1509   const yaml::SIMachineFunctionInfo &YamlMFI =
1510       static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1511   MachineFunction &MF = PFS.MF;
1512   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1513   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1514 
1515   if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1516     return true;
1517 
1518   if (MFI->Occupancy == 0) {
1519     // Fixup the subtarget dependent default value.
1520     MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1521   }
1522 
1523   auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1524     Register TempReg;
1525     if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1526       SourceRange = RegName.SourceRange;
1527       return true;
1528     }
1529     RegVal = TempReg;
1530 
1531     return false;
1532   };
1533 
1534   auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1535                                    Register &RegVal) {
1536     return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1537   };
1538 
1539   if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1540     return true;
1541 
1542   if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1543     return true;
1544 
1545   if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1546                             MFI->LongBranchReservedReg))
1547     return true;
1548 
1549   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1550     // Create a diagnostic for a the register string literal.
1551     const MemoryBuffer &Buffer =
1552         *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1553     Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1554                          RegName.Value.size(), SourceMgr::DK_Error,
1555                          "incorrect register class for field", RegName.Value,
1556                          std::nullopt, std::nullopt);
1557     SourceRange = RegName.SourceRange;
1558     return true;
1559   };
1560 
1561   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1562       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1563       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1564     return true;
1565 
1566   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1567       !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1568     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1569   }
1570 
1571   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1572       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1573     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1574   }
1575 
1576   if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1577       !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1578     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1579   }
1580 
1581   for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1582     Register ParsedReg;
1583     if (parseRegister(YamlReg, ParsedReg))
1584       return true;
1585 
1586     MFI->reserveWWMRegister(ParsedReg);
1587   }
1588 
1589   auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1590                                    const TargetRegisterClass &RC,
1591                                    ArgDescriptor &Arg, unsigned UserSGPRs,
1592                                    unsigned SystemSGPRs) {
1593     // Skip parsing if it's not present.
1594     if (!A)
1595       return false;
1596 
1597     if (A->IsRegister) {
1598       Register Reg;
1599       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1600         SourceRange = A->RegisterName.SourceRange;
1601         return true;
1602       }
1603       if (!RC.contains(Reg))
1604         return diagnoseRegisterClass(A->RegisterName);
1605       Arg = ArgDescriptor::createRegister(Reg);
1606     } else
1607       Arg = ArgDescriptor::createStack(A->StackOffset);
1608     // Check and apply the optional mask.
1609     if (A->Mask)
1610       Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1611 
1612     MFI->NumUserSGPRs += UserSGPRs;
1613     MFI->NumSystemSGPRs += SystemSGPRs;
1614     return false;
1615   };
1616 
1617   if (YamlMFI.ArgInfo &&
1618       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1619                              AMDGPU::SGPR_128RegClass,
1620                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1621        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1622                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1623                              2, 0) ||
1624        parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1625                              MFI->ArgInfo.QueuePtr, 2, 0) ||
1626        parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1627                              AMDGPU::SReg_64RegClass,
1628                              MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1629        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1630                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1631                              2, 0) ||
1632        parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1633                              AMDGPU::SReg_64RegClass,
1634                              MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1635        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1636                              AMDGPU::SGPR_32RegClass,
1637                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1638        parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1639                              AMDGPU::SGPR_32RegClass,
1640                              MFI->ArgInfo.LDSKernelId, 0, 1) ||
1641        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1642                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1643                              0, 1) ||
1644        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1645                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1646                              0, 1) ||
1647        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1648                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1649                              0, 1) ||
1650        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1651                              AMDGPU::SGPR_32RegClass,
1652                              MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1653        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1654                              AMDGPU::SGPR_32RegClass,
1655                              MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1656        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1657                              AMDGPU::SReg_64RegClass,
1658                              MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1659        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1660                              AMDGPU::SReg_64RegClass,
1661                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1662        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1663                              AMDGPU::VGPR_32RegClass,
1664                              MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1665        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1666                              AMDGPU::VGPR_32RegClass,
1667                              MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1668        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1669                              AMDGPU::VGPR_32RegClass,
1670                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1671     return true;
1672 
1673   if (ST.hasIEEEMode())
1674     MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1675   if (ST.hasDX10ClampMode())
1676     MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1677 
1678   // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1679   MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1680                                       ? DenormalMode::IEEE
1681                                       : DenormalMode::PreserveSign;
1682   MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1683                                        ? DenormalMode::IEEE
1684                                        : DenormalMode::PreserveSign;
1685 
1686   MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1687                                           ? DenormalMode::IEEE
1688                                           : DenormalMode::PreserveSign;
1689   MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1690                                            ? DenormalMode::IEEE
1691                                            : DenormalMode::PreserveSign;
1692 
1693   return false;
1694 }
1695