xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (revision 258a0d760aa8b42899a000e30f610f900a402556)
1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information  needed to emit code for SI+ GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCtorDtorLowering.h"
19 #include "AMDGPUExportClustering.h"
20 #include "AMDGPUIGroupLP.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPUTargetObjectFile.h"
23 #include "AMDGPUTargetTransformInfo.h"
24 #include "GCNIterativeScheduler.h"
25 #include "GCNSchedStrategy.h"
26 #include "GCNVOPDUtils.h"
27 #include "R600.h"
28 #include "R600MachineFunctionInfo.h"
29 #include "R600TargetMachine.h"
30 #include "SIMachineFunctionInfo.h"
31 #include "SIMachineScheduler.h"
32 #include "TargetInfo/AMDGPUTargetInfo.h"
33 #include "Utils/AMDGPUBaseInfo.h"
34 #include "llvm/Analysis/CGSCCPassManager.h"
35 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
36 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
37 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
38 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
39 #include "llvm/CodeGen/GlobalISel/Localizer.h"
40 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
41 #include "llvm/CodeGen/MIRParser/MIParser.h"
42 #include "llvm/CodeGen/Passes.h"
43 #include "llvm/CodeGen/RegAllocRegistry.h"
44 #include "llvm/CodeGen/TargetPassConfig.h"
45 #include "llvm/IR/IntrinsicsAMDGPU.h"
46 #include "llvm/IR/LegacyPassManager.h"
47 #include "llvm/IR/PassManager.h"
48 #include "llvm/IR/PatternMatch.h"
49 #include "llvm/InitializePasses.h"
50 #include "llvm/MC/TargetRegistry.h"
51 #include "llvm/Passes/PassBuilder.h"
52 #include "llvm/Transforms/IPO.h"
53 #include "llvm/Transforms/IPO/AlwaysInliner.h"
54 #include "llvm/Transforms/IPO/GlobalDCE.h"
55 #include "llvm/Transforms/IPO/Internalize.h"
56 #include "llvm/Transforms/Scalar.h"
57 #include "llvm/Transforms/Scalar/GVN.h"
58 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
59 #include "llvm/Transforms/Utils.h"
60 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
61 #include "llvm/Transforms/Vectorize.h"
62 #include <optional>
63 
64 using namespace llvm;
65 using namespace llvm::PatternMatch;
66 
67 namespace {
68 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
69 public:
70   SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
71     : RegisterRegAllocBase(N, D, C) {}
72 };
73 
74 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
75 public:
76   VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
77     : RegisterRegAllocBase(N, D, C) {}
78 };
79 
80 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
81                               const TargetRegisterClass &RC) {
82   return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
83 }
84 
85 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
86                               const TargetRegisterClass &RC) {
87   return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
88 }
89 
90 
91 /// -{sgpr|vgpr}-regalloc=... command line option.
92 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
93 
94 /// A dummy default pass factory indicates whether the register allocator is
95 /// overridden on the command line.
96 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
97 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
98 
99 static SGPRRegisterRegAlloc
100 defaultSGPRRegAlloc("default",
101                     "pick SGPR register allocator based on -O option",
102                     useDefaultRegisterAllocator);
103 
104 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
105                RegisterPassParser<SGPRRegisterRegAlloc>>
106 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
107              cl::desc("Register allocator to use for SGPRs"));
108 
109 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
110                RegisterPassParser<VGPRRegisterRegAlloc>>
111 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
112              cl::desc("Register allocator to use for VGPRs"));
113 
114 
115 static void initializeDefaultSGPRRegisterAllocatorOnce() {
116   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
117 
118   if (!Ctor) {
119     Ctor = SGPRRegAlloc;
120     SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
121   }
122 }
123 
124 static void initializeDefaultVGPRRegisterAllocatorOnce() {
125   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
126 
127   if (!Ctor) {
128     Ctor = VGPRRegAlloc;
129     VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
130   }
131 }
132 
133 static FunctionPass *createBasicSGPRRegisterAllocator() {
134   return createBasicRegisterAllocator(onlyAllocateSGPRs);
135 }
136 
137 static FunctionPass *createGreedySGPRRegisterAllocator() {
138   return createGreedyRegisterAllocator(onlyAllocateSGPRs);
139 }
140 
141 static FunctionPass *createFastSGPRRegisterAllocator() {
142   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
143 }
144 
145 static FunctionPass *createBasicVGPRRegisterAllocator() {
146   return createBasicRegisterAllocator(onlyAllocateVGPRs);
147 }
148 
149 static FunctionPass *createGreedyVGPRRegisterAllocator() {
150   return createGreedyRegisterAllocator(onlyAllocateVGPRs);
151 }
152 
153 static FunctionPass *createFastVGPRRegisterAllocator() {
154   return createFastRegisterAllocator(onlyAllocateVGPRs, true);
155 }
156 
157 static SGPRRegisterRegAlloc basicRegAllocSGPR(
158   "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
159 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
160   "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
161 
162 static SGPRRegisterRegAlloc fastRegAllocSGPR(
163   "fast", "fast register allocator", createFastSGPRRegisterAllocator);
164 
165 
166 static VGPRRegisterRegAlloc basicRegAllocVGPR(
167   "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
168 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
169   "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
170 
171 static VGPRRegisterRegAlloc fastRegAllocVGPR(
172   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
173 }
174 
175 static cl::opt<bool> EnableSROA(
176   "amdgpu-sroa",
177   cl::desc("Run SROA after promote alloca pass"),
178   cl::ReallyHidden,
179   cl::init(true));
180 
181 static cl::opt<bool>
182 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
183                         cl::desc("Run early if-conversion"),
184                         cl::init(false));
185 
186 static cl::opt<bool>
187 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
188             cl::desc("Run pre-RA exec mask optimizations"),
189             cl::init(true));
190 
191 // Option to disable vectorizer for tests.
192 static cl::opt<bool> EnableLoadStoreVectorizer(
193   "amdgpu-load-store-vectorizer",
194   cl::desc("Enable load store vectorizer"),
195   cl::init(true),
196   cl::Hidden);
197 
198 // Option to control global loads scalarization
199 static cl::opt<bool> ScalarizeGlobal(
200   "amdgpu-scalarize-global-loads",
201   cl::desc("Enable global load scalarization"),
202   cl::init(true),
203   cl::Hidden);
204 
205 // Option to run internalize pass.
206 static cl::opt<bool> InternalizeSymbols(
207   "amdgpu-internalize-symbols",
208   cl::desc("Enable elimination of non-kernel functions and unused globals"),
209   cl::init(false),
210   cl::Hidden);
211 
212 // Option to inline all early.
213 static cl::opt<bool> EarlyInlineAll(
214   "amdgpu-early-inline-all",
215   cl::desc("Inline all functions early"),
216   cl::init(false),
217   cl::Hidden);
218 
219 static cl::opt<bool> EnableSDWAPeephole(
220   "amdgpu-sdwa-peephole",
221   cl::desc("Enable SDWA peepholer"),
222   cl::init(true));
223 
224 static cl::opt<bool> EnableDPPCombine(
225   "amdgpu-dpp-combine",
226   cl::desc("Enable DPP combiner"),
227   cl::init(true));
228 
229 // Enable address space based alias analysis
230 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
231   cl::desc("Enable AMDGPU Alias Analysis"),
232   cl::init(true));
233 
234 // Option to run late CFG structurizer
235 static cl::opt<bool, true> LateCFGStructurize(
236   "amdgpu-late-structurize",
237   cl::desc("Enable late CFG structurization"),
238   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
239   cl::Hidden);
240 
241 // Enable lib calls simplifications
242 static cl::opt<bool> EnableLibCallSimplify(
243   "amdgpu-simplify-libcall",
244   cl::desc("Enable amdgpu library simplifications"),
245   cl::init(true),
246   cl::Hidden);
247 
248 static cl::opt<bool> EnableLowerKernelArguments(
249   "amdgpu-ir-lower-kernel-arguments",
250   cl::desc("Lower kernel argument loads in IR pass"),
251   cl::init(true),
252   cl::Hidden);
253 
254 static cl::opt<bool> EnableRegReassign(
255   "amdgpu-reassign-regs",
256   cl::desc("Enable register reassign optimizations on gfx10+"),
257   cl::init(true),
258   cl::Hidden);
259 
260 static cl::opt<bool> OptVGPRLiveRange(
261     "amdgpu-opt-vgpr-liverange",
262     cl::desc("Enable VGPR liverange optimizations for if-else structure"),
263     cl::init(true), cl::Hidden);
264 
265 // Enable atomic optimization
266 static cl::opt<bool> EnableAtomicOptimizations(
267   "amdgpu-atomic-optimizations",
268   cl::desc("Enable atomic optimizations"),
269   cl::init(false),
270   cl::Hidden);
271 
272 // Enable Mode register optimization
273 static cl::opt<bool> EnableSIModeRegisterPass(
274   "amdgpu-mode-register",
275   cl::desc("Enable mode register pass"),
276   cl::init(true),
277   cl::Hidden);
278 
279 // Enable GFX11+ s_delay_alu insertion
280 static cl::opt<bool>
281     EnableInsertDelayAlu("amdgpu-enable-delay-alu",
282                          cl::desc("Enable s_delay_alu insertion"),
283                          cl::init(true), cl::Hidden);
284 
285 // Enable GFX11+ VOPD
286 static cl::opt<bool>
287     EnableVOPD("amdgpu-enable-vopd",
288                cl::desc("Enable VOPD, dual issue of VALU in wave32"),
289                cl::init(true), cl::Hidden);
290 
291 // Option is used in lit tests to prevent deadcoding of patterns inspected.
292 static cl::opt<bool>
293 EnableDCEInRA("amdgpu-dce-in-ra",
294     cl::init(true), cl::Hidden,
295     cl::desc("Enable machine DCE inside regalloc"));
296 
297 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
298                                            cl::desc("Adjust wave priority"),
299                                            cl::init(false), cl::Hidden);
300 
301 static cl::opt<bool> EnableScalarIRPasses(
302   "amdgpu-scalar-ir-passes",
303   cl::desc("Enable scalar IR passes"),
304   cl::init(true),
305   cl::Hidden);
306 
307 static cl::opt<bool> EnableStructurizerWorkarounds(
308     "amdgpu-enable-structurizer-workarounds",
309     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
310     cl::Hidden);
311 
312 static cl::opt<bool> EnableLDSReplaceWithPointer(
313     "amdgpu-enable-lds-replace-with-pointer",
314     cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
315     cl::Hidden);
316 
317 static cl::opt<bool, true> EnableLowerModuleLDS(
318     "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
319     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
320     cl::Hidden);
321 
322 static cl::opt<bool> EnablePreRAOptimizations(
323     "amdgpu-enable-pre-ra-optimizations",
324     cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
325     cl::Hidden);
326 
327 static cl::opt<bool> EnablePromoteKernelArguments(
328     "amdgpu-enable-promote-kernel-arguments",
329     cl::desc("Enable promotion of flat kernel pointer arguments to global"),
330     cl::Hidden, cl::init(true));
331 
332 static cl::opt<bool> EnableMaxIlpSchedStrategy(
333     "amdgpu-enable-max-ilp-scheduling-strategy",
334     cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
335     cl::Hidden, cl::init(false));
336 
337 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
338   // Register the target
339   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
340   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
341 
342   PassRegistry *PR = PassRegistry::getPassRegistry();
343   initializeR600ClauseMergePassPass(*PR);
344   initializeR600ControlFlowFinalizerPass(*PR);
345   initializeR600PacketizerPass(*PR);
346   initializeR600ExpandSpecialInstrsPassPass(*PR);
347   initializeR600VectorRegMergerPass(*PR);
348   initializeGlobalISel(*PR);
349   initializeAMDGPUDAGToDAGISelPass(*PR);
350   initializeGCNDPPCombinePass(*PR);
351   initializeSILowerI1CopiesPass(*PR);
352   initializeSILowerSGPRSpillsPass(*PR);
353   initializeSIFixSGPRCopiesPass(*PR);
354   initializeSIFixVGPRCopiesPass(*PR);
355   initializeSIFoldOperandsPass(*PR);
356   initializeSIPeepholeSDWAPass(*PR);
357   initializeSIShrinkInstructionsPass(*PR);
358   initializeSIOptimizeExecMaskingPreRAPass(*PR);
359   initializeSIOptimizeVGPRLiveRangePass(*PR);
360   initializeSILoadStoreOptimizerPass(*PR);
361   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
362   initializeAMDGPUAlwaysInlinePass(*PR);
363   initializeAMDGPUAttributorPass(*PR);
364   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
365   initializeAMDGPUAnnotateUniformValuesPass(*PR);
366   initializeAMDGPUArgumentUsageInfoPass(*PR);
367   initializeAMDGPUAtomicOptimizerPass(*PR);
368   initializeAMDGPULowerKernelArgumentsPass(*PR);
369   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
370   initializeAMDGPULowerKernelAttributesPass(*PR);
371   initializeAMDGPULowerIntrinsicsPass(*PR);
372   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
373   initializeAMDGPUPostLegalizerCombinerPass(*PR);
374   initializeAMDGPUPreLegalizerCombinerPass(*PR);
375   initializeAMDGPURegBankCombinerPass(*PR);
376   initializeAMDGPUPromoteAllocaPass(*PR);
377   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
378   initializeAMDGPUCodeGenPreparePass(*PR);
379   initializeAMDGPULateCodeGenPreparePass(*PR);
380   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
381   initializeAMDGPUPropagateAttributesLatePass(*PR);
382   initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
383   initializeAMDGPULowerModuleLDSPass(*PR);
384   initializeAMDGPURewriteOutArgumentsPass(*PR);
385   initializeAMDGPURewriteUndefForPHIPass(*PR);
386   initializeAMDGPUUnifyMetadataPass(*PR);
387   initializeSIAnnotateControlFlowPass(*PR);
388   initializeAMDGPUReleaseVGPRsPass(*PR);
389   initializeAMDGPUInsertDelayAluPass(*PR);
390   initializeSIInsertHardClausesPass(*PR);
391   initializeSIInsertWaitcntsPass(*PR);
392   initializeSIModeRegisterPass(*PR);
393   initializeSIWholeQuadModePass(*PR);
394   initializeSILowerControlFlowPass(*PR);
395   initializeSIPreEmitPeepholePass(*PR);
396   initializeSILateBranchLoweringPass(*PR);
397   initializeSIMemoryLegalizerPass(*PR);
398   initializeSIOptimizeExecMaskingPass(*PR);
399   initializeSIPreAllocateWWMRegsPass(*PR);
400   initializeSIFormMemoryClausesPass(*PR);
401   initializeSIPostRABundlerPass(*PR);
402   initializeGCNCreateVOPDPass(*PR);
403   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
404   initializeAMDGPUAAWrapperPassPass(*PR);
405   initializeAMDGPUExternalAAWrapperPass(*PR);
406   initializeAMDGPUUseNativeCallsPass(*PR);
407   initializeAMDGPUSimplifyLibCallsPass(*PR);
408   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
409   initializeAMDGPUResourceUsageAnalysisPass(*PR);
410   initializeGCNNSAReassignPass(*PR);
411   initializeGCNPreRAOptimizationsPass(*PR);
412 }
413 
414 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
415   return std::make_unique<AMDGPUTargetObjectFile>();
416 }
417 
418 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
419   return new SIScheduleDAGMI(C);
420 }
421 
422 static ScheduleDAGInstrs *
423 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
424   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
425   ScheduleDAGMILive *DAG =
426     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
427   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
428   if (ST.shouldClusterStores())
429     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
430   DAG->addMutation(createIGroupLPDAGMutation());
431   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
432   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
433   return DAG;
434 }
435 
436 static ScheduleDAGInstrs *
437 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
438   ScheduleDAGMILive *DAG =
439       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
440   DAG->addMutation(createIGroupLPDAGMutation());
441   return DAG;
442 }
443 
444 static ScheduleDAGInstrs *
445 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
446   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
447   auto DAG = new GCNIterativeScheduler(C,
448     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
449   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
450   if (ST.shouldClusterStores())
451     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
452   return DAG;
453 }
454 
455 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
456   return new GCNIterativeScheduler(C,
457     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
458 }
459 
460 static ScheduleDAGInstrs *
461 createIterativeILPMachineScheduler(MachineSchedContext *C) {
462   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
463   auto DAG = new GCNIterativeScheduler(C,
464     GCNIterativeScheduler::SCHEDULE_ILP);
465   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
466   if (ST.shouldClusterStores())
467     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
468   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
469   return DAG;
470 }
471 
472 static MachineSchedRegistry
473 SISchedRegistry("si", "Run SI's custom scheduler",
474                 createSIMachineScheduler);
475 
476 static MachineSchedRegistry
477 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
478                              "Run GCN scheduler to maximize occupancy",
479                              createGCNMaxOccupancyMachineScheduler);
480 
481 static MachineSchedRegistry
482     GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
483                            createGCNMaxILPMachineScheduler);
484 
485 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
486     "gcn-iterative-max-occupancy-experimental",
487     "Run GCN scheduler to maximize occupancy (experimental)",
488     createIterativeGCNMaxOccupancyMachineScheduler);
489 
490 static MachineSchedRegistry GCNMinRegSchedRegistry(
491     "gcn-iterative-minreg",
492     "Run GCN iterative scheduler for minimal register usage (experimental)",
493     createMinRegScheduler);
494 
495 static MachineSchedRegistry GCNILPSchedRegistry(
496     "gcn-iterative-ilp",
497     "Run GCN iterative scheduler for ILP scheduling (experimental)",
498     createIterativeILPMachineScheduler);
499 
500 static StringRef computeDataLayout(const Triple &TT) {
501   if (TT.getArch() == Triple::r600) {
502     // 32-bit pointers.
503     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
504            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
505   }
506 
507   // 32-bit private, local, and region pointers. 64-bit global, constant and
508   // flat, non-integral buffer fat pointers.
509   return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
510          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
511          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
512          "-ni:7";
513 }
514 
515 LLVM_READNONE
516 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
517   if (!GPU.empty())
518     return GPU;
519 
520   // Need to default to a target with flat support for HSA.
521   if (TT.getArch() == Triple::amdgcn)
522     return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
523 
524   return "r600";
525 }
526 
527 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
528   // The AMDGPU toolchain only supports generating shared objects, so we
529   // must always use PIC.
530   return Reloc::PIC_;
531 }
532 
533 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
534                                          StringRef CPU, StringRef FS,
535                                          TargetOptions Options,
536                                          std::optional<Reloc::Model> RM,
537                                          std::optional<CodeModel::Model> CM,
538                                          CodeGenOpt::Level OptLevel)
539     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
540                         FS, Options, getEffectiveRelocModel(RM),
541                         getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
542       TLOF(createTLOF(getTargetTriple())) {
543   initAsmInfo();
544   if (TT.getArch() == Triple::amdgcn) {
545     if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
546       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
547     else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
548       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
549   }
550 }
551 
552 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
553 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
554 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
555 
556 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
557 
558 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
559   Attribute GPUAttr = F.getFnAttribute("target-cpu");
560   return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
561 }
562 
563 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
564   Attribute FSAttr = F.getFnAttribute("target-features");
565 
566   return FSAttr.isValid() ? FSAttr.getValueAsString()
567                           : getTargetFeatureString();
568 }
569 
570 /// Predicate for Internalize pass.
571 static bool mustPreserveGV(const GlobalValue &GV) {
572   if (const Function *F = dyn_cast<Function>(&GV))
573     return F->isDeclaration() || F->getName().startswith("__asan_") ||
574            F->getName().startswith("__sanitizer_") ||
575            AMDGPU::isEntryFunctionCC(F->getCallingConv());
576 
577   GV.removeDeadConstantUsers();
578   return !GV.use_empty();
579 }
580 
581 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
582   AAM.registerFunctionAnalysis<AMDGPUAA>();
583 }
584 
585 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
586   PB.registerPipelineParsingCallback(
587       [this](StringRef PassName, ModulePassManager &PM,
588              ArrayRef<PassBuilder::PipelineElement>) {
589         if (PassName == "amdgpu-propagate-attributes-late") {
590           PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
591           return true;
592         }
593         if (PassName == "amdgpu-unify-metadata") {
594           PM.addPass(AMDGPUUnifyMetadataPass());
595           return true;
596         }
597         if (PassName == "amdgpu-printf-runtime-binding") {
598           PM.addPass(AMDGPUPrintfRuntimeBindingPass());
599           return true;
600         }
601         if (PassName == "amdgpu-always-inline") {
602           PM.addPass(AMDGPUAlwaysInlinePass());
603           return true;
604         }
605         if (PassName == "amdgpu-replace-lds-use-with-pointer") {
606           PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
607           return true;
608         }
609         if (PassName == "amdgpu-lower-module-lds") {
610           PM.addPass(AMDGPULowerModuleLDSPass());
611           return true;
612         }
613         if (PassName == "amdgpu-lower-ctor-dtor") {
614           PM.addPass(AMDGPUCtorDtorLoweringPass());
615           return true;
616         }
617         return false;
618       });
619   PB.registerPipelineParsingCallback(
620       [this](StringRef PassName, FunctionPassManager &PM,
621              ArrayRef<PassBuilder::PipelineElement>) {
622         if (PassName == "amdgpu-simplifylib") {
623           PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
624           return true;
625         }
626         if (PassName == "amdgpu-usenative") {
627           PM.addPass(AMDGPUUseNativeCallsPass());
628           return true;
629         }
630         if (PassName == "amdgpu-promote-alloca") {
631           PM.addPass(AMDGPUPromoteAllocaPass(*this));
632           return true;
633         }
634         if (PassName == "amdgpu-promote-alloca-to-vector") {
635           PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
636           return true;
637         }
638         if (PassName == "amdgpu-lower-kernel-attributes") {
639           PM.addPass(AMDGPULowerKernelAttributesPass());
640           return true;
641         }
642         if (PassName == "amdgpu-propagate-attributes-early") {
643           PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
644           return true;
645         }
646         if (PassName == "amdgpu-promote-kernel-arguments") {
647           PM.addPass(AMDGPUPromoteKernelArgumentsPass());
648           return true;
649         }
650         return false;
651       });
652 
653   PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
654     FAM.registerPass([&] { return AMDGPUAA(); });
655   });
656 
657   PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
658     if (AAName == "amdgpu-aa") {
659       AAM.registerFunctionAnalysis<AMDGPUAA>();
660       return true;
661     }
662     return false;
663   });
664 
665   PB.registerPipelineStartEPCallback(
666       [this](ModulePassManager &PM, OptimizationLevel Level) {
667         FunctionPassManager FPM;
668         FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
669         FPM.addPass(AMDGPUUseNativeCallsPass());
670         if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
671           FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
672         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
673       });
674 
675   PB.registerPipelineEarlySimplificationEPCallback(
676       [this](ModulePassManager &PM, OptimizationLevel Level) {
677         if (Level == OptimizationLevel::O0)
678           return;
679 
680         PM.addPass(AMDGPUUnifyMetadataPass());
681         PM.addPass(AMDGPUPrintfRuntimeBindingPass());
682 
683         if (InternalizeSymbols) {
684           PM.addPass(InternalizePass(mustPreserveGV));
685         }
686         PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
687         if (InternalizeSymbols) {
688           PM.addPass(GlobalDCEPass());
689         }
690         if (EarlyInlineAll && !EnableFunctionCalls)
691           PM.addPass(AMDGPUAlwaysInlinePass());
692       });
693 
694   PB.registerCGSCCOptimizerLateEPCallback(
695       [this](CGSCCPassManager &PM, OptimizationLevel Level) {
696         if (Level == OptimizationLevel::O0)
697           return;
698 
699         FunctionPassManager FPM;
700 
701         // Add promote kernel arguments pass to the opt pipeline right before
702         // infer address spaces which is needed to do actual address space
703         // rewriting.
704         if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
705             EnablePromoteKernelArguments)
706           FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
707 
708         // Add infer address spaces pass to the opt pipeline after inlining
709         // but before SROA to increase SROA opportunities.
710         FPM.addPass(InferAddressSpacesPass());
711 
712         // This should run after inlining to have any chance of doing
713         // anything, and before other cleanup optimizations.
714         FPM.addPass(AMDGPULowerKernelAttributesPass());
715 
716         if (Level != OptimizationLevel::O0) {
717           // Promote alloca to vector before SROA and loop unroll. If we
718           // manage to eliminate allocas before unroll we may choose to unroll
719           // less.
720           FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
721         }
722 
723         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
724       });
725 }
726 
727 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
728   return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
729           AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
730           AddrSpace == AMDGPUAS::REGION_ADDRESS)
731              ? -1
732              : 0;
733 }
734 
735 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
736                                               unsigned DestAS) const {
737   return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
738          AMDGPU::isFlatGlobalAddrSpace(DestAS);
739 }
740 
741 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
742   const auto *LD = dyn_cast<LoadInst>(V);
743   if (!LD)
744     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
745 
746   // It must be a generic pointer loaded.
747   assert(V->getType()->isPointerTy() &&
748          V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
749 
750   const auto *Ptr = LD->getPointerOperand();
751   if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
752     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
753   // For a generic pointer loaded from the constant memory, it could be assumed
754   // as a global pointer since the constant memory is only populated on the
755   // host side. As implied by the offload programming model, only global
756   // pointers could be referenced on the host side.
757   return AMDGPUAS::GLOBAL_ADDRESS;
758 }
759 
760 std::pair<const Value *, unsigned>
761 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
762   if (auto *II = dyn_cast<IntrinsicInst>(V)) {
763     switch (II->getIntrinsicID()) {
764     case Intrinsic::amdgcn_is_shared:
765       return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
766     case Intrinsic::amdgcn_is_private:
767       return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
768     default:
769       break;
770     }
771     return std::pair(nullptr, -1);
772   }
773   // Check the global pointer predication based on
774   // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
775   // the order of 'is_shared' and 'is_private' is not significant.
776   Value *Ptr;
777   if (match(
778           const_cast<Value *>(V),
779           m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
780                   m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
781                       m_Deferred(Ptr))))))
782     return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
783 
784   return std::pair(nullptr, -1);
785 }
786 
787 unsigned
788 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
789   switch (Kind) {
790   case PseudoSourceValue::Stack:
791   case PseudoSourceValue::FixedStack:
792     return AMDGPUAS::PRIVATE_ADDRESS;
793   case PseudoSourceValue::ConstantPool:
794   case PseudoSourceValue::GOT:
795   case PseudoSourceValue::JumpTable:
796   case PseudoSourceValue::GlobalValueCallEntry:
797   case PseudoSourceValue::ExternalSymbolCallEntry:
798     return AMDGPUAS::CONSTANT_ADDRESS;
799   }
800   return AMDGPUAS::FLAT_ADDRESS;
801 }
802 
803 //===----------------------------------------------------------------------===//
804 // GCN Target Machine (SI+)
805 //===----------------------------------------------------------------------===//
806 
807 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
808                                    StringRef CPU, StringRef FS,
809                                    TargetOptions Options,
810                                    std::optional<Reloc::Model> RM,
811                                    std::optional<CodeModel::Model> CM,
812                                    CodeGenOpt::Level OL, bool JIT)
813     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
814 
815 const TargetSubtargetInfo *
816 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
817   StringRef GPU = getGPUName(F);
818   StringRef FS = getFeatureString(F);
819 
820   SmallString<128> SubtargetKey(GPU);
821   SubtargetKey.append(FS);
822 
823   auto &I = SubtargetMap[SubtargetKey];
824   if (!I) {
825     // This needs to be done before we create a new subtarget since any
826     // creation will depend on the TM and the code generation flags on the
827     // function that reside in TargetOptions.
828     resetTargetOptions(F);
829     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
830   }
831 
832   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
833 
834   return I.get();
835 }
836 
837 TargetTransformInfo
838 GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
839   return TargetTransformInfo(GCNTTIImpl(this, F));
840 }
841 
842 //===----------------------------------------------------------------------===//
843 // AMDGPU Pass Setup
844 //===----------------------------------------------------------------------===//
845 
846 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
847   return getStandardCSEConfigForOpt(TM->getOptLevel());
848 }
849 
850 namespace {
851 
852 class GCNPassConfig final : public AMDGPUPassConfig {
853 public:
854   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
855     : AMDGPUPassConfig(TM, PM) {
856     // It is necessary to know the register usage of the entire call graph.  We
857     // allow calls without EnableAMDGPUFunctionCalls if they are marked
858     // noinline, so this is always required.
859     setRequiresCodeGenSCCOrder(true);
860     substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
861   }
862 
863   GCNTargetMachine &getGCNTargetMachine() const {
864     return getTM<GCNTargetMachine>();
865   }
866 
867   ScheduleDAGInstrs *
868   createMachineScheduler(MachineSchedContext *C) const override;
869 
870   ScheduleDAGInstrs *
871   createPostMachineScheduler(MachineSchedContext *C) const override {
872     ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
873         C, std::make_unique<PostGenericScheduler>(C),
874         /*RemoveKillFlags=*/true);
875     const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
876     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
877     if (ST.shouldClusterStores())
878       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
879     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
880     DAG->addMutation(createIGroupLPDAGMutation());
881     if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
882       DAG->addMutation(createVOPDPairingMutation());
883     return DAG;
884   }
885 
886   bool addPreISel() override;
887   void addMachineSSAOptimization() override;
888   bool addILPOpts() override;
889   bool addInstSelector() override;
890   bool addIRTranslator() override;
891   void addPreLegalizeMachineIR() override;
892   bool addLegalizeMachineIR() override;
893   void addPreRegBankSelect() override;
894   bool addRegBankSelect() override;
895   void addPreGlobalInstructionSelect() override;
896   bool addGlobalInstructionSelect() override;
897   void addFastRegAlloc() override;
898   void addOptimizedRegAlloc() override;
899 
900   FunctionPass *createSGPRAllocPass(bool Optimized);
901   FunctionPass *createVGPRAllocPass(bool Optimized);
902   FunctionPass *createRegAllocPass(bool Optimized) override;
903 
904   bool addRegAssignAndRewriteFast() override;
905   bool addRegAssignAndRewriteOptimized() override;
906 
907   void addPreRegAlloc() override;
908   bool addPreRewrite() override;
909   void addPostRegAlloc() override;
910   void addPreSched2() override;
911   void addPreEmitPass() override;
912 };
913 
914 } // end anonymous namespace
915 
916 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
917     : TargetPassConfig(TM, PM) {
918   // Exceptions and StackMaps are not supported, so these passes will never do
919   // anything.
920   disablePass(&StackMapLivenessID);
921   disablePass(&FuncletLayoutID);
922   // Garbage collection is not supported.
923   disablePass(&GCLoweringID);
924   disablePass(&ShadowStackGCLoweringID);
925 }
926 
927 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
928   if (getOptLevel() == CodeGenOpt::Aggressive)
929     addPass(createGVNPass());
930   else
931     addPass(createEarlyCSEPass());
932 }
933 
934 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
935   addPass(createLICMPass());
936   addPass(createSeparateConstOffsetFromGEPPass());
937   // ReassociateGEPs exposes more opportunities for SLSR. See
938   // the example in reassociate-geps-and-slsr.ll.
939   addPass(createStraightLineStrengthReducePass());
940   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
941   // EarlyCSE can reuse.
942   addEarlyCSEOrGVNPass();
943   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
944   addPass(createNaryReassociatePass());
945   // NaryReassociate on GEPs creates redundant common expressions, so run
946   // EarlyCSE after it.
947   addPass(createEarlyCSEPass());
948 }
949 
950 void AMDGPUPassConfig::addIRPasses() {
951   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
952 
953   // There is no reason to run these.
954   disablePass(&StackMapLivenessID);
955   disablePass(&FuncletLayoutID);
956   disablePass(&PatchableFunctionID);
957 
958   addPass(createAMDGPUPrintfRuntimeBinding());
959   addPass(createAMDGPUCtorDtorLoweringLegacyPass());
960 
961   // A call to propagate attributes pass in the backend in case opt was not run.
962   addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
963 
964   addPass(createAMDGPULowerIntrinsicsPass());
965 
966   // Function calls are not supported, so make sure we inline everything.
967   addPass(createAMDGPUAlwaysInlinePass());
968   addPass(createAlwaysInlinerLegacyPass());
969   // We need to add the barrier noop pass, otherwise adding the function
970   // inlining pass will cause all of the PassConfigs passes to be run
971   // one function at a time, which means if we have a module with two
972   // functions, then we will generate code for the first function
973   // without ever running any passes on the second.
974   addPass(createBarrierNoopPass());
975 
976   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
977   if (TM.getTargetTriple().getArch() == Triple::r600)
978     addPass(createR600OpenCLImageTypeLoweringPass());
979 
980   // Replace OpenCL enqueued block function pointers with global variables.
981   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
982 
983   // Can increase LDS used by kernel so runs before PromoteAlloca
984   if (EnableLowerModuleLDS) {
985     // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
986     // pass "amdgpu-lower-module-lds", and also it required to be run only if
987     // "amdgpu-lower-module-lds" pass is enabled.
988     if (EnableLDSReplaceWithPointer)
989       addPass(createAMDGPUReplaceLDSUseWithPointerPass());
990 
991     addPass(createAMDGPULowerModuleLDSPass());
992   }
993 
994   if (TM.getOptLevel() > CodeGenOpt::None)
995     addPass(createInferAddressSpacesPass());
996 
997   addPass(createAtomicExpandPass());
998 
999   if (TM.getOptLevel() > CodeGenOpt::None) {
1000     addPass(createAMDGPUPromoteAlloca());
1001 
1002     if (EnableSROA)
1003       addPass(createSROAPass());
1004     if (isPassEnabled(EnableScalarIRPasses))
1005       addStraightLineScalarOptimizationPasses();
1006 
1007     if (EnableAMDGPUAliasAnalysis) {
1008       addPass(createAMDGPUAAWrapperPass());
1009       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1010                                              AAResults &AAR) {
1011         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1012           AAR.addAAResult(WrapperPass->getResult());
1013         }));
1014     }
1015 
1016     if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1017       // TODO: May want to move later or split into an early and late one.
1018       addPass(createAMDGPUCodeGenPreparePass());
1019     }
1020   }
1021 
1022   TargetPassConfig::addIRPasses();
1023 
1024   // EarlyCSE is not always strong enough to clean up what LSR produces. For
1025   // example, GVN can combine
1026   //
1027   //   %0 = add %a, %b
1028   //   %1 = add %b, %a
1029   //
1030   // and
1031   //
1032   //   %0 = shl nsw %a, 2
1033   //   %1 = shl %a, 2
1034   //
1035   // but EarlyCSE can do neither of them.
1036   if (isPassEnabled(EnableScalarIRPasses))
1037     addEarlyCSEOrGVNPass();
1038 }
1039 
1040 void AMDGPUPassConfig::addCodeGenPrepare() {
1041   if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1042     addPass(createAMDGPUAttributorPass());
1043 
1044     // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1045     // analysis, and should be removed.
1046     addPass(createAMDGPUAnnotateKernelFeaturesPass());
1047   }
1048 
1049   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1050       EnableLowerKernelArguments)
1051     addPass(createAMDGPULowerKernelArgumentsPass());
1052 
1053   TargetPassConfig::addCodeGenPrepare();
1054 
1055   if (isPassEnabled(EnableLoadStoreVectorizer))
1056     addPass(createLoadStoreVectorizerPass());
1057 
1058   // LowerSwitch pass may introduce unreachable blocks that can
1059   // cause unexpected behavior for subsequent passes. Placing it
1060   // here seems better that these blocks would get cleaned up by
1061   // UnreachableBlockElim inserted next in the pass flow.
1062   addPass(createLowerSwitchPass());
1063 }
1064 
1065 bool AMDGPUPassConfig::addPreISel() {
1066   if (TM->getOptLevel() > CodeGenOpt::None)
1067     addPass(createFlattenCFGPass());
1068   return false;
1069 }
1070 
1071 bool AMDGPUPassConfig::addInstSelector() {
1072   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1073   return false;
1074 }
1075 
1076 bool AMDGPUPassConfig::addGCPasses() {
1077   // Do nothing. GC is not supported.
1078   return false;
1079 }
1080 
1081 llvm::ScheduleDAGInstrs *
1082 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1083   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1084   ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1085   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1086   if (ST.shouldClusterStores())
1087     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1088   return DAG;
1089 }
1090 
1091 MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
1092     BumpPtrAllocator &Allocator, const Function &F,
1093     const TargetSubtargetInfo *STI) const {
1094   return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1095       Allocator, F, static_cast<const R600Subtarget *>(STI));
1096 }
1097 
1098 //===----------------------------------------------------------------------===//
1099 // GCN Pass Setup
1100 //===----------------------------------------------------------------------===//
1101 
1102 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1103   MachineSchedContext *C) const {
1104   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1105   if (ST.enableSIScheduler())
1106     return createSIMachineScheduler(C);
1107 
1108   if (EnableMaxIlpSchedStrategy)
1109     return createGCNMaxILPMachineScheduler(C);
1110 
1111   return createGCNMaxOccupancyMachineScheduler(C);
1112 }
1113 
1114 bool GCNPassConfig::addPreISel() {
1115   AMDGPUPassConfig::addPreISel();
1116 
1117   if (TM->getOptLevel() > CodeGenOpt::None)
1118     addPass(createAMDGPULateCodeGenPreparePass());
1119 
1120   if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
1121     addPass(createAMDGPUAtomicOptimizerPass());
1122   }
1123 
1124   if (TM->getOptLevel() > CodeGenOpt::None)
1125     addPass(createSinkingPass());
1126 
1127   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1128   // regions formed by them.
1129   addPass(&AMDGPUUnifyDivergentExitNodesID);
1130   if (!LateCFGStructurize) {
1131     if (EnableStructurizerWorkarounds) {
1132       addPass(createFixIrreduciblePass());
1133       addPass(createUnifyLoopExitsPass());
1134     }
1135     addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1136   }
1137   addPass(createAMDGPUAnnotateUniformValues());
1138   if (!LateCFGStructurize) {
1139     addPass(createSIAnnotateControlFlowPass());
1140     // TODO: Move this right after structurizeCFG to avoid extra divergence
1141     // analysis. This depends on stopping SIAnnotateControlFlow from making
1142     // control flow modifications.
1143     addPass(createAMDGPURewriteUndefForPHIPass());
1144   }
1145   addPass(createLCSSAPass());
1146 
1147   if (TM->getOptLevel() > CodeGenOpt::Less)
1148     addPass(&AMDGPUPerfHintAnalysisID);
1149 
1150   return false;
1151 }
1152 
1153 void GCNPassConfig::addMachineSSAOptimization() {
1154   TargetPassConfig::addMachineSSAOptimization();
1155 
1156   // We want to fold operands after PeepholeOptimizer has run (or as part of
1157   // it), because it will eliminate extra copies making it easier to fold the
1158   // real source operand. We want to eliminate dead instructions after, so that
1159   // we see fewer uses of the copies. We then need to clean up the dead
1160   // instructions leftover after the operands are folded as well.
1161   //
1162   // XXX - Can we get away without running DeadMachineInstructionElim again?
1163   addPass(&SIFoldOperandsID);
1164   if (EnableDPPCombine)
1165     addPass(&GCNDPPCombineID);
1166   addPass(&SILoadStoreOptimizerID);
1167   if (isPassEnabled(EnableSDWAPeephole)) {
1168     addPass(&SIPeepholeSDWAID);
1169     addPass(&EarlyMachineLICMID);
1170     addPass(&MachineCSEID);
1171     addPass(&SIFoldOperandsID);
1172   }
1173   addPass(&DeadMachineInstructionElimID);
1174   addPass(createSIShrinkInstructionsPass());
1175 }
1176 
1177 bool GCNPassConfig::addILPOpts() {
1178   if (EnableEarlyIfConversion)
1179     addPass(&EarlyIfConverterID);
1180 
1181   TargetPassConfig::addILPOpts();
1182   return false;
1183 }
1184 
1185 bool GCNPassConfig::addInstSelector() {
1186   AMDGPUPassConfig::addInstSelector();
1187   addPass(&SIFixSGPRCopiesID);
1188   addPass(createSILowerI1CopiesPass());
1189   return false;
1190 }
1191 
1192 bool GCNPassConfig::addIRTranslator() {
1193   addPass(new IRTranslator(getOptLevel()));
1194   return false;
1195 }
1196 
1197 void GCNPassConfig::addPreLegalizeMachineIR() {
1198   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1199   addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1200   addPass(new Localizer());
1201 }
1202 
1203 bool GCNPassConfig::addLegalizeMachineIR() {
1204   addPass(new Legalizer());
1205   return false;
1206 }
1207 
1208 void GCNPassConfig::addPreRegBankSelect() {
1209   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1210   addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1211 }
1212 
1213 bool GCNPassConfig::addRegBankSelect() {
1214   addPass(new RegBankSelect());
1215   return false;
1216 }
1217 
1218 void GCNPassConfig::addPreGlobalInstructionSelect() {
1219   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1220   addPass(createAMDGPURegBankCombiner(IsOptNone));
1221 }
1222 
1223 bool GCNPassConfig::addGlobalInstructionSelect() {
1224   addPass(new InstructionSelect(getOptLevel()));
1225   return false;
1226 }
1227 
1228 void GCNPassConfig::addPreRegAlloc() {
1229   if (LateCFGStructurize) {
1230     addPass(createAMDGPUMachineCFGStructurizerPass());
1231   }
1232 }
1233 
1234 void GCNPassConfig::addFastRegAlloc() {
1235   // FIXME: We have to disable the verifier here because of PHIElimination +
1236   // TwoAddressInstructions disabling it.
1237 
1238   // This must be run immediately after phi elimination and before
1239   // TwoAddressInstructions, otherwise the processing of the tied operand of
1240   // SI_ELSE will introduce a copy of the tied operand source after the else.
1241   insertPass(&PHIEliminationID, &SILowerControlFlowID);
1242 
1243   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1244   insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1245 
1246   TargetPassConfig::addFastRegAlloc();
1247 }
1248 
1249 void GCNPassConfig::addOptimizedRegAlloc() {
1250   // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1251   // instructions that cause scheduling barriers.
1252   insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1253   insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1254 
1255   if (OptExecMaskPreRA)
1256     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1257 
1258   if (isPassEnabled(EnablePreRAOptimizations))
1259     insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1260 
1261   // This is not an essential optimization and it has a noticeable impact on
1262   // compilation time, so we only enable it from O2.
1263   if (TM->getOptLevel() > CodeGenOpt::Less)
1264     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1265 
1266   // FIXME: when an instruction has a Killed operand, and the instruction is
1267   // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1268   // the register in LiveVariables, this would trigger a failure in verifier,
1269   // we should fix it and enable the verifier.
1270   if (OptVGPRLiveRange)
1271     insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1272   // This must be run immediately after phi elimination and before
1273   // TwoAddressInstructions, otherwise the processing of the tied operand of
1274   // SI_ELSE will introduce a copy of the tied operand source after the else.
1275   insertPass(&PHIEliminationID, &SILowerControlFlowID);
1276 
1277   if (EnableDCEInRA)
1278     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1279 
1280   TargetPassConfig::addOptimizedRegAlloc();
1281 }
1282 
1283 bool GCNPassConfig::addPreRewrite() {
1284   if (EnableRegReassign)
1285     addPass(&GCNNSAReassignID);
1286   return true;
1287 }
1288 
1289 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1290   // Initialize the global default.
1291   llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1292                   initializeDefaultSGPRRegisterAllocatorOnce);
1293 
1294   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1295   if (Ctor != useDefaultRegisterAllocator)
1296     return Ctor();
1297 
1298   if (Optimized)
1299     return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1300 
1301   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1302 }
1303 
1304 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1305   // Initialize the global default.
1306   llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1307                   initializeDefaultVGPRRegisterAllocatorOnce);
1308 
1309   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1310   if (Ctor != useDefaultRegisterAllocator)
1311     return Ctor();
1312 
1313   if (Optimized)
1314     return createGreedyVGPRRegisterAllocator();
1315 
1316   return createFastVGPRRegisterAllocator();
1317 }
1318 
1319 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1320   llvm_unreachable("should not be used");
1321 }
1322 
1323 static const char RegAllocOptNotSupportedMessage[] =
1324   "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1325 
1326 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1327   if (!usingDefaultRegAlloc())
1328     report_fatal_error(RegAllocOptNotSupportedMessage);
1329 
1330   addPass(createSGPRAllocPass(false));
1331 
1332   // Equivalent of PEI for SGPRs.
1333   addPass(&SILowerSGPRSpillsID);
1334 
1335   addPass(createVGPRAllocPass(false));
1336   return true;
1337 }
1338 
1339 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1340   if (!usingDefaultRegAlloc())
1341     report_fatal_error(RegAllocOptNotSupportedMessage);
1342 
1343   addPass(createSGPRAllocPass(true));
1344 
1345   // Commit allocated register changes. This is mostly necessary because too
1346   // many things rely on the use lists of the physical registers, such as the
1347   // verifier. This is only necessary with allocators which use LiveIntervals,
1348   // since FastRegAlloc does the replacements itself.
1349   addPass(createVirtRegRewriter(false));
1350 
1351   // Equivalent of PEI for SGPRs.
1352   addPass(&SILowerSGPRSpillsID);
1353 
1354   addPass(createVGPRAllocPass(true));
1355 
1356   addPreRewrite();
1357   addPass(&VirtRegRewriterID);
1358 
1359   return true;
1360 }
1361 
1362 void GCNPassConfig::addPostRegAlloc() {
1363   addPass(&SIFixVGPRCopiesID);
1364   if (getOptLevel() > CodeGenOpt::None)
1365     addPass(&SIOptimizeExecMaskingID);
1366   TargetPassConfig::addPostRegAlloc();
1367 }
1368 
1369 void GCNPassConfig::addPreSched2() {
1370   if (TM->getOptLevel() > CodeGenOpt::None)
1371     addPass(createSIShrinkInstructionsPass());
1372   addPass(&SIPostRABundlerID);
1373 }
1374 
1375 void GCNPassConfig::addPreEmitPass() {
1376   if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
1377     addPass(&GCNCreateVOPDID);
1378   addPass(createSIMemoryLegalizerPass());
1379   addPass(createSIInsertWaitcntsPass());
1380 
1381   addPass(createSIModeRegisterPass());
1382 
1383   if (getOptLevel() > CodeGenOpt::None)
1384     addPass(&SIInsertHardClausesID);
1385 
1386   addPass(&SILateBranchLoweringPassID);
1387   if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
1388     addPass(createAMDGPUSetWavePriorityPass());
1389   if (getOptLevel() > CodeGenOpt::None)
1390     addPass(&SIPreEmitPeepholeID);
1391   // The hazard recognizer that runs as part of the post-ra scheduler does not
1392   // guarantee to be able handle all hazards correctly. This is because if there
1393   // are multiple scheduling regions in a basic block, the regions are scheduled
1394   // bottom up, so when we begin to schedule a region we don't know what
1395   // instructions were emitted directly before it.
1396   //
1397   // Here we add a stand-alone hazard recognizer pass which can handle all
1398   // cases.
1399   addPass(&PostRAHazardRecognizerID);
1400 
1401   if (getOptLevel() > CodeGenOpt::Less)
1402     addPass(&AMDGPUReleaseVGPRsID);
1403 
1404   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
1405     addPass(&AMDGPUInsertDelayAluID);
1406 
1407   addPass(&BranchRelaxationPassID);
1408 }
1409 
1410 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1411   return new GCNPassConfig(*this, PM);
1412 }
1413 
1414 MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1415     BumpPtrAllocator &Allocator, const Function &F,
1416     const TargetSubtargetInfo *STI) const {
1417   return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1418       Allocator, F, static_cast<const GCNSubtarget *>(STI));
1419 }
1420 
1421 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1422   return new yaml::SIMachineFunctionInfo();
1423 }
1424 
1425 yaml::MachineFunctionInfo *
1426 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1427   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1428   return new yaml::SIMachineFunctionInfo(
1429       *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1430 }
1431 
1432 bool GCNTargetMachine::parseMachineFunctionInfo(
1433     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1434     SMDiagnostic &Error, SMRange &SourceRange) const {
1435   const yaml::SIMachineFunctionInfo &YamlMFI =
1436       static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1437   MachineFunction &MF = PFS.MF;
1438   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1439 
1440   if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1441     return true;
1442 
1443   if (MFI->Occupancy == 0) {
1444     // Fixup the subtarget dependent default value.
1445     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1446     MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1447   }
1448 
1449   auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1450     Register TempReg;
1451     if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1452       SourceRange = RegName.SourceRange;
1453       return true;
1454     }
1455     RegVal = TempReg;
1456 
1457     return false;
1458   };
1459 
1460   auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1461                                    Register &RegVal) {
1462     return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1463   };
1464 
1465   if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1466     return true;
1467 
1468   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1469     // Create a diagnostic for a the register string literal.
1470     const MemoryBuffer &Buffer =
1471         *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1472     Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1473                          RegName.Value.size(), SourceMgr::DK_Error,
1474                          "incorrect register class for field", RegName.Value,
1475                          std::nullopt, std::nullopt);
1476     SourceRange = RegName.SourceRange;
1477     return true;
1478   };
1479 
1480   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1481       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1482       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1483     return true;
1484 
1485   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1486       !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1487     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1488   }
1489 
1490   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1491       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1492     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1493   }
1494 
1495   if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1496       !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1497     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1498   }
1499 
1500   for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1501     Register ParsedReg;
1502     if (parseRegister(YamlReg, ParsedReg))
1503       return true;
1504 
1505     MFI->reserveWWMRegister(ParsedReg);
1506   }
1507 
1508   auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1509                                    const TargetRegisterClass &RC,
1510                                    ArgDescriptor &Arg, unsigned UserSGPRs,
1511                                    unsigned SystemSGPRs) {
1512     // Skip parsing if it's not present.
1513     if (!A)
1514       return false;
1515 
1516     if (A->IsRegister) {
1517       Register Reg;
1518       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1519         SourceRange = A->RegisterName.SourceRange;
1520         return true;
1521       }
1522       if (!RC.contains(Reg))
1523         return diagnoseRegisterClass(A->RegisterName);
1524       Arg = ArgDescriptor::createRegister(Reg);
1525     } else
1526       Arg = ArgDescriptor::createStack(A->StackOffset);
1527     // Check and apply the optional mask.
1528     if (A->Mask)
1529       Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1530 
1531     MFI->NumUserSGPRs += UserSGPRs;
1532     MFI->NumSystemSGPRs += SystemSGPRs;
1533     return false;
1534   };
1535 
1536   if (YamlMFI.ArgInfo &&
1537       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1538                              AMDGPU::SGPR_128RegClass,
1539                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1540        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1541                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1542                              2, 0) ||
1543        parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1544                              MFI->ArgInfo.QueuePtr, 2, 0) ||
1545        parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1546                              AMDGPU::SReg_64RegClass,
1547                              MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1548        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1549                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1550                              2, 0) ||
1551        parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1552                              AMDGPU::SReg_64RegClass,
1553                              MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1554        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1555                              AMDGPU::SGPR_32RegClass,
1556                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1557        parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1558                              AMDGPU::SGPR_32RegClass,
1559                              MFI->ArgInfo.LDSKernelId, 0, 1) ||
1560        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1561                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1562                              0, 1) ||
1563        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1564                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1565                              0, 1) ||
1566        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1567                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1568                              0, 1) ||
1569        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1570                              AMDGPU::SGPR_32RegClass,
1571                              MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1572        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1573                              AMDGPU::SGPR_32RegClass,
1574                              MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1575        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1576                              AMDGPU::SReg_64RegClass,
1577                              MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1578        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1579                              AMDGPU::SReg_64RegClass,
1580                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1581        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1582                              AMDGPU::VGPR_32RegClass,
1583                              MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1584        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1585                              AMDGPU::VGPR_32RegClass,
1586                              MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1587        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1588                              AMDGPU::VGPR_32RegClass,
1589                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1590     return true;
1591 
1592   MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1593   MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1594 
1595   // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1596   MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1597                                       ? DenormalMode::IEEE
1598                                       : DenormalMode::PreserveSign;
1599   MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1600                                        ? DenormalMode::IEEE
1601                                        : DenormalMode::PreserveSign;
1602 
1603   MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1604                                           ? DenormalMode::IEEE
1605                                           : DenormalMode::PreserveSign;
1606   MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1607                                            ? DenormalMode::IEEE
1608                                            : DenormalMode::PreserveSign;
1609 
1610   return false;
1611 }
1612