xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (revision 069ac18495ad8fde2748bc94b0f80a50250bb01d)
1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 ///
11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
13 /// an MCObjectStreamer it outputs binary code.
14 //
15 //===----------------------------------------------------------------------===//
16 //
17 
18 #include "AMDGPUAsmPrinter.h"
19 #include "AMDGPU.h"
20 #include "AMDGPUHSAMetadataStreamer.h"
21 #include "AMDGPUResourceUsageAnalysis.h"
22 #include "AMDKernelCodeT.h"
23 #include "GCNSubtarget.h"
24 #include "MCTargetDesc/AMDGPUInstPrinter.h"
25 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
26 #include "R600AsmPrinter.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "TargetInfo/AMDGPUTargetInfo.h"
29 #include "Utils/AMDGPUBaseInfo.h"
30 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
31 #include "llvm/BinaryFormat/ELF.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/MC/MCAssembler.h"
36 #include "llvm/MC/MCContext.h"
37 #include "llvm/MC/MCSectionELF.h"
38 #include "llvm/MC/MCStreamer.h"
39 #include "llvm/MC/TargetRegistry.h"
40 #include "llvm/Support/AMDHSAKernelDescriptor.h"
41 #include "llvm/Target/TargetLoweringObjectFile.h"
42 #include "llvm/Target/TargetMachine.h"
43 #include "llvm/TargetParser/TargetParser.h"
44 
45 using namespace llvm;
46 using namespace llvm::AMDGPU;
47 
48 // This should get the default rounding mode from the kernel. We just set the
49 // default here, but this could change if the OpenCL rounding mode pragmas are
50 // used.
51 //
52 // The denormal mode here should match what is reported by the OpenCL runtime
53 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
54 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
55 //
56 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
57 // precision, and leaves single precision to flush all and does not report
58 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
59 // CL_FP_DENORM for both.
60 //
61 // FIXME: It seems some instructions do not support single precision denormals
62 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
63 // and sin_f32, cos_f32 on most parts).
64 
65 // We want to use these instructions, and using fp32 denormals also causes
66 // instructions to run at the double precision rate for the device so it's
67 // probably best to just report no single precision denormals.
68 static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
69   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
70          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
71          FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
72          FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
73 }
74 
75 static AsmPrinter *
76 createAMDGPUAsmPrinterPass(TargetMachine &tm,
77                            std::unique_ptr<MCStreamer> &&Streamer) {
78   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
79 }
80 
81 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
82   TargetRegistry::RegisterAsmPrinter(getTheR600Target(),
83                                      llvm::createR600AsmPrinterPass);
84   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
85                                      createAMDGPUAsmPrinterPass);
86 }
87 
88 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
89                                    std::unique_ptr<MCStreamer> Streamer)
90     : AsmPrinter(TM, std::move(Streamer)) {
91   assert(OutStreamer && "AsmPrinter constructed without streamer");
92 }
93 
94 StringRef AMDGPUAsmPrinter::getPassName() const {
95   return "AMDGPU Assembly Printer";
96 }
97 
98 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
99   return TM.getMCSubtargetInfo();
100 }
101 
102 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
103   if (!OutStreamer)
104     return nullptr;
105   return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
106 }
107 
108 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
109   IsTargetStreamerInitialized = false;
110 }
111 
112 void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
113   IsTargetStreamerInitialized = true;
114 
115   // TODO: Which one is called first, emitStartOfAsmFile or
116   // emitFunctionBodyStart?
117   if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
118     initializeTargetID(M);
119 
120   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
121       TM.getTargetTriple().getOS() != Triple::AMDPAL)
122     return;
123 
124   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3)
125     getTargetStreamer()->EmitDirectiveAMDGCNTarget();
126 
127   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
128     HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
129 
130   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
131     getTargetStreamer()->getPALMetadata()->readFromIR(M);
132 
133   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3)
134     return;
135 
136   // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2.
137   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
138     getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
139 
140   // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2.
141   IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU());
142   getTargetStreamer()->EmitDirectiveHSACodeObjectISAV2(
143       Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
144 }
145 
146 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
147   // Init target streamer if it has not yet happened
148   if (!IsTargetStreamerInitialized)
149     initTargetStreamer(M);
150 
151   if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
152       CodeObjectVersion == AMDGPU::AMDHSA_COV2)
153     getTargetStreamer()->EmitISAVersion();
154 
155   // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
156   // Emit HSA Metadata (NT_AMD_HSA_METADATA).
157   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
158     HSAMetadataStream->end();
159     bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
160     (void)Success;
161     assert(Success && "Malformed HSA Metadata");
162   }
163 }
164 
165 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
166   const MachineBasicBlock *MBB) const {
167   if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
168     return false;
169 
170   if (MBB->empty())
171     return true;
172 
173   // If this is a block implementing a long branch, an expression relative to
174   // the start of the block is needed.  to the start of the block.
175   // XXX - Is there a smarter way to check this?
176   return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
177 }
178 
179 void AMDGPUAsmPrinter::emitFunctionBodyStart() {
180   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
181   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
182   const Function &F = MF->getFunction();
183 
184   // TODO: Which one is called first, emitStartOfAsmFile or
185   // emitFunctionBodyStart?
186   if (!getTargetStreamer()->getTargetID())
187     initializeTargetID(*F.getParent());
188 
189   const auto &FunctionTargetID = STM.getTargetID();
190   // Make sure function's xnack settings are compatible with module's
191   // xnack settings.
192   if (FunctionTargetID.isXnackSupported() &&
193       FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
194       FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
195     OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
196                            "' function does not match module xnack setting");
197     return;
198   }
199   // Make sure function's sramecc settings are compatible with module's
200   // sramecc settings.
201   if (FunctionTargetID.isSramEccSupported() &&
202       FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
203       FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
204     OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
205                            "' function does not match module sramecc setting");
206     return;
207   }
208 
209   if (!MFI.isEntryFunction())
210     return;
211 
212   if ((STM.isMesaKernel(F) || CodeObjectVersion == AMDGPU::AMDHSA_COV2) &&
213       (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
214        F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
215     amd_kernel_code_t KernelCode;
216     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
217     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
218   }
219 
220   if (STM.isAmdHsaOS())
221     HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
222 }
223 
224 void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
225   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
226   if (!MFI.isEntryFunction())
227     return;
228 
229   if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
230       CodeObjectVersion == AMDGPU::AMDHSA_COV2)
231     return;
232 
233   auto &Streamer = getTargetStreamer()->getStreamer();
234   auto &Context = Streamer.getContext();
235   auto &ObjectFileInfo = *Context.getObjectFileInfo();
236   auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
237 
238   Streamer.pushSection();
239   Streamer.switchSection(&ReadOnlySection);
240 
241   // CP microcode requires the kernel descriptor to be allocated on 64 byte
242   // alignment.
243   Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
244   ReadOnlySection.ensureMinAlignment(Align(64));
245 
246   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
247 
248   SmallString<128> KernelName;
249   getNameWithPrefix(KernelName, &MF->getFunction());
250   getTargetStreamer()->EmitAmdhsaKernelDescriptor(
251       STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
252       CurrentProgramInfo.NumVGPRsForWavesPerEU,
253       CurrentProgramInfo.NumSGPRsForWavesPerEU -
254           IsaInfo::getNumExtraSGPRs(
255               &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
256               getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
257       CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
258       CodeObjectVersion);
259 
260   Streamer.popSection();
261 }
262 
263 void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
264   if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
265       CodeObjectVersion >= AMDGPU::AMDHSA_COV3) {
266     AsmPrinter::emitFunctionEntryLabel();
267     return;
268   }
269 
270   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
271   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
272   if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
273     SmallString<128> SymbolName;
274     getNameWithPrefix(SymbolName, &MF->getFunction()),
275     getTargetStreamer()->EmitAMDGPUSymbolType(
276         SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
277   }
278   if (DumpCodeInstEmitter) {
279     // Disassemble function name label to text.
280     DisasmLines.push_back(MF->getName().str() + ":");
281     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
282     HexLines.push_back("");
283   }
284 
285   AsmPrinter::emitFunctionEntryLabel();
286 }
287 
288 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
289   if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
290     // Write a line for the basic block label if it is not only fallthrough.
291     DisasmLines.push_back(
292         (Twine("BB") + Twine(getFunctionNumber())
293          + "_" + Twine(MBB.getNumber()) + ":").str());
294     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
295     HexLines.push_back("");
296   }
297   AsmPrinter::emitBasicBlockStart(MBB);
298 }
299 
300 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
301   if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
302     if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
303       OutContext.reportError({},
304                              Twine(GV->getName()) +
305                                  ": unsupported initializer for address space");
306       return;
307     }
308 
309     // LDS variables aren't emitted in HSA or PAL yet.
310     const Triple::OSType OS = TM.getTargetTriple().getOS();
311     if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
312       return;
313 
314     MCSymbol *GVSym = getSymbol(GV);
315 
316     GVSym->redefineIfPossible();
317     if (GVSym->isDefined() || GVSym->isVariable())
318       report_fatal_error("symbol '" + Twine(GVSym->getName()) +
319                          "' is already defined");
320 
321     const DataLayout &DL = GV->getParent()->getDataLayout();
322     uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
323     Align Alignment = GV->getAlign().value_or(Align(4));
324 
325     emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
326     emitLinkage(GV, GVSym);
327     auto TS = getTargetStreamer();
328     TS->emitAMDGPULDS(GVSym, Size, Alignment);
329     return;
330   }
331 
332   AsmPrinter::emitGlobalVariable(GV);
333 }
334 
335 bool AMDGPUAsmPrinter::doInitialization(Module &M) {
336   CodeObjectVersion = AMDGPU::getCodeObjectVersion(M);
337 
338   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
339     switch (CodeObjectVersion) {
340     case AMDGPU::AMDHSA_COV2:
341       HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2());
342       break;
343     case AMDGPU::AMDHSA_COV3:
344       HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3());
345       break;
346     case AMDGPU::AMDHSA_COV4:
347       HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
348       break;
349     case AMDGPU::AMDHSA_COV5:
350       HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
351       break;
352     default:
353       report_fatal_error("Unexpected code object version");
354     }
355   }
356   return AsmPrinter::doInitialization(M);
357 }
358 
359 bool AMDGPUAsmPrinter::doFinalization(Module &M) {
360   // Pad with s_code_end to help tools and guard against instruction prefetch
361   // causing stale data in caches. Arguably this should be done by the linker,
362   // which is why this isn't done for Mesa.
363   const MCSubtargetInfo &STI = *getGlobalSTI();
364   if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
365       (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
366        STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
367     OutStreamer->switchSection(getObjFileLowering().getTextSection());
368     getTargetStreamer()->EmitCodeEnd(STI);
369   }
370 
371   return AsmPrinter::doFinalization(M);
372 }
373 
374 // Print comments that apply to both callable functions and entry points.
375 void AMDGPUAsmPrinter::emitCommonFunctionComments(
376     uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
377     uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
378     const AMDGPUMachineFunction *MFI) {
379   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
380   OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
381   OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
382   if (NumAGPR) {
383     OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
384     OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
385                                 false);
386   }
387   OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
388   OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
389                               false);
390 }
391 
392 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
393     const MachineFunction &MF) const {
394   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
395   uint16_t KernelCodeProperties = 0;
396 
397   if (MFI.hasPrivateSegmentBuffer()) {
398     KernelCodeProperties |=
399         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
400   }
401   if (MFI.hasDispatchPtr()) {
402     KernelCodeProperties |=
403         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
404   }
405   if (MFI.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
406     KernelCodeProperties |=
407         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
408   }
409   if (MFI.hasKernargSegmentPtr()) {
410     KernelCodeProperties |=
411         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
412   }
413   if (MFI.hasDispatchID()) {
414     KernelCodeProperties |=
415         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
416   }
417   if (MFI.hasFlatScratchInit()) {
418     KernelCodeProperties |=
419         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
420   }
421   if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
422     KernelCodeProperties |=
423         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
424   }
425 
426   if (CurrentProgramInfo.DynamicCallStack &&
427       CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
428     KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
429 
430   return KernelCodeProperties;
431 }
432 
433 amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
434     const MachineFunction &MF,
435     const SIProgramInfo &PI) const {
436   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
437   const Function &F = MF.getFunction();
438 
439   amdhsa::kernel_descriptor_t KernelDescriptor;
440   memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
441 
442   assert(isUInt<32>(PI.ScratchSize));
443   assert(isUInt<32>(PI.getComputePGMRSrc1()));
444   assert(isUInt<32>(PI.getComputePGMRSrc2()));
445 
446   KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
447   KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
448 
449   Align MaxKernArgAlign;
450   KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
451 
452   KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
453   KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
454   KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
455 
456   assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
457   if (STM.hasGFX90AInsts())
458     KernelDescriptor.compute_pgm_rsrc3 =
459       CurrentProgramInfo.ComputePGMRSrc3GFX90A;
460 
461   return KernelDescriptor;
462 }
463 
464 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
465   // Init target streamer lazily on the first function so that previous passes
466   // can set metadata.
467   if (!IsTargetStreamerInitialized)
468     initTargetStreamer(*MF.getFunction().getParent());
469 
470   ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
471   CurrentProgramInfo = SIProgramInfo();
472 
473   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
474 
475   // The starting address of all shader programs must be 256 bytes aligned.
476   // Regular functions just need the basic required instruction alignment.
477   MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
478 
479   SetupMachineFunction(MF);
480 
481   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
482   MCContext &Context = getObjFileLowering().getContext();
483   // FIXME: This should be an explicit check for Mesa.
484   if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
485     MCSectionELF *ConfigSection =
486         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
487     OutStreamer->switchSection(ConfigSection);
488   }
489 
490   if (MFI->isModuleEntryFunction()) {
491     getSIProgramInfo(CurrentProgramInfo, MF);
492   }
493 
494   if (STM.isAmdPalOS()) {
495     if (MFI->isEntryFunction())
496       EmitPALMetadata(MF, CurrentProgramInfo);
497     else if (MFI->isModuleEntryFunction())
498       emitPALFunctionMetadata(MF);
499   } else if (!STM.isAmdHsaOS()) {
500     EmitProgramInfoSI(MF, CurrentProgramInfo);
501   }
502 
503   DumpCodeInstEmitter = nullptr;
504   if (STM.dumpCode()) {
505     // For -dumpcode, get the assembler out of the streamer, even if it does
506     // not really want to let us have it. This only works with -filetype=obj.
507     bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
508     OutStreamer->setUseAssemblerInfoForParsing(true);
509     MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
510     OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
511     if (Assembler)
512       DumpCodeInstEmitter = Assembler->getEmitterPtr();
513   }
514 
515   DisasmLines.clear();
516   HexLines.clear();
517   DisasmLineMaxLen = 0;
518 
519   emitFunctionBody();
520 
521   emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
522                            STM.hasMAIInsts());
523 
524   if (isVerbose()) {
525     MCSectionELF *CommentSection =
526         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
527     OutStreamer->switchSection(CommentSection);
528 
529     if (!MFI->isEntryFunction()) {
530       OutStreamer->emitRawComment(" Function info:", false);
531       const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
532           ResourceUsage->getResourceInfo(&MF.getFunction());
533       emitCommonFunctionComments(
534           Info.NumVGPR,
535           STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
536           Info.getTotalNumVGPRs(STM),
537           Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
538           Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
539       return false;
540     }
541 
542     OutStreamer->emitRawComment(" Kernel info:", false);
543     emitCommonFunctionComments(
544         CurrentProgramInfo.NumArchVGPR,
545         STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
546                           : std::optional<uint32_t>(),
547         CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
548         CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
549 
550     OutStreamer->emitRawComment(
551       " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
552     OutStreamer->emitRawComment(
553       " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
554     OutStreamer->emitRawComment(
555       " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
556       " bytes/workgroup (compile time only)", false);
557 
558     OutStreamer->emitRawComment(
559       " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
560     OutStreamer->emitRawComment(
561       " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
562 
563     OutStreamer->emitRawComment(
564       " NumSGPRsForWavesPerEU: " +
565       Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
566     OutStreamer->emitRawComment(
567       " NumVGPRsForWavesPerEU: " +
568       Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
569 
570     if (STM.hasGFX90AInsts())
571       OutStreamer->emitRawComment(
572         " AccumOffset: " +
573         Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
574 
575     OutStreamer->emitRawComment(
576       " Occupancy: " +
577       Twine(CurrentProgramInfo.Occupancy), false);
578 
579     OutStreamer->emitRawComment(
580       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
581 
582     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
583                                     Twine(CurrentProgramInfo.ScratchEnable),
584                                 false);
585     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
586                                     Twine(CurrentProgramInfo.UserSGPR),
587                                 false);
588     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
589                                     Twine(CurrentProgramInfo.TrapHandlerEnable),
590                                 false);
591     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
592                                     Twine(CurrentProgramInfo.TGIdXEnable),
593                                 false);
594     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
595                                     Twine(CurrentProgramInfo.TGIdYEnable),
596                                 false);
597     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
598                                     Twine(CurrentProgramInfo.TGIdZEnable),
599                                 false);
600     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
601                                     Twine(CurrentProgramInfo.TIdIGCompCount),
602                                 false);
603 
604     assert(STM.hasGFX90AInsts() ||
605            CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
606     if (STM.hasGFX90AInsts()) {
607       OutStreamer->emitRawComment(
608         " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
609         Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
610                                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
611                                false);
612       OutStreamer->emitRawComment(
613         " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
614         Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
615                                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
616                                false);
617     }
618   }
619 
620   if (DumpCodeInstEmitter) {
621 
622     OutStreamer->switchSection(
623         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
624 
625     for (size_t i = 0; i < DisasmLines.size(); ++i) {
626       std::string Comment = "\n";
627       if (!HexLines[i].empty()) {
628         Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
629         Comment += " ; " + HexLines[i] + "\n";
630       }
631 
632       OutStreamer->emitBytes(StringRef(DisasmLines[i]));
633       OutStreamer->emitBytes(StringRef(Comment));
634     }
635   }
636 
637   return false;
638 }
639 
640 // TODO: Fold this into emitFunctionBodyStart.
641 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
642   // In the beginning all features are either 'Any' or 'NotSupported',
643   // depending on global target features. This will cover empty modules.
644   getTargetStreamer()->initializeTargetID(
645       *getGlobalSTI(), getGlobalSTI()->getFeatureString(), CodeObjectVersion);
646 
647   // If module is empty, we are done.
648   if (M.empty())
649     return;
650 
651   // If module is not empty, need to find first 'Off' or 'On' feature
652   // setting per feature from functions in module.
653   for (auto &F : M) {
654     auto &TSTargetID = getTargetStreamer()->getTargetID();
655     if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
656         (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
657       break;
658 
659     const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
660     const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
661     if (TSTargetID->isXnackSupported())
662       if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
663         TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
664     if (TSTargetID->isSramEccSupported())
665       if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
666         TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
667   }
668 }
669 
670 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
671   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
672   const SIInstrInfo *TII = STM.getInstrInfo();
673 
674   uint64_t CodeSize = 0;
675 
676   for (const MachineBasicBlock &MBB : MF) {
677     for (const MachineInstr &MI : MBB) {
678       // TODO: CodeSize should account for multiple functions.
679 
680       // TODO: Should we count size of debug info?
681       if (MI.isDebugInstr())
682         continue;
683 
684       CodeSize += TII->getInstSizeInBytes(MI);
685     }
686   }
687 
688   return CodeSize;
689 }
690 
691 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
692                                         const MachineFunction &MF) {
693   const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
694       ResourceUsage->getResourceInfo(&MF.getFunction());
695   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
696 
697   ProgInfo.NumArchVGPR = Info.NumVGPR;
698   ProgInfo.NumAccVGPR = Info.NumAGPR;
699   ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
700   ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
701   ProgInfo.TgSplit = STM.isTgSplitEnabled();
702   ProgInfo.NumSGPR = Info.NumExplicitSGPR;
703   ProgInfo.ScratchSize = Info.PrivateSegmentSize;
704   ProgInfo.VCCUsed = Info.UsesVCC;
705   ProgInfo.FlatUsed = Info.UsesFlatScratch;
706   ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
707 
708   const uint64_t MaxScratchPerWorkitem =
709       STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
710   if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
711     DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
712                                           ProgInfo.ScratchSize,
713                                           MaxScratchPerWorkitem, DS_Error);
714     MF.getFunction().getContext().diagnose(DiagStackSize);
715   }
716 
717   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
718 
719   // The calculations related to SGPR/VGPR blocks are
720   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
721   // unified.
722   unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
723       &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
724       getTargetStreamer()->getTargetID()->isXnackOnOrAny());
725 
726   // Check the addressable register limit before we add ExtraSGPRs.
727   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
728       !STM.hasSGPRInitBug()) {
729     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
730     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
731       // This can happen due to a compiler bug or when using inline asm.
732       LLVMContext &Ctx = MF.getFunction().getContext();
733       DiagnosticInfoResourceLimit Diag(
734           MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
735           MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
736       Ctx.diagnose(Diag);
737       ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
738     }
739   }
740 
741   // Account for extra SGPRs and VGPRs reserved for debugger use.
742   ProgInfo.NumSGPR += ExtraSGPRs;
743 
744   const Function &F = MF.getFunction();
745 
746   // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
747   // dispatch registers are function args.
748   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
749 
750   if (isShader(F.getCallingConv())) {
751     bool IsPixelShader =
752         F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
753 
754     // Calculate the number of VGPR registers based on the SPI input registers
755     uint32_t InputEna = 0;
756     uint32_t InputAddr = 0;
757     unsigned LastEna = 0;
758 
759     if (IsPixelShader) {
760       // Note for IsPixelShader:
761       // By this stage, all enabled inputs are tagged in InputAddr as well.
762       // We will use InputAddr to determine whether the input counts against the
763       // vgpr total and only use the InputEnable to determine the last input
764       // that is relevant - if extra arguments are used, then we have to honour
765       // the InputAddr for any intermediate non-enabled inputs.
766       InputEna = MFI->getPSInputEnable();
767       InputAddr = MFI->getPSInputAddr();
768 
769       // We only need to consider input args up to the last used arg.
770       assert((InputEna || InputAddr) &&
771              "PSInputAddr and PSInputEnable should "
772              "never both be 0 for AMDGPU_PS shaders");
773       // There are some rare circumstances where InputAddr is non-zero and
774       // InputEna can be set to 0. In this case we default to setting LastEna
775       // to 1.
776       LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
777     }
778 
779     // FIXME: We should be using the number of registers determined during
780     // calling convention lowering to legalize the types.
781     const DataLayout &DL = F.getParent()->getDataLayout();
782     unsigned PSArgCount = 0;
783     unsigned IntermediateVGPR = 0;
784     for (auto &Arg : F.args()) {
785       unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
786       if (Arg.hasAttribute(Attribute::InReg)) {
787         WaveDispatchNumSGPR += NumRegs;
788       } else {
789         // If this is a PS shader and we're processing the PS Input args (first
790         // 16 VGPR), use the InputEna and InputAddr bits to define how many
791         // VGPRs are actually used.
792         // Any extra VGPR arguments are handled as normal arguments (and
793         // contribute to the VGPR count whether they're used or not).
794         if (IsPixelShader && PSArgCount < 16) {
795           if ((1 << PSArgCount) & InputAddr) {
796             if (PSArgCount < LastEna)
797               WaveDispatchNumVGPR += NumRegs;
798             else
799               IntermediateVGPR += NumRegs;
800           }
801           PSArgCount++;
802         } else {
803           // If there are extra arguments we have to include the allocation for
804           // the non-used (but enabled with InputAddr) input arguments
805           if (IntermediateVGPR) {
806             WaveDispatchNumVGPR += IntermediateVGPR;
807             IntermediateVGPR = 0;
808           }
809           WaveDispatchNumVGPR += NumRegs;
810         }
811       }
812     }
813     ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
814     ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
815     ProgInfo.NumVGPR =
816         Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
817   }
818 
819   // Adjust number of registers used to meet default/requested minimum/maximum
820   // number of waves per execution unit request.
821   ProgInfo.NumSGPRsForWavesPerEU = std::max(
822     std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
823   ProgInfo.NumVGPRsForWavesPerEU = std::max(
824     std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
825 
826   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
827       STM.hasSGPRInitBug()) {
828     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
829     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
830       // This can happen due to a compiler bug or when using inline asm to use
831       // the registers which are usually reserved for vcc etc.
832       LLVMContext &Ctx = MF.getFunction().getContext();
833       DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
834                                        ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
835                                        DS_Error, DK_ResourceLimit);
836       Ctx.diagnose(Diag);
837       ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
838       ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
839     }
840   }
841 
842   if (STM.hasSGPRInitBug()) {
843     ProgInfo.NumSGPR =
844         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
845     ProgInfo.NumSGPRsForWavesPerEU =
846         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
847   }
848 
849   if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
850     LLVMContext &Ctx = MF.getFunction().getContext();
851     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
852                                      MFI->getNumUserSGPRs(),
853                                      STM.getMaxNumUserSGPRs(), DS_Error);
854     Ctx.diagnose(Diag);
855   }
856 
857   if (MFI->getLDSSize() >
858       static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
859     LLVMContext &Ctx = MF.getFunction().getContext();
860     DiagnosticInfoResourceLimit Diag(
861         MF.getFunction(), "local memory", MFI->getLDSSize(),
862         STM.getAddressableLocalMemorySize(), DS_Error);
863     Ctx.diagnose(Diag);
864   }
865 
866   ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
867       &STM, ProgInfo.NumSGPRsForWavesPerEU);
868   ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
869       &STM, ProgInfo.NumVGPRsForWavesPerEU);
870 
871   const SIModeRegisterDefaults Mode = MFI->getMode();
872 
873   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
874   // register.
875   ProgInfo.FloatMode = getFPMode(Mode);
876 
877   ProgInfo.IEEEMode = Mode.IEEE;
878 
879   // Make clamp modifier on NaN input returns 0.
880   ProgInfo.DX10Clamp = Mode.DX10Clamp;
881 
882   unsigned LDSAlignShift;
883   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
884     // LDS is allocated in 64 dword blocks.
885     LDSAlignShift = 8;
886   } else {
887     // LDS is allocated in 128 dword blocks.
888     LDSAlignShift = 9;
889   }
890 
891   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
892   ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
893 
894   ProgInfo.LDSSize = MFI->getLDSSize();
895   ProgInfo.LDSBlocks =
896       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
897 
898   // Scratch is allocated in 64-dword or 256-dword blocks.
899   unsigned ScratchAlignShift =
900       STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
901   // We need to program the hardware with the amount of scratch memory that
902   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
903   // scratch memory used per thread.
904   ProgInfo.ScratchBlocks = divideCeil(
905       ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
906 
907   if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
908     ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
909     ProgInfo.MemOrdered = 1;
910   }
911 
912   // 0 = X, 1 = XY, 2 = XYZ
913   unsigned TIDIGCompCnt = 0;
914   if (MFI->hasWorkItemIDZ())
915     TIDIGCompCnt = 2;
916   else if (MFI->hasWorkItemIDY())
917     TIDIGCompCnt = 1;
918 
919   // The private segment wave byte offset is the last of the system SGPRs. We
920   // initially assumed it was allocated, and may have used it. It shouldn't harm
921   // anything to disable it if we know the stack isn't used here. We may still
922   // have emitted code reading it to initialize scratch, but if that's unused
923   // reading garbage should be OK.
924   ProgInfo.ScratchEnable =
925       ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
926   ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
927   // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
928   ProgInfo.TrapHandlerEnable =
929       STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
930   ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
931   ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
932   ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
933   ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
934   ProgInfo.TIdIGCompCount = TIDIGCompCnt;
935   ProgInfo.EXCPEnMSB = 0;
936   // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
937   ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
938   ProgInfo.EXCPEnable = 0;
939 
940   if (STM.hasGFX90AInsts()) {
941     AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
942                     amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
943                     ProgInfo.AccumOffset);
944     AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
945                     amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
946                     ProgInfo.TgSplit);
947   }
948 
949   ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
950                                             ProgInfo.NumSGPRsForWavesPerEU,
951                                             ProgInfo.NumVGPRsForWavesPerEU);
952 }
953 
954 static unsigned getRsrcReg(CallingConv::ID CallConv) {
955   switch (CallConv) {
956   default: [[fallthrough]];
957   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
958   case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
959   case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
960   case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
961   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
962   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
963   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
964   }
965 }
966 
967 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
968                                          const SIProgramInfo &CurrentProgramInfo) {
969   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
970   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
971   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
972 
973   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
974     OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
975 
976     OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
977 
978     OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
979     OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
980 
981     OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
982     OutStreamer->emitInt32(
983         STM.getGeneration() >= AMDGPUSubtarget::GFX11
984             ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
985             : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
986 
987     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
988     // 0" comment but I don't see a corresponding field in the register spec.
989   } else {
990     OutStreamer->emitInt32(RsrcReg);
991     OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
992                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
993     OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
994     OutStreamer->emitInt32(
995         STM.getGeneration() >= AMDGPUSubtarget::GFX11
996             ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
997             : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
998   }
999 
1000   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1001     OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1002     unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1003                                 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1004                                 : CurrentProgramInfo.LDSBlocks;
1005     OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1006     OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1007     OutStreamer->emitInt32(MFI->getPSInputEnable());
1008     OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1009     OutStreamer->emitInt32(MFI->getPSInputAddr());
1010   }
1011 
1012   OutStreamer->emitInt32(R_SPILLED_SGPRS);
1013   OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1014   OutStreamer->emitInt32(R_SPILLED_VGPRS);
1015   OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1016 }
1017 
1018 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1019 // is AMDPAL.  It stores each compute/SPI register setting and other PAL
1020 // metadata items into the PALMD::Metadata, combining with any provided by the
1021 // frontend as LLVM metadata. Once all functions are written, the PAL metadata
1022 // is then written as a single block in the .note section.
1023 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1024        const SIProgramInfo &CurrentProgramInfo) {
1025   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1026   auto CC = MF.getFunction().getCallingConv();
1027   auto MD = getTargetStreamer()->getPALMetadata();
1028 
1029   MD->setEntryPoint(CC, MF.getFunction().getName());
1030   MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1031 
1032   // Only set AGPRs for supported devices
1033   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1034   if (STM.hasMAIInsts()) {
1035     MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1036   }
1037 
1038   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1039   if (MD->getPALMajorVersion() < 3) {
1040     MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
1041     if (AMDGPU::isCompute(CC)) {
1042       MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
1043     } else {
1044       if (CurrentProgramInfo.ScratchBlocks > 0)
1045         MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1046     }
1047   } else {
1048     MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1049     MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1050     MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1051     MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1052 
1053     if (AMDGPU::isCompute(CC)) {
1054       MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1055       MD->setHwStage(CC, ".trap_present",
1056                      (bool)CurrentProgramInfo.TrapHandlerEnable);
1057 
1058       // EXCPEnMSB?
1059       const unsigned LdsDwGranularity = 128;
1060       MD->setHwStage(CC, ".lds_size",
1061                      (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
1062                                 sizeof(uint32_t)));
1063       MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1064     } else {
1065       MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1066     }
1067   }
1068 
1069   // ScratchSize is in bytes, 16 aligned.
1070   MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1071   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1072     unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1073                                 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1074                                 : CurrentProgramInfo.LDSBlocks;
1075     if (MD->getPALMajorVersion() < 3) {
1076       MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1077       MD->setSpiPsInputEna(MFI->getPSInputEnable());
1078       MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1079     } else {
1080       // Graphics registers
1081       const unsigned ExtraLdsDwGranularity =
1082           STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1083       MD->setGraphicsRegisters(
1084           ".ps_extra_lds_size",
1085           (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1086 
1087       // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1088       static StringLiteral const PsInputFields[] = {
1089           ".persp_sample_ena",    ".persp_center_ena",
1090           ".persp_centroid_ena",  ".persp_pull_model_ena",
1091           ".linear_sample_ena",   ".linear_center_ena",
1092           ".linear_centroid_ena", ".line_stipple_tex_ena",
1093           ".pos_x_float_ena",     ".pos_y_float_ena",
1094           ".pos_z_float_ena",     ".pos_w_float_ena",
1095           ".front_face_ena",      ".ancillary_ena",
1096           ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1097       unsigned PSInputEna = MFI->getPSInputEnable();
1098       unsigned PSInputAddr = MFI->getPSInputAddr();
1099       for (auto [Idx, Field] : enumerate(PsInputFields)) {
1100         MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1101                                  (bool)((PSInputEna >> Idx) & 1));
1102         MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1103                                  (bool)((PSInputAddr >> Idx) & 1));
1104       }
1105     }
1106   }
1107 
1108   // For version 3 and above the wave front size is already set in the metadata
1109   if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1110     MD->setWave32(MF.getFunction().getCallingConv());
1111 }
1112 
1113 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1114   auto *MD = getTargetStreamer()->getPALMetadata();
1115   const MachineFrameInfo &MFI = MF.getFrameInfo();
1116   MD->setFunctionScratchSize(MF, MFI.getStackSize());
1117 
1118   // Set compute registers
1119   MD->setRsrc1(CallingConv::AMDGPU_CS,
1120                CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
1121   MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
1122 
1123   // Set optional info
1124   MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize);
1125   MD->setFunctionNumUsedVgprs(MF, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1126   MD->setFunctionNumUsedSgprs(MF, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1127 }
1128 
1129 // This is supposed to be log2(Size)
1130 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1131   switch (Size) {
1132   case 4:
1133     return AMD_ELEMENT_4_BYTES;
1134   case 8:
1135     return AMD_ELEMENT_8_BYTES;
1136   case 16:
1137     return AMD_ELEMENT_16_BYTES;
1138   default:
1139     llvm_unreachable("invalid private_element_size");
1140   }
1141 }
1142 
1143 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1144                                         const SIProgramInfo &CurrentProgramInfo,
1145                                         const MachineFunction &MF) const {
1146   const Function &F = MF.getFunction();
1147   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1148          F.getCallingConv() == CallingConv::SPIR_KERNEL);
1149 
1150   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1151   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1152 
1153   AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
1154 
1155   Out.compute_pgm_resource_registers =
1156       CurrentProgramInfo.getComputePGMRSrc1() |
1157       (CurrentProgramInfo.getComputePGMRSrc2() << 32);
1158   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1159 
1160   if (CurrentProgramInfo.DynamicCallStack)
1161     Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
1162 
1163   AMD_HSA_BITS_SET(Out.code_properties,
1164                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1165                    getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1166 
1167   if (MFI->hasPrivateSegmentBuffer()) {
1168     Out.code_properties |=
1169       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1170   }
1171 
1172   if (MFI->hasDispatchPtr())
1173     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1174 
1175   if (MFI->hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1176     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1177 
1178   if (MFI->hasKernargSegmentPtr())
1179     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1180 
1181   if (MFI->hasDispatchID())
1182     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1183 
1184   if (MFI->hasFlatScratchInit())
1185     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1186 
1187   if (MFI->hasDispatchPtr())
1188     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1189 
1190   if (STM.isXNACKEnabled())
1191     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1192 
1193   Align MaxKernArgAlign;
1194   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1195   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1196   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1197   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1198   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1199 
1200   // kernarg_segment_alignment is specified as log of the alignment.
1201   // The minimum alignment is 16.
1202   // FIXME: The metadata treats the minimum as 4?
1203   Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1204 }
1205 
1206 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1207                                        const char *ExtraCode, raw_ostream &O) {
1208   // First try the generic code, which knows about modifiers like 'c' and 'n'.
1209   if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1210     return false;
1211 
1212   if (ExtraCode && ExtraCode[0]) {
1213     if (ExtraCode[1] != 0)
1214       return true; // Unknown modifier.
1215 
1216     switch (ExtraCode[0]) {
1217     case 'r':
1218       break;
1219     default:
1220       return true;
1221     }
1222   }
1223 
1224   // TODO: Should be able to support other operand types like globals.
1225   const MachineOperand &MO = MI->getOperand(OpNo);
1226   if (MO.isReg()) {
1227     AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
1228                                        *MF->getSubtarget().getRegisterInfo());
1229     return false;
1230   } else if (MO.isImm()) {
1231     int64_t Val = MO.getImm();
1232     if (AMDGPU::isInlinableIntLiteral(Val)) {
1233       O << Val;
1234     } else if (isUInt<16>(Val)) {
1235       O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1236     } else if (isUInt<32>(Val)) {
1237       O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1238     } else {
1239       O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1240     }
1241     return false;
1242   }
1243   return true;
1244 }
1245 
1246 void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1247   AU.addRequired<AMDGPUResourceUsageAnalysis>();
1248   AU.addPreserved<AMDGPUResourceUsageAnalysis>();
1249   AsmPrinter::getAnalysisUsage(AU);
1250 }
1251 
1252 void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1253     const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1254     bool isModuleEntryFunction, bool hasMAIInsts) {
1255   if (!ORE)
1256     return;
1257 
1258   const char *Name = "kernel-resource-usage";
1259   const char *Indent = "    ";
1260 
1261   // If the remark is not specifically enabled, do not output to yaml
1262   LLVMContext &Ctx = MF.getFunction().getContext();
1263   if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1264     return;
1265 
1266   auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1267                                      StringRef RemarkLabel, auto Argument) {
1268     // Add an indent for every line besides the line with the kernel name. This
1269     // makes it easier to tell which resource usage go with which kernel since
1270     // the kernel name will always be displayed first.
1271     std::string LabelStr = RemarkLabel.str() + ": ";
1272     if (!RemarkName.equals("FunctionName"))
1273       LabelStr = Indent + LabelStr;
1274 
1275     ORE->emit([&]() {
1276       return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1277                                                MF.getFunction().getSubprogram(),
1278                                                &MF.front())
1279              << LabelStr << ore::NV(RemarkName, Argument);
1280     });
1281   };
1282 
1283   // FIXME: Formatting here is pretty nasty because clang does not accept
1284   // newlines from diagnostics. This forces us to emit multiple diagnostic
1285   // remarks to simulate newlines. If and when clang does accept newlines, this
1286   // formatting should be aggregated into one remark with newlines to avoid
1287   // printing multiple diagnostic location and diag opts.
1288   EmitResourceUsageRemark("FunctionName", "Function Name",
1289                           MF.getFunction().getName());
1290   EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
1291   EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
1292   if (hasMAIInsts)
1293     EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
1294   EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1295                           CurrentProgramInfo.ScratchSize);
1296   EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1297                           CurrentProgramInfo.Occupancy);
1298   EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1299                           CurrentProgramInfo.SGPRSpill);
1300   EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1301                           CurrentProgramInfo.VGPRSpill);
1302   if (isModuleEntryFunction)
1303     EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1304                             CurrentProgramInfo.LDSSize);
1305 }
1306