xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 ///
11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
13 /// an MCObjectStreamer it outputs binary code.
14 //
15 //===----------------------------------------------------------------------===//
16 //
17 
18 #include "AMDGPUAsmPrinter.h"
19 #include "AMDGPU.h"
20 #include "AMDGPUHSAMetadataStreamer.h"
21 #include "AMDGPUResourceUsageAnalysis.h"
22 #include "AMDKernelCodeT.h"
23 #include "GCNSubtarget.h"
24 #include "MCTargetDesc/AMDGPUInstPrinter.h"
25 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
26 #include "R600AsmPrinter.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "TargetInfo/AMDGPUTargetInfo.h"
29 #include "Utils/AMDGPUBaseInfo.h"
30 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
31 #include "llvm/BinaryFormat/ELF.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/MC/MCAssembler.h"
36 #include "llvm/MC/MCContext.h"
37 #include "llvm/MC/MCSectionELF.h"
38 #include "llvm/MC/MCStreamer.h"
39 #include "llvm/MC/TargetRegistry.h"
40 #include "llvm/Support/AMDHSAKernelDescriptor.h"
41 #include "llvm/Target/TargetLoweringObjectFile.h"
42 #include "llvm/Target/TargetMachine.h"
43 #include "llvm/TargetParser/TargetParser.h"
44 
45 using namespace llvm;
46 using namespace llvm::AMDGPU;
47 
48 // This should get the default rounding mode from the kernel. We just set the
49 // default here, but this could change if the OpenCL rounding mode pragmas are
50 // used.
51 //
52 // The denormal mode here should match what is reported by the OpenCL runtime
53 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
54 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
55 //
56 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
57 // precision, and leaves single precision to flush all and does not report
58 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
59 // CL_FP_DENORM for both.
60 //
61 // FIXME: It seems some instructions do not support single precision denormals
62 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
63 // and sin_f32, cos_f32 on most parts).
64 
65 // We want to use these instructions, and using fp32 denormals also causes
66 // instructions to run at the double precision rate for the device so it's
67 // probably best to just report no single precision denormals.
68 static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
69   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
70          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
71          FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
72          FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
73 }
74 
75 static AsmPrinter *
76 createAMDGPUAsmPrinterPass(TargetMachine &tm,
77                            std::unique_ptr<MCStreamer> &&Streamer) {
78   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
79 }
80 
81 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
82   TargetRegistry::RegisterAsmPrinter(getTheR600Target(),
83                                      llvm::createR600AsmPrinterPass);
84   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
85                                      createAMDGPUAsmPrinterPass);
86 }
87 
88 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
89                                    std::unique_ptr<MCStreamer> Streamer)
90     : AsmPrinter(TM, std::move(Streamer)) {
91   assert(OutStreamer && "AsmPrinter constructed without streamer");
92 }
93 
94 StringRef AMDGPUAsmPrinter::getPassName() const {
95   return "AMDGPU Assembly Printer";
96 }
97 
98 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
99   return TM.getMCSubtargetInfo();
100 }
101 
102 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
103   if (!OutStreamer)
104     return nullptr;
105   return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
106 }
107 
108 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
109   IsTargetStreamerInitialized = false;
110 }
111 
112 void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
113   IsTargetStreamerInitialized = true;
114 
115   // TODO: Which one is called first, emitStartOfAsmFile or
116   // emitFunctionBodyStart?
117   if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
118     initializeTargetID(M);
119 
120   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
121       TM.getTargetTriple().getOS() != Triple::AMDPAL)
122     return;
123 
124   getTargetStreamer()->EmitDirectiveAMDGCNTarget();
125 
126   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
127     getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
128         CodeObjectVersion);
129     HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
130   }
131 
132   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
133     getTargetStreamer()->getPALMetadata()->readFromIR(M);
134 }
135 
136 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
137   // Init target streamer if it has not yet happened
138   if (!IsTargetStreamerInitialized)
139     initTargetStreamer(M);
140 
141   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
142     getTargetStreamer()->EmitISAVersion();
143 
144   // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
145   // Emit HSA Metadata (NT_AMD_HSA_METADATA).
146   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
147     HSAMetadataStream->end();
148     bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
149     (void)Success;
150     assert(Success && "Malformed HSA Metadata");
151   }
152 }
153 
154 void AMDGPUAsmPrinter::emitFunctionBodyStart() {
155   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
156   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
157   const Function &F = MF->getFunction();
158 
159   // TODO: Which one is called first, emitStartOfAsmFile or
160   // emitFunctionBodyStart?
161   if (!getTargetStreamer()->getTargetID())
162     initializeTargetID(*F.getParent());
163 
164   const auto &FunctionTargetID = STM.getTargetID();
165   // Make sure function's xnack settings are compatible with module's
166   // xnack settings.
167   if (FunctionTargetID.isXnackSupported() &&
168       FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
169       FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
170     OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
171                            "' function does not match module xnack setting");
172     return;
173   }
174   // Make sure function's sramecc settings are compatible with module's
175   // sramecc settings.
176   if (FunctionTargetID.isSramEccSupported() &&
177       FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
178       FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
179     OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
180                            "' function does not match module sramecc setting");
181     return;
182   }
183 
184   if (!MFI.isEntryFunction())
185     return;
186 
187   if (STM.isMesaKernel(F) &&
188       (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
189        F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
190     amd_kernel_code_t KernelCode;
191     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
192     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
193   }
194 
195   if (STM.isAmdHsaOS())
196     HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
197 
198   if (MFI.getNumKernargPreloadedSGPRs() > 0) {
199     assert(AMDGPU::hasKernargPreload(STM));
200     getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI());
201   }
202 }
203 
204 void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
205   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
206   if (!MFI.isEntryFunction())
207     return;
208 
209   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
210     return;
211 
212   auto &Streamer = getTargetStreamer()->getStreamer();
213   auto &Context = Streamer.getContext();
214   auto &ObjectFileInfo = *Context.getObjectFileInfo();
215   auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
216 
217   Streamer.pushSection();
218   Streamer.switchSection(&ReadOnlySection);
219 
220   // CP microcode requires the kernel descriptor to be allocated on 64 byte
221   // alignment.
222   Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
223   ReadOnlySection.ensureMinAlignment(Align(64));
224 
225   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
226 
227   SmallString<128> KernelName;
228   getNameWithPrefix(KernelName, &MF->getFunction());
229   getTargetStreamer()->EmitAmdhsaKernelDescriptor(
230       STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
231       CurrentProgramInfo.NumVGPRsForWavesPerEU,
232       CurrentProgramInfo.NumSGPRsForWavesPerEU -
233           IsaInfo::getNumExtraSGPRs(
234               &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
235               getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
236       CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
237 
238   Streamer.popSection();
239 }
240 
241 void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
242   Register RegNo = MI->getOperand(0).getReg();
243 
244   SmallString<128> Str;
245   raw_svector_ostream OS(Str);
246   OS << "implicit-def: "
247      << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
248 
249   if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
250     OS << " : SGPR spill to VGPR lane";
251 
252   OutStreamer->AddComment(OS.str());
253   OutStreamer->addBlankLine();
254 }
255 
256 void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
257   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
258     AsmPrinter::emitFunctionEntryLabel();
259     return;
260   }
261 
262   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
263   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
264   if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
265     SmallString<128> SymbolName;
266     getNameWithPrefix(SymbolName, &MF->getFunction()),
267     getTargetStreamer()->EmitAMDGPUSymbolType(
268         SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
269   }
270   if (DumpCodeInstEmitter) {
271     // Disassemble function name label to text.
272     DisasmLines.push_back(MF->getName().str() + ":");
273     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
274     HexLines.push_back("");
275   }
276 
277   AsmPrinter::emitFunctionEntryLabel();
278 }
279 
280 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
281   if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
282     // Write a line for the basic block label if it is not only fallthrough.
283     DisasmLines.push_back(
284         (Twine("BB") + Twine(getFunctionNumber())
285          + "_" + Twine(MBB.getNumber()) + ":").str());
286     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
287     HexLines.push_back("");
288   }
289   AsmPrinter::emitBasicBlockStart(MBB);
290 }
291 
292 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
293   if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
294     if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
295       OutContext.reportError({},
296                              Twine(GV->getName()) +
297                                  ": unsupported initializer for address space");
298       return;
299     }
300 
301     // LDS variables aren't emitted in HSA or PAL yet.
302     const Triple::OSType OS = TM.getTargetTriple().getOS();
303     if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
304       return;
305 
306     MCSymbol *GVSym = getSymbol(GV);
307 
308     GVSym->redefineIfPossible();
309     if (GVSym->isDefined() || GVSym->isVariable())
310       report_fatal_error("symbol '" + Twine(GVSym->getName()) +
311                          "' is already defined");
312 
313     const DataLayout &DL = GV->getParent()->getDataLayout();
314     uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
315     Align Alignment = GV->getAlign().value_or(Align(4));
316 
317     emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
318     emitLinkage(GV, GVSym);
319     auto TS = getTargetStreamer();
320     TS->emitAMDGPULDS(GVSym, Size, Alignment);
321     return;
322   }
323 
324   AsmPrinter::emitGlobalVariable(GV);
325 }
326 
327 bool AMDGPUAsmPrinter::doInitialization(Module &M) {
328   CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
329 
330   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
331     switch (CodeObjectVersion) {
332     case AMDGPU::AMDHSA_COV4:
333       HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
334       break;
335     case AMDGPU::AMDHSA_COV5:
336       HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
337       break;
338     default:
339       report_fatal_error("Unexpected code object version");
340     }
341   }
342   return AsmPrinter::doInitialization(M);
343 }
344 
345 bool AMDGPUAsmPrinter::doFinalization(Module &M) {
346   // Pad with s_code_end to help tools and guard against instruction prefetch
347   // causing stale data in caches. Arguably this should be done by the linker,
348   // which is why this isn't done for Mesa.
349   const MCSubtargetInfo &STI = *getGlobalSTI();
350   if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
351       (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
352        STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
353     OutStreamer->switchSection(getObjFileLowering().getTextSection());
354     getTargetStreamer()->EmitCodeEnd(STI);
355   }
356 
357   return AsmPrinter::doFinalization(M);
358 }
359 
360 // Print comments that apply to both callable functions and entry points.
361 void AMDGPUAsmPrinter::emitCommonFunctionComments(
362     uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
363     uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
364     const AMDGPUMachineFunction *MFI) {
365   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
366   OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
367   OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
368   if (NumAGPR) {
369     OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
370     OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
371                                 false);
372   }
373   OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
374   OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
375                               false);
376 }
377 
378 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
379     const MachineFunction &MF) const {
380   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
381   uint16_t KernelCodeProperties = 0;
382   const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
383 
384   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
385     KernelCodeProperties |=
386         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
387   }
388   if (UserSGPRInfo.hasDispatchPtr()) {
389     KernelCodeProperties |=
390         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
391   }
392   if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
393     KernelCodeProperties |=
394         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
395   }
396   if (UserSGPRInfo.hasKernargSegmentPtr()) {
397     KernelCodeProperties |=
398         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
399   }
400   if (UserSGPRInfo.hasDispatchID()) {
401     KernelCodeProperties |=
402         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
403   }
404   if (UserSGPRInfo.hasFlatScratchInit()) {
405     KernelCodeProperties |=
406         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
407   }
408   if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
409     KernelCodeProperties |=
410         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
411   }
412 
413   if (CurrentProgramInfo.DynamicCallStack &&
414       CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
415     KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
416 
417   return KernelCodeProperties;
418 }
419 
420 amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
421     const MachineFunction &MF,
422     const SIProgramInfo &PI) const {
423   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
424   const Function &F = MF.getFunction();
425   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
426 
427   amdhsa::kernel_descriptor_t KernelDescriptor;
428   memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
429 
430   assert(isUInt<32>(PI.ScratchSize));
431   assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
432   assert(isUInt<32>(PI.getComputePGMRSrc2()));
433 
434   KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
435   KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
436 
437   Align MaxKernArgAlign;
438   KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
439 
440   KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM);
441   KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
442   KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
443 
444   assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
445   if (STM.hasGFX90AInsts())
446     KernelDescriptor.compute_pgm_rsrc3 =
447       CurrentProgramInfo.ComputePGMRSrc3GFX90A;
448 
449   if (AMDGPU::hasKernargPreload(STM))
450     KernelDescriptor.kernarg_preload =
451         static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs());
452 
453   return KernelDescriptor;
454 }
455 
456 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
457   // Init target streamer lazily on the first function so that previous passes
458   // can set metadata.
459   if (!IsTargetStreamerInitialized)
460     initTargetStreamer(*MF.getFunction().getParent());
461 
462   ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
463   CurrentProgramInfo = SIProgramInfo();
464 
465   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
466 
467   // The starting address of all shader programs must be 256 bytes aligned.
468   // Regular functions just need the basic required instruction alignment.
469   MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
470 
471   SetupMachineFunction(MF);
472 
473   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
474   MCContext &Context = getObjFileLowering().getContext();
475   // FIXME: This should be an explicit check for Mesa.
476   if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
477     MCSectionELF *ConfigSection =
478         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
479     OutStreamer->switchSection(ConfigSection);
480   }
481 
482   if (MFI->isModuleEntryFunction()) {
483     getSIProgramInfo(CurrentProgramInfo, MF);
484   }
485 
486   if (STM.isAmdPalOS()) {
487     if (MFI->isEntryFunction())
488       EmitPALMetadata(MF, CurrentProgramInfo);
489     else if (MFI->isModuleEntryFunction())
490       emitPALFunctionMetadata(MF);
491   } else if (!STM.isAmdHsaOS()) {
492     EmitProgramInfoSI(MF, CurrentProgramInfo);
493   }
494 
495   DumpCodeInstEmitter = nullptr;
496   if (STM.dumpCode()) {
497     // For -dumpcode, get the assembler out of the streamer, even if it does
498     // not really want to let us have it. This only works with -filetype=obj.
499     bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
500     OutStreamer->setUseAssemblerInfoForParsing(true);
501     MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
502     OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
503     if (Assembler)
504       DumpCodeInstEmitter = Assembler->getEmitterPtr();
505   }
506 
507   DisasmLines.clear();
508   HexLines.clear();
509   DisasmLineMaxLen = 0;
510 
511   emitFunctionBody();
512 
513   emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
514                            STM.hasMAIInsts());
515 
516   if (isVerbose()) {
517     MCSectionELF *CommentSection =
518         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
519     OutStreamer->switchSection(CommentSection);
520 
521     if (!MFI->isEntryFunction()) {
522       OutStreamer->emitRawComment(" Function info:", false);
523       const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
524           ResourceUsage->getResourceInfo(&MF.getFunction());
525       emitCommonFunctionComments(
526           Info.NumVGPR,
527           STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
528           Info.getTotalNumVGPRs(STM),
529           Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
530           Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
531       return false;
532     }
533 
534     OutStreamer->emitRawComment(" Kernel info:", false);
535     emitCommonFunctionComments(
536         CurrentProgramInfo.NumArchVGPR,
537         STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
538                           : std::optional<uint32_t>(),
539         CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
540         CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
541 
542     OutStreamer->emitRawComment(
543       " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
544     OutStreamer->emitRawComment(
545       " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
546     OutStreamer->emitRawComment(
547       " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
548       " bytes/workgroup (compile time only)", false);
549 
550     OutStreamer->emitRawComment(
551       " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
552     OutStreamer->emitRawComment(
553       " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
554 
555     OutStreamer->emitRawComment(
556       " NumSGPRsForWavesPerEU: " +
557       Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
558     OutStreamer->emitRawComment(
559       " NumVGPRsForWavesPerEU: " +
560       Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
561 
562     if (STM.hasGFX90AInsts())
563       OutStreamer->emitRawComment(
564         " AccumOffset: " +
565         Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
566 
567     OutStreamer->emitRawComment(
568       " Occupancy: " +
569       Twine(CurrentProgramInfo.Occupancy), false);
570 
571     OutStreamer->emitRawComment(
572       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
573 
574     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
575                                     Twine(CurrentProgramInfo.ScratchEnable),
576                                 false);
577     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
578                                     Twine(CurrentProgramInfo.UserSGPR),
579                                 false);
580     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
581                                     Twine(CurrentProgramInfo.TrapHandlerEnable),
582                                 false);
583     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
584                                     Twine(CurrentProgramInfo.TGIdXEnable),
585                                 false);
586     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
587                                     Twine(CurrentProgramInfo.TGIdYEnable),
588                                 false);
589     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
590                                     Twine(CurrentProgramInfo.TGIdZEnable),
591                                 false);
592     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
593                                     Twine(CurrentProgramInfo.TIdIGCompCount),
594                                 false);
595 
596     assert(STM.hasGFX90AInsts() ||
597            CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
598     if (STM.hasGFX90AInsts()) {
599       OutStreamer->emitRawComment(
600         " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
601         Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
602                                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
603                                false);
604       OutStreamer->emitRawComment(
605         " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
606         Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
607                                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
608                                false);
609     }
610   }
611 
612   if (DumpCodeInstEmitter) {
613 
614     OutStreamer->switchSection(
615         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
616 
617     for (size_t i = 0; i < DisasmLines.size(); ++i) {
618       std::string Comment = "\n";
619       if (!HexLines[i].empty()) {
620         Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
621         Comment += " ; " + HexLines[i] + "\n";
622       }
623 
624       OutStreamer->emitBytes(StringRef(DisasmLines[i]));
625       OutStreamer->emitBytes(StringRef(Comment));
626     }
627   }
628 
629   return false;
630 }
631 
632 // TODO: Fold this into emitFunctionBodyStart.
633 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
634   // In the beginning all features are either 'Any' or 'NotSupported',
635   // depending on global target features. This will cover empty modules.
636   getTargetStreamer()->initializeTargetID(*getGlobalSTI(),
637                                           getGlobalSTI()->getFeatureString());
638 
639   // If module is empty, we are done.
640   if (M.empty())
641     return;
642 
643   // If module is not empty, need to find first 'Off' or 'On' feature
644   // setting per feature from functions in module.
645   for (auto &F : M) {
646     auto &TSTargetID = getTargetStreamer()->getTargetID();
647     if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
648         (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
649       break;
650 
651     const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
652     const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
653     if (TSTargetID->isXnackSupported())
654       if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
655         TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
656     if (TSTargetID->isSramEccSupported())
657       if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
658         TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
659   }
660 }
661 
662 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
663   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
664   const SIInstrInfo *TII = STM.getInstrInfo();
665 
666   uint64_t CodeSize = 0;
667 
668   for (const MachineBasicBlock &MBB : MF) {
669     for (const MachineInstr &MI : MBB) {
670       // TODO: CodeSize should account for multiple functions.
671 
672       // TODO: Should we count size of debug info?
673       if (MI.isDebugInstr())
674         continue;
675 
676       CodeSize += TII->getInstSizeInBytes(MI);
677     }
678   }
679 
680   return CodeSize;
681 }
682 
683 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
684                                         const MachineFunction &MF) {
685   const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
686       ResourceUsage->getResourceInfo(&MF.getFunction());
687   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
688 
689   ProgInfo.NumArchVGPR = Info.NumVGPR;
690   ProgInfo.NumAccVGPR = Info.NumAGPR;
691   ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
692   ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
693   ProgInfo.TgSplit = STM.isTgSplitEnabled();
694   ProgInfo.NumSGPR = Info.NumExplicitSGPR;
695   ProgInfo.ScratchSize = Info.PrivateSegmentSize;
696   ProgInfo.VCCUsed = Info.UsesVCC;
697   ProgInfo.FlatUsed = Info.UsesFlatScratch;
698   ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
699 
700   const uint64_t MaxScratchPerWorkitem =
701       STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
702   if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
703     DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
704                                           ProgInfo.ScratchSize,
705                                           MaxScratchPerWorkitem, DS_Error);
706     MF.getFunction().getContext().diagnose(DiagStackSize);
707   }
708 
709   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
710 
711   // The calculations related to SGPR/VGPR blocks are
712   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
713   // unified.
714   unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
715       &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
716       getTargetStreamer()->getTargetID()->isXnackOnOrAny());
717 
718   // Check the addressable register limit before we add ExtraSGPRs.
719   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
720       !STM.hasSGPRInitBug()) {
721     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
722     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
723       // This can happen due to a compiler bug or when using inline asm.
724       LLVMContext &Ctx = MF.getFunction().getContext();
725       DiagnosticInfoResourceLimit Diag(
726           MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
727           MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
728       Ctx.diagnose(Diag);
729       ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
730     }
731   }
732 
733   // Account for extra SGPRs and VGPRs reserved for debugger use.
734   ProgInfo.NumSGPR += ExtraSGPRs;
735 
736   const Function &F = MF.getFunction();
737 
738   // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
739   // dispatch registers are function args.
740   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
741 
742   if (isShader(F.getCallingConv())) {
743     bool IsPixelShader =
744         F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
745 
746     // Calculate the number of VGPR registers based on the SPI input registers
747     uint32_t InputEna = 0;
748     uint32_t InputAddr = 0;
749     unsigned LastEna = 0;
750 
751     if (IsPixelShader) {
752       // Note for IsPixelShader:
753       // By this stage, all enabled inputs are tagged in InputAddr as well.
754       // We will use InputAddr to determine whether the input counts against the
755       // vgpr total and only use the InputEnable to determine the last input
756       // that is relevant - if extra arguments are used, then we have to honour
757       // the InputAddr for any intermediate non-enabled inputs.
758       InputEna = MFI->getPSInputEnable();
759       InputAddr = MFI->getPSInputAddr();
760 
761       // We only need to consider input args up to the last used arg.
762       assert((InputEna || InputAddr) &&
763              "PSInputAddr and PSInputEnable should "
764              "never both be 0 for AMDGPU_PS shaders");
765       // There are some rare circumstances where InputAddr is non-zero and
766       // InputEna can be set to 0. In this case we default to setting LastEna
767       // to 1.
768       LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
769     }
770 
771     // FIXME: We should be using the number of registers determined during
772     // calling convention lowering to legalize the types.
773     const DataLayout &DL = F.getParent()->getDataLayout();
774     unsigned PSArgCount = 0;
775     unsigned IntermediateVGPR = 0;
776     for (auto &Arg : F.args()) {
777       unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
778       if (Arg.hasAttribute(Attribute::InReg)) {
779         WaveDispatchNumSGPR += NumRegs;
780       } else {
781         // If this is a PS shader and we're processing the PS Input args (first
782         // 16 VGPR), use the InputEna and InputAddr bits to define how many
783         // VGPRs are actually used.
784         // Any extra VGPR arguments are handled as normal arguments (and
785         // contribute to the VGPR count whether they're used or not).
786         if (IsPixelShader && PSArgCount < 16) {
787           if ((1 << PSArgCount) & InputAddr) {
788             if (PSArgCount < LastEna)
789               WaveDispatchNumVGPR += NumRegs;
790             else
791               IntermediateVGPR += NumRegs;
792           }
793           PSArgCount++;
794         } else {
795           // If there are extra arguments we have to include the allocation for
796           // the non-used (but enabled with InputAddr) input arguments
797           if (IntermediateVGPR) {
798             WaveDispatchNumVGPR += IntermediateVGPR;
799             IntermediateVGPR = 0;
800           }
801           WaveDispatchNumVGPR += NumRegs;
802         }
803       }
804     }
805     ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
806     ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
807     ProgInfo.NumVGPR =
808         Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
809   }
810 
811   // Adjust number of registers used to meet default/requested minimum/maximum
812   // number of waves per execution unit request.
813   ProgInfo.NumSGPRsForWavesPerEU = std::max(
814     std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
815   ProgInfo.NumVGPRsForWavesPerEU = std::max(
816     std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
817 
818   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
819       STM.hasSGPRInitBug()) {
820     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
821     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
822       // This can happen due to a compiler bug or when using inline asm to use
823       // the registers which are usually reserved for vcc etc.
824       LLVMContext &Ctx = MF.getFunction().getContext();
825       DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
826                                        ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
827                                        DS_Error, DK_ResourceLimit);
828       Ctx.diagnose(Diag);
829       ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
830       ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
831     }
832   }
833 
834   if (STM.hasSGPRInitBug()) {
835     ProgInfo.NumSGPR =
836         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
837     ProgInfo.NumSGPRsForWavesPerEU =
838         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
839   }
840 
841   if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
842     LLVMContext &Ctx = MF.getFunction().getContext();
843     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
844                                      MFI->getNumUserSGPRs(),
845                                      STM.getMaxNumUserSGPRs(), DS_Error);
846     Ctx.diagnose(Diag);
847   }
848 
849   if (MFI->getLDSSize() >
850       static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
851     LLVMContext &Ctx = MF.getFunction().getContext();
852     DiagnosticInfoResourceLimit Diag(
853         MF.getFunction(), "local memory", MFI->getLDSSize(),
854         STM.getAddressableLocalMemorySize(), DS_Error);
855     Ctx.diagnose(Diag);
856   }
857 
858   ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
859       &STM, ProgInfo.NumSGPRsForWavesPerEU);
860   ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
861       &STM, ProgInfo.NumVGPRsForWavesPerEU);
862 
863   const SIModeRegisterDefaults Mode = MFI->getMode();
864 
865   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
866   // register.
867   ProgInfo.FloatMode = getFPMode(Mode);
868 
869   ProgInfo.IEEEMode = Mode.IEEE;
870 
871   // Make clamp modifier on NaN input returns 0.
872   ProgInfo.DX10Clamp = Mode.DX10Clamp;
873 
874   unsigned LDSAlignShift;
875   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
876     // LDS is allocated in 64 dword blocks.
877     LDSAlignShift = 8;
878   } else {
879     // LDS is allocated in 128 dword blocks.
880     LDSAlignShift = 9;
881   }
882 
883   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
884   ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
885 
886   ProgInfo.LDSSize = MFI->getLDSSize();
887   ProgInfo.LDSBlocks =
888       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
889 
890   // Scratch is allocated in 64-dword or 256-dword blocks.
891   unsigned ScratchAlignShift =
892       STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
893   // We need to program the hardware with the amount of scratch memory that
894   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
895   // scratch memory used per thread.
896   ProgInfo.ScratchBlocks = divideCeil(
897       ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
898 
899   if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
900     ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
901     ProgInfo.MemOrdered = 1;
902   }
903 
904   // 0 = X, 1 = XY, 2 = XYZ
905   unsigned TIDIGCompCnt = 0;
906   if (MFI->hasWorkItemIDZ())
907     TIDIGCompCnt = 2;
908   else if (MFI->hasWorkItemIDY())
909     TIDIGCompCnt = 1;
910 
911   // The private segment wave byte offset is the last of the system SGPRs. We
912   // initially assumed it was allocated, and may have used it. It shouldn't harm
913   // anything to disable it if we know the stack isn't used here. We may still
914   // have emitted code reading it to initialize scratch, but if that's unused
915   // reading garbage should be OK.
916   ProgInfo.ScratchEnable =
917       ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
918   ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
919   // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
920   ProgInfo.TrapHandlerEnable =
921       STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
922   ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
923   ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
924   ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
925   ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
926   ProgInfo.TIdIGCompCount = TIDIGCompCnt;
927   ProgInfo.EXCPEnMSB = 0;
928   // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
929   ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
930   ProgInfo.EXCPEnable = 0;
931 
932   if (STM.hasGFX90AInsts()) {
933     AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
934                     amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
935                     ProgInfo.AccumOffset);
936     AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
937                     amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
938                     ProgInfo.TgSplit);
939   }
940 
941   ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
942                                             ProgInfo.NumSGPRsForWavesPerEU,
943                                             ProgInfo.NumVGPRsForWavesPerEU);
944   const auto [MinWEU, MaxWEU] =
945       AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
946   if (ProgInfo.Occupancy < MinWEU) {
947     DiagnosticInfoOptimizationFailure Diag(
948         F, F.getSubprogram(),
949         "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
950         "'" +
951             F.getName() + "': desired occupancy was " + Twine(MinWEU) +
952             ", final occupancy is " + Twine(ProgInfo.Occupancy));
953     F.getContext().diagnose(Diag);
954   }
955 }
956 
957 static unsigned getRsrcReg(CallingConv::ID CallConv) {
958   switch (CallConv) {
959   default: [[fallthrough]];
960   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
961   case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
962   case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
963   case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
964   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
965   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
966   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
967   }
968 }
969 
970 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
971                                          const SIProgramInfo &CurrentProgramInfo) {
972   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
973   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
974   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
975 
976   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
977     OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
978 
979     OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM));
980 
981     OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
982     OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
983 
984     OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
985     OutStreamer->emitInt32(
986         STM.getGeneration() >= AMDGPUSubtarget::GFX12
987             ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
988         : STM.getGeneration() == AMDGPUSubtarget::GFX11
989             ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
990             : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
991 
992     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
993     // 0" comment but I don't see a corresponding field in the register spec.
994   } else {
995     OutStreamer->emitInt32(RsrcReg);
996     OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
997                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
998     OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
999     OutStreamer->emitInt32(
1000         STM.getGeneration() >= AMDGPUSubtarget::GFX12
1001             ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
1002         : STM.getGeneration() == AMDGPUSubtarget::GFX11
1003             ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
1004             : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
1005   }
1006 
1007   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1008     OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1009     unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1010                                 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1011                                 : CurrentProgramInfo.LDSBlocks;
1012     OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1013     OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1014     OutStreamer->emitInt32(MFI->getPSInputEnable());
1015     OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1016     OutStreamer->emitInt32(MFI->getPSInputAddr());
1017   }
1018 
1019   OutStreamer->emitInt32(R_SPILLED_SGPRS);
1020   OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1021   OutStreamer->emitInt32(R_SPILLED_VGPRS);
1022   OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1023 }
1024 
1025 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1026 // is AMDPAL.  It stores each compute/SPI register setting and other PAL
1027 // metadata items into the PALMD::Metadata, combining with any provided by the
1028 // frontend as LLVM metadata. Once all functions are written, the PAL metadata
1029 // is then written as a single block in the .note section.
1030 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1031        const SIProgramInfo &CurrentProgramInfo) {
1032   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1033   auto CC = MF.getFunction().getCallingConv();
1034   auto MD = getTargetStreamer()->getPALMetadata();
1035 
1036   MD->setEntryPoint(CC, MF.getFunction().getName());
1037   MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1038 
1039   // Only set AGPRs for supported devices
1040   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1041   if (STM.hasMAIInsts()) {
1042     MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1043   }
1044 
1045   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1046   if (MD->getPALMajorVersion() < 3) {
1047     MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
1048     if (AMDGPU::isCompute(CC)) {
1049       MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
1050     } else {
1051       if (CurrentProgramInfo.ScratchBlocks > 0)
1052         MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1053     }
1054   } else {
1055     MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1056     MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1057     MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1058     MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1059 
1060     if (AMDGPU::isCompute(CC)) {
1061       MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1062       MD->setHwStage(CC, ".trap_present",
1063                      (bool)CurrentProgramInfo.TrapHandlerEnable);
1064 
1065       // EXCPEnMSB?
1066       const unsigned LdsDwGranularity = 128;
1067       MD->setHwStage(CC, ".lds_size",
1068                      (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
1069                                 sizeof(uint32_t)));
1070       MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1071     } else {
1072       MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1073     }
1074   }
1075 
1076   // ScratchSize is in bytes, 16 aligned.
1077   MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1078   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1079     unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1080                                 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1081                                 : CurrentProgramInfo.LDSBlocks;
1082     if (MD->getPALMajorVersion() < 3) {
1083       MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1084       MD->setSpiPsInputEna(MFI->getPSInputEnable());
1085       MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1086     } else {
1087       // Graphics registers
1088       const unsigned ExtraLdsDwGranularity =
1089           STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1090       MD->setGraphicsRegisters(
1091           ".ps_extra_lds_size",
1092           (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1093 
1094       // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1095       static StringLiteral const PsInputFields[] = {
1096           ".persp_sample_ena",    ".persp_center_ena",
1097           ".persp_centroid_ena",  ".persp_pull_model_ena",
1098           ".linear_sample_ena",   ".linear_center_ena",
1099           ".linear_centroid_ena", ".line_stipple_tex_ena",
1100           ".pos_x_float_ena",     ".pos_y_float_ena",
1101           ".pos_z_float_ena",     ".pos_w_float_ena",
1102           ".front_face_ena",      ".ancillary_ena",
1103           ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1104       unsigned PSInputEna = MFI->getPSInputEnable();
1105       unsigned PSInputAddr = MFI->getPSInputAddr();
1106       for (auto [Idx, Field] : enumerate(PsInputFields)) {
1107         MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1108                                  (bool)((PSInputEna >> Idx) & 1));
1109         MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1110                                  (bool)((PSInputAddr >> Idx) & 1));
1111       }
1112     }
1113   }
1114 
1115   // For version 3 and above the wave front size is already set in the metadata
1116   if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1117     MD->setWave32(MF.getFunction().getCallingConv());
1118 }
1119 
1120 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1121   auto *MD = getTargetStreamer()->getPALMetadata();
1122   const MachineFrameInfo &MFI = MF.getFrameInfo();
1123   StringRef FnName = MF.getFunction().getName();
1124   MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1125   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1126 
1127   // Set compute registers
1128   MD->setRsrc1(CallingConv::AMDGPU_CS,
1129                CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST));
1130   MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
1131 
1132   // Set optional info
1133   MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1134   MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1135   MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1136 }
1137 
1138 // This is supposed to be log2(Size)
1139 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1140   switch (Size) {
1141   case 4:
1142     return AMD_ELEMENT_4_BYTES;
1143   case 8:
1144     return AMD_ELEMENT_8_BYTES;
1145   case 16:
1146     return AMD_ELEMENT_16_BYTES;
1147   default:
1148     llvm_unreachable("invalid private_element_size");
1149   }
1150 }
1151 
1152 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1153                                         const SIProgramInfo &CurrentProgramInfo,
1154                                         const MachineFunction &MF) const {
1155   const Function &F = MF.getFunction();
1156   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1157          F.getCallingConv() == CallingConv::SPIR_KERNEL);
1158 
1159   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1160   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1161 
1162   AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
1163 
1164   Out.compute_pgm_resource_registers =
1165       CurrentProgramInfo.getComputePGMRSrc1(STM) |
1166       (CurrentProgramInfo.getComputePGMRSrc2() << 32);
1167   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1168 
1169   if (CurrentProgramInfo.DynamicCallStack)
1170     Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
1171 
1172   AMD_HSA_BITS_SET(Out.code_properties,
1173                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1174                    getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1175 
1176   const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1177   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1178     Out.code_properties |=
1179       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1180   }
1181 
1182   if (UserSGPRInfo.hasDispatchPtr())
1183     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1184 
1185   if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1186     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1187 
1188   if (UserSGPRInfo.hasKernargSegmentPtr())
1189     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1190 
1191   if (UserSGPRInfo.hasDispatchID())
1192     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1193 
1194   if (UserSGPRInfo.hasFlatScratchInit())
1195     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1196 
1197   if (UserSGPRInfo.hasDispatchPtr())
1198     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1199 
1200   if (STM.isXNACKEnabled())
1201     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1202 
1203   Align MaxKernArgAlign;
1204   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1205   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1206   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1207   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1208   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1209 
1210   // kernarg_segment_alignment is specified as log of the alignment.
1211   // The minimum alignment is 16.
1212   // FIXME: The metadata treats the minimum as 4?
1213   Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1214 }
1215 
1216 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1217                                        const char *ExtraCode, raw_ostream &O) {
1218   // First try the generic code, which knows about modifiers like 'c' and 'n'.
1219   if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1220     return false;
1221 
1222   if (ExtraCode && ExtraCode[0]) {
1223     if (ExtraCode[1] != 0)
1224       return true; // Unknown modifier.
1225 
1226     switch (ExtraCode[0]) {
1227     case 'r':
1228       break;
1229     default:
1230       return true;
1231     }
1232   }
1233 
1234   // TODO: Should be able to support other operand types like globals.
1235   const MachineOperand &MO = MI->getOperand(OpNo);
1236   if (MO.isReg()) {
1237     AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
1238                                        *MF->getSubtarget().getRegisterInfo());
1239     return false;
1240   } else if (MO.isImm()) {
1241     int64_t Val = MO.getImm();
1242     if (AMDGPU::isInlinableIntLiteral(Val)) {
1243       O << Val;
1244     } else if (isUInt<16>(Val)) {
1245       O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1246     } else if (isUInt<32>(Val)) {
1247       O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1248     } else {
1249       O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1250     }
1251     return false;
1252   }
1253   return true;
1254 }
1255 
1256 void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1257   AU.addRequired<AMDGPUResourceUsageAnalysis>();
1258   AU.addPreserved<AMDGPUResourceUsageAnalysis>();
1259   AsmPrinter::getAnalysisUsage(AU);
1260 }
1261 
1262 void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1263     const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1264     bool isModuleEntryFunction, bool hasMAIInsts) {
1265   if (!ORE)
1266     return;
1267 
1268   const char *Name = "kernel-resource-usage";
1269   const char *Indent = "    ";
1270 
1271   // If the remark is not specifically enabled, do not output to yaml
1272   LLVMContext &Ctx = MF.getFunction().getContext();
1273   if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1274     return;
1275 
1276   auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1277                                      StringRef RemarkLabel, auto Argument) {
1278     // Add an indent for every line besides the line with the kernel name. This
1279     // makes it easier to tell which resource usage go with which kernel since
1280     // the kernel name will always be displayed first.
1281     std::string LabelStr = RemarkLabel.str() + ": ";
1282     if (!RemarkName.equals("FunctionName"))
1283       LabelStr = Indent + LabelStr;
1284 
1285     ORE->emit([&]() {
1286       return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1287                                                MF.getFunction().getSubprogram(),
1288                                                &MF.front())
1289              << LabelStr << ore::NV(RemarkName, Argument);
1290     });
1291   };
1292 
1293   // FIXME: Formatting here is pretty nasty because clang does not accept
1294   // newlines from diagnostics. This forces us to emit multiple diagnostic
1295   // remarks to simulate newlines. If and when clang does accept newlines, this
1296   // formatting should be aggregated into one remark with newlines to avoid
1297   // printing multiple diagnostic location and diag opts.
1298   EmitResourceUsageRemark("FunctionName", "Function Name",
1299                           MF.getFunction().getName());
1300   EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
1301   EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
1302   if (hasMAIInsts)
1303     EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
1304   EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1305                           CurrentProgramInfo.ScratchSize);
1306   StringRef DynamicStackStr =
1307       CurrentProgramInfo.DynamicCallStack ? "True" : "False";
1308   EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1309   EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1310                           CurrentProgramInfo.Occupancy);
1311   EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1312                           CurrentProgramInfo.SGPRSpill);
1313   EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1314                           CurrentProgramInfo.VGPRSpill);
1315   if (isModuleEntryFunction)
1316     EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1317                             CurrentProgramInfo.LDSSize);
1318 }
1319