xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "GCNSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUSelectionDAGInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/MDBuilder.h"
29 #include <algorithm>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "gcn-subtarget"
34 
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #undef AMDGPUSubtarget
40 
41 static cl::opt<bool> EnableVGPRIndexMode(
42     "amdgpu-vgpr-index-mode",
43     cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44     cl::init(false));
45 
46 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47                            cl::desc("Enable the use of AA during codegen."),
48                            cl::init(true));
49 
50 static cl::opt<unsigned>
51     NSAThreshold("amdgpu-nsa-threshold",
52                  cl::desc("Number of addresses from which to enable MIMG NSA."),
53                  cl::init(2), cl::Hidden);
54 
55 GCNSubtarget::~GCNSubtarget() = default;
56 
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)57 GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58                                                             StringRef GPU,
59                                                             StringRef FS) {
60   // Determine default and user-specified characteristics
61   //
62   // We want to be able to turn these off, but making this a subtarget feature
63   // for SI has the unhelpful behavior that it unsets everything else if you
64   // disable it.
65   //
66   // Similarly we want enable-prt-strict-null to be on by default and not to
67   // unset everything else if it is disabled
68 
69   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
70 
71   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72   // default
73   if (isAmdHsaOS())
74     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75 
76   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77 
78   // Disable mutually exclusive bits.
79   if (FS.contains_insensitive("+wavefrontsize")) {
80     if (!FS.contains_insensitive("wavefrontsize16"))
81       FullFS += "-wavefrontsize16,";
82     if (!FS.contains_insensitive("wavefrontsize32"))
83       FullFS += "-wavefrontsize32,";
84     if (!FS.contains_insensitive("wavefrontsize64"))
85       FullFS += "-wavefrontsize64,";
86   }
87 
88   FullFS += FS;
89 
90   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
91 
92   // Implement the "generic" processors, which acts as the default when no
93   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94   // the first amdgcn target that supports flat addressing. Other OSes defaults
95   // to the first amdgcn target.
96   if (Gen == AMDGPUSubtarget::INVALID) {
97     Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98                                        : AMDGPUSubtarget::SOUTHERN_ISLANDS;
99     // Assume wave64 for the unknown target, if not explicitly set.
100     if (getWavefrontSizeLog2() == 0)
101       WavefrontSizeLog2 = 6;
102   } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
103              !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
104     // If there is no default wave size it must be a generation before gfx10,
105     // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106     // set wave32 as a default.
107     ToggleFeature(AMDGPU::FeatureWavefrontSize32);
108     WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;
109   }
110 
111   // We don't support FP64 for EG/NI atm.
112   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113 
114   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115   // support flat operations, otherwise they cannot access a 64-bit global
116   // address space
117   assert(hasAddr64() || hasFlat());
118   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119   // that do not support ADDR64 variants of MUBUF instructions. Such targets
120   // cannot use a 64 bit offset with a MUBUF instruction to access the global
121   // address space
122   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
123     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
124     FlatForGlobal = true;
125   }
126   // Unless +-flat-for-global is specified, use MUBUF instructions for global
127   // address space access if flat operations are not available.
128   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
129     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
130     FlatForGlobal = false;
131   }
132 
133   // Set defaults if needed.
134   if (MaxPrivateElementSize == 0)
135     MaxPrivateElementSize = 4;
136 
137   if (LDSBankCount == 0)
138     LDSBankCount = 32;
139 
140   if (TT.isAMDGCN() && AddressableLocalMemorySize == 0)
141     AddressableLocalMemorySize = 32768;
142 
143   LocalMemorySize = AddressableLocalMemorySize;
144   if (AMDGPU::isGFX10Plus(*this) &&
145       !getFeatureBits().test(AMDGPU::FeatureCuMode))
146     LocalMemorySize *= 2;
147 
148   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
149   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
150 
151   TargetID.setTargetIDFromFeaturesString(FS);
152 
153   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
154                     << TargetID.getXnackSetting() << '\n');
155   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
156                     << TargetID.getSramEccSetting() << '\n');
157 
158   return *this;
159 }
160 
checkSubtargetFeatures(const Function & F) const161 void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
162   LLVMContext &Ctx = F.getContext();
163   if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&
164       hasFeature(AMDGPU::FeatureWavefrontSize64)) {
165     Ctx.diagnose(DiagnosticInfoUnsupported(
166         F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
167   }
168 }
169 
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)170 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
171                            const GCNTargetMachine &TM)
172     : // clang-format off
173     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
174     AMDGPUSubtarget(TT),
175     TargetTriple(TT),
176     TargetID(*this),
177     InstrItins(getInstrItineraryForCPU(GPU)),
178     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
179     TLInfo(TM, *this),
180     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
181   // clang-format on
182   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
183   EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this);
184 
185   TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
186 
187   CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
188   InlineAsmLoweringInfo =
189       std::make_unique<InlineAsmLowering>(getTargetLowering());
190   Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
191   RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
192   InstSelector =
193       std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
194 }
195 
getSelectionDAGInfo() const196 const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {
197   return TSInfo.get();
198 }
199 
getConstantBusLimit(unsigned Opcode) const200 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
201   if (getGeneration() < GFX10)
202     return 1;
203 
204   switch (Opcode) {
205   case AMDGPU::V_LSHLREV_B64_e64:
206   case AMDGPU::V_LSHLREV_B64_gfx10:
207   case AMDGPU::V_LSHLREV_B64_e64_gfx11:
208   case AMDGPU::V_LSHLREV_B64_e32_gfx12:
209   case AMDGPU::V_LSHLREV_B64_e64_gfx12:
210   case AMDGPU::V_LSHL_B64_e64:
211   case AMDGPU::V_LSHRREV_B64_e64:
212   case AMDGPU::V_LSHRREV_B64_gfx10:
213   case AMDGPU::V_LSHRREV_B64_e64_gfx11:
214   case AMDGPU::V_LSHRREV_B64_e64_gfx12:
215   case AMDGPU::V_LSHR_B64_e64:
216   case AMDGPU::V_ASHRREV_I64_e64:
217   case AMDGPU::V_ASHRREV_I64_gfx10:
218   case AMDGPU::V_ASHRREV_I64_e64_gfx11:
219   case AMDGPU::V_ASHRREV_I64_e64_gfx12:
220   case AMDGPU::V_ASHR_I64_e64:
221     return 1;
222   }
223 
224   return 2;
225 }
226 
227 /// This list was mostly derived from experimentation.
zeroesHigh16BitsOfDest(unsigned Opcode) const228 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
229   switch (Opcode) {
230   case AMDGPU::V_CVT_F16_F32_e32:
231   case AMDGPU::V_CVT_F16_F32_e64:
232   case AMDGPU::V_CVT_F16_U16_e32:
233   case AMDGPU::V_CVT_F16_U16_e64:
234   case AMDGPU::V_CVT_F16_I16_e32:
235   case AMDGPU::V_CVT_F16_I16_e64:
236   case AMDGPU::V_RCP_F16_e64:
237   case AMDGPU::V_RCP_F16_e32:
238   case AMDGPU::V_RSQ_F16_e64:
239   case AMDGPU::V_RSQ_F16_e32:
240   case AMDGPU::V_SQRT_F16_e64:
241   case AMDGPU::V_SQRT_F16_e32:
242   case AMDGPU::V_LOG_F16_e64:
243   case AMDGPU::V_LOG_F16_e32:
244   case AMDGPU::V_EXP_F16_e64:
245   case AMDGPU::V_EXP_F16_e32:
246   case AMDGPU::V_SIN_F16_e64:
247   case AMDGPU::V_SIN_F16_e32:
248   case AMDGPU::V_COS_F16_e64:
249   case AMDGPU::V_COS_F16_e32:
250   case AMDGPU::V_FLOOR_F16_e64:
251   case AMDGPU::V_FLOOR_F16_e32:
252   case AMDGPU::V_CEIL_F16_e64:
253   case AMDGPU::V_CEIL_F16_e32:
254   case AMDGPU::V_TRUNC_F16_e64:
255   case AMDGPU::V_TRUNC_F16_e32:
256   case AMDGPU::V_RNDNE_F16_e64:
257   case AMDGPU::V_RNDNE_F16_e32:
258   case AMDGPU::V_FRACT_F16_e64:
259   case AMDGPU::V_FRACT_F16_e32:
260   case AMDGPU::V_FREXP_MANT_F16_e64:
261   case AMDGPU::V_FREXP_MANT_F16_e32:
262   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
263   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
264   case AMDGPU::V_LDEXP_F16_e64:
265   case AMDGPU::V_LDEXP_F16_e32:
266   case AMDGPU::V_LSHLREV_B16_e64:
267   case AMDGPU::V_LSHLREV_B16_e32:
268   case AMDGPU::V_LSHRREV_B16_e64:
269   case AMDGPU::V_LSHRREV_B16_e32:
270   case AMDGPU::V_ASHRREV_I16_e64:
271   case AMDGPU::V_ASHRREV_I16_e32:
272   case AMDGPU::V_ADD_U16_e64:
273   case AMDGPU::V_ADD_U16_e32:
274   case AMDGPU::V_SUB_U16_e64:
275   case AMDGPU::V_SUB_U16_e32:
276   case AMDGPU::V_SUBREV_U16_e64:
277   case AMDGPU::V_SUBREV_U16_e32:
278   case AMDGPU::V_MUL_LO_U16_e64:
279   case AMDGPU::V_MUL_LO_U16_e32:
280   case AMDGPU::V_ADD_F16_e64:
281   case AMDGPU::V_ADD_F16_e32:
282   case AMDGPU::V_SUB_F16_e64:
283   case AMDGPU::V_SUB_F16_e32:
284   case AMDGPU::V_SUBREV_F16_e64:
285   case AMDGPU::V_SUBREV_F16_e32:
286   case AMDGPU::V_MUL_F16_e64:
287   case AMDGPU::V_MUL_F16_e32:
288   case AMDGPU::V_MAX_F16_e64:
289   case AMDGPU::V_MAX_F16_e32:
290   case AMDGPU::V_MIN_F16_e64:
291   case AMDGPU::V_MIN_F16_e32:
292   case AMDGPU::V_MAX_U16_e64:
293   case AMDGPU::V_MAX_U16_e32:
294   case AMDGPU::V_MIN_U16_e64:
295   case AMDGPU::V_MIN_U16_e32:
296   case AMDGPU::V_MAX_I16_e64:
297   case AMDGPU::V_MAX_I16_e32:
298   case AMDGPU::V_MIN_I16_e64:
299   case AMDGPU::V_MIN_I16_e32:
300   case AMDGPU::V_MAD_F16_e64:
301   case AMDGPU::V_MAD_U16_e64:
302   case AMDGPU::V_MAD_I16_e64:
303   case AMDGPU::V_FMA_F16_e64:
304   case AMDGPU::V_DIV_FIXUP_F16_e64:
305     // On gfx10, all 16-bit instructions preserve the high bits.
306     return getGeneration() <= AMDGPUSubtarget::GFX9;
307   case AMDGPU::V_MADAK_F16:
308   case AMDGPU::V_MADMK_F16:
309   case AMDGPU::V_MAC_F16_e64:
310   case AMDGPU::V_MAC_F16_e32:
311   case AMDGPU::V_FMAMK_F16:
312   case AMDGPU::V_FMAAK_F16:
313   case AMDGPU::V_FMAC_F16_e64:
314   case AMDGPU::V_FMAC_F16_e32:
315     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
316     // instructions maintain the legacy behavior of 0ing. Some instructions
317     // changed to preserving the high bits.
318     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
319   case AMDGPU::V_MAD_MIXLO_F16:
320   case AMDGPU::V_MAD_MIXHI_F16:
321   default:
322     return false;
323   }
324 }
325 
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const326 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
327                                        unsigned NumRegionInstrs) const {
328   // Track register pressure so the scheduler can try to decrease
329   // pressure once register usage is above the threshold defined by
330   // SIRegisterInfo::getRegPressureSetLimit()
331   Policy.ShouldTrackPressure = true;
332 
333   // Enabling both top down and bottom up scheduling seems to give us less
334   // register spills than just using one of these approaches on its own.
335   Policy.OnlyTopDown = false;
336   Policy.OnlyBottomUp = false;
337 
338   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
339   if (!enableSIScheduler())
340     Policy.ShouldTrackLaneMasks = true;
341 }
342 
mirFileLoaded(MachineFunction & MF) const343 void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
344   if (isWave32()) {
345     // Fix implicit $vcc operands after MIParser has verified that they match
346     // the instruction definitions.
347     for (auto &MBB : MF) {
348       for (auto &MI : MBB)
349         InstrInfo.fixImplicitOperands(MI);
350     }
351   }
352 }
353 
hasMadF16() const354 bool GCNSubtarget::hasMadF16() const {
355   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
356 }
357 
useVGPRIndexMode() const358 bool GCNSubtarget::useVGPRIndexMode() const {
359   return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
360 }
361 
useAA() const362 bool GCNSubtarget::useAA() const { return UseAA; }
363 
getOccupancyWithNumSGPRs(unsigned SGPRs) const364 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
365   return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),
366                                                    getGeneration());
367 }
368 
369 unsigned
getOccupancyWithNumVGPRs(unsigned NumVGPRs,unsigned DynamicVGPRBlockSize) const370 GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
371                                        unsigned DynamicVGPRBlockSize) const {
372   return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs,
373                                                        DynamicVGPRBlockSize);
374 }
375 
376 unsigned
getBaseReservedNumSGPRs(const bool HasFlatScratch) const377 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
378   if (getGeneration() >= AMDGPUSubtarget::GFX10)
379     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
380 
381   if (HasFlatScratch || HasArchitectedFlatScratch) {
382     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
383       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
384     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
385       return 4; // FLAT_SCRATCH, VCC (in that order).
386   }
387 
388   if (isXNACKEnabled())
389     return 4; // XNACK, VCC (in that order).
390   return 2;   // VCC.
391 }
392 
getReservedNumSGPRs(const MachineFunction & MF) const393 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
394   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
395   return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());
396 }
397 
getReservedNumSGPRs(const Function & F) const398 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
399   // In principle we do not need to reserve SGPR pair used for flat_scratch if
400   // we know flat instructions do not access the stack anywhere in the
401   // program. For now assume it's needed if we have flat instructions.
402   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
403   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
404 }
405 
406 std::pair<unsigned, unsigned>
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const407 GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
408                                unsigned NumSGPRs, unsigned NumVGPRs) const {
409   unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
410   // Temporarily check both the attribute and the subtarget feature until the
411   // latter is removed.
412   if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
413     DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
414 
415   auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
416   unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
417   unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
418 
419   // Maximum occupancy may be further limited by high SGPR/VGPR usage.
420   MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
421   return {std::min(MinOcc, MaxOcc), MaxOcc};
422 }
423 
getBaseMaxNumSGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU,unsigned PreloadedSGPRs,unsigned ReservedNumSGPRs) const424 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
425     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
426     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
427   // Compute maximum number of SGPRs function can use using default/requested
428   // minimum number of waves per execution unit.
429   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
430   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
431 
432   // Check if maximum number of SGPRs was explicitly requested using
433   // "amdgpu-num-sgpr" attribute.
434   unsigned Requested =
435       F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
436 
437   if (Requested != MaxNumSGPRs) {
438     // Make sure requested value does not violate subtarget's specifications.
439     if (Requested && (Requested <= ReservedNumSGPRs))
440       Requested = 0;
441 
442     // If more SGPRs are required to support the input user/system SGPRs,
443     // increase to accommodate them.
444     //
445     // FIXME: This really ends up using the requested number of SGPRs + number
446     // of reserved special registers in total. Theoretically you could re-use
447     // the last input registers for these special registers, but this would
448     // require a lot of complexity to deal with the weird aliasing.
449     unsigned InputNumSGPRs = PreloadedSGPRs;
450     if (Requested && Requested < InputNumSGPRs)
451       Requested = InputNumSGPRs;
452 
453     // Make sure requested value is compatible with values implied by
454     // default/requested minimum/maximum number of waves per execution unit.
455     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
456       Requested = 0;
457     if (WavesPerEU.second && Requested &&
458         Requested < getMinNumSGPRs(WavesPerEU.second))
459       Requested = 0;
460 
461     if (Requested)
462       MaxNumSGPRs = Requested;
463   }
464 
465   if (hasSGPRInitBug())
466     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
467 
468   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
469 }
470 
getMaxNumSGPRs(const MachineFunction & MF) const471 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
472   const Function &F = MF.getFunction();
473   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
474   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
475                             getReservedNumSGPRs(MF));
476 }
477 
getMaxNumPreloadedSGPRs() const478 unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
479   using USI = GCNUserSGPRUsageInfo;
480   // Max number of user SGPRs
481   const unsigned MaxUserSGPRs =
482       USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
483       USI::getNumUserSGPRForField(USI::DispatchPtrID) +
484       USI::getNumUserSGPRForField(USI::QueuePtrID) +
485       USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
486       USI::getNumUserSGPRForField(USI::DispatchIdID) +
487       USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
488       USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
489 
490   // Max number of system SGPRs
491   const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
492                                   1 + // WorkGroupIDY
493                                   1 + // WorkGroupIDZ
494                                   1 + // WorkGroupInfo
495                                   1;  // private segment wave byte offset
496 
497   // Max number of synthetic SGPRs
498   const unsigned SyntheticSGPRs = 1; // LDSKernelId
499 
500   return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
501 }
502 
getMaxNumSGPRs(const Function & F) const503 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
504   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
505                             getReservedNumSGPRs(F));
506 }
507 
getBaseMaxNumVGPRs(const Function & F,std::pair<unsigned,unsigned> NumVGPRBounds) const508 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
509     const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
510   const auto &[Min, Max] = NumVGPRBounds;
511 
512   // Check if maximum number of VGPRs was explicitly requested using
513   // "amdgpu-num-vgpr" attribute.
514 
515   unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
516   if (Requested != Max && hasGFX90AInsts())
517     Requested *= 2;
518 
519   // Make sure requested value is inside the range of possible VGPR usage.
520   return std::clamp(Requested, Min, Max);
521 }
522 
getMaxNumVGPRs(const Function & F) const523 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
524   // Temporarily check both the attribute and the subtarget feature, until the
525   // latter is removed.
526   unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
527   if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
528     DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
529 
530   std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
531   return getBaseMaxNumVGPRs(
532       F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
533           getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
534 }
535 
getMaxNumVGPRs(const MachineFunction & MF) const536 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
537   return getMaxNumVGPRs(MF.getFunction());
538 }
539 
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep,const TargetSchedModel * SchedModel) const540 void GCNSubtarget::adjustSchedDependency(
541     SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
542     const TargetSchedModel *SchedModel) const {
543   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
544       !Use->isInstr())
545     return;
546 
547   MachineInstr *DefI = Def->getInstr();
548   MachineInstr *UseI = Use->getInstr();
549 
550   if (DefI->isBundle()) {
551     const SIRegisterInfo *TRI = getRegisterInfo();
552     auto Reg = Dep.getReg();
553     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
554     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
555     unsigned Lat = 0;
556     for (++I; I != E && I->isBundledWithPred(); ++I) {
557       if (I->modifiesRegister(Reg, TRI))
558         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
559       else if (Lat)
560         --Lat;
561     }
562     Dep.setLatency(Lat);
563   } else if (UseI->isBundle()) {
564     const SIRegisterInfo *TRI = getRegisterInfo();
565     auto Reg = Dep.getReg();
566     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
567     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
568     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
569     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
570       if (I->readsRegister(Reg, TRI))
571         break;
572       --Lat;
573     }
574     Dep.setLatency(Lat);
575   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
576     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
577     // implicit operands which come from the MCInstrDesc, which can fool
578     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
579     // pseudo operands.
580     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
581         DefI, DefOpIdx, UseI, UseOpIdx));
582   }
583 }
584 
getNSAThreshold(const MachineFunction & MF) const585 unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
586   if (getGeneration() >= AMDGPUSubtarget::GFX12)
587     return 0; // Not MIMG encoding.
588 
589   if (NSAThreshold.getNumOccurrences() > 0)
590     return std::max(NSAThreshold.getValue(), 2u);
591 
592   int Value = MF.getFunction().getFnAttributeAsParsedInteger(
593       "amdgpu-nsa-threshold", -1);
594   if (Value > 0)
595     return std::max(Value, 2);
596 
597   return NSAThreshold;
598 }
599 
GCNUserSGPRUsageInfo(const Function & F,const GCNSubtarget & ST)600 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
601                                            const GCNSubtarget &ST)
602     : ST(ST) {
603   const CallingConv::ID CC = F.getCallingConv();
604   const bool IsKernel =
605       CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
606 
607   if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
608     KernargSegmentPtr = true;
609 
610   bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
611   if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
612     PrivateSegmentBuffer = true;
613   else if (ST.isMesaGfxShader(F))
614     ImplicitBufferPtr = true;
615 
616   if (!AMDGPU::isGraphics(CC)) {
617     if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
618       DispatchPtr = true;
619 
620     // FIXME: Can this always be disabled with < COv5?
621     if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
622       QueuePtr = true;
623 
624     if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
625       DispatchID = true;
626   }
627 
628   if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
629       (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
630       // FlatScratchInit cannot be true for graphics CC if enableFlatScratch()
631       // is false.
632       (ST.enableFlatScratch() ||
633        (!AMDGPU::isGraphics(CC) &&
634         !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&
635       !ST.flatScratchIsArchitected()) {
636     FlatScratchInit = true;
637   }
638 
639   if (hasImplicitBufferPtr())
640     NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
641 
642   if (hasPrivateSegmentBuffer())
643     NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
644 
645   if (hasDispatchPtr())
646     NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
647 
648   if (hasQueuePtr())
649     NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
650 
651   if (hasKernargSegmentPtr())
652     NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
653 
654   if (hasDispatchID())
655     NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
656 
657   if (hasFlatScratchInit())
658     NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
659 
660   if (hasPrivateSegmentSize())
661     NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
662 }
663 
allocKernargPreloadSGPRs(unsigned NumSGPRs)664 void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
665   assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
666   NumKernargPreloadSGPRs += NumSGPRs;
667   NumUsedUserSGPRs += NumSGPRs;
668 }
669 
getNumFreeUserSGPRs()670 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
671   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
672 }
673