xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1*700637cbSDimitry Andric //===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2*700637cbSDimitry Andric //
3*700637cbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*700637cbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*700637cbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*700637cbSDimitry Andric //
7*700637cbSDimitry Andric //===----------------------------------------------------------------------===//
8*700637cbSDimitry Andric //
9*700637cbSDimitry Andric /// \file
10*700637cbSDimitry Andric /// Implements the GCN specific subclass of TargetSubtarget.
11*700637cbSDimitry Andric //
12*700637cbSDimitry Andric //===----------------------------------------------------------------------===//
13*700637cbSDimitry Andric 
14*700637cbSDimitry Andric #include "GCNSubtarget.h"
15*700637cbSDimitry Andric #include "AMDGPUCallLowering.h"
16*700637cbSDimitry Andric #include "AMDGPUInstructionSelector.h"
17*700637cbSDimitry Andric #include "AMDGPULegalizerInfo.h"
18*700637cbSDimitry Andric #include "AMDGPURegisterBankInfo.h"
19*700637cbSDimitry Andric #include "AMDGPUSelectionDAGInfo.h"
20*700637cbSDimitry Andric #include "AMDGPUTargetMachine.h"
21*700637cbSDimitry Andric #include "SIMachineFunctionInfo.h"
22*700637cbSDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
23*700637cbSDimitry Andric #include "llvm/ADT/SmallString.h"
24*700637cbSDimitry Andric #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25*700637cbSDimitry Andric #include "llvm/CodeGen/MachineScheduler.h"
26*700637cbSDimitry Andric #include "llvm/CodeGen/TargetFrameLowering.h"
27*700637cbSDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
28*700637cbSDimitry Andric #include "llvm/IR/MDBuilder.h"
29*700637cbSDimitry Andric #include <algorithm>
30*700637cbSDimitry Andric 
31*700637cbSDimitry Andric using namespace llvm;
32*700637cbSDimitry Andric 
33*700637cbSDimitry Andric #define DEBUG_TYPE "gcn-subtarget"
34*700637cbSDimitry Andric 
35*700637cbSDimitry Andric #define GET_SUBTARGETINFO_TARGET_DESC
36*700637cbSDimitry Andric #define GET_SUBTARGETINFO_CTOR
37*700637cbSDimitry Andric #define AMDGPUSubtarget GCNSubtarget
38*700637cbSDimitry Andric #include "AMDGPUGenSubtargetInfo.inc"
39*700637cbSDimitry Andric #undef AMDGPUSubtarget
40*700637cbSDimitry Andric 
41*700637cbSDimitry Andric static cl::opt<bool> EnableVGPRIndexMode(
42*700637cbSDimitry Andric     "amdgpu-vgpr-index-mode",
43*700637cbSDimitry Andric     cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44*700637cbSDimitry Andric     cl::init(false));
45*700637cbSDimitry Andric 
46*700637cbSDimitry Andric static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47*700637cbSDimitry Andric                            cl::desc("Enable the use of AA during codegen."),
48*700637cbSDimitry Andric                            cl::init(true));
49*700637cbSDimitry Andric 
50*700637cbSDimitry Andric static cl::opt<unsigned>
51*700637cbSDimitry Andric     NSAThreshold("amdgpu-nsa-threshold",
52*700637cbSDimitry Andric                  cl::desc("Number of addresses from which to enable MIMG NSA."),
53*700637cbSDimitry Andric                  cl::init(2), cl::Hidden);
54*700637cbSDimitry Andric 
55*700637cbSDimitry Andric GCNSubtarget::~GCNSubtarget() = default;
56*700637cbSDimitry Andric 
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)57*700637cbSDimitry Andric GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58*700637cbSDimitry Andric                                                             StringRef GPU,
59*700637cbSDimitry Andric                                                             StringRef FS) {
60*700637cbSDimitry Andric   // Determine default and user-specified characteristics
61*700637cbSDimitry Andric   //
62*700637cbSDimitry Andric   // We want to be able to turn these off, but making this a subtarget feature
63*700637cbSDimitry Andric   // for SI has the unhelpful behavior that it unsets everything else if you
64*700637cbSDimitry Andric   // disable it.
65*700637cbSDimitry Andric   //
66*700637cbSDimitry Andric   // Similarly we want enable-prt-strict-null to be on by default and not to
67*700637cbSDimitry Andric   // unset everything else if it is disabled
68*700637cbSDimitry Andric 
69*700637cbSDimitry Andric   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
70*700637cbSDimitry Andric 
71*700637cbSDimitry Andric   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72*700637cbSDimitry Andric   // default
73*700637cbSDimitry Andric   if (isAmdHsaOS())
74*700637cbSDimitry Andric     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75*700637cbSDimitry Andric 
76*700637cbSDimitry Andric   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77*700637cbSDimitry Andric 
78*700637cbSDimitry Andric   // Disable mutually exclusive bits.
79*700637cbSDimitry Andric   if (FS.contains_insensitive("+wavefrontsize")) {
80*700637cbSDimitry Andric     if (!FS.contains_insensitive("wavefrontsize16"))
81*700637cbSDimitry Andric       FullFS += "-wavefrontsize16,";
82*700637cbSDimitry Andric     if (!FS.contains_insensitive("wavefrontsize32"))
83*700637cbSDimitry Andric       FullFS += "-wavefrontsize32,";
84*700637cbSDimitry Andric     if (!FS.contains_insensitive("wavefrontsize64"))
85*700637cbSDimitry Andric       FullFS += "-wavefrontsize64,";
86*700637cbSDimitry Andric   }
87*700637cbSDimitry Andric 
88*700637cbSDimitry Andric   FullFS += FS;
89*700637cbSDimitry Andric 
90*700637cbSDimitry Andric   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
91*700637cbSDimitry Andric 
92*700637cbSDimitry Andric   // Implement the "generic" processors, which acts as the default when no
93*700637cbSDimitry Andric   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94*700637cbSDimitry Andric   // the first amdgcn target that supports flat addressing. Other OSes defaults
95*700637cbSDimitry Andric   // to the first amdgcn target.
96*700637cbSDimitry Andric   if (Gen == AMDGPUSubtarget::INVALID) {
97*700637cbSDimitry Andric     Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98*700637cbSDimitry Andric                                        : AMDGPUSubtarget::SOUTHERN_ISLANDS;
99*700637cbSDimitry Andric     // Assume wave64 for the unknown target, if not explicitly set.
100*700637cbSDimitry Andric     if (getWavefrontSizeLog2() == 0)
101*700637cbSDimitry Andric       WavefrontSizeLog2 = 6;
102*700637cbSDimitry Andric   } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
103*700637cbSDimitry Andric              !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
104*700637cbSDimitry Andric     // If there is no default wave size it must be a generation before gfx10,
105*700637cbSDimitry Andric     // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106*700637cbSDimitry Andric     // set wave32 as a default.
107*700637cbSDimitry Andric     ToggleFeature(AMDGPU::FeatureWavefrontSize32);
108*700637cbSDimitry Andric     WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;
109*700637cbSDimitry Andric   }
110*700637cbSDimitry Andric 
111*700637cbSDimitry Andric   // We don't support FP64 for EG/NI atm.
112*700637cbSDimitry Andric   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113*700637cbSDimitry Andric 
114*700637cbSDimitry Andric   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115*700637cbSDimitry Andric   // support flat operations, otherwise they cannot access a 64-bit global
116*700637cbSDimitry Andric   // address space
117*700637cbSDimitry Andric   assert(hasAddr64() || hasFlat());
118*700637cbSDimitry Andric   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119*700637cbSDimitry Andric   // that do not support ADDR64 variants of MUBUF instructions. Such targets
120*700637cbSDimitry Andric   // cannot use a 64 bit offset with a MUBUF instruction to access the global
121*700637cbSDimitry Andric   // address space
122*700637cbSDimitry Andric   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
123*700637cbSDimitry Andric     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
124*700637cbSDimitry Andric     FlatForGlobal = true;
125*700637cbSDimitry Andric   }
126*700637cbSDimitry Andric   // Unless +-flat-for-global is specified, use MUBUF instructions for global
127*700637cbSDimitry Andric   // address space access if flat operations are not available.
128*700637cbSDimitry Andric   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
129*700637cbSDimitry Andric     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
130*700637cbSDimitry Andric     FlatForGlobal = false;
131*700637cbSDimitry Andric   }
132*700637cbSDimitry Andric 
133*700637cbSDimitry Andric   // Set defaults if needed.
134*700637cbSDimitry Andric   if (MaxPrivateElementSize == 0)
135*700637cbSDimitry Andric     MaxPrivateElementSize = 4;
136*700637cbSDimitry Andric 
137*700637cbSDimitry Andric   if (LDSBankCount == 0)
138*700637cbSDimitry Andric     LDSBankCount = 32;
139*700637cbSDimitry Andric 
140*700637cbSDimitry Andric   if (TT.isAMDGCN() && AddressableLocalMemorySize == 0)
141*700637cbSDimitry Andric     AddressableLocalMemorySize = 32768;
142*700637cbSDimitry Andric 
143*700637cbSDimitry Andric   LocalMemorySize = AddressableLocalMemorySize;
144*700637cbSDimitry Andric   if (AMDGPU::isGFX10Plus(*this) &&
145*700637cbSDimitry Andric       !getFeatureBits().test(AMDGPU::FeatureCuMode))
146*700637cbSDimitry Andric     LocalMemorySize *= 2;
147*700637cbSDimitry Andric 
148*700637cbSDimitry Andric   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
149*700637cbSDimitry Andric   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
150*700637cbSDimitry Andric 
151*700637cbSDimitry Andric   TargetID.setTargetIDFromFeaturesString(FS);
152*700637cbSDimitry Andric 
153*700637cbSDimitry Andric   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
154*700637cbSDimitry Andric                     << TargetID.getXnackSetting() << '\n');
155*700637cbSDimitry Andric   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
156*700637cbSDimitry Andric                     << TargetID.getSramEccSetting() << '\n');
157*700637cbSDimitry Andric 
158*700637cbSDimitry Andric   return *this;
159*700637cbSDimitry Andric }
160*700637cbSDimitry Andric 
checkSubtargetFeatures(const Function & F) const161*700637cbSDimitry Andric void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
162*700637cbSDimitry Andric   LLVMContext &Ctx = F.getContext();
163*700637cbSDimitry Andric   if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&
164*700637cbSDimitry Andric       hasFeature(AMDGPU::FeatureWavefrontSize64)) {
165*700637cbSDimitry Andric     Ctx.diagnose(DiagnosticInfoUnsupported(
166*700637cbSDimitry Andric         F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
167*700637cbSDimitry Andric   }
168*700637cbSDimitry Andric }
169*700637cbSDimitry Andric 
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)170*700637cbSDimitry Andric GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
171*700637cbSDimitry Andric                            const GCNTargetMachine &TM)
172*700637cbSDimitry Andric     : // clang-format off
173*700637cbSDimitry Andric     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
174*700637cbSDimitry Andric     AMDGPUSubtarget(TT),
175*700637cbSDimitry Andric     TargetTriple(TT),
176*700637cbSDimitry Andric     TargetID(*this),
177*700637cbSDimitry Andric     InstrItins(getInstrItineraryForCPU(GPU)),
178*700637cbSDimitry Andric     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
179*700637cbSDimitry Andric     TLInfo(TM, *this),
180*700637cbSDimitry Andric     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
181*700637cbSDimitry Andric   // clang-format on
182*700637cbSDimitry Andric   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
183*700637cbSDimitry Andric   EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this);
184*700637cbSDimitry Andric 
185*700637cbSDimitry Andric   TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
186*700637cbSDimitry Andric 
187*700637cbSDimitry Andric   CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
188*700637cbSDimitry Andric   InlineAsmLoweringInfo =
189*700637cbSDimitry Andric       std::make_unique<InlineAsmLowering>(getTargetLowering());
190*700637cbSDimitry Andric   Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
191*700637cbSDimitry Andric   RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
192*700637cbSDimitry Andric   InstSelector =
193*700637cbSDimitry Andric       std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
194*700637cbSDimitry Andric }
195*700637cbSDimitry Andric 
getSelectionDAGInfo() const196*700637cbSDimitry Andric const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {
197*700637cbSDimitry Andric   return TSInfo.get();
198*700637cbSDimitry Andric }
199*700637cbSDimitry Andric 
getConstantBusLimit(unsigned Opcode) const200*700637cbSDimitry Andric unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
201*700637cbSDimitry Andric   if (getGeneration() < GFX10)
202*700637cbSDimitry Andric     return 1;
203*700637cbSDimitry Andric 
204*700637cbSDimitry Andric   switch (Opcode) {
205*700637cbSDimitry Andric   case AMDGPU::V_LSHLREV_B64_e64:
206*700637cbSDimitry Andric   case AMDGPU::V_LSHLREV_B64_gfx10:
207*700637cbSDimitry Andric   case AMDGPU::V_LSHLREV_B64_e64_gfx11:
208*700637cbSDimitry Andric   case AMDGPU::V_LSHLREV_B64_e32_gfx12:
209*700637cbSDimitry Andric   case AMDGPU::V_LSHLREV_B64_e64_gfx12:
210*700637cbSDimitry Andric   case AMDGPU::V_LSHL_B64_e64:
211*700637cbSDimitry Andric   case AMDGPU::V_LSHRREV_B64_e64:
212*700637cbSDimitry Andric   case AMDGPU::V_LSHRREV_B64_gfx10:
213*700637cbSDimitry Andric   case AMDGPU::V_LSHRREV_B64_e64_gfx11:
214*700637cbSDimitry Andric   case AMDGPU::V_LSHRREV_B64_e64_gfx12:
215*700637cbSDimitry Andric   case AMDGPU::V_LSHR_B64_e64:
216*700637cbSDimitry Andric   case AMDGPU::V_ASHRREV_I64_e64:
217*700637cbSDimitry Andric   case AMDGPU::V_ASHRREV_I64_gfx10:
218*700637cbSDimitry Andric   case AMDGPU::V_ASHRREV_I64_e64_gfx11:
219*700637cbSDimitry Andric   case AMDGPU::V_ASHRREV_I64_e64_gfx12:
220*700637cbSDimitry Andric   case AMDGPU::V_ASHR_I64_e64:
221*700637cbSDimitry Andric     return 1;
222*700637cbSDimitry Andric   }
223*700637cbSDimitry Andric 
224*700637cbSDimitry Andric   return 2;
225*700637cbSDimitry Andric }
226*700637cbSDimitry Andric 
227*700637cbSDimitry Andric /// This list was mostly derived from experimentation.
zeroesHigh16BitsOfDest(unsigned Opcode) const228*700637cbSDimitry Andric bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
229*700637cbSDimitry Andric   switch (Opcode) {
230*700637cbSDimitry Andric   case AMDGPU::V_CVT_F16_F32_e32:
231*700637cbSDimitry Andric   case AMDGPU::V_CVT_F16_F32_e64:
232*700637cbSDimitry Andric   case AMDGPU::V_CVT_F16_U16_e32:
233*700637cbSDimitry Andric   case AMDGPU::V_CVT_F16_U16_e64:
234*700637cbSDimitry Andric   case AMDGPU::V_CVT_F16_I16_e32:
235*700637cbSDimitry Andric   case AMDGPU::V_CVT_F16_I16_e64:
236*700637cbSDimitry Andric   case AMDGPU::V_RCP_F16_e64:
237*700637cbSDimitry Andric   case AMDGPU::V_RCP_F16_e32:
238*700637cbSDimitry Andric   case AMDGPU::V_RSQ_F16_e64:
239*700637cbSDimitry Andric   case AMDGPU::V_RSQ_F16_e32:
240*700637cbSDimitry Andric   case AMDGPU::V_SQRT_F16_e64:
241*700637cbSDimitry Andric   case AMDGPU::V_SQRT_F16_e32:
242*700637cbSDimitry Andric   case AMDGPU::V_LOG_F16_e64:
243*700637cbSDimitry Andric   case AMDGPU::V_LOG_F16_e32:
244*700637cbSDimitry Andric   case AMDGPU::V_EXP_F16_e64:
245*700637cbSDimitry Andric   case AMDGPU::V_EXP_F16_e32:
246*700637cbSDimitry Andric   case AMDGPU::V_SIN_F16_e64:
247*700637cbSDimitry Andric   case AMDGPU::V_SIN_F16_e32:
248*700637cbSDimitry Andric   case AMDGPU::V_COS_F16_e64:
249*700637cbSDimitry Andric   case AMDGPU::V_COS_F16_e32:
250*700637cbSDimitry Andric   case AMDGPU::V_FLOOR_F16_e64:
251*700637cbSDimitry Andric   case AMDGPU::V_FLOOR_F16_e32:
252*700637cbSDimitry Andric   case AMDGPU::V_CEIL_F16_e64:
253*700637cbSDimitry Andric   case AMDGPU::V_CEIL_F16_e32:
254*700637cbSDimitry Andric   case AMDGPU::V_TRUNC_F16_e64:
255*700637cbSDimitry Andric   case AMDGPU::V_TRUNC_F16_e32:
256*700637cbSDimitry Andric   case AMDGPU::V_RNDNE_F16_e64:
257*700637cbSDimitry Andric   case AMDGPU::V_RNDNE_F16_e32:
258*700637cbSDimitry Andric   case AMDGPU::V_FRACT_F16_e64:
259*700637cbSDimitry Andric   case AMDGPU::V_FRACT_F16_e32:
260*700637cbSDimitry Andric   case AMDGPU::V_FREXP_MANT_F16_e64:
261*700637cbSDimitry Andric   case AMDGPU::V_FREXP_MANT_F16_e32:
262*700637cbSDimitry Andric   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
263*700637cbSDimitry Andric   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
264*700637cbSDimitry Andric   case AMDGPU::V_LDEXP_F16_e64:
265*700637cbSDimitry Andric   case AMDGPU::V_LDEXP_F16_e32:
266*700637cbSDimitry Andric   case AMDGPU::V_LSHLREV_B16_e64:
267*700637cbSDimitry Andric   case AMDGPU::V_LSHLREV_B16_e32:
268*700637cbSDimitry Andric   case AMDGPU::V_LSHRREV_B16_e64:
269*700637cbSDimitry Andric   case AMDGPU::V_LSHRREV_B16_e32:
270*700637cbSDimitry Andric   case AMDGPU::V_ASHRREV_I16_e64:
271*700637cbSDimitry Andric   case AMDGPU::V_ASHRREV_I16_e32:
272*700637cbSDimitry Andric   case AMDGPU::V_ADD_U16_e64:
273*700637cbSDimitry Andric   case AMDGPU::V_ADD_U16_e32:
274*700637cbSDimitry Andric   case AMDGPU::V_SUB_U16_e64:
275*700637cbSDimitry Andric   case AMDGPU::V_SUB_U16_e32:
276*700637cbSDimitry Andric   case AMDGPU::V_SUBREV_U16_e64:
277*700637cbSDimitry Andric   case AMDGPU::V_SUBREV_U16_e32:
278*700637cbSDimitry Andric   case AMDGPU::V_MUL_LO_U16_e64:
279*700637cbSDimitry Andric   case AMDGPU::V_MUL_LO_U16_e32:
280*700637cbSDimitry Andric   case AMDGPU::V_ADD_F16_e64:
281*700637cbSDimitry Andric   case AMDGPU::V_ADD_F16_e32:
282*700637cbSDimitry Andric   case AMDGPU::V_SUB_F16_e64:
283*700637cbSDimitry Andric   case AMDGPU::V_SUB_F16_e32:
284*700637cbSDimitry Andric   case AMDGPU::V_SUBREV_F16_e64:
285*700637cbSDimitry Andric   case AMDGPU::V_SUBREV_F16_e32:
286*700637cbSDimitry Andric   case AMDGPU::V_MUL_F16_e64:
287*700637cbSDimitry Andric   case AMDGPU::V_MUL_F16_e32:
288*700637cbSDimitry Andric   case AMDGPU::V_MAX_F16_e64:
289*700637cbSDimitry Andric   case AMDGPU::V_MAX_F16_e32:
290*700637cbSDimitry Andric   case AMDGPU::V_MIN_F16_e64:
291*700637cbSDimitry Andric   case AMDGPU::V_MIN_F16_e32:
292*700637cbSDimitry Andric   case AMDGPU::V_MAX_U16_e64:
293*700637cbSDimitry Andric   case AMDGPU::V_MAX_U16_e32:
294*700637cbSDimitry Andric   case AMDGPU::V_MIN_U16_e64:
295*700637cbSDimitry Andric   case AMDGPU::V_MIN_U16_e32:
296*700637cbSDimitry Andric   case AMDGPU::V_MAX_I16_e64:
297*700637cbSDimitry Andric   case AMDGPU::V_MAX_I16_e32:
298*700637cbSDimitry Andric   case AMDGPU::V_MIN_I16_e64:
299*700637cbSDimitry Andric   case AMDGPU::V_MIN_I16_e32:
300*700637cbSDimitry Andric   case AMDGPU::V_MAD_F16_e64:
301*700637cbSDimitry Andric   case AMDGPU::V_MAD_U16_e64:
302*700637cbSDimitry Andric   case AMDGPU::V_MAD_I16_e64:
303*700637cbSDimitry Andric   case AMDGPU::V_FMA_F16_e64:
304*700637cbSDimitry Andric   case AMDGPU::V_DIV_FIXUP_F16_e64:
305*700637cbSDimitry Andric     // On gfx10, all 16-bit instructions preserve the high bits.
306*700637cbSDimitry Andric     return getGeneration() <= AMDGPUSubtarget::GFX9;
307*700637cbSDimitry Andric   case AMDGPU::V_MADAK_F16:
308*700637cbSDimitry Andric   case AMDGPU::V_MADMK_F16:
309*700637cbSDimitry Andric   case AMDGPU::V_MAC_F16_e64:
310*700637cbSDimitry Andric   case AMDGPU::V_MAC_F16_e32:
311*700637cbSDimitry Andric   case AMDGPU::V_FMAMK_F16:
312*700637cbSDimitry Andric   case AMDGPU::V_FMAAK_F16:
313*700637cbSDimitry Andric   case AMDGPU::V_FMAC_F16_e64:
314*700637cbSDimitry Andric   case AMDGPU::V_FMAC_F16_e32:
315*700637cbSDimitry Andric     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
316*700637cbSDimitry Andric     // instructions maintain the legacy behavior of 0ing. Some instructions
317*700637cbSDimitry Andric     // changed to preserving the high bits.
318*700637cbSDimitry Andric     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
319*700637cbSDimitry Andric   case AMDGPU::V_MAD_MIXLO_F16:
320*700637cbSDimitry Andric   case AMDGPU::V_MAD_MIXHI_F16:
321*700637cbSDimitry Andric   default:
322*700637cbSDimitry Andric     return false;
323*700637cbSDimitry Andric   }
324*700637cbSDimitry Andric }
325*700637cbSDimitry Andric 
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const326*700637cbSDimitry Andric void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
327*700637cbSDimitry Andric                                        unsigned NumRegionInstrs) const {
328*700637cbSDimitry Andric   // Track register pressure so the scheduler can try to decrease
329*700637cbSDimitry Andric   // pressure once register usage is above the threshold defined by
330*700637cbSDimitry Andric   // SIRegisterInfo::getRegPressureSetLimit()
331*700637cbSDimitry Andric   Policy.ShouldTrackPressure = true;
332*700637cbSDimitry Andric 
333*700637cbSDimitry Andric   // Enabling both top down and bottom up scheduling seems to give us less
334*700637cbSDimitry Andric   // register spills than just using one of these approaches on its own.
335*700637cbSDimitry Andric   Policy.OnlyTopDown = false;
336*700637cbSDimitry Andric   Policy.OnlyBottomUp = false;
337*700637cbSDimitry Andric 
338*700637cbSDimitry Andric   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
339*700637cbSDimitry Andric   if (!enableSIScheduler())
340*700637cbSDimitry Andric     Policy.ShouldTrackLaneMasks = true;
341*700637cbSDimitry Andric }
342*700637cbSDimitry Andric 
mirFileLoaded(MachineFunction & MF) const343*700637cbSDimitry Andric void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
344*700637cbSDimitry Andric   if (isWave32()) {
345*700637cbSDimitry Andric     // Fix implicit $vcc operands after MIParser has verified that they match
346*700637cbSDimitry Andric     // the instruction definitions.
347*700637cbSDimitry Andric     for (auto &MBB : MF) {
348*700637cbSDimitry Andric       for (auto &MI : MBB)
349*700637cbSDimitry Andric         InstrInfo.fixImplicitOperands(MI);
350*700637cbSDimitry Andric     }
351*700637cbSDimitry Andric   }
352*700637cbSDimitry Andric }
353*700637cbSDimitry Andric 
hasMadF16() const354*700637cbSDimitry Andric bool GCNSubtarget::hasMadF16() const {
355*700637cbSDimitry Andric   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
356*700637cbSDimitry Andric }
357*700637cbSDimitry Andric 
useVGPRIndexMode() const358*700637cbSDimitry Andric bool GCNSubtarget::useVGPRIndexMode() const {
359*700637cbSDimitry Andric   return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
360*700637cbSDimitry Andric }
361*700637cbSDimitry Andric 
useAA() const362*700637cbSDimitry Andric bool GCNSubtarget::useAA() const { return UseAA; }
363*700637cbSDimitry Andric 
getOccupancyWithNumSGPRs(unsigned SGPRs) const364*700637cbSDimitry Andric unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
365*700637cbSDimitry Andric   return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),
366*700637cbSDimitry Andric                                                    getGeneration());
367*700637cbSDimitry Andric }
368*700637cbSDimitry Andric 
369*700637cbSDimitry Andric unsigned
getOccupancyWithNumVGPRs(unsigned NumVGPRs,unsigned DynamicVGPRBlockSize) const370*700637cbSDimitry Andric GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
371*700637cbSDimitry Andric                                        unsigned DynamicVGPRBlockSize) const {
372*700637cbSDimitry Andric   return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs,
373*700637cbSDimitry Andric                                                        DynamicVGPRBlockSize);
374*700637cbSDimitry Andric }
375*700637cbSDimitry Andric 
376*700637cbSDimitry Andric unsigned
getBaseReservedNumSGPRs(const bool HasFlatScratch) const377*700637cbSDimitry Andric GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
378*700637cbSDimitry Andric   if (getGeneration() >= AMDGPUSubtarget::GFX10)
379*700637cbSDimitry Andric     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
380*700637cbSDimitry Andric 
381*700637cbSDimitry Andric   if (HasFlatScratch || HasArchitectedFlatScratch) {
382*700637cbSDimitry Andric     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
383*700637cbSDimitry Andric       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
384*700637cbSDimitry Andric     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
385*700637cbSDimitry Andric       return 4; // FLAT_SCRATCH, VCC (in that order).
386*700637cbSDimitry Andric   }
387*700637cbSDimitry Andric 
388*700637cbSDimitry Andric   if (isXNACKEnabled())
389*700637cbSDimitry Andric     return 4; // XNACK, VCC (in that order).
390*700637cbSDimitry Andric   return 2;   // VCC.
391*700637cbSDimitry Andric }
392*700637cbSDimitry Andric 
getReservedNumSGPRs(const MachineFunction & MF) const393*700637cbSDimitry Andric unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
394*700637cbSDimitry Andric   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
395*700637cbSDimitry Andric   return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());
396*700637cbSDimitry Andric }
397*700637cbSDimitry Andric 
getReservedNumSGPRs(const Function & F) const398*700637cbSDimitry Andric unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
399*700637cbSDimitry Andric   // In principle we do not need to reserve SGPR pair used for flat_scratch if
400*700637cbSDimitry Andric   // we know flat instructions do not access the stack anywhere in the
401*700637cbSDimitry Andric   // program. For now assume it's needed if we have flat instructions.
402*700637cbSDimitry Andric   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
403*700637cbSDimitry Andric   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
404*700637cbSDimitry Andric }
405*700637cbSDimitry Andric 
406*700637cbSDimitry Andric std::pair<unsigned, unsigned>
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const407*700637cbSDimitry Andric GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
408*700637cbSDimitry Andric                                unsigned NumSGPRs, unsigned NumVGPRs) const {
409*700637cbSDimitry Andric   unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
410*700637cbSDimitry Andric   // Temporarily check both the attribute and the subtarget feature until the
411*700637cbSDimitry Andric   // latter is removed.
412*700637cbSDimitry Andric   if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
413*700637cbSDimitry Andric     DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
414*700637cbSDimitry Andric 
415*700637cbSDimitry Andric   auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
416*700637cbSDimitry Andric   unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
417*700637cbSDimitry Andric   unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
418*700637cbSDimitry Andric 
419*700637cbSDimitry Andric   // Maximum occupancy may be further limited by high SGPR/VGPR usage.
420*700637cbSDimitry Andric   MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
421*700637cbSDimitry Andric   return {std::min(MinOcc, MaxOcc), MaxOcc};
422*700637cbSDimitry Andric }
423*700637cbSDimitry Andric 
getBaseMaxNumSGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU,unsigned PreloadedSGPRs,unsigned ReservedNumSGPRs) const424*700637cbSDimitry Andric unsigned GCNSubtarget::getBaseMaxNumSGPRs(
425*700637cbSDimitry Andric     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
426*700637cbSDimitry Andric     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
427*700637cbSDimitry Andric   // Compute maximum number of SGPRs function can use using default/requested
428*700637cbSDimitry Andric   // minimum number of waves per execution unit.
429*700637cbSDimitry Andric   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
430*700637cbSDimitry Andric   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
431*700637cbSDimitry Andric 
432*700637cbSDimitry Andric   // Check if maximum number of SGPRs was explicitly requested using
433*700637cbSDimitry Andric   // "amdgpu-num-sgpr" attribute.
434*700637cbSDimitry Andric   unsigned Requested =
435*700637cbSDimitry Andric       F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
436*700637cbSDimitry Andric 
437*700637cbSDimitry Andric   if (Requested != MaxNumSGPRs) {
438*700637cbSDimitry Andric     // Make sure requested value does not violate subtarget's specifications.
439*700637cbSDimitry Andric     if (Requested && (Requested <= ReservedNumSGPRs))
440*700637cbSDimitry Andric       Requested = 0;
441*700637cbSDimitry Andric 
442*700637cbSDimitry Andric     // If more SGPRs are required to support the input user/system SGPRs,
443*700637cbSDimitry Andric     // increase to accommodate them.
444*700637cbSDimitry Andric     //
445*700637cbSDimitry Andric     // FIXME: This really ends up using the requested number of SGPRs + number
446*700637cbSDimitry Andric     // of reserved special registers in total. Theoretically you could re-use
447*700637cbSDimitry Andric     // the last input registers for these special registers, but this would
448*700637cbSDimitry Andric     // require a lot of complexity to deal with the weird aliasing.
449*700637cbSDimitry Andric     unsigned InputNumSGPRs = PreloadedSGPRs;
450*700637cbSDimitry Andric     if (Requested && Requested < InputNumSGPRs)
451*700637cbSDimitry Andric       Requested = InputNumSGPRs;
452*700637cbSDimitry Andric 
453*700637cbSDimitry Andric     // Make sure requested value is compatible with values implied by
454*700637cbSDimitry Andric     // default/requested minimum/maximum number of waves per execution unit.
455*700637cbSDimitry Andric     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
456*700637cbSDimitry Andric       Requested = 0;
457*700637cbSDimitry Andric     if (WavesPerEU.second && Requested &&
458*700637cbSDimitry Andric         Requested < getMinNumSGPRs(WavesPerEU.second))
459*700637cbSDimitry Andric       Requested = 0;
460*700637cbSDimitry Andric 
461*700637cbSDimitry Andric     if (Requested)
462*700637cbSDimitry Andric       MaxNumSGPRs = Requested;
463*700637cbSDimitry Andric   }
464*700637cbSDimitry Andric 
465*700637cbSDimitry Andric   if (hasSGPRInitBug())
466*700637cbSDimitry Andric     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
467*700637cbSDimitry Andric 
468*700637cbSDimitry Andric   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
469*700637cbSDimitry Andric }
470*700637cbSDimitry Andric 
getMaxNumSGPRs(const MachineFunction & MF) const471*700637cbSDimitry Andric unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
472*700637cbSDimitry Andric   const Function &F = MF.getFunction();
473*700637cbSDimitry Andric   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
474*700637cbSDimitry Andric   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
475*700637cbSDimitry Andric                             getReservedNumSGPRs(MF));
476*700637cbSDimitry Andric }
477*700637cbSDimitry Andric 
getMaxNumPreloadedSGPRs() const478*700637cbSDimitry Andric unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
479*700637cbSDimitry Andric   using USI = GCNUserSGPRUsageInfo;
480*700637cbSDimitry Andric   // Max number of user SGPRs
481*700637cbSDimitry Andric   const unsigned MaxUserSGPRs =
482*700637cbSDimitry Andric       USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
483*700637cbSDimitry Andric       USI::getNumUserSGPRForField(USI::DispatchPtrID) +
484*700637cbSDimitry Andric       USI::getNumUserSGPRForField(USI::QueuePtrID) +
485*700637cbSDimitry Andric       USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
486*700637cbSDimitry Andric       USI::getNumUserSGPRForField(USI::DispatchIdID) +
487*700637cbSDimitry Andric       USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
488*700637cbSDimitry Andric       USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
489*700637cbSDimitry Andric 
490*700637cbSDimitry Andric   // Max number of system SGPRs
491*700637cbSDimitry Andric   const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
492*700637cbSDimitry Andric                                   1 + // WorkGroupIDY
493*700637cbSDimitry Andric                                   1 + // WorkGroupIDZ
494*700637cbSDimitry Andric                                   1 + // WorkGroupInfo
495*700637cbSDimitry Andric                                   1;  // private segment wave byte offset
496*700637cbSDimitry Andric 
497*700637cbSDimitry Andric   // Max number of synthetic SGPRs
498*700637cbSDimitry Andric   const unsigned SyntheticSGPRs = 1; // LDSKernelId
499*700637cbSDimitry Andric 
500*700637cbSDimitry Andric   return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
501*700637cbSDimitry Andric }
502*700637cbSDimitry Andric 
getMaxNumSGPRs(const Function & F) const503*700637cbSDimitry Andric unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
504*700637cbSDimitry Andric   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
505*700637cbSDimitry Andric                             getReservedNumSGPRs(F));
506*700637cbSDimitry Andric }
507*700637cbSDimitry Andric 
getBaseMaxNumVGPRs(const Function & F,std::pair<unsigned,unsigned> NumVGPRBounds) const508*700637cbSDimitry Andric unsigned GCNSubtarget::getBaseMaxNumVGPRs(
509*700637cbSDimitry Andric     const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
510*700637cbSDimitry Andric   const auto &[Min, Max] = NumVGPRBounds;
511*700637cbSDimitry Andric 
512*700637cbSDimitry Andric   // Check if maximum number of VGPRs was explicitly requested using
513*700637cbSDimitry Andric   // "amdgpu-num-vgpr" attribute.
514*700637cbSDimitry Andric 
515*700637cbSDimitry Andric   unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
516*700637cbSDimitry Andric   if (Requested != Max && hasGFX90AInsts())
517*700637cbSDimitry Andric     Requested *= 2;
518*700637cbSDimitry Andric 
519*700637cbSDimitry Andric   // Make sure requested value is inside the range of possible VGPR usage.
520*700637cbSDimitry Andric   return std::clamp(Requested, Min, Max);
521*700637cbSDimitry Andric }
522*700637cbSDimitry Andric 
getMaxNumVGPRs(const Function & F) const523*700637cbSDimitry Andric unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
524*700637cbSDimitry Andric   // Temporarily check both the attribute and the subtarget feature, until the
525*700637cbSDimitry Andric   // latter is removed.
526*700637cbSDimitry Andric   unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
527*700637cbSDimitry Andric   if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
528*700637cbSDimitry Andric     DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
529*700637cbSDimitry Andric 
530*700637cbSDimitry Andric   std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
531*700637cbSDimitry Andric   return getBaseMaxNumVGPRs(
532*700637cbSDimitry Andric       F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
533*700637cbSDimitry Andric           getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
534*700637cbSDimitry Andric }
535*700637cbSDimitry Andric 
getMaxNumVGPRs(const MachineFunction & MF) const536*700637cbSDimitry Andric unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
537*700637cbSDimitry Andric   return getMaxNumVGPRs(MF.getFunction());
538*700637cbSDimitry Andric }
539*700637cbSDimitry Andric 
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep,const TargetSchedModel * SchedModel) const540*700637cbSDimitry Andric void GCNSubtarget::adjustSchedDependency(
541*700637cbSDimitry Andric     SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
542*700637cbSDimitry Andric     const TargetSchedModel *SchedModel) const {
543*700637cbSDimitry Andric   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
544*700637cbSDimitry Andric       !Use->isInstr())
545*700637cbSDimitry Andric     return;
546*700637cbSDimitry Andric 
547*700637cbSDimitry Andric   MachineInstr *DefI = Def->getInstr();
548*700637cbSDimitry Andric   MachineInstr *UseI = Use->getInstr();
549*700637cbSDimitry Andric 
550*700637cbSDimitry Andric   if (DefI->isBundle()) {
551*700637cbSDimitry Andric     const SIRegisterInfo *TRI = getRegisterInfo();
552*700637cbSDimitry Andric     auto Reg = Dep.getReg();
553*700637cbSDimitry Andric     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
554*700637cbSDimitry Andric     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
555*700637cbSDimitry Andric     unsigned Lat = 0;
556*700637cbSDimitry Andric     for (++I; I != E && I->isBundledWithPred(); ++I) {
557*700637cbSDimitry Andric       if (I->modifiesRegister(Reg, TRI))
558*700637cbSDimitry Andric         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
559*700637cbSDimitry Andric       else if (Lat)
560*700637cbSDimitry Andric         --Lat;
561*700637cbSDimitry Andric     }
562*700637cbSDimitry Andric     Dep.setLatency(Lat);
563*700637cbSDimitry Andric   } else if (UseI->isBundle()) {
564*700637cbSDimitry Andric     const SIRegisterInfo *TRI = getRegisterInfo();
565*700637cbSDimitry Andric     auto Reg = Dep.getReg();
566*700637cbSDimitry Andric     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
567*700637cbSDimitry Andric     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
568*700637cbSDimitry Andric     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
569*700637cbSDimitry Andric     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
570*700637cbSDimitry Andric       if (I->readsRegister(Reg, TRI))
571*700637cbSDimitry Andric         break;
572*700637cbSDimitry Andric       --Lat;
573*700637cbSDimitry Andric     }
574*700637cbSDimitry Andric     Dep.setLatency(Lat);
575*700637cbSDimitry Andric   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
576*700637cbSDimitry Andric     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
577*700637cbSDimitry Andric     // implicit operands which come from the MCInstrDesc, which can fool
578*700637cbSDimitry Andric     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
579*700637cbSDimitry Andric     // pseudo operands.
580*700637cbSDimitry Andric     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
581*700637cbSDimitry Andric         DefI, DefOpIdx, UseI, UseOpIdx));
582*700637cbSDimitry Andric   }
583*700637cbSDimitry Andric }
584*700637cbSDimitry Andric 
getNSAThreshold(const MachineFunction & MF) const585*700637cbSDimitry Andric unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
586*700637cbSDimitry Andric   if (getGeneration() >= AMDGPUSubtarget::GFX12)
587*700637cbSDimitry Andric     return 0; // Not MIMG encoding.
588*700637cbSDimitry Andric 
589*700637cbSDimitry Andric   if (NSAThreshold.getNumOccurrences() > 0)
590*700637cbSDimitry Andric     return std::max(NSAThreshold.getValue(), 2u);
591*700637cbSDimitry Andric 
592*700637cbSDimitry Andric   int Value = MF.getFunction().getFnAttributeAsParsedInteger(
593*700637cbSDimitry Andric       "amdgpu-nsa-threshold", -1);
594*700637cbSDimitry Andric   if (Value > 0)
595*700637cbSDimitry Andric     return std::max(Value, 2);
596*700637cbSDimitry Andric 
597*700637cbSDimitry Andric   return NSAThreshold;
598*700637cbSDimitry Andric }
599*700637cbSDimitry Andric 
GCNUserSGPRUsageInfo(const Function & F,const GCNSubtarget & ST)600*700637cbSDimitry Andric GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
601*700637cbSDimitry Andric                                            const GCNSubtarget &ST)
602*700637cbSDimitry Andric     : ST(ST) {
603*700637cbSDimitry Andric   const CallingConv::ID CC = F.getCallingConv();
604*700637cbSDimitry Andric   const bool IsKernel =
605*700637cbSDimitry Andric       CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
606*700637cbSDimitry Andric 
607*700637cbSDimitry Andric   if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
608*700637cbSDimitry Andric     KernargSegmentPtr = true;
609*700637cbSDimitry Andric 
610*700637cbSDimitry Andric   bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
611*700637cbSDimitry Andric   if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
612*700637cbSDimitry Andric     PrivateSegmentBuffer = true;
613*700637cbSDimitry Andric   else if (ST.isMesaGfxShader(F))
614*700637cbSDimitry Andric     ImplicitBufferPtr = true;
615*700637cbSDimitry Andric 
616*700637cbSDimitry Andric   if (!AMDGPU::isGraphics(CC)) {
617*700637cbSDimitry Andric     if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
618*700637cbSDimitry Andric       DispatchPtr = true;
619*700637cbSDimitry Andric 
620*700637cbSDimitry Andric     // FIXME: Can this always be disabled with < COv5?
621*700637cbSDimitry Andric     if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
622*700637cbSDimitry Andric       QueuePtr = true;
623*700637cbSDimitry Andric 
624*700637cbSDimitry Andric     if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
625*700637cbSDimitry Andric       DispatchID = true;
626*700637cbSDimitry Andric   }
627*700637cbSDimitry Andric 
628*700637cbSDimitry Andric   if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
629*700637cbSDimitry Andric       (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
630*700637cbSDimitry Andric       // FlatScratchInit cannot be true for graphics CC if enableFlatScratch()
631*700637cbSDimitry Andric       // is false.
632*700637cbSDimitry Andric       (ST.enableFlatScratch() ||
633*700637cbSDimitry Andric        (!AMDGPU::isGraphics(CC) &&
634*700637cbSDimitry Andric         !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&
635*700637cbSDimitry Andric       !ST.flatScratchIsArchitected()) {
636*700637cbSDimitry Andric     FlatScratchInit = true;
637*700637cbSDimitry Andric   }
638*700637cbSDimitry Andric 
639*700637cbSDimitry Andric   if (hasImplicitBufferPtr())
640*700637cbSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
641*700637cbSDimitry Andric 
642*700637cbSDimitry Andric   if (hasPrivateSegmentBuffer())
643*700637cbSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
644*700637cbSDimitry Andric 
645*700637cbSDimitry Andric   if (hasDispatchPtr())
646*700637cbSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
647*700637cbSDimitry Andric 
648*700637cbSDimitry Andric   if (hasQueuePtr())
649*700637cbSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
650*700637cbSDimitry Andric 
651*700637cbSDimitry Andric   if (hasKernargSegmentPtr())
652*700637cbSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
653*700637cbSDimitry Andric 
654*700637cbSDimitry Andric   if (hasDispatchID())
655*700637cbSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
656*700637cbSDimitry Andric 
657*700637cbSDimitry Andric   if (hasFlatScratchInit())
658*700637cbSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
659*700637cbSDimitry Andric 
660*700637cbSDimitry Andric   if (hasPrivateSegmentSize())
661*700637cbSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
662*700637cbSDimitry Andric }
663*700637cbSDimitry Andric 
allocKernargPreloadSGPRs(unsigned NumSGPRs)664*700637cbSDimitry Andric void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
665*700637cbSDimitry Andric   assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
666*700637cbSDimitry Andric   NumKernargPreloadSGPRs += NumSGPRs;
667*700637cbSDimitry Andric   NumUsedUserSGPRs += NumSGPRs;
668*700637cbSDimitry Andric }
669*700637cbSDimitry Andric 
getNumFreeUserSGPRs()670*700637cbSDimitry Andric unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
671*700637cbSDimitry Andric   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
672*700637cbSDimitry Andric }
673