1 //===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "GCNSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUSelectionDAGInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/MDBuilder.h"
29 #include <algorithm>
30
31 using namespace llvm;
32
33 #define DEBUG_TYPE "gcn-subtarget"
34
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #undef AMDGPUSubtarget
40
41 static cl::opt<bool> EnableVGPRIndexMode(
42 "amdgpu-vgpr-index-mode",
43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44 cl::init(false));
45
46 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47 cl::desc("Enable the use of AA during codegen."),
48 cl::init(true));
49
50 static cl::opt<unsigned>
51 NSAThreshold("amdgpu-nsa-threshold",
52 cl::desc("Number of addresses from which to enable MIMG NSA."),
53 cl::init(2), cl::Hidden);
54
55 GCNSubtarget::~GCNSubtarget() = default;
56
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)57 GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58 StringRef GPU,
59 StringRef FS) {
60 // Determine default and user-specified characteristics
61 //
62 // We want to be able to turn these off, but making this a subtarget feature
63 // for SI has the unhelpful behavior that it unsets everything else if you
64 // disable it.
65 //
66 // Similarly we want enable-prt-strict-null to be on by default and not to
67 // unset everything else if it is disabled
68
69 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
70
71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72 // default
73 if (isAmdHsaOS())
74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78 // Disable mutually exclusive bits.
79 if (FS.contains_insensitive("+wavefrontsize")) {
80 if (!FS.contains_insensitive("wavefrontsize16"))
81 FullFS += "-wavefrontsize16,";
82 if (!FS.contains_insensitive("wavefrontsize32"))
83 FullFS += "-wavefrontsize32,";
84 if (!FS.contains_insensitive("wavefrontsize64"))
85 FullFS += "-wavefrontsize64,";
86 }
87
88 FullFS += FS;
89
90 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
91
92 // Implement the "generic" processors, which acts as the default when no
93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94 // the first amdgcn target that supports flat addressing. Other OSes defaults
95 // to the first amdgcn target.
96 if (Gen == AMDGPUSubtarget::INVALID) {
97 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
99 // Assume wave64 for the unknown target, if not explicitly set.
100 if (getWavefrontSizeLog2() == 0)
101 WavefrontSizeLog2 = 6;
102 } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
104 // If there is no default wave size it must be a generation before gfx10,
105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106 // set wave32 as a default.
107 ToggleFeature(AMDGPU::FeatureWavefrontSize32);
108 WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;
109 }
110
111 // We don't support FP64 for EG/NI atm.
112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113
114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115 // support flat operations, otherwise they cannot access a 64-bit global
116 // address space
117 assert(hasAddr64() || hasFlat());
118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119 // that do not support ADDR64 variants of MUBUF instructions. Such targets
120 // cannot use a 64 bit offset with a MUBUF instruction to access the global
121 // address space
122 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
123 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
124 FlatForGlobal = true;
125 }
126 // Unless +-flat-for-global is specified, use MUBUF instructions for global
127 // address space access if flat operations are not available.
128 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
129 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
130 FlatForGlobal = false;
131 }
132
133 // Set defaults if needed.
134 if (MaxPrivateElementSize == 0)
135 MaxPrivateElementSize = 4;
136
137 if (LDSBankCount == 0)
138 LDSBankCount = 32;
139
140 if (TT.isAMDGCN() && AddressableLocalMemorySize == 0)
141 AddressableLocalMemorySize = 32768;
142
143 LocalMemorySize = AddressableLocalMemorySize;
144 if (AMDGPU::isGFX10Plus(*this) &&
145 !getFeatureBits().test(AMDGPU::FeatureCuMode))
146 LocalMemorySize *= 2;
147
148 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
149 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
150
151 TargetID.setTargetIDFromFeaturesString(FS);
152
153 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
154 << TargetID.getXnackSetting() << '\n');
155 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
156 << TargetID.getSramEccSetting() << '\n');
157
158 return *this;
159 }
160
checkSubtargetFeatures(const Function & F) const161 void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
162 LLVMContext &Ctx = F.getContext();
163 if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&
164 hasFeature(AMDGPU::FeatureWavefrontSize64)) {
165 Ctx.diagnose(DiagnosticInfoUnsupported(
166 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
167 }
168 }
169
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)170 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
171 const GCNTargetMachine &TM)
172 : // clang-format off
173 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
174 AMDGPUSubtarget(TT),
175 TargetTriple(TT),
176 TargetID(*this),
177 InstrItins(getInstrItineraryForCPU(GPU)),
178 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
179 TLInfo(TM, *this),
180 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
181 // clang-format on
182 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
183 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this);
184
185 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
186
187 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
188 InlineAsmLoweringInfo =
189 std::make_unique<InlineAsmLowering>(getTargetLowering());
190 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
191 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
192 InstSelector =
193 std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
194 }
195
getSelectionDAGInfo() const196 const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {
197 return TSInfo.get();
198 }
199
getConstantBusLimit(unsigned Opcode) const200 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
201 if (getGeneration() < GFX10)
202 return 1;
203
204 switch (Opcode) {
205 case AMDGPU::V_LSHLREV_B64_e64:
206 case AMDGPU::V_LSHLREV_B64_gfx10:
207 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
208 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
209 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
210 case AMDGPU::V_LSHL_B64_e64:
211 case AMDGPU::V_LSHRREV_B64_e64:
212 case AMDGPU::V_LSHRREV_B64_gfx10:
213 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
214 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
215 case AMDGPU::V_LSHR_B64_e64:
216 case AMDGPU::V_ASHRREV_I64_e64:
217 case AMDGPU::V_ASHRREV_I64_gfx10:
218 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
219 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
220 case AMDGPU::V_ASHR_I64_e64:
221 return 1;
222 }
223
224 return 2;
225 }
226
227 /// This list was mostly derived from experimentation.
zeroesHigh16BitsOfDest(unsigned Opcode) const228 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
229 switch (Opcode) {
230 case AMDGPU::V_CVT_F16_F32_e32:
231 case AMDGPU::V_CVT_F16_F32_e64:
232 case AMDGPU::V_CVT_F16_U16_e32:
233 case AMDGPU::V_CVT_F16_U16_e64:
234 case AMDGPU::V_CVT_F16_I16_e32:
235 case AMDGPU::V_CVT_F16_I16_e64:
236 case AMDGPU::V_RCP_F16_e64:
237 case AMDGPU::V_RCP_F16_e32:
238 case AMDGPU::V_RSQ_F16_e64:
239 case AMDGPU::V_RSQ_F16_e32:
240 case AMDGPU::V_SQRT_F16_e64:
241 case AMDGPU::V_SQRT_F16_e32:
242 case AMDGPU::V_LOG_F16_e64:
243 case AMDGPU::V_LOG_F16_e32:
244 case AMDGPU::V_EXP_F16_e64:
245 case AMDGPU::V_EXP_F16_e32:
246 case AMDGPU::V_SIN_F16_e64:
247 case AMDGPU::V_SIN_F16_e32:
248 case AMDGPU::V_COS_F16_e64:
249 case AMDGPU::V_COS_F16_e32:
250 case AMDGPU::V_FLOOR_F16_e64:
251 case AMDGPU::V_FLOOR_F16_e32:
252 case AMDGPU::V_CEIL_F16_e64:
253 case AMDGPU::V_CEIL_F16_e32:
254 case AMDGPU::V_TRUNC_F16_e64:
255 case AMDGPU::V_TRUNC_F16_e32:
256 case AMDGPU::V_RNDNE_F16_e64:
257 case AMDGPU::V_RNDNE_F16_e32:
258 case AMDGPU::V_FRACT_F16_e64:
259 case AMDGPU::V_FRACT_F16_e32:
260 case AMDGPU::V_FREXP_MANT_F16_e64:
261 case AMDGPU::V_FREXP_MANT_F16_e32:
262 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
263 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
264 case AMDGPU::V_LDEXP_F16_e64:
265 case AMDGPU::V_LDEXP_F16_e32:
266 case AMDGPU::V_LSHLREV_B16_e64:
267 case AMDGPU::V_LSHLREV_B16_e32:
268 case AMDGPU::V_LSHRREV_B16_e64:
269 case AMDGPU::V_LSHRREV_B16_e32:
270 case AMDGPU::V_ASHRREV_I16_e64:
271 case AMDGPU::V_ASHRREV_I16_e32:
272 case AMDGPU::V_ADD_U16_e64:
273 case AMDGPU::V_ADD_U16_e32:
274 case AMDGPU::V_SUB_U16_e64:
275 case AMDGPU::V_SUB_U16_e32:
276 case AMDGPU::V_SUBREV_U16_e64:
277 case AMDGPU::V_SUBREV_U16_e32:
278 case AMDGPU::V_MUL_LO_U16_e64:
279 case AMDGPU::V_MUL_LO_U16_e32:
280 case AMDGPU::V_ADD_F16_e64:
281 case AMDGPU::V_ADD_F16_e32:
282 case AMDGPU::V_SUB_F16_e64:
283 case AMDGPU::V_SUB_F16_e32:
284 case AMDGPU::V_SUBREV_F16_e64:
285 case AMDGPU::V_SUBREV_F16_e32:
286 case AMDGPU::V_MUL_F16_e64:
287 case AMDGPU::V_MUL_F16_e32:
288 case AMDGPU::V_MAX_F16_e64:
289 case AMDGPU::V_MAX_F16_e32:
290 case AMDGPU::V_MIN_F16_e64:
291 case AMDGPU::V_MIN_F16_e32:
292 case AMDGPU::V_MAX_U16_e64:
293 case AMDGPU::V_MAX_U16_e32:
294 case AMDGPU::V_MIN_U16_e64:
295 case AMDGPU::V_MIN_U16_e32:
296 case AMDGPU::V_MAX_I16_e64:
297 case AMDGPU::V_MAX_I16_e32:
298 case AMDGPU::V_MIN_I16_e64:
299 case AMDGPU::V_MIN_I16_e32:
300 case AMDGPU::V_MAD_F16_e64:
301 case AMDGPU::V_MAD_U16_e64:
302 case AMDGPU::V_MAD_I16_e64:
303 case AMDGPU::V_FMA_F16_e64:
304 case AMDGPU::V_DIV_FIXUP_F16_e64:
305 // On gfx10, all 16-bit instructions preserve the high bits.
306 return getGeneration() <= AMDGPUSubtarget::GFX9;
307 case AMDGPU::V_MADAK_F16:
308 case AMDGPU::V_MADMK_F16:
309 case AMDGPU::V_MAC_F16_e64:
310 case AMDGPU::V_MAC_F16_e32:
311 case AMDGPU::V_FMAMK_F16:
312 case AMDGPU::V_FMAAK_F16:
313 case AMDGPU::V_FMAC_F16_e64:
314 case AMDGPU::V_FMAC_F16_e32:
315 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
316 // instructions maintain the legacy behavior of 0ing. Some instructions
317 // changed to preserving the high bits.
318 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
319 case AMDGPU::V_MAD_MIXLO_F16:
320 case AMDGPU::V_MAD_MIXHI_F16:
321 default:
322 return false;
323 }
324 }
325
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const326 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
327 unsigned NumRegionInstrs) const {
328 // Track register pressure so the scheduler can try to decrease
329 // pressure once register usage is above the threshold defined by
330 // SIRegisterInfo::getRegPressureSetLimit()
331 Policy.ShouldTrackPressure = true;
332
333 // Enabling both top down and bottom up scheduling seems to give us less
334 // register spills than just using one of these approaches on its own.
335 Policy.OnlyTopDown = false;
336 Policy.OnlyBottomUp = false;
337
338 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
339 if (!enableSIScheduler())
340 Policy.ShouldTrackLaneMasks = true;
341 }
342
mirFileLoaded(MachineFunction & MF) const343 void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
344 if (isWave32()) {
345 // Fix implicit $vcc operands after MIParser has verified that they match
346 // the instruction definitions.
347 for (auto &MBB : MF) {
348 for (auto &MI : MBB)
349 InstrInfo.fixImplicitOperands(MI);
350 }
351 }
352 }
353
hasMadF16() const354 bool GCNSubtarget::hasMadF16() const {
355 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
356 }
357
useVGPRIndexMode() const358 bool GCNSubtarget::useVGPRIndexMode() const {
359 return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
360 }
361
useAA() const362 bool GCNSubtarget::useAA() const { return UseAA; }
363
getOccupancyWithNumSGPRs(unsigned SGPRs) const364 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
365 return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),
366 getGeneration());
367 }
368
369 unsigned
getOccupancyWithNumVGPRs(unsigned NumVGPRs,unsigned DynamicVGPRBlockSize) const370 GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
371 unsigned DynamicVGPRBlockSize) const {
372 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs,
373 DynamicVGPRBlockSize);
374 }
375
376 unsigned
getBaseReservedNumSGPRs(const bool HasFlatScratch) const377 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
378 if (getGeneration() >= AMDGPUSubtarget::GFX10)
379 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
380
381 if (HasFlatScratch || HasArchitectedFlatScratch) {
382 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
383 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
384 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
385 return 4; // FLAT_SCRATCH, VCC (in that order).
386 }
387
388 if (isXNACKEnabled())
389 return 4; // XNACK, VCC (in that order).
390 return 2; // VCC.
391 }
392
getReservedNumSGPRs(const MachineFunction & MF) const393 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
394 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
395 return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());
396 }
397
getReservedNumSGPRs(const Function & F) const398 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
399 // In principle we do not need to reserve SGPR pair used for flat_scratch if
400 // we know flat instructions do not access the stack anywhere in the
401 // program. For now assume it's needed if we have flat instructions.
402 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
403 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
404 }
405
406 std::pair<unsigned, unsigned>
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const407 GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
408 unsigned NumSGPRs, unsigned NumVGPRs) const {
409 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
410 // Temporarily check both the attribute and the subtarget feature until the
411 // latter is removed.
412 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
413 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
414
415 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
416 unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
417 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
418
419 // Maximum occupancy may be further limited by high SGPR/VGPR usage.
420 MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
421 return {std::min(MinOcc, MaxOcc), MaxOcc};
422 }
423
getBaseMaxNumSGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU,unsigned PreloadedSGPRs,unsigned ReservedNumSGPRs) const424 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
425 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
426 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
427 // Compute maximum number of SGPRs function can use using default/requested
428 // minimum number of waves per execution unit.
429 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
430 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
431
432 // Check if maximum number of SGPRs was explicitly requested using
433 // "amdgpu-num-sgpr" attribute.
434 unsigned Requested =
435 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
436
437 if (Requested != MaxNumSGPRs) {
438 // Make sure requested value does not violate subtarget's specifications.
439 if (Requested && (Requested <= ReservedNumSGPRs))
440 Requested = 0;
441
442 // If more SGPRs are required to support the input user/system SGPRs,
443 // increase to accommodate them.
444 //
445 // FIXME: This really ends up using the requested number of SGPRs + number
446 // of reserved special registers in total. Theoretically you could re-use
447 // the last input registers for these special registers, but this would
448 // require a lot of complexity to deal with the weird aliasing.
449 unsigned InputNumSGPRs = PreloadedSGPRs;
450 if (Requested && Requested < InputNumSGPRs)
451 Requested = InputNumSGPRs;
452
453 // Make sure requested value is compatible with values implied by
454 // default/requested minimum/maximum number of waves per execution unit.
455 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
456 Requested = 0;
457 if (WavesPerEU.second && Requested &&
458 Requested < getMinNumSGPRs(WavesPerEU.second))
459 Requested = 0;
460
461 if (Requested)
462 MaxNumSGPRs = Requested;
463 }
464
465 if (hasSGPRInitBug())
466 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
467
468 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
469 }
470
getMaxNumSGPRs(const MachineFunction & MF) const471 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
472 const Function &F = MF.getFunction();
473 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
474 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
475 getReservedNumSGPRs(MF));
476 }
477
getMaxNumPreloadedSGPRs() const478 unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
479 using USI = GCNUserSGPRUsageInfo;
480 // Max number of user SGPRs
481 const unsigned MaxUserSGPRs =
482 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
483 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
484 USI::getNumUserSGPRForField(USI::QueuePtrID) +
485 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
486 USI::getNumUserSGPRForField(USI::DispatchIdID) +
487 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
488 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
489
490 // Max number of system SGPRs
491 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
492 1 + // WorkGroupIDY
493 1 + // WorkGroupIDZ
494 1 + // WorkGroupInfo
495 1; // private segment wave byte offset
496
497 // Max number of synthetic SGPRs
498 const unsigned SyntheticSGPRs = 1; // LDSKernelId
499
500 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
501 }
502
getMaxNumSGPRs(const Function & F) const503 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
504 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
505 getReservedNumSGPRs(F));
506 }
507
getBaseMaxNumVGPRs(const Function & F,std::pair<unsigned,unsigned> NumVGPRBounds) const508 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
509 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
510 const auto &[Min, Max] = NumVGPRBounds;
511
512 // Check if maximum number of VGPRs was explicitly requested using
513 // "amdgpu-num-vgpr" attribute.
514
515 unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
516 if (Requested != Max && hasGFX90AInsts())
517 Requested *= 2;
518
519 // Make sure requested value is inside the range of possible VGPR usage.
520 return std::clamp(Requested, Min, Max);
521 }
522
getMaxNumVGPRs(const Function & F) const523 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
524 // Temporarily check both the attribute and the subtarget feature, until the
525 // latter is removed.
526 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
527 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
528 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
529
530 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
531 return getBaseMaxNumVGPRs(
532 F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
533 getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
534 }
535
getMaxNumVGPRs(const MachineFunction & MF) const536 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
537 return getMaxNumVGPRs(MF.getFunction());
538 }
539
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep,const TargetSchedModel * SchedModel) const540 void GCNSubtarget::adjustSchedDependency(
541 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
542 const TargetSchedModel *SchedModel) const {
543 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
544 !Use->isInstr())
545 return;
546
547 MachineInstr *DefI = Def->getInstr();
548 MachineInstr *UseI = Use->getInstr();
549
550 if (DefI->isBundle()) {
551 const SIRegisterInfo *TRI = getRegisterInfo();
552 auto Reg = Dep.getReg();
553 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
554 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
555 unsigned Lat = 0;
556 for (++I; I != E && I->isBundledWithPred(); ++I) {
557 if (I->modifiesRegister(Reg, TRI))
558 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
559 else if (Lat)
560 --Lat;
561 }
562 Dep.setLatency(Lat);
563 } else if (UseI->isBundle()) {
564 const SIRegisterInfo *TRI = getRegisterInfo();
565 auto Reg = Dep.getReg();
566 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
567 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
568 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
569 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
570 if (I->readsRegister(Reg, TRI))
571 break;
572 --Lat;
573 }
574 Dep.setLatency(Lat);
575 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
576 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
577 // implicit operands which come from the MCInstrDesc, which can fool
578 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
579 // pseudo operands.
580 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
581 DefI, DefOpIdx, UseI, UseOpIdx));
582 }
583 }
584
getNSAThreshold(const MachineFunction & MF) const585 unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
586 if (getGeneration() >= AMDGPUSubtarget::GFX12)
587 return 0; // Not MIMG encoding.
588
589 if (NSAThreshold.getNumOccurrences() > 0)
590 return std::max(NSAThreshold.getValue(), 2u);
591
592 int Value = MF.getFunction().getFnAttributeAsParsedInteger(
593 "amdgpu-nsa-threshold", -1);
594 if (Value > 0)
595 return std::max(Value, 2);
596
597 return NSAThreshold;
598 }
599
GCNUserSGPRUsageInfo(const Function & F,const GCNSubtarget & ST)600 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
601 const GCNSubtarget &ST)
602 : ST(ST) {
603 const CallingConv::ID CC = F.getCallingConv();
604 const bool IsKernel =
605 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
606
607 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
608 KernargSegmentPtr = true;
609
610 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
611 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
612 PrivateSegmentBuffer = true;
613 else if (ST.isMesaGfxShader(F))
614 ImplicitBufferPtr = true;
615
616 if (!AMDGPU::isGraphics(CC)) {
617 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
618 DispatchPtr = true;
619
620 // FIXME: Can this always be disabled with < COv5?
621 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
622 QueuePtr = true;
623
624 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
625 DispatchID = true;
626 }
627
628 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
629 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
630 // FlatScratchInit cannot be true for graphics CC if enableFlatScratch()
631 // is false.
632 (ST.enableFlatScratch() ||
633 (!AMDGPU::isGraphics(CC) &&
634 !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&
635 !ST.flatScratchIsArchitected()) {
636 FlatScratchInit = true;
637 }
638
639 if (hasImplicitBufferPtr())
640 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
641
642 if (hasPrivateSegmentBuffer())
643 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
644
645 if (hasDispatchPtr())
646 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
647
648 if (hasQueuePtr())
649 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
650
651 if (hasKernargSegmentPtr())
652 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
653
654 if (hasDispatchID())
655 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
656
657 if (hasFlatScratchInit())
658 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
659
660 if (hasPrivateSegmentSize())
661 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
662 }
663
allocKernargPreloadSGPRs(unsigned NumSGPRs)664 void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
665 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
666 NumKernargPreloadSGPRs += NumSGPRs;
667 NumUsedUserSGPRs += NumSGPRs;
668 }
669
getNumFreeUserSGPRs()670 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
671 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
672 }
673