1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "GCNSubtarget.h"
21 #include "R600Subtarget.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
26 #include "llvm/CodeGen/MachineScheduler.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include "llvm/IR/IntrinsicsR600.h"
31 #include "llvm/IR/MDBuilder.h"
32 #include "llvm/MC/MCSubtargetInfo.h"
33 #include <algorithm>
34
35 using namespace llvm;
36
37 #define DEBUG_TYPE "amdgpu-subtarget"
38
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #define AMDGPUSubtarget GCNSubtarget
42 #include "AMDGPUGenSubtargetInfo.inc"
43 #undef AMDGPUSubtarget
44
45 static cl::opt<bool> EnablePowerSched(
46 "amdgpu-enable-power-sched",
47 cl::desc("Enable scheduling to minimize mAI power bursts"),
48 cl::init(false));
49
50 static cl::opt<bool> EnableVGPRIndexMode(
51 "amdgpu-vgpr-index-mode",
52 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
53 cl::init(false));
54
55 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
56 cl::desc("Enable the use of AA during codegen."),
57 cl::init(true));
58
59 static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
60 cl::desc("Number of addresses from which to enable MIMG NSA."),
61 cl::init(3), cl::Hidden);
62
63 GCNSubtarget::~GCNSubtarget() = default;
64
65 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67 StringRef GPU, StringRef FS) {
68 // Determine default and user-specified characteristics
69 //
70 // We want to be able to turn these off, but making this a subtarget feature
71 // for SI has the unhelpful behavior that it unsets everything else if you
72 // disable it.
73 //
74 // Similarly we want enable-prt-strict-null to be on by default and not to
75 // unset everything else if it is disabled
76
77 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
78
79 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
80 if (isAmdHsaOS())
81 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
82
83 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
84
85 // Disable mutually exclusive bits.
86 if (FS.contains_insensitive("+wavefrontsize")) {
87 if (!FS.contains_insensitive("wavefrontsize16"))
88 FullFS += "-wavefrontsize16,";
89 if (!FS.contains_insensitive("wavefrontsize32"))
90 FullFS += "-wavefrontsize32,";
91 if (!FS.contains_insensitive("wavefrontsize64"))
92 FullFS += "-wavefrontsize64,";
93 }
94
95 FullFS += FS;
96
97 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
98
99 // Implement the "generic" processors, which acts as the default when no
100 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
101 // the first amdgcn target that supports flat addressing. Other OSes defaults
102 // to the first amdgcn target.
103 if (Gen == AMDGPUSubtarget::INVALID) {
104 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
105 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
106 }
107
108 if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
109 !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
110 // If there is no default wave size it must be a generation before gfx10,
111 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
112 // set wave32 as a default.
113 ToggleFeature(AMDGPU::FeatureWavefrontSize32);
114 }
115
116 // We don't support FP64 for EG/NI atm.
117 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
118
119 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
120 // support flat operations, otherwise they cannot access a 64-bit global
121 // address space
122 assert(hasAddr64() || hasFlat());
123 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
124 // that do not support ADDR64 variants of MUBUF instructions. Such targets
125 // cannot use a 64 bit offset with a MUBUF instruction to access the global
126 // address space
127 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
128 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
129 FlatForGlobal = true;
130 }
131 // Unless +-flat-for-global is specified, use MUBUF instructions for global
132 // address space access if flat operations are not available.
133 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
134 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
135 FlatForGlobal = false;
136 }
137
138 // Set defaults if needed.
139 if (MaxPrivateElementSize == 0)
140 MaxPrivateElementSize = 4;
141
142 if (LDSBankCount == 0)
143 LDSBankCount = 32;
144
145 if (TT.getArch() == Triple::amdgcn) {
146 if (LocalMemorySize == 0)
147 LocalMemorySize = 32768;
148
149 // Do something sensible for unspecified target.
150 if (!HasMovrel && !HasVGPRIndexMode)
151 HasMovrel = true;
152 }
153
154 AddressableLocalMemorySize = LocalMemorySize;
155
156 if (AMDGPU::isGFX10Plus(*this) &&
157 !getFeatureBits().test(AMDGPU::FeatureCuMode))
158 LocalMemorySize *= 2;
159
160 // Don't crash on invalid devices.
161 if (WavefrontSizeLog2 == 0)
162 WavefrontSizeLog2 = 5;
163
164 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
165 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
166
167 TargetID.setTargetIDFromFeaturesString(FS);
168
169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170 << TargetID.getXnackSetting() << '\n');
171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172 << TargetID.getSramEccSetting() << '\n');
173
174 return *this;
175 }
176
checkSubtargetFeatures(const Function & F) const177 void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
178 LLVMContext &Ctx = F.getContext();
179 if (hasFeature(AMDGPU::FeatureWavefrontSize32) ==
180 hasFeature(AMDGPU::FeatureWavefrontSize64)) {
181 Ctx.diagnose(DiagnosticInfoUnsupported(
182 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
183 }
184 }
185
AMDGPUSubtarget(Triple TT)186 AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
187
useRealTrue16Insts() const188 bool AMDGPUSubtarget::useRealTrue16Insts() const {
189 return hasTrue16BitInsts() && EnableRealTrue16Insts;
190 }
191
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)192 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
193 const GCNTargetMachine &TM)
194 : // clang-format off
195 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
196 AMDGPUSubtarget(TT),
197 TargetTriple(TT),
198 TargetID(*this),
199 InstrItins(getInstrItineraryForCPU(GPU)),
200 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
201 TLInfo(TM, *this),
202 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
203 // clang-format on
204 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
205 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this);
206 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
207 InlineAsmLoweringInfo =
208 std::make_unique<InlineAsmLowering>(getTargetLowering());
209 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
210 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
211 InstSelector =
212 std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
213 }
214
getConstantBusLimit(unsigned Opcode) const215 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
216 if (getGeneration() < GFX10)
217 return 1;
218
219 switch (Opcode) {
220 case AMDGPU::V_LSHLREV_B64_e64:
221 case AMDGPU::V_LSHLREV_B64_gfx10:
222 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
223 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
224 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
225 case AMDGPU::V_LSHL_B64_e64:
226 case AMDGPU::V_LSHRREV_B64_e64:
227 case AMDGPU::V_LSHRREV_B64_gfx10:
228 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
229 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
230 case AMDGPU::V_LSHR_B64_e64:
231 case AMDGPU::V_ASHRREV_I64_e64:
232 case AMDGPU::V_ASHRREV_I64_gfx10:
233 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
234 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
235 case AMDGPU::V_ASHR_I64_e64:
236 return 1;
237 }
238
239 return 2;
240 }
241
242 /// This list was mostly derived from experimentation.
zeroesHigh16BitsOfDest(unsigned Opcode) const243 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
244 switch (Opcode) {
245 case AMDGPU::V_CVT_F16_F32_e32:
246 case AMDGPU::V_CVT_F16_F32_e64:
247 case AMDGPU::V_CVT_F16_U16_e32:
248 case AMDGPU::V_CVT_F16_U16_e64:
249 case AMDGPU::V_CVT_F16_I16_e32:
250 case AMDGPU::V_CVT_F16_I16_e64:
251 case AMDGPU::V_RCP_F16_e64:
252 case AMDGPU::V_RCP_F16_e32:
253 case AMDGPU::V_RSQ_F16_e64:
254 case AMDGPU::V_RSQ_F16_e32:
255 case AMDGPU::V_SQRT_F16_e64:
256 case AMDGPU::V_SQRT_F16_e32:
257 case AMDGPU::V_LOG_F16_e64:
258 case AMDGPU::V_LOG_F16_e32:
259 case AMDGPU::V_EXP_F16_e64:
260 case AMDGPU::V_EXP_F16_e32:
261 case AMDGPU::V_SIN_F16_e64:
262 case AMDGPU::V_SIN_F16_e32:
263 case AMDGPU::V_COS_F16_e64:
264 case AMDGPU::V_COS_F16_e32:
265 case AMDGPU::V_FLOOR_F16_e64:
266 case AMDGPU::V_FLOOR_F16_e32:
267 case AMDGPU::V_CEIL_F16_e64:
268 case AMDGPU::V_CEIL_F16_e32:
269 case AMDGPU::V_TRUNC_F16_e64:
270 case AMDGPU::V_TRUNC_F16_e32:
271 case AMDGPU::V_RNDNE_F16_e64:
272 case AMDGPU::V_RNDNE_F16_e32:
273 case AMDGPU::V_FRACT_F16_e64:
274 case AMDGPU::V_FRACT_F16_e32:
275 case AMDGPU::V_FREXP_MANT_F16_e64:
276 case AMDGPU::V_FREXP_MANT_F16_e32:
277 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
278 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
279 case AMDGPU::V_LDEXP_F16_e64:
280 case AMDGPU::V_LDEXP_F16_e32:
281 case AMDGPU::V_LSHLREV_B16_e64:
282 case AMDGPU::V_LSHLREV_B16_e32:
283 case AMDGPU::V_LSHRREV_B16_e64:
284 case AMDGPU::V_LSHRREV_B16_e32:
285 case AMDGPU::V_ASHRREV_I16_e64:
286 case AMDGPU::V_ASHRREV_I16_e32:
287 case AMDGPU::V_ADD_U16_e64:
288 case AMDGPU::V_ADD_U16_e32:
289 case AMDGPU::V_SUB_U16_e64:
290 case AMDGPU::V_SUB_U16_e32:
291 case AMDGPU::V_SUBREV_U16_e64:
292 case AMDGPU::V_SUBREV_U16_e32:
293 case AMDGPU::V_MUL_LO_U16_e64:
294 case AMDGPU::V_MUL_LO_U16_e32:
295 case AMDGPU::V_ADD_F16_e64:
296 case AMDGPU::V_ADD_F16_e32:
297 case AMDGPU::V_SUB_F16_e64:
298 case AMDGPU::V_SUB_F16_e32:
299 case AMDGPU::V_SUBREV_F16_e64:
300 case AMDGPU::V_SUBREV_F16_e32:
301 case AMDGPU::V_MUL_F16_e64:
302 case AMDGPU::V_MUL_F16_e32:
303 case AMDGPU::V_MAX_F16_e64:
304 case AMDGPU::V_MAX_F16_e32:
305 case AMDGPU::V_MIN_F16_e64:
306 case AMDGPU::V_MIN_F16_e32:
307 case AMDGPU::V_MAX_U16_e64:
308 case AMDGPU::V_MAX_U16_e32:
309 case AMDGPU::V_MIN_U16_e64:
310 case AMDGPU::V_MIN_U16_e32:
311 case AMDGPU::V_MAX_I16_e64:
312 case AMDGPU::V_MAX_I16_e32:
313 case AMDGPU::V_MIN_I16_e64:
314 case AMDGPU::V_MIN_I16_e32:
315 case AMDGPU::V_MAD_F16_e64:
316 case AMDGPU::V_MAD_U16_e64:
317 case AMDGPU::V_MAD_I16_e64:
318 case AMDGPU::V_FMA_F16_e64:
319 case AMDGPU::V_DIV_FIXUP_F16_e64:
320 // On gfx10, all 16-bit instructions preserve the high bits.
321 return getGeneration() <= AMDGPUSubtarget::GFX9;
322 case AMDGPU::V_MADAK_F16:
323 case AMDGPU::V_MADMK_F16:
324 case AMDGPU::V_MAC_F16_e64:
325 case AMDGPU::V_MAC_F16_e32:
326 case AMDGPU::V_FMAMK_F16:
327 case AMDGPU::V_FMAAK_F16:
328 case AMDGPU::V_FMAC_F16_e64:
329 case AMDGPU::V_FMAC_F16_e32:
330 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
331 // instructions maintain the legacy behavior of 0ing. Some instructions
332 // changed to preserving the high bits.
333 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
334 case AMDGPU::V_MAD_MIXLO_F16:
335 case AMDGPU::V_MAD_MIXHI_F16:
336 default:
337 return false;
338 }
339 }
340
341 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
342 // allows the given function to achieve an occupancy of NWaves waves per
343 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
344 unsigned
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const345 AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
346 const Function &F) const {
347 const unsigned WaveSize = getWavefrontSize();
348 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
349 const unsigned WavesPerWorkgroup =
350 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
351
352 const unsigned WorkGroupsPerCU =
353 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
354
355 return getLocalMemorySize() / WorkGroupsPerCU;
356 }
357
358 // FIXME: Should return min,max range.
359 //
360 // Returns the maximum occupancy, in number of waves per SIMD / EU, that can
361 // be achieved when only the given function is running on the machine; and
362 // taking into account the overall number of wave slots, the (maximum) workgroup
363 // size, and the per-workgroup LDS allocation size.
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const364 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
365 const Function &F) const {
366 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
367 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
368 if (!MaxWorkGroupsPerCu)
369 return 0;
370
371 const unsigned WaveSize = getWavefrontSize();
372
373 // FIXME: Do we need to account for alignment requirement of LDS rounding the
374 // size up?
375 // Compute restriction based on LDS usage
376 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
377
378 // This can be queried with more LDS than is possible, so just assume the
379 // worst.
380 if (NumGroups == 0)
381 return 1;
382
383 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
384
385 // Round to the number of waves per CU.
386 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
387 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
388
389 // Number of waves per EU (SIMD).
390 MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
391
392 // Clamp to the maximum possible number of waves.
393 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
394
395 // FIXME: Needs to be a multiple of the group size?
396 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
397
398 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
399 "computed invalid occupancy");
400 return MaxWaves;
401 }
402
403 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const404 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
405 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
406 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
407 }
408
409 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const410 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
411 switch (CC) {
412 case CallingConv::AMDGPU_VS:
413 case CallingConv::AMDGPU_LS:
414 case CallingConv::AMDGPU_HS:
415 case CallingConv::AMDGPU_ES:
416 case CallingConv::AMDGPU_GS:
417 case CallingConv::AMDGPU_PS:
418 return std::pair(1, getWavefrontSize());
419 default:
420 return std::pair(1u, getMaxFlatWorkGroupSize());
421 }
422 }
423
getFlatWorkGroupSizes(const Function & F) const424 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
425 const Function &F) const {
426 // Default minimum/maximum flat work group sizes.
427 std::pair<unsigned, unsigned> Default =
428 getDefaultFlatWorkGroupSize(F.getCallingConv());
429
430 // Requested minimum/maximum flat work group sizes.
431 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
432 F, "amdgpu-flat-work-group-size", Default);
433
434 // Make sure requested minimum is less than requested maximum.
435 if (Requested.first > Requested.second)
436 return Default;
437
438 // Make sure requested values do not violate subtarget's specifications.
439 if (Requested.first < getMinFlatWorkGroupSize())
440 return Default;
441 if (Requested.second > getMaxFlatWorkGroupSize())
442 return Default;
443
444 return Requested;
445 }
446
getEffectiveWavesPerEU(std::pair<unsigned,unsigned> Requested,std::pair<unsigned,unsigned> FlatWorkGroupSizes) const447 std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
448 std::pair<unsigned, unsigned> Requested,
449 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
450 // Default minimum/maximum number of waves per execution unit.
451 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
452
453 // If minimum/maximum flat work group sizes were explicitly requested using
454 // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
455 // number of waves per execution unit to values implied by requested
456 // minimum/maximum flat work group sizes.
457 unsigned MinImpliedByFlatWorkGroupSize =
458 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
459 Default.first = MinImpliedByFlatWorkGroupSize;
460
461 // Make sure requested minimum is less than requested maximum.
462 if (Requested.second && Requested.first > Requested.second)
463 return Default;
464
465 // Make sure requested values do not violate subtarget's specifications.
466 if (Requested.first < getMinWavesPerEU() ||
467 Requested.second > getMaxWavesPerEU())
468 return Default;
469
470 // Make sure requested values are compatible with values implied by requested
471 // minimum/maximum flat work group sizes.
472 if (Requested.first < MinImpliedByFlatWorkGroupSize)
473 return Default;
474
475 return Requested;
476 }
477
getWavesPerEU(const Function & F,std::pair<unsigned,unsigned> FlatWorkGroupSizes) const478 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
479 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
480 // Default minimum/maximum number of waves per execution unit.
481 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
482
483 // Requested minimum/maximum number of waves per execution unit.
484 std::pair<unsigned, unsigned> Requested =
485 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
486 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
487 }
488
getReqdWorkGroupSize(const Function & Kernel,unsigned Dim)489 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
490 auto Node = Kernel.getMetadata("reqd_work_group_size");
491 if (Node && Node->getNumOperands() == 3)
492 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
493 return std::numeric_limits<unsigned>::max();
494 }
495
isMesaKernel(const Function & F) const496 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
497 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
498 }
499
getMaxWorkitemID(const Function & Kernel,unsigned Dimension) const500 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
501 unsigned Dimension) const {
502 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
503 if (ReqdSize != std::numeric_limits<unsigned>::max())
504 return ReqdSize - 1;
505 return getFlatWorkGroupSizes(Kernel).second - 1;
506 }
507
isSingleLaneExecution(const Function & Func) const508 bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
509 for (int I = 0; I < 3; ++I) {
510 if (getMaxWorkitemID(Func, I) > 0)
511 return false;
512 }
513
514 return true;
515 }
516
makeLIDRangeMetadata(Instruction * I) const517 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
518 Function *Kernel = I->getParent()->getParent();
519 unsigned MinSize = 0;
520 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
521 bool IdQuery = false;
522
523 // If reqd_work_group_size is present it narrows value down.
524 if (auto *CI = dyn_cast<CallInst>(I)) {
525 const Function *F = CI->getCalledFunction();
526 if (F) {
527 unsigned Dim = UINT_MAX;
528 switch (F->getIntrinsicID()) {
529 case Intrinsic::amdgcn_workitem_id_x:
530 case Intrinsic::r600_read_tidig_x:
531 IdQuery = true;
532 [[fallthrough]];
533 case Intrinsic::r600_read_local_size_x:
534 Dim = 0;
535 break;
536 case Intrinsic::amdgcn_workitem_id_y:
537 case Intrinsic::r600_read_tidig_y:
538 IdQuery = true;
539 [[fallthrough]];
540 case Intrinsic::r600_read_local_size_y:
541 Dim = 1;
542 break;
543 case Intrinsic::amdgcn_workitem_id_z:
544 case Intrinsic::r600_read_tidig_z:
545 IdQuery = true;
546 [[fallthrough]];
547 case Intrinsic::r600_read_local_size_z:
548 Dim = 2;
549 break;
550 default:
551 break;
552 }
553
554 if (Dim <= 3) {
555 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
556 if (ReqdSize != std::numeric_limits<unsigned>::max())
557 MinSize = MaxSize = ReqdSize;
558 }
559 }
560 }
561
562 if (!MaxSize)
563 return false;
564
565 // Range metadata is [Lo, Hi). For ID query we need to pass max size
566 // as Hi. For size query we need to pass Hi + 1.
567 if (IdQuery)
568 MinSize = 0;
569 else
570 ++MaxSize;
571
572 APInt Lower{32, MinSize};
573 APInt Upper{32, MaxSize};
574 if (auto *CI = dyn_cast<CallBase>(I)) {
575 ConstantRange Range(Lower, Upper);
576 CI->addRangeRetAttr(Range);
577 } else {
578 MDBuilder MDB(I->getContext());
579 MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
580 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
581 }
582 return true;
583 }
584
getImplicitArgNumBytes(const Function & F) const585 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
586 assert(AMDGPU::isKernel(F.getCallingConv()));
587
588 // We don't allocate the segment if we know the implicit arguments weren't
589 // used, even if the ABI implies we need them.
590 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
591 return 0;
592
593 if (isMesaKernel(F))
594 return 16;
595
596 // Assume all implicit inputs are used by default
597 const Module *M = F.getParent();
598 unsigned NBytes =
599 AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
600 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
601 NBytes);
602 }
603
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const604 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
605 Align &MaxAlign) const {
606 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
607 F.getCallingConv() == CallingConv::SPIR_KERNEL);
608
609 const DataLayout &DL = F.getDataLayout();
610 uint64_t ExplicitArgBytes = 0;
611 MaxAlign = Align(1);
612
613 for (const Argument &Arg : F.args()) {
614 const bool IsByRef = Arg.hasByRefAttr();
615 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
616 Align Alignment = DL.getValueOrABITypeAlignment(
617 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
618 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
619 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
620 MaxAlign = std::max(MaxAlign, Alignment);
621 }
622
623 return ExplicitArgBytes;
624 }
625
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const626 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
627 Align &MaxAlign) const {
628 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
629 F.getCallingConv() != CallingConv::SPIR_KERNEL)
630 return 0;
631
632 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
633
634 unsigned ExplicitOffset = getExplicitKernelArgOffset();
635
636 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
637 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
638 if (ImplicitBytes != 0) {
639 const Align Alignment = getAlignmentForImplicitArgPtr();
640 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
641 MaxAlign = std::max(MaxAlign, Alignment);
642 }
643
644 // Being able to dereference past the end is useful for emitting scalar loads.
645 return alignTo(TotalSize, 4);
646 }
647
getAMDGPUDwarfFlavour() const648 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
649 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
650 : AMDGPUDwarfFlavour::Wave64;
651 }
652
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const653 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
654 unsigned NumRegionInstrs) const {
655 // Track register pressure so the scheduler can try to decrease
656 // pressure once register usage is above the threshold defined by
657 // SIRegisterInfo::getRegPressureSetLimit()
658 Policy.ShouldTrackPressure = true;
659
660 // Enabling both top down and bottom up scheduling seems to give us less
661 // register spills than just using one of these approaches on its own.
662 Policy.OnlyTopDown = false;
663 Policy.OnlyBottomUp = false;
664
665 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
666 if (!enableSIScheduler())
667 Policy.ShouldTrackLaneMasks = true;
668 }
669
mirFileLoaded(MachineFunction & MF) const670 void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
671 if (isWave32()) {
672 // Fix implicit $vcc operands after MIParser has verified that they match
673 // the instruction definitions.
674 for (auto &MBB : MF) {
675 for (auto &MI : MBB)
676 InstrInfo.fixImplicitOperands(MI);
677 }
678 }
679 }
680
hasMadF16() const681 bool GCNSubtarget::hasMadF16() const {
682 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
683 }
684
useVGPRIndexMode() const685 bool GCNSubtarget::useVGPRIndexMode() const {
686 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
687 }
688
useAA() const689 bool GCNSubtarget::useAA() const { return UseAA; }
690
getOccupancyWithNumSGPRs(unsigned SGPRs) const691 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
692 return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),
693 getGeneration());
694 }
695
getOccupancyWithNumVGPRs(unsigned NumVGPRs) const696 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
697 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
698 }
699
700 unsigned
getBaseReservedNumSGPRs(const bool HasFlatScratch) const701 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
702 if (getGeneration() >= AMDGPUSubtarget::GFX10)
703 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
704
705 if (HasFlatScratch || HasArchitectedFlatScratch) {
706 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
707 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
708 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
709 return 4; // FLAT_SCRATCH, VCC (in that order).
710 }
711
712 if (isXNACKEnabled())
713 return 4; // XNACK, VCC (in that order).
714 return 2; // VCC.
715 }
716
getReservedNumSGPRs(const MachineFunction & MF) const717 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
718 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
719 return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());
720 }
721
getReservedNumSGPRs(const Function & F) const722 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
723 // In principle we do not need to reserve SGPR pair used for flat_scratch if
724 // we know flat instructions do not access the stack anywhere in the
725 // program. For now assume it's needed if we have flat instructions.
726 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
727 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
728 }
729
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const730 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
731 unsigned NumSGPRs,
732 unsigned NumVGPRs) const {
733 unsigned Occupancy =
734 std::min(getMaxWavesPerEU(),
735 getOccupancyWithLocalMemSize(LDSSize, F));
736 if (NumSGPRs)
737 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
738 if (NumVGPRs)
739 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
740 return Occupancy;
741 }
742
getBaseMaxNumSGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU,unsigned PreloadedSGPRs,unsigned ReservedNumSGPRs) const743 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
744 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
745 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
746 // Compute maximum number of SGPRs function can use using default/requested
747 // minimum number of waves per execution unit.
748 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
749 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
750
751 // Check if maximum number of SGPRs was explicitly requested using
752 // "amdgpu-num-sgpr" attribute.
753 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
754 unsigned Requested =
755 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
756
757 // Make sure requested value does not violate subtarget's specifications.
758 if (Requested && (Requested <= ReservedNumSGPRs))
759 Requested = 0;
760
761 // If more SGPRs are required to support the input user/system SGPRs,
762 // increase to accommodate them.
763 //
764 // FIXME: This really ends up using the requested number of SGPRs + number
765 // of reserved special registers in total. Theoretically you could re-use
766 // the last input registers for these special registers, but this would
767 // require a lot of complexity to deal with the weird aliasing.
768 unsigned InputNumSGPRs = PreloadedSGPRs;
769 if (Requested && Requested < InputNumSGPRs)
770 Requested = InputNumSGPRs;
771
772 // Make sure requested value is compatible with values implied by
773 // default/requested minimum/maximum number of waves per execution unit.
774 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
775 Requested = 0;
776 if (WavesPerEU.second &&
777 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
778 Requested = 0;
779
780 if (Requested)
781 MaxNumSGPRs = Requested;
782 }
783
784 if (hasSGPRInitBug())
785 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
786
787 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
788 }
789
getMaxNumSGPRs(const MachineFunction & MF) const790 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
791 const Function &F = MF.getFunction();
792 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
793 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
794 getReservedNumSGPRs(MF));
795 }
796
getMaxNumPreloadedSGPRs()797 static unsigned getMaxNumPreloadedSGPRs() {
798 using USI = GCNUserSGPRUsageInfo;
799 // Max number of user SGPRs
800 const unsigned MaxUserSGPRs =
801 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
802 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
803 USI::getNumUserSGPRForField(USI::QueuePtrID) +
804 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
805 USI::getNumUserSGPRForField(USI::DispatchIdID) +
806 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
807 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
808
809 // Max number of system SGPRs
810 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
811 1 + // WorkGroupIDY
812 1 + // WorkGroupIDZ
813 1 + // WorkGroupInfo
814 1; // private segment wave byte offset
815
816 // Max number of synthetic SGPRs
817 const unsigned SyntheticSGPRs = 1; // LDSKernelId
818
819 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
820 }
821
getMaxNumSGPRs(const Function & F) const822 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
823 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
824 getReservedNumSGPRs(F));
825 }
826
getBaseMaxNumVGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU) const827 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
828 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
829 // Compute maximum number of VGPRs function can use using default/requested
830 // minimum number of waves per execution unit.
831 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
832
833 // Check if maximum number of VGPRs was explicitly requested using
834 // "amdgpu-num-vgpr" attribute.
835 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
836 unsigned Requested =
837 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
838
839 if (hasGFX90AInsts())
840 Requested *= 2;
841
842 // Make sure requested value is compatible with values implied by
843 // default/requested minimum/maximum number of waves per execution unit.
844 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
845 Requested = 0;
846 if (WavesPerEU.second &&
847 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
848 Requested = 0;
849
850 if (Requested)
851 MaxNumVGPRs = Requested;
852 }
853
854 return MaxNumVGPRs;
855 }
856
getMaxNumVGPRs(const Function & F) const857 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
858 return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
859 }
860
getMaxNumVGPRs(const MachineFunction & MF) const861 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
862 const Function &F = MF.getFunction();
863 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
864 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
865 }
866
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep,const TargetSchedModel * SchedModel) const867 void GCNSubtarget::adjustSchedDependency(
868 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
869 const TargetSchedModel *SchedModel) const {
870 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
871 !Def->isInstr() || !Use->isInstr())
872 return;
873
874 MachineInstr *DefI = Def->getInstr();
875 MachineInstr *UseI = Use->getInstr();
876
877 if (DefI->isBundle()) {
878 const SIRegisterInfo *TRI = getRegisterInfo();
879 auto Reg = Dep.getReg();
880 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
881 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
882 unsigned Lat = 0;
883 for (++I; I != E && I->isBundledWithPred(); ++I) {
884 if (I->modifiesRegister(Reg, TRI))
885 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
886 else if (Lat)
887 --Lat;
888 }
889 Dep.setLatency(Lat);
890 } else if (UseI->isBundle()) {
891 const SIRegisterInfo *TRI = getRegisterInfo();
892 auto Reg = Dep.getReg();
893 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
894 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
895 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
896 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
897 if (I->readsRegister(Reg, TRI))
898 break;
899 --Lat;
900 }
901 Dep.setLatency(Lat);
902 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
903 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
904 // implicit operands which come from the MCInstrDesc, which can fool
905 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
906 // pseudo operands.
907 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
908 DefI, DefOpIdx, UseI, UseOpIdx));
909 }
910 }
911
912 namespace {
913 struct FillMFMAShadowMutation : ScheduleDAGMutation {
914 const SIInstrInfo *TII;
915
916 ScheduleDAGMI *DAG;
917
FillMFMAShadowMutation__anonc50ee2620111::FillMFMAShadowMutation918 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
919
isSALU__anonc50ee2620111::FillMFMAShadowMutation920 bool isSALU(const SUnit *SU) const {
921 const MachineInstr *MI = SU->getInstr();
922 return MI && TII->isSALU(*MI) && !MI->isTerminator();
923 }
924
isVALU__anonc50ee2620111::FillMFMAShadowMutation925 bool isVALU(const SUnit *SU) const {
926 const MachineInstr *MI = SU->getInstr();
927 return MI && TII->isVALU(*MI);
928 }
929
930 // Link as many SALU instructions in chain as possible. Return the size
931 // of the chain. Links up to MaxChain instructions.
linkSALUChain__anonc50ee2620111::FillMFMAShadowMutation932 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
933 SmallPtrSetImpl<SUnit *> &Visited) const {
934 SmallVector<SUnit *, 8> Worklist({To});
935 unsigned Linked = 0;
936
937 while (!Worklist.empty() && MaxChain-- > 0) {
938 SUnit *SU = Worklist.pop_back_val();
939 if (!Visited.insert(SU).second)
940 continue;
941
942 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
943 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
944
945 if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
946 if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
947 ++Linked;
948
949 for (SDep &SI : From->Succs) {
950 SUnit *SUv = SI.getSUnit();
951 if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
952 DAG->canAddEdge(SUv, SU))
953 DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
954 }
955
956 for (SDep &SI : SU->Succs) {
957 SUnit *Succ = SI.getSUnit();
958 if (Succ != SU && isSALU(Succ))
959 Worklist.push_back(Succ);
960 }
961 }
962
963 return Linked;
964 }
965
apply__anonc50ee2620111::FillMFMAShadowMutation966 void apply(ScheduleDAGInstrs *DAGInstrs) override {
967 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
968 if (!ST.hasMAIInsts())
969 return;
970 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
971 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
972 if (!TSchedModel || DAG->SUnits.empty())
973 return;
974
975 // Scan for MFMA long latency instructions and try to add a dependency
976 // of available SALU instructions to give them a chance to fill MFMA
977 // shadow. That is desirable to fill MFMA shadow with SALU instructions
978 // rather than VALU to prevent power consumption bursts and throttle.
979 auto LastSALU = DAG->SUnits.begin();
980 auto E = DAG->SUnits.end();
981 SmallPtrSet<SUnit*, 32> Visited;
982 for (SUnit &SU : DAG->SUnits) {
983 MachineInstr &MAI = *SU.getInstr();
984 if (!TII->isMAI(MAI) ||
985 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
986 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
987 continue;
988
989 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
990
991 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
992 dbgs() << "Need " << Lat
993 << " instructions to cover latency.\n");
994
995 // Find up to Lat independent scalar instructions as early as
996 // possible such that they can be scheduled after this MFMA.
997 for ( ; Lat && LastSALU != E; ++LastSALU) {
998 if (Visited.count(&*LastSALU))
999 continue;
1000
1001 if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
1002 !DAG->canAddEdge(&*LastSALU, &SU))
1003 continue;
1004
1005 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1006 }
1007 }
1008 }
1009 };
1010 } // namespace
1011
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const1012 void GCNSubtarget::getPostRAMutations(
1013 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1014 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1015 }
1016
1017 std::unique_ptr<ScheduleDAGMutation>
createFillMFMAShadowMutation(const TargetInstrInfo * TII) const1018 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1019 return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
1020 : nullptr;
1021 }
1022
getNSAThreshold(const MachineFunction & MF) const1023 unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
1024 if (getGeneration() >= AMDGPUSubtarget::GFX12)
1025 return 0; // Not MIMG encoding.
1026
1027 if (NSAThreshold.getNumOccurrences() > 0)
1028 return std::max(NSAThreshold.getValue(), 2u);
1029
1030 int Value = MF.getFunction().getFnAttributeAsParsedInteger(
1031 "amdgpu-nsa-threshold", -1);
1032 if (Value > 0)
1033 return std::max(Value, 2);
1034
1035 return 3;
1036 }
1037
get(const MachineFunction & MF)1038 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1039 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1040 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1041 return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
1042 }
1043
get(const TargetMachine & TM,const Function & F)1044 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1045 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1046 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1047 return static_cast<const AMDGPUSubtarget &>(
1048 TM.getSubtarget<R600Subtarget>(F));
1049 }
1050
GCNUserSGPRUsageInfo(const Function & F,const GCNSubtarget & ST)1051 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
1052 const GCNSubtarget &ST)
1053 : ST(ST) {
1054 const CallingConv::ID CC = F.getCallingConv();
1055 const bool IsKernel =
1056 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
1057 // FIXME: Should have analysis or something rather than attribute to detect
1058 // calls.
1059 const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
1060 // FIXME: This attribute is a hack, we just need an analysis on the function
1061 // to look for allocas.
1062 const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
1063
1064 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
1065 KernargSegmentPtr = true;
1066
1067 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
1068 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
1069 PrivateSegmentBuffer = true;
1070 else if (ST.isMesaGfxShader(F))
1071 ImplicitBufferPtr = true;
1072
1073 if (!AMDGPU::isGraphics(CC)) {
1074 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
1075 DispatchPtr = true;
1076
1077 // FIXME: Can this always be disabled with < COv5?
1078 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
1079 QueuePtr = true;
1080
1081 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
1082 DispatchID = true;
1083 }
1084
1085 // TODO: This could be refined a lot. The attribute is a poor way of
1086 // detecting calls or stack objects that may require it before argument
1087 // lowering.
1088 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
1089 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
1090 (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
1091 !ST.flatScratchIsArchitected()) {
1092 FlatScratchInit = true;
1093 }
1094
1095 if (hasImplicitBufferPtr())
1096 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
1097
1098 if (hasPrivateSegmentBuffer())
1099 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
1100
1101 if (hasDispatchPtr())
1102 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
1103
1104 if (hasQueuePtr())
1105 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
1106
1107 if (hasKernargSegmentPtr())
1108 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
1109
1110 if (hasDispatchID())
1111 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
1112
1113 if (hasFlatScratchInit())
1114 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1115
1116 if (hasPrivateSegmentSize())
1117 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
1118 }
1119
allocKernargPreloadSGPRs(unsigned NumSGPRs)1120 void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
1121 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
1122 NumKernargPreloadSGPRs += NumSGPRs;
1123 NumUsedUserSGPRs += NumSGPRs;
1124 }
1125
getNumFreeUserSGPRs()1126 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
1127 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
1128 }
1129
1130 SmallVector<unsigned>
getMaxNumWorkGroups(const Function & F) const1131 AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
1132 return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
1133 }
1134