xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (revision 924226fba12cc9a228c73b956e1b7fa24c60b055)
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> EnableFlatScratch(
54   "amdgpu-enable-flat-scratch",
55   cl::desc("Use flat scratch instructions"),
56   cl::init(false));
57 
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59                            cl::desc("Enable the use of AA during codegen."),
60                            cl::init(true));
61 
62 GCNSubtarget::~GCNSubtarget() = default;
63 
64 GCNSubtarget &
65 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
66                                               StringRef GPU, StringRef FS) {
67   // Determine default and user-specified characteristics
68   //
69   // We want to be able to turn these off, but making this a subtarget feature
70   // for SI has the unhelpful behavior that it unsets everything else if you
71   // disable it.
72   //
73   // Similarly we want enable-prt-strict-null to be on by default and not to
74   // unset everything else if it is disabled
75 
76   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77 
78   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79   if (isAmdHsaOS())
80     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81 
82   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83 
84   // Disable mutually exclusive bits.
85   if (FS.contains_insensitive("+wavefrontsize")) {
86     if (!FS.contains_insensitive("wavefrontsize16"))
87       FullFS += "-wavefrontsize16,";
88     if (!FS.contains_insensitive("wavefrontsize32"))
89       FullFS += "-wavefrontsize32,";
90     if (!FS.contains_insensitive("wavefrontsize64"))
91       FullFS += "-wavefrontsize64,";
92   }
93 
94   FullFS += FS;
95 
96   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97 
98   // Implement the "generic" processors, which acts as the default when no
99   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100   // the first amdgcn target that supports flat addressing. Other OSes defaults
101   // to the first amdgcn target.
102   if (Gen == AMDGPUSubtarget::INVALID) {
103      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
104                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
105   }
106 
107   // We don't support FP64 for EG/NI atm.
108   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
109 
110   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111   // support flat operations, otherwise they cannot access a 64-bit global
112   // address space
113   assert(hasAddr64() || hasFlat());
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115   // that do not support ADDR64 variants of MUBUF instructions. Such targets
116   // cannot use a 64 bit offset with a MUBUF instruction to access the global
117   // address space
118   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120     FlatForGlobal = true;
121   }
122   // Unless +-flat-for-global is specified, use MUBUF instructions for global
123   // address space access if flat operations are not available.
124   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126     FlatForGlobal = false;
127   }
128 
129   // Set defaults if needed.
130   if (MaxPrivateElementSize == 0)
131     MaxPrivateElementSize = 4;
132 
133   if (LDSBankCount == 0)
134     LDSBankCount = 32;
135 
136   if (TT.getArch() == Triple::amdgcn) {
137     if (LocalMemorySize == 0)
138       LocalMemorySize = 32768;
139 
140     // Do something sensible for unspecified target.
141     if (!HasMovrel && !HasVGPRIndexMode)
142       HasMovrel = true;
143   }
144 
145   // Don't crash on invalid devices.
146   if (WavefrontSizeLog2 == 0)
147     WavefrontSizeLog2 = 5;
148 
149   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
150   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
151 
152   TargetID.setTargetIDFromFeaturesString(FS);
153 
154   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
155                     << TargetID.getXnackSetting() << '\n');
156   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
157                     << TargetID.getSramEccSetting() << '\n');
158 
159   return *this;
160 }
161 
162 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
163   TargetTriple(TT),
164   GCN3Encoding(false),
165   Has16BitInsts(false),
166   HasMadMixInsts(false),
167   HasMadMacF32Insts(false),
168   HasDsSrc2Insts(false),
169   HasSDWA(false),
170   HasVOP3PInsts(false),
171   HasMulI24(true),
172   HasMulU24(true),
173   HasSMulHi(false),
174   HasInv2PiInlineImm(false),
175   HasFminFmaxLegacy(true),
176   EnablePromoteAlloca(false),
177   HasTrigReducedRange(false),
178   MaxWavesPerEU(10),
179   LocalMemorySize(0),
180   WavefrontSizeLog2(0)
181   { }
182 
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184                            const GCNTargetMachine &TM)
185     : // clang-format off
186     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
187     AMDGPUSubtarget(TT),
188     TargetTriple(TT),
189     TargetID(*this),
190     Gen(INVALID),
191     InstrItins(getInstrItineraryForCPU(GPU)),
192     LDSBankCount(0),
193     MaxPrivateElementSize(0),
194 
195     FastFMAF32(false),
196     FastDenormalF32(false),
197     HalfRate64Ops(false),
198     FullRate64Ops(false),
199 
200     FlatForGlobal(false),
201     AutoWaitcntBeforeBarrier(false),
202     UnalignedScratchAccess(false),
203     UnalignedAccessMode(false),
204 
205     HasApertureRegs(false),
206     SupportsXNACK(false),
207     EnableXNACK(false),
208     EnableTgSplit(false),
209     EnableCuMode(false),
210     TrapHandler(false),
211 
212     EnableLoadStoreOpt(false),
213     EnableUnsafeDSOffsetFolding(false),
214     EnableSIScheduler(false),
215     EnableDS128(false),
216     EnablePRTStrictNull(false),
217     DumpCode(false),
218 
219     FP64(false),
220     CIInsts(false),
221     GFX8Insts(false),
222     GFX9Insts(false),
223     GFX90AInsts(false),
224     GFX10Insts(false),
225     GFX10_3Insts(false),
226     GFX7GFX8GFX9Insts(false),
227     SGPRInitBug(false),
228     NegativeScratchOffsetBug(false),
229     NegativeUnalignedScratchOffsetBug(false),
230     HasSMemRealTime(false),
231     HasIntClamp(false),
232     HasFmaMixInsts(false),
233     HasMovrel(false),
234     HasVGPRIndexMode(false),
235     HasScalarStores(false),
236     HasScalarAtomics(false),
237     HasSDWAOmod(false),
238     HasSDWAScalar(false),
239     HasSDWASdst(false),
240     HasSDWAMac(false),
241     HasSDWAOutModsVOPC(false),
242     HasDPP(false),
243     HasDPP8(false),
244     Has64BitDPP(false),
245     HasPackedFP32Ops(false),
246     HasExtendedImageInsts(false),
247     HasR128A16(false),
248     HasGFX10A16(false),
249     HasG16(false),
250     HasNSAEncoding(false),
251     NSAMaxSize(0),
252     GFX10_AEncoding(false),
253     GFX10_BEncoding(false),
254     HasDLInsts(false),
255     HasDot1Insts(false),
256     HasDot2Insts(false),
257     HasDot3Insts(false),
258     HasDot4Insts(false),
259     HasDot5Insts(false),
260     HasDot6Insts(false),
261     HasDot7Insts(false),
262     HasMAIInsts(false),
263     HasPkFmacF16Inst(false),
264     HasAtomicFaddInsts(false),
265     SupportsSRAMECC(false),
266     EnableSRAMECC(false),
267     HasNoSdstCMPX(false),
268     HasVscnt(false),
269     HasGetWaveIdInst(false),
270     HasSMemTimeInst(false),
271     HasShaderCyclesRegister(false),
272     HasVOP3Literal(false),
273     HasNoDataDepHazard(false),
274     FlatAddressSpace(false),
275     FlatInstOffsets(false),
276     FlatGlobalInsts(false),
277     FlatScratchInsts(false),
278     ScalarFlatScratchInsts(false),
279     HasArchitectedFlatScratch(false),
280     AddNoCarryInsts(false),
281     HasUnpackedD16VMem(false),
282     LDSMisalignedBug(false),
283     HasMFMAInlineLiteralBug(false),
284     UnalignedBufferAccess(false),
285     UnalignedDSAccess(false),
286     HasPackedTID(false),
287 
288     ScalarizeGlobal(false),
289 
290     HasVcmpxPermlaneHazard(false),
291     HasVMEMtoScalarWriteHazard(false),
292     HasSMEMtoVectorWriteHazard(false),
293     HasInstFwdPrefetchBug(false),
294     HasVcmpxExecWARHazard(false),
295     HasLdsBranchVmemWARHazard(false),
296     HasNSAtoVMEMBug(false),
297     HasNSAClauseBug(false),
298     HasOffset3fBug(false),
299     HasFlatSegmentOffsetBug(false),
300     HasImageStoreD16Bug(false),
301     HasImageGather4D16Bug(false),
302 
303     FeatureDisable(false),
304     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
305     TLInfo(TM, *this),
306     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
307   // clang-format on
308   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
309   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
310   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
311   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
312   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
313   InstSelector.reset(new AMDGPUInstructionSelector(
314   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
315 }
316 
317 bool GCNSubtarget::enableFlatScratch() const {
318   return flatScratchIsArchitected() ||
319          (EnableFlatScratch && hasFlatScratchInsts());
320 }
321 
322 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
323   if (getGeneration() < GFX10)
324     return 1;
325 
326   switch (Opcode) {
327   case AMDGPU::V_LSHLREV_B64_e64:
328   case AMDGPU::V_LSHLREV_B64_gfx10:
329   case AMDGPU::V_LSHL_B64_e64:
330   case AMDGPU::V_LSHRREV_B64_e64:
331   case AMDGPU::V_LSHRREV_B64_gfx10:
332   case AMDGPU::V_LSHR_B64_e64:
333   case AMDGPU::V_ASHRREV_I64_e64:
334   case AMDGPU::V_ASHRREV_I64_gfx10:
335   case AMDGPU::V_ASHR_I64_e64:
336     return 1;
337   }
338 
339   return 2;
340 }
341 
342 /// This list was mostly derived from experimentation.
343 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
344   switch (Opcode) {
345   case AMDGPU::V_CVT_F16_F32_e32:
346   case AMDGPU::V_CVT_F16_F32_e64:
347   case AMDGPU::V_CVT_F16_U16_e32:
348   case AMDGPU::V_CVT_F16_U16_e64:
349   case AMDGPU::V_CVT_F16_I16_e32:
350   case AMDGPU::V_CVT_F16_I16_e64:
351   case AMDGPU::V_RCP_F16_e64:
352   case AMDGPU::V_RCP_F16_e32:
353   case AMDGPU::V_RSQ_F16_e64:
354   case AMDGPU::V_RSQ_F16_e32:
355   case AMDGPU::V_SQRT_F16_e64:
356   case AMDGPU::V_SQRT_F16_e32:
357   case AMDGPU::V_LOG_F16_e64:
358   case AMDGPU::V_LOG_F16_e32:
359   case AMDGPU::V_EXP_F16_e64:
360   case AMDGPU::V_EXP_F16_e32:
361   case AMDGPU::V_SIN_F16_e64:
362   case AMDGPU::V_SIN_F16_e32:
363   case AMDGPU::V_COS_F16_e64:
364   case AMDGPU::V_COS_F16_e32:
365   case AMDGPU::V_FLOOR_F16_e64:
366   case AMDGPU::V_FLOOR_F16_e32:
367   case AMDGPU::V_CEIL_F16_e64:
368   case AMDGPU::V_CEIL_F16_e32:
369   case AMDGPU::V_TRUNC_F16_e64:
370   case AMDGPU::V_TRUNC_F16_e32:
371   case AMDGPU::V_RNDNE_F16_e64:
372   case AMDGPU::V_RNDNE_F16_e32:
373   case AMDGPU::V_FRACT_F16_e64:
374   case AMDGPU::V_FRACT_F16_e32:
375   case AMDGPU::V_FREXP_MANT_F16_e64:
376   case AMDGPU::V_FREXP_MANT_F16_e32:
377   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
378   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
379   case AMDGPU::V_LDEXP_F16_e64:
380   case AMDGPU::V_LDEXP_F16_e32:
381   case AMDGPU::V_LSHLREV_B16_e64:
382   case AMDGPU::V_LSHLREV_B16_e32:
383   case AMDGPU::V_LSHRREV_B16_e64:
384   case AMDGPU::V_LSHRREV_B16_e32:
385   case AMDGPU::V_ASHRREV_I16_e64:
386   case AMDGPU::V_ASHRREV_I16_e32:
387   case AMDGPU::V_ADD_U16_e64:
388   case AMDGPU::V_ADD_U16_e32:
389   case AMDGPU::V_SUB_U16_e64:
390   case AMDGPU::V_SUB_U16_e32:
391   case AMDGPU::V_SUBREV_U16_e64:
392   case AMDGPU::V_SUBREV_U16_e32:
393   case AMDGPU::V_MUL_LO_U16_e64:
394   case AMDGPU::V_MUL_LO_U16_e32:
395   case AMDGPU::V_ADD_F16_e64:
396   case AMDGPU::V_ADD_F16_e32:
397   case AMDGPU::V_SUB_F16_e64:
398   case AMDGPU::V_SUB_F16_e32:
399   case AMDGPU::V_SUBREV_F16_e64:
400   case AMDGPU::V_SUBREV_F16_e32:
401   case AMDGPU::V_MUL_F16_e64:
402   case AMDGPU::V_MUL_F16_e32:
403   case AMDGPU::V_MAX_F16_e64:
404   case AMDGPU::V_MAX_F16_e32:
405   case AMDGPU::V_MIN_F16_e64:
406   case AMDGPU::V_MIN_F16_e32:
407   case AMDGPU::V_MAX_U16_e64:
408   case AMDGPU::V_MAX_U16_e32:
409   case AMDGPU::V_MIN_U16_e64:
410   case AMDGPU::V_MIN_U16_e32:
411   case AMDGPU::V_MAX_I16_e64:
412   case AMDGPU::V_MAX_I16_e32:
413   case AMDGPU::V_MIN_I16_e64:
414   case AMDGPU::V_MIN_I16_e32:
415   case AMDGPU::V_MAD_F16_e64:
416   case AMDGPU::V_MAD_U16_e64:
417   case AMDGPU::V_MAD_I16_e64:
418   case AMDGPU::V_FMA_F16_e64:
419   case AMDGPU::V_DIV_FIXUP_F16_e64:
420     // On gfx10, all 16-bit instructions preserve the high bits.
421     return getGeneration() <= AMDGPUSubtarget::GFX9;
422   case AMDGPU::V_MADAK_F16:
423   case AMDGPU::V_MADMK_F16:
424   case AMDGPU::V_MAC_F16_e64:
425   case AMDGPU::V_MAC_F16_e32:
426   case AMDGPU::V_FMAMK_F16:
427   case AMDGPU::V_FMAAK_F16:
428   case AMDGPU::V_FMAC_F16_e64:
429   case AMDGPU::V_FMAC_F16_e32:
430     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
431     // instructions maintain the legacy behavior of 0ing. Some instructions
432     // changed to preserving the high bits.
433     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
434   case AMDGPU::V_MAD_MIXLO_F16:
435   case AMDGPU::V_MAD_MIXHI_F16:
436   default:
437     return false;
438   }
439 }
440 
441 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
442   const Function &F) const {
443   if (NWaves == 1)
444     return getLocalMemorySize();
445   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
446   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
447   if (!WorkGroupsPerCu)
448     return 0;
449   unsigned MaxWaves = getMaxWavesPerEU();
450   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
451 }
452 
453 // FIXME: Should return min,max range.
454 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
455   const Function &F) const {
456   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
457   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
458   if (!MaxWorkGroupsPerCu)
459     return 0;
460 
461   const unsigned WaveSize = getWavefrontSize();
462 
463   // FIXME: Do we need to account for alignment requirement of LDS rounding the
464   // size up?
465   // Compute restriction based on LDS usage
466   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
467 
468   // This can be queried with more LDS than is possible, so just assume the
469   // worst.
470   if (NumGroups == 0)
471     return 1;
472 
473   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
474 
475   // Round to the number of waves.
476   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
477   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
478 
479   // Clamp to the maximum possible number of waves.
480   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
481 
482   // FIXME: Needs to be a multiple of the group size?
483   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
484 
485   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
486          "computed invalid occupancy");
487   return MaxWaves;
488 }
489 
490 unsigned
491 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
492   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
493   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
494 }
495 
496 std::pair<unsigned, unsigned>
497 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
498   switch (CC) {
499   case CallingConv::AMDGPU_VS:
500   case CallingConv::AMDGPU_LS:
501   case CallingConv::AMDGPU_HS:
502   case CallingConv::AMDGPU_ES:
503   case CallingConv::AMDGPU_GS:
504   case CallingConv::AMDGPU_PS:
505     return std::make_pair(1, getWavefrontSize());
506   default:
507     return std::make_pair(1u, getMaxFlatWorkGroupSize());
508   }
509 }
510 
511 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
512   const Function &F) const {
513   // Default minimum/maximum flat work group sizes.
514   std::pair<unsigned, unsigned> Default =
515     getDefaultFlatWorkGroupSize(F.getCallingConv());
516 
517   // Requested minimum/maximum flat work group sizes.
518   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
519     F, "amdgpu-flat-work-group-size", Default);
520 
521   // Make sure requested minimum is less than requested maximum.
522   if (Requested.first > Requested.second)
523     return Default;
524 
525   // Make sure requested values do not violate subtarget's specifications.
526   if (Requested.first < getMinFlatWorkGroupSize())
527     return Default;
528   if (Requested.second > getMaxFlatWorkGroupSize())
529     return Default;
530 
531   return Requested;
532 }
533 
534 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
535     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
536   // Default minimum/maximum number of waves per execution unit.
537   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
538 
539   // If minimum/maximum flat work group sizes were explicitly requested using
540   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
541   // number of waves per execution unit to values implied by requested
542   // minimum/maximum flat work group sizes.
543   unsigned MinImpliedByFlatWorkGroupSize =
544     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
545   Default.first = MinImpliedByFlatWorkGroupSize;
546 
547   // Requested minimum/maximum number of waves per execution unit.
548   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
549     F, "amdgpu-waves-per-eu", Default, true);
550 
551   // Make sure requested minimum is less than requested maximum.
552   if (Requested.second && Requested.first > Requested.second)
553     return Default;
554 
555   // Make sure requested values do not violate subtarget's specifications.
556   if (Requested.first < getMinWavesPerEU() ||
557       Requested.second > getMaxWavesPerEU())
558     return Default;
559 
560   // Make sure requested values are compatible with values implied by requested
561   // minimum/maximum flat work group sizes.
562   if (Requested.first < MinImpliedByFlatWorkGroupSize)
563     return Default;
564 
565   return Requested;
566 }
567 
568 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
569   auto Node = Kernel.getMetadata("reqd_work_group_size");
570   if (Node && Node->getNumOperands() == 3)
571     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
572   return std::numeric_limits<unsigned>::max();
573 }
574 
575 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
576   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
577 }
578 
579 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
580                                            unsigned Dimension) const {
581   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
582   if (ReqdSize != std::numeric_limits<unsigned>::max())
583     return ReqdSize - 1;
584   return getFlatWorkGroupSizes(Kernel).second - 1;
585 }
586 
587 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
588   Function *Kernel = I->getParent()->getParent();
589   unsigned MinSize = 0;
590   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
591   bool IdQuery = false;
592 
593   // If reqd_work_group_size is present it narrows value down.
594   if (auto *CI = dyn_cast<CallInst>(I)) {
595     const Function *F = CI->getCalledFunction();
596     if (F) {
597       unsigned Dim = UINT_MAX;
598       switch (F->getIntrinsicID()) {
599       case Intrinsic::amdgcn_workitem_id_x:
600       case Intrinsic::r600_read_tidig_x:
601         IdQuery = true;
602         LLVM_FALLTHROUGH;
603       case Intrinsic::r600_read_local_size_x:
604         Dim = 0;
605         break;
606       case Intrinsic::amdgcn_workitem_id_y:
607       case Intrinsic::r600_read_tidig_y:
608         IdQuery = true;
609         LLVM_FALLTHROUGH;
610       case Intrinsic::r600_read_local_size_y:
611         Dim = 1;
612         break;
613       case Intrinsic::amdgcn_workitem_id_z:
614       case Intrinsic::r600_read_tidig_z:
615         IdQuery = true;
616         LLVM_FALLTHROUGH;
617       case Intrinsic::r600_read_local_size_z:
618         Dim = 2;
619         break;
620       default:
621         break;
622       }
623 
624       if (Dim <= 3) {
625         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
626         if (ReqdSize != std::numeric_limits<unsigned>::max())
627           MinSize = MaxSize = ReqdSize;
628       }
629     }
630   }
631 
632   if (!MaxSize)
633     return false;
634 
635   // Range metadata is [Lo, Hi). For ID query we need to pass max size
636   // as Hi. For size query we need to pass Hi + 1.
637   if (IdQuery)
638     MinSize = 0;
639   else
640     ++MaxSize;
641 
642   MDBuilder MDB(I->getContext());
643   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
644                                                   APInt(32, MaxSize));
645   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
646   return true;
647 }
648 
649 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
650   assert(AMDGPU::isKernel(F.getCallingConv()));
651 
652   // We don't allocate the segment if we know the implicit arguments weren't
653   // used, even if the ABI implies we need them.
654   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
655     return 0;
656 
657   if (isMesaKernel(F))
658     return 16;
659 
660   // Assume all implicit inputs are used by default
661   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
662 }
663 
664 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
665                                                  Align &MaxAlign) const {
666   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
667          F.getCallingConv() == CallingConv::SPIR_KERNEL);
668 
669   const DataLayout &DL = F.getParent()->getDataLayout();
670   uint64_t ExplicitArgBytes = 0;
671   MaxAlign = Align(1);
672 
673   for (const Argument &Arg : F.args()) {
674     const bool IsByRef = Arg.hasByRefAttr();
675     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
676     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
677     if (!Alignment)
678       Alignment = DL.getABITypeAlign(ArgTy);
679 
680     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
681     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
682     MaxAlign = max(MaxAlign, Alignment);
683   }
684 
685   return ExplicitArgBytes;
686 }
687 
688 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
689                                                 Align &MaxAlign) const {
690   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
691 
692   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
693 
694   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
695   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
696   if (ImplicitBytes != 0) {
697     const Align Alignment = getAlignmentForImplicitArgPtr();
698     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
699     MaxAlign = std::max(MaxAlign, Alignment);
700   }
701 
702   // Being able to dereference past the end is useful for emitting scalar loads.
703   return alignTo(TotalSize, 4);
704 }
705 
706 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
707   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
708                                   : AMDGPUDwarfFlavour::Wave64;
709 }
710 
711 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
712                                       unsigned NumRegionInstrs) const {
713   // Track register pressure so the scheduler can try to decrease
714   // pressure once register usage is above the threshold defined by
715   // SIRegisterInfo::getRegPressureSetLimit()
716   Policy.ShouldTrackPressure = true;
717 
718   // Enabling both top down and bottom up scheduling seems to give us less
719   // register spills than just using one of these approaches on its own.
720   Policy.OnlyTopDown = false;
721   Policy.OnlyBottomUp = false;
722 
723   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
724   if (!enableSIScheduler())
725     Policy.ShouldTrackLaneMasks = true;
726 }
727 
728 bool GCNSubtarget::hasMadF16() const {
729   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
730 }
731 
732 bool GCNSubtarget::useVGPRIndexMode() const {
733   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
734 }
735 
736 bool GCNSubtarget::useAA() const { return UseAA; }
737 
738 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
739   if (getGeneration() >= AMDGPUSubtarget::GFX10)
740     return getMaxWavesPerEU();
741 
742   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
743     if (SGPRs <= 80)
744       return 10;
745     if (SGPRs <= 88)
746       return 9;
747     if (SGPRs <= 100)
748       return 8;
749     return 7;
750   }
751   if (SGPRs <= 48)
752     return 10;
753   if (SGPRs <= 56)
754     return 9;
755   if (SGPRs <= 64)
756     return 8;
757   if (SGPRs <= 72)
758     return 7;
759   if (SGPRs <= 80)
760     return 6;
761   return 5;
762 }
763 
764 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
765   unsigned MaxWaves = getMaxWavesPerEU();
766   unsigned Granule = getVGPRAllocGranule();
767   if (VGPRs < Granule)
768     return MaxWaves;
769   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
770   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
771 }
772 
773 unsigned
774 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
775   if (getGeneration() >= AMDGPUSubtarget::GFX10)
776     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
777 
778   if (HasFlatScratch || HasArchitectedFlatScratch) {
779     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
780       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
781     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
782       return 4; // FLAT_SCRATCH, VCC (in that order).
783   }
784 
785   if (isXNACKEnabled())
786     return 4; // XNACK, VCC (in that order).
787   return 2; // VCC.
788 }
789 
790 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
791   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
792   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
793 }
794 
795 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
796   // In principle we do not need to reserve SGPR pair used for flat_scratch if
797   // we know flat instructions do not access the stack anywhere in the
798   // program. For now assume it's needed if we have flat instructions.
799   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
800   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
801 }
802 
803 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
804                                         unsigned NumSGPRs,
805                                         unsigned NumVGPRs) const {
806   unsigned Occupancy =
807     std::min(getMaxWavesPerEU(),
808              getOccupancyWithLocalMemSize(LDSSize, F));
809   if (NumSGPRs)
810     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
811   if (NumVGPRs)
812     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
813   return Occupancy;
814 }
815 
816 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
817     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
818     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
819   // Compute maximum number of SGPRs function can use using default/requested
820   // minimum number of waves per execution unit.
821   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
822   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
823 
824   // Check if maximum number of SGPRs was explicitly requested using
825   // "amdgpu-num-sgpr" attribute.
826   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
827     unsigned Requested = AMDGPU::getIntegerAttribute(
828       F, "amdgpu-num-sgpr", MaxNumSGPRs);
829 
830     // Make sure requested value does not violate subtarget's specifications.
831     if (Requested && (Requested <= ReservedNumSGPRs))
832       Requested = 0;
833 
834     // If more SGPRs are required to support the input user/system SGPRs,
835     // increase to accommodate them.
836     //
837     // FIXME: This really ends up using the requested number of SGPRs + number
838     // of reserved special registers in total. Theoretically you could re-use
839     // the last input registers for these special registers, but this would
840     // require a lot of complexity to deal with the weird aliasing.
841     unsigned InputNumSGPRs = PreloadedSGPRs;
842     if (Requested && Requested < InputNumSGPRs)
843       Requested = InputNumSGPRs;
844 
845     // Make sure requested value is compatible with values implied by
846     // default/requested minimum/maximum number of waves per execution unit.
847     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
848       Requested = 0;
849     if (WavesPerEU.second &&
850         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
851       Requested = 0;
852 
853     if (Requested)
854       MaxNumSGPRs = Requested;
855   }
856 
857   if (hasSGPRInitBug())
858     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
859 
860   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
861 }
862 
863 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
864   const Function &F = MF.getFunction();
865   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
866   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
867                             getReservedNumSGPRs(MF));
868 }
869 
870 static unsigned getMaxNumPreloadedSGPRs() {
871   // Max number of user SGPRs
872   unsigned MaxUserSGPRs = 4 + // private segment buffer
873                           2 + // Dispatch ptr
874                           2 + // queue ptr
875                           2 + // kernel segment ptr
876                           2 + // dispatch ID
877                           2 + // flat scratch init
878                           2;  // Implicit buffer ptr
879   // Max number of system SGPRs
880   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
881                             1 + // WorkGroupIDY
882                             1 + // WorkGroupIDZ
883                             1 + // WorkGroupInfo
884                             1;  // private segment wave byte offset
885   return MaxUserSGPRs + MaxSystemSGPRs;
886 }
887 
888 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
889   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
890                             getReservedNumSGPRs(F));
891 }
892 
893 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
894     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
895   // Compute maximum number of VGPRs function can use using default/requested
896   // minimum number of waves per execution unit.
897   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
898 
899   // Check if maximum number of VGPRs was explicitly requested using
900   // "amdgpu-num-vgpr" attribute.
901   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
902     unsigned Requested = AMDGPU::getIntegerAttribute(
903       F, "amdgpu-num-vgpr", MaxNumVGPRs);
904 
905     if (hasGFX90AInsts())
906       Requested *= 2;
907 
908     // Make sure requested value is compatible with values implied by
909     // default/requested minimum/maximum number of waves per execution unit.
910     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
911       Requested = 0;
912     if (WavesPerEU.second &&
913         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
914       Requested = 0;
915 
916     if (Requested)
917       MaxNumVGPRs = Requested;
918   }
919 
920   return MaxNumVGPRs;
921 }
922 
923 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
924   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
925 }
926 
927 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
928   const Function &F = MF.getFunction();
929   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
930   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
931 }
932 
933 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
934                                          int UseOpIdx, SDep &Dep) const {
935   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
936       !Def->isInstr() || !Use->isInstr())
937     return;
938 
939   MachineInstr *DefI = Def->getInstr();
940   MachineInstr *UseI = Use->getInstr();
941 
942   if (DefI->isBundle()) {
943     const SIRegisterInfo *TRI = getRegisterInfo();
944     auto Reg = Dep.getReg();
945     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
946     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
947     unsigned Lat = 0;
948     for (++I; I != E && I->isBundledWithPred(); ++I) {
949       if (I->modifiesRegister(Reg, TRI))
950         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
951       else if (Lat)
952         --Lat;
953     }
954     Dep.setLatency(Lat);
955   } else if (UseI->isBundle()) {
956     const SIRegisterInfo *TRI = getRegisterInfo();
957     auto Reg = Dep.getReg();
958     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
959     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
960     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
961     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
962       if (I->readsRegister(Reg, TRI))
963         break;
964       --Lat;
965     }
966     Dep.setLatency(Lat);
967   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
968     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
969     // implicit operands which come from the MCInstrDesc, which can fool
970     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
971     // pseudo operands.
972     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
973         DefI, DefOpIdx, UseI, UseOpIdx));
974   }
975 }
976 
977 namespace {
978 struct FillMFMAShadowMutation : ScheduleDAGMutation {
979   const SIInstrInfo *TII;
980 
981   ScheduleDAGMI *DAG;
982 
983   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
984 
985   bool isSALU(const SUnit *SU) const {
986     const MachineInstr *MI = SU->getInstr();
987     return MI && TII->isSALU(*MI) && !MI->isTerminator();
988   }
989 
990   bool isVALU(const SUnit *SU) const {
991     const MachineInstr *MI = SU->getInstr();
992     return MI && TII->isVALU(*MI);
993   }
994 
995   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
996     if (Pred->NodeNum < Succ->NodeNum)
997       return true;
998 
999     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
1000 
1001     for (unsigned I = 0; I < Succs.size(); ++I) {
1002       for (const SDep &SI : Succs[I]->Succs) {
1003         const SUnit *SU = SI.getSUnit();
1004         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1005           Succs.push_back(SU);
1006       }
1007     }
1008 
1009     SmallPtrSet<const SUnit*, 32> Visited;
1010     while (!Preds.empty()) {
1011       const SUnit *SU = Preds.pop_back_val();
1012       if (llvm::is_contained(Succs, SU))
1013         return false;
1014       Visited.insert(SU);
1015       for (const SDep &SI : SU->Preds)
1016         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1017           Preds.push_back(SI.getSUnit());
1018     }
1019 
1020     return true;
1021   }
1022 
1023   // Link as many SALU instructions in chain as possible. Return the size
1024   // of the chain. Links up to MaxChain instructions.
1025   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1026                          SmallPtrSetImpl<SUnit *> &Visited) const {
1027     SmallVector<SUnit *, 8> Worklist({To});
1028     unsigned Linked = 0;
1029 
1030     while (!Worklist.empty() && MaxChain-- > 0) {
1031       SUnit *SU = Worklist.pop_back_val();
1032       if (!Visited.insert(SU).second)
1033         continue;
1034 
1035       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1036                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1037 
1038       if (SU->addPred(SDep(From, SDep::Artificial), false))
1039         ++Linked;
1040 
1041       for (SDep &SI : From->Succs) {
1042         SUnit *SUv = SI.getSUnit();
1043         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1044           SUv->addPred(SDep(SU, SDep::Artificial), false);
1045       }
1046 
1047       for (SDep &SI : SU->Succs) {
1048         SUnit *Succ = SI.getSUnit();
1049         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1050           Worklist.push_back(Succ);
1051       }
1052     }
1053 
1054     return Linked;
1055   }
1056 
1057   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1058     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1059     if (!ST.hasMAIInsts() || DisablePowerSched)
1060       return;
1061     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1062     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1063     if (!TSchedModel || DAG->SUnits.empty())
1064       return;
1065 
1066     // Scan for MFMA long latency instructions and try to add a dependency
1067     // of available SALU instructions to give them a chance to fill MFMA
1068     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1069     // rather than VALU to prevent power consumption bursts and throttle.
1070     auto LastSALU = DAG->SUnits.begin();
1071     auto E = DAG->SUnits.end();
1072     SmallPtrSet<SUnit*, 32> Visited;
1073     for (SUnit &SU : DAG->SUnits) {
1074       MachineInstr &MAI = *SU.getInstr();
1075       if (!TII->isMAI(MAI) ||
1076            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1077            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1078         continue;
1079 
1080       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1081 
1082       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1083                  dbgs() << "Need " << Lat
1084                         << " instructions to cover latency.\n");
1085 
1086       // Find up to Lat independent scalar instructions as early as
1087       // possible such that they can be scheduled after this MFMA.
1088       for ( ; Lat && LastSALU != E; ++LastSALU) {
1089         if (Visited.count(&*LastSALU))
1090           continue;
1091 
1092         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1093           continue;
1094 
1095         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1096       }
1097     }
1098   }
1099 };
1100 } // namespace
1101 
1102 void GCNSubtarget::getPostRAMutations(
1103     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1104   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1105 }
1106 
1107 std::unique_ptr<ScheduleDAGMutation>
1108 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1109   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1110 }
1111 
1112 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1113   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1114     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1115   else
1116     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1117 }
1118 
1119 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1120   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1121     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1122   else
1123     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1124 }
1125