xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (revision e2eeea75eb8b6dd50c1298067a0655880d186734)
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, FullFS);
61 
62   HasMulU24 = getGeneration() >= EVERGREEN;
63   HasMulI24 = hasCaymanISA();
64 
65   return *this;
66 }
67 
68 GCNSubtarget &
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70                                               StringRef GPU, StringRef FS) {
71   // Determine default and user-specified characteristics
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
87 
88   // Disable mutually exclusive bits.
89   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91       FullFS += "-wavefrontsize16,";
92     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93       FullFS += "-wavefrontsize32,";
94     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95       FullFS += "-wavefrontsize64,";
96   }
97 
98   FullFS += FS;
99 
100   ParseSubtargetFeatures(GPU, FullFS);
101 
102   // We don't support FP64 for EG/NI atm.
103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
104 
105   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107   // variants of MUBUF instructions.
108   if (!hasAddr64() && !FS.contains("flat-for-global")) {
109     FlatForGlobal = true;
110   }
111 
112   // Set defaults if needed.
113   if (MaxPrivateElementSize == 0)
114     MaxPrivateElementSize = 4;
115 
116   if (LDSBankCount == 0)
117     LDSBankCount = 32;
118 
119   if (TT.getArch() == Triple::amdgcn) {
120     if (LocalMemorySize == 0)
121       LocalMemorySize = 32768;
122 
123     // Do something sensible for unspecified target.
124     if (!HasMovrel && !HasVGPRIndexMode)
125       HasMovrel = true;
126   }
127 
128   // Don't crash on invalid devices.
129   if (WavefrontSizeLog2 == 0)
130     WavefrontSizeLog2 = 5;
131 
132   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
133 
134   // Disable XNACK on targets where it is not enabled by default unless it is
135   // explicitly requested.
136   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137     ToggleFeature(AMDGPU::FeatureXNACK);
138     EnableXNACK = false;
139   }
140 
141   // ECC is on by default, but turn it off if the hardware doesn't support it
142   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
143   // ECC.
144   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145     ToggleFeature(AMDGPU::FeatureSRAMECC);
146     EnableSRAMECC = false;
147   }
148 
149   return *this;
150 }
151 
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
153   TargetTriple(TT),
154   Has16BitInsts(false),
155   HasMadMixInsts(false),
156   HasMadMacF32Insts(false),
157   HasDsSrc2Insts(false),
158   HasSDWA(false),
159   HasVOP3PInsts(false),
160   HasMulI24(true),
161   HasMulU24(true),
162   HasInv2PiInlineImm(false),
163   HasFminFmaxLegacy(true),
164   EnablePromoteAlloca(false),
165   HasTrigReducedRange(false),
166   MaxWavesPerEU(10),
167   LocalMemorySize(0),
168   WavefrontSizeLog2(0)
169   { }
170 
171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
172                            const GCNTargetMachine &TM) :
173     AMDGPUGenSubtargetInfo(TT, GPU, FS),
174     AMDGPUSubtarget(TT),
175     TargetTriple(TT),
176     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
177     InstrItins(getInstrItineraryForCPU(GPU)),
178     LDSBankCount(0),
179     MaxPrivateElementSize(0),
180 
181     FastFMAF32(false),
182     FastDenormalF32(false),
183     HalfRate64Ops(false),
184 
185     FlatForGlobal(false),
186     AutoWaitcntBeforeBarrier(false),
187     CodeObjectV3(false),
188     UnalignedScratchAccess(false),
189     UnalignedBufferAccess(false),
190 
191     HasApertureRegs(false),
192     EnableXNACK(false),
193     DoesNotSupportXNACK(false),
194     EnableCuMode(false),
195     TrapHandler(false),
196 
197     EnableLoadStoreOpt(false),
198     EnableUnsafeDSOffsetFolding(false),
199     EnableSIScheduler(false),
200     EnableDS128(false),
201     EnablePRTStrictNull(false),
202     DumpCode(false),
203 
204     FP64(false),
205     GCN3Encoding(false),
206     CIInsts(false),
207     GFX8Insts(false),
208     GFX9Insts(false),
209     GFX10Insts(false),
210     GFX10_3Insts(false),
211     GFX7GFX8GFX9Insts(false),
212     SGPRInitBug(false),
213     HasSMemRealTime(false),
214     HasIntClamp(false),
215     HasFmaMixInsts(false),
216     HasMovrel(false),
217     HasVGPRIndexMode(false),
218     HasScalarStores(false),
219     HasScalarAtomics(false),
220     HasSDWAOmod(false),
221     HasSDWAScalar(false),
222     HasSDWASdst(false),
223     HasSDWAMac(false),
224     HasSDWAOutModsVOPC(false),
225     HasDPP(false),
226     HasDPP8(false),
227     HasR128A16(false),
228     HasGFX10A16(false),
229     HasG16(false),
230     HasNSAEncoding(false),
231     GFX10_BEncoding(false),
232     HasDLInsts(false),
233     HasDot1Insts(false),
234     HasDot2Insts(false),
235     HasDot3Insts(false),
236     HasDot4Insts(false),
237     HasDot5Insts(false),
238     HasDot6Insts(false),
239     HasMAIInsts(false),
240     HasPkFmacF16Inst(false),
241     HasAtomicFaddInsts(false),
242     EnableSRAMECC(false),
243     DoesNotSupportSRAMECC(false),
244     HasNoSdstCMPX(false),
245     HasVscnt(false),
246     HasGetWaveIdInst(false),
247     HasSMemTimeInst(false),
248     HasRegisterBanking(false),
249     HasVOP3Literal(false),
250     HasNoDataDepHazard(false),
251     FlatAddressSpace(false),
252     FlatInstOffsets(false),
253     FlatGlobalInsts(false),
254     FlatScratchInsts(false),
255     ScalarFlatScratchInsts(false),
256     AddNoCarryInsts(false),
257     HasUnpackedD16VMem(false),
258     LDSMisalignedBug(false),
259     HasMFMAInlineLiteralBug(false),
260 
261     ScalarizeGlobal(false),
262 
263     HasVcmpxPermlaneHazard(false),
264     HasVMEMtoScalarWriteHazard(false),
265     HasSMEMtoVectorWriteHazard(false),
266     HasInstFwdPrefetchBug(false),
267     HasVcmpxExecWARHazard(false),
268     HasLdsBranchVmemWARHazard(false),
269     HasNSAtoVMEMBug(false),
270     HasOffset3fBug(false),
271     HasFlatSegmentOffsetBug(false),
272 
273     FeatureDisable(false),
274     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
275     TLInfo(TM, *this),
276     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
277   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
278   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
279   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
280   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
281   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
282   InstSelector.reset(new AMDGPUInstructionSelector(
283   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
284 }
285 
286 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
287   if (getGeneration() < GFX10)
288     return 1;
289 
290   switch (Opcode) {
291   case AMDGPU::V_LSHLREV_B64:
292   case AMDGPU::V_LSHLREV_B64_gfx10:
293   case AMDGPU::V_LSHL_B64:
294   case AMDGPU::V_LSHRREV_B64:
295   case AMDGPU::V_LSHRREV_B64_gfx10:
296   case AMDGPU::V_LSHR_B64:
297   case AMDGPU::V_ASHRREV_I64:
298   case AMDGPU::V_ASHRREV_I64_gfx10:
299   case AMDGPU::V_ASHR_I64:
300     return 1;
301   }
302 
303   return 2;
304 }
305 
306 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
307   const Function &F) const {
308   if (NWaves == 1)
309     return getLocalMemorySize();
310   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
311   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
312   if (!WorkGroupsPerCu)
313     return 0;
314   unsigned MaxWaves = getMaxWavesPerEU();
315   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
316 }
317 
318 // FIXME: Should return min,max range.
319 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
320   const Function &F) const {
321   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
322   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
323   if (!MaxWorkGroupsPerCu)
324     return 0;
325 
326   const unsigned WaveSize = getWavefrontSize();
327 
328   // FIXME: Do we need to account for alignment requirement of LDS rounding the
329   // size up?
330   // Compute restriction based on LDS usage
331   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
332 
333   // This can be queried with more LDS than is possible, so just assume the
334   // worst.
335   if (NumGroups == 0)
336     return 1;
337 
338   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
339 
340   // Round to the number of waves.
341   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
342   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
343 
344   // Clamp to the maximum possible number of waves.
345   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
346 
347   // FIXME: Needs to be a multiple of the group size?
348   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
349 
350   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
351          "computed invalid occupancy");
352   return MaxWaves;
353 }
354 
355 unsigned
356 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
357   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
358   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
359 }
360 
361 std::pair<unsigned, unsigned>
362 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
363   switch (CC) {
364   case CallingConv::AMDGPU_VS:
365   case CallingConv::AMDGPU_LS:
366   case CallingConv::AMDGPU_HS:
367   case CallingConv::AMDGPU_ES:
368   case CallingConv::AMDGPU_GS:
369   case CallingConv::AMDGPU_PS:
370     return std::make_pair(1, getWavefrontSize());
371   default:
372     return std::make_pair(1u, getMaxFlatWorkGroupSize());
373   }
374 }
375 
376 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
377   const Function &F) const {
378   // Default minimum/maximum flat work group sizes.
379   std::pair<unsigned, unsigned> Default =
380     getDefaultFlatWorkGroupSize(F.getCallingConv());
381 
382   // Requested minimum/maximum flat work group sizes.
383   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
384     F, "amdgpu-flat-work-group-size", Default);
385 
386   // Make sure requested minimum is less than requested maximum.
387   if (Requested.first > Requested.second)
388     return Default;
389 
390   // Make sure requested values do not violate subtarget's specifications.
391   if (Requested.first < getMinFlatWorkGroupSize())
392     return Default;
393   if (Requested.second > getMaxFlatWorkGroupSize())
394     return Default;
395 
396   return Requested;
397 }
398 
399 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
400   const Function &F) const {
401   // Default minimum/maximum number of waves per execution unit.
402   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
403 
404   // Default/requested minimum/maximum flat work group sizes.
405   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
406 
407   // If minimum/maximum flat work group sizes were explicitly requested using
408   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
409   // number of waves per execution unit to values implied by requested
410   // minimum/maximum flat work group sizes.
411   unsigned MinImpliedByFlatWorkGroupSize =
412     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
413   Default.first = MinImpliedByFlatWorkGroupSize;
414   bool RequestedFlatWorkGroupSize =
415       F.hasFnAttribute("amdgpu-flat-work-group-size");
416 
417   // Requested minimum/maximum number of waves per execution unit.
418   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
419     F, "amdgpu-waves-per-eu", Default, true);
420 
421   // Make sure requested minimum is less than requested maximum.
422   if (Requested.second && Requested.first > Requested.second)
423     return Default;
424 
425   // Make sure requested values do not violate subtarget's specifications.
426   if (Requested.first < getMinWavesPerEU() ||
427       Requested.second > getMaxWavesPerEU())
428     return Default;
429 
430   // Make sure requested values are compatible with values implied by requested
431   // minimum/maximum flat work group sizes.
432   if (RequestedFlatWorkGroupSize &&
433       Requested.first < MinImpliedByFlatWorkGroupSize)
434     return Default;
435 
436   return Requested;
437 }
438 
439 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
440   Function *Kernel = I->getParent()->getParent();
441   unsigned MinSize = 0;
442   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
443   bool IdQuery = false;
444 
445   // If reqd_work_group_size is present it narrows value down.
446   if (auto *CI = dyn_cast<CallInst>(I)) {
447     const Function *F = CI->getCalledFunction();
448     if (F) {
449       unsigned Dim = UINT_MAX;
450       switch (F->getIntrinsicID()) {
451       case Intrinsic::amdgcn_workitem_id_x:
452       case Intrinsic::r600_read_tidig_x:
453         IdQuery = true;
454         LLVM_FALLTHROUGH;
455       case Intrinsic::r600_read_local_size_x:
456         Dim = 0;
457         break;
458       case Intrinsic::amdgcn_workitem_id_y:
459       case Intrinsic::r600_read_tidig_y:
460         IdQuery = true;
461         LLVM_FALLTHROUGH;
462       case Intrinsic::r600_read_local_size_y:
463         Dim = 1;
464         break;
465       case Intrinsic::amdgcn_workitem_id_z:
466       case Intrinsic::r600_read_tidig_z:
467         IdQuery = true;
468         LLVM_FALLTHROUGH;
469       case Intrinsic::r600_read_local_size_z:
470         Dim = 2;
471         break;
472       default:
473         break;
474       }
475       if (Dim <= 3) {
476         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
477           if (Node->getNumOperands() == 3)
478             MinSize = MaxSize = mdconst::extract<ConstantInt>(
479                                   Node->getOperand(Dim))->getZExtValue();
480       }
481     }
482   }
483 
484   if (!MaxSize)
485     return false;
486 
487   // Range metadata is [Lo, Hi). For ID query we need to pass max size
488   // as Hi. For size query we need to pass Hi + 1.
489   if (IdQuery)
490     MinSize = 0;
491   else
492     ++MaxSize;
493 
494   MDBuilder MDB(I->getContext());
495   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
496                                                   APInt(32, MaxSize));
497   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
498   return true;
499 }
500 
501 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
502                                                  Align &MaxAlign) const {
503   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
504          F.getCallingConv() == CallingConv::SPIR_KERNEL);
505 
506   const DataLayout &DL = F.getParent()->getDataLayout();
507   uint64_t ExplicitArgBytes = 0;
508   MaxAlign = Align(1);
509 
510   for (const Argument &Arg : F.args()) {
511     Type *ArgTy = Arg.getType();
512 
513     const Align Alignment = DL.getABITypeAlign(ArgTy);
514     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
515     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
516     MaxAlign = std::max(MaxAlign, Alignment);
517   }
518 
519   return ExplicitArgBytes;
520 }
521 
522 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
523                                                 Align &MaxAlign) const {
524   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
525 
526   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
527 
528   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
529   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
530   if (ImplicitBytes != 0) {
531     const Align Alignment = getAlignmentForImplicitArgPtr();
532     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
533   }
534 
535   // Being able to dereference past the end is useful for emitting scalar loads.
536   return alignTo(TotalSize, 4);
537 }
538 
539 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
540                              const TargetMachine &TM) :
541   R600GenSubtargetInfo(TT, GPU, FS),
542   AMDGPUSubtarget(TT),
543   InstrInfo(*this),
544   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
545   FMA(false),
546   CaymanISA(false),
547   CFALUBug(false),
548   HasVertexCache(false),
549   R600ALUInst(false),
550   FP64(false),
551   TexVTXClauseSize(0),
552   Gen(R600),
553   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
554   InstrItins(getInstrItineraryForCPU(GPU)) { }
555 
556 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
557                                       unsigned NumRegionInstrs) const {
558   // Track register pressure so the scheduler can try to decrease
559   // pressure once register usage is above the threshold defined by
560   // SIRegisterInfo::getRegPressureSetLimit()
561   Policy.ShouldTrackPressure = true;
562 
563   // Enabling both top down and bottom up scheduling seems to give us less
564   // register spills than just using one of these approaches on its own.
565   Policy.OnlyTopDown = false;
566   Policy.OnlyBottomUp = false;
567 
568   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
569   if (!enableSIScheduler())
570     Policy.ShouldTrackLaneMasks = true;
571 }
572 
573 bool GCNSubtarget::hasMadF16() const {
574   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
575 }
576 
577 bool GCNSubtarget::useVGPRIndexMode() const {
578   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
579 }
580 
581 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
582   if (getGeneration() >= AMDGPUSubtarget::GFX10)
583     return getMaxWavesPerEU();
584 
585   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
586     if (SGPRs <= 80)
587       return 10;
588     if (SGPRs <= 88)
589       return 9;
590     if (SGPRs <= 100)
591       return 8;
592     return 7;
593   }
594   if (SGPRs <= 48)
595     return 10;
596   if (SGPRs <= 56)
597     return 9;
598   if (SGPRs <= 64)
599     return 8;
600   if (SGPRs <= 72)
601     return 7;
602   if (SGPRs <= 80)
603     return 6;
604   return 5;
605 }
606 
607 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
608   unsigned MaxWaves = getMaxWavesPerEU();
609   unsigned Granule = getVGPRAllocGranule();
610   if (VGPRs < Granule)
611     return MaxWaves;
612   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
613   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
614 }
615 
616 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
617   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
618   if (getGeneration() >= AMDGPUSubtarget::GFX10)
619     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
620 
621   if (MFI.hasFlatScratchInit()) {
622     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
623       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
624     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
625       return 4; // FLAT_SCRATCH, VCC (in that order).
626   }
627 
628   if (isXNACKEnabled())
629     return 4; // XNACK, VCC (in that order).
630   return 2; // VCC.
631 }
632 
633 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
634                                         unsigned NumSGPRs,
635                                         unsigned NumVGPRs) const {
636   unsigned Occupancy =
637     std::min(getMaxWavesPerEU(),
638              getOccupancyWithLocalMemSize(LDSSize, F));
639   if (NumSGPRs)
640     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
641   if (NumVGPRs)
642     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
643   return Occupancy;
644 }
645 
646 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
647   const Function &F = MF.getFunction();
648   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
649 
650   // Compute maximum number of SGPRs function can use using default/requested
651   // minimum number of waves per execution unit.
652   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
653   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
654   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
655 
656   // Check if maximum number of SGPRs was explicitly requested using
657   // "amdgpu-num-sgpr" attribute.
658   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
659     unsigned Requested = AMDGPU::getIntegerAttribute(
660       F, "amdgpu-num-sgpr", MaxNumSGPRs);
661 
662     // Make sure requested value does not violate subtarget's specifications.
663     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
664       Requested = 0;
665 
666     // If more SGPRs are required to support the input user/system SGPRs,
667     // increase to accommodate them.
668     //
669     // FIXME: This really ends up using the requested number of SGPRs + number
670     // of reserved special registers in total. Theoretically you could re-use
671     // the last input registers for these special registers, but this would
672     // require a lot of complexity to deal with the weird aliasing.
673     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
674     if (Requested && Requested < InputNumSGPRs)
675       Requested = InputNumSGPRs;
676 
677     // Make sure requested value is compatible with values implied by
678     // default/requested minimum/maximum number of waves per execution unit.
679     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
680       Requested = 0;
681     if (WavesPerEU.second &&
682         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
683       Requested = 0;
684 
685     if (Requested)
686       MaxNumSGPRs = Requested;
687   }
688 
689   if (hasSGPRInitBug())
690     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
691 
692   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
693                   MaxAddressableNumSGPRs);
694 }
695 
696 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
697   const Function &F = MF.getFunction();
698   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
699 
700   // Compute maximum number of VGPRs function can use using default/requested
701   // minimum number of waves per execution unit.
702   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
703   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
704 
705   // Check if maximum number of VGPRs was explicitly requested using
706   // "amdgpu-num-vgpr" attribute.
707   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
708     unsigned Requested = AMDGPU::getIntegerAttribute(
709       F, "amdgpu-num-vgpr", MaxNumVGPRs);
710 
711     // Make sure requested value is compatible with values implied by
712     // default/requested minimum/maximum number of waves per execution unit.
713     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
714       Requested = 0;
715     if (WavesPerEU.second &&
716         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
717       Requested = 0;
718 
719     if (Requested)
720       MaxNumVGPRs = Requested;
721   }
722 
723   return MaxNumVGPRs;
724 }
725 
726 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
727                                          int UseOpIdx, SDep &Dep) const {
728   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
729       !Def->isInstr() || !Use->isInstr())
730     return;
731 
732   MachineInstr *DefI = Def->getInstr();
733   MachineInstr *UseI = Use->getInstr();
734 
735   if (DefI->isBundle()) {
736     const SIRegisterInfo *TRI = getRegisterInfo();
737     auto Reg = Dep.getReg();
738     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
739     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
740     unsigned Lat = 0;
741     for (++I; I != E && I->isBundledWithPred(); ++I) {
742       if (I->modifiesRegister(Reg, TRI))
743         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
744       else if (Lat)
745         --Lat;
746     }
747     Dep.setLatency(Lat);
748   } else if (UseI->isBundle()) {
749     const SIRegisterInfo *TRI = getRegisterInfo();
750     auto Reg = Dep.getReg();
751     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
752     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
753     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
754     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
755       if (I->readsRegister(Reg, TRI))
756         break;
757       --Lat;
758     }
759     Dep.setLatency(Lat);
760   }
761 }
762 
763 namespace {
764 struct FillMFMAShadowMutation : ScheduleDAGMutation {
765   const SIInstrInfo *TII;
766 
767   ScheduleDAGMI *DAG;
768 
769   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
770 
771   bool isSALU(const SUnit *SU) const {
772     const MachineInstr *MI = SU->getInstr();
773     return MI && TII->isSALU(*MI) && !MI->isTerminator();
774   }
775 
776   bool isVALU(const SUnit *SU) const {
777     const MachineInstr *MI = SU->getInstr();
778     return MI && TII->isVALU(*MI);
779   }
780 
781   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
782     if (Pred->NodeNum < Succ->NodeNum)
783       return true;
784 
785     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
786 
787     for (unsigned I = 0; I < Succs.size(); ++I) {
788       for (const SDep &SI : Succs[I]->Succs) {
789         const SUnit *SU = SI.getSUnit();
790         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
791           Succs.push_back(SU);
792       }
793     }
794 
795     SmallPtrSet<const SUnit*, 32> Visited;
796     while (!Preds.empty()) {
797       const SUnit *SU = Preds.pop_back_val();
798       if (llvm::find(Succs, SU) != Succs.end())
799         return false;
800       Visited.insert(SU);
801       for (const SDep &SI : SU->Preds)
802         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
803           Preds.push_back(SI.getSUnit());
804     }
805 
806     return true;
807   }
808 
809   // Link as much SALU intructions in chain as possible. Return the size
810   // of the chain. Links up to MaxChain instructions.
811   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
812                          SmallPtrSetImpl<SUnit *> &Visited) const {
813     SmallVector<SUnit *, 8> Worklist({To});
814     unsigned Linked = 0;
815 
816     while (!Worklist.empty() && MaxChain-- > 0) {
817       SUnit *SU = Worklist.pop_back_val();
818       if (!Visited.insert(SU).second)
819         continue;
820 
821       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
822                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
823 
824       if (SU->addPred(SDep(From, SDep::Artificial), false))
825         ++Linked;
826 
827       for (SDep &SI : From->Succs) {
828         SUnit *SUv = SI.getSUnit();
829         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
830           SUv->addPred(SDep(SU, SDep::Artificial), false);
831       }
832 
833       for (SDep &SI : SU->Succs) {
834         SUnit *Succ = SI.getSUnit();
835         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
836           Worklist.push_back(Succ);
837       }
838     }
839 
840     return Linked;
841   }
842 
843   void apply(ScheduleDAGInstrs *DAGInstrs) override {
844     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
845     if (!ST.hasMAIInsts() || DisablePowerSched)
846       return;
847     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
848     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
849     if (!TSchedModel || DAG->SUnits.empty())
850       return;
851 
852     // Scan for MFMA long latency instructions and try to add a dependency
853     // of available SALU instructions to give them a chance to fill MFMA
854     // shadow. That is desirable to fill MFMA shadow with SALU instructions
855     // rather than VALU to prevent power consumption bursts and throttle.
856     auto LastSALU = DAG->SUnits.begin();
857     auto E = DAG->SUnits.end();
858     SmallPtrSet<SUnit*, 32> Visited;
859     for (SUnit &SU : DAG->SUnits) {
860       MachineInstr &MAI = *SU.getInstr();
861       if (!TII->isMAI(MAI) ||
862            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
863            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
864         continue;
865 
866       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
867 
868       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
869                  dbgs() << "Need " << Lat
870                         << " instructions to cover latency.\n");
871 
872       // Find up to Lat independent scalar instructions as early as
873       // possible such that they can be scheduled after this MFMA.
874       for ( ; Lat && LastSALU != E; ++LastSALU) {
875         if (Visited.count(&*LastSALU))
876           continue;
877 
878         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
879           continue;
880 
881         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
882       }
883     }
884   }
885 };
886 } // namespace
887 
888 void GCNSubtarget::getPostRAMutations(
889     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
890   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
891 }
892 
893 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
894   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
895     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
896   else
897     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
898 }
899 
900 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
901   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
902     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
903   else
904     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
905 }
906