xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (revision a7dea1671b87c07d2d266f836bfa8b58efc7c134)
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 GCNSubtarget::~GCNSubtarget() = default;
49 
50 R600Subtarget &
51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
52                                                StringRef GPU, StringRef FS) {
53   SmallString<256> FullFS("+promote-alloca,");
54   FullFS += FS;
55   ParseSubtargetFeatures(GPU, FullFS);
56 
57   // FIXME: I don't think think Evergreen has any useful support for
58   // denormals, but should be checked. Should we issue a warning somewhere
59   // if someone tries to enable these?
60   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
61     FP32Denormals = false;
62   }
63 
64   HasMulU24 = getGeneration() >= EVERGREEN;
65   HasMulI24 = hasCaymanISA();
66 
67   return *this;
68 }
69 
70 GCNSubtarget &
71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
72                                               StringRef GPU, StringRef FS) {
73   // Determine default and user-specified characteristics
74   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75   // enabled, but some instructions do not respect them and they run at the
76   // double precision rate, so don't enable by default.
77   //
78   // We want to be able to turn these off, but making this a subtarget feature
79   // for SI has the unhelpful behavior that it unsets everything else if you
80   // disable it.
81   //
82   // Similarly we want enable-prt-strict-null to be on by default and not to
83   // unset everything else if it is disabled
84 
85   // Assuming ECC is enabled is the conservative default.
86   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
87 
88   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
90 
91   // FIXME: I don't think think Evergreen has any useful support for
92   // denormals, but should be checked. Should we issue a warning somewhere
93   // if someone tries to enable these?
94   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
95     FullFS += "+fp64-fp16-denormals,";
96   } else {
97     FullFS += "-fp32-denormals,";
98   }
99 
100   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
101 
102   // Disable mutually exclusive bits.
103   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
104     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
105       FullFS += "-wavefrontsize16,";
106     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
107       FullFS += "-wavefrontsize32,";
108     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
109       FullFS += "-wavefrontsize64,";
110   }
111 
112   FullFS += FS;
113 
114   ParseSubtargetFeatures(GPU, FullFS);
115 
116   // We don't support FP64 for EG/NI atm.
117   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
118 
119   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121   // variants of MUBUF instructions.
122   if (!hasAddr64() && !FS.contains("flat-for-global")) {
123     FlatForGlobal = true;
124   }
125 
126   // Set defaults if needed.
127   if (MaxPrivateElementSize == 0)
128     MaxPrivateElementSize = 4;
129 
130   if (LDSBankCount == 0)
131     LDSBankCount = 32;
132 
133   if (TT.getArch() == Triple::amdgcn) {
134     if (LocalMemorySize == 0)
135       LocalMemorySize = 32768;
136 
137     // Do something sensible for unspecified target.
138     if (!HasMovrel && !HasVGPRIndexMode)
139       HasMovrel = true;
140   }
141 
142   // Don't crash on invalid devices.
143   if (WavefrontSize == 0)
144     WavefrontSize = 64;
145 
146   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
147 
148   if (DoesNotSupportXNACK && EnableXNACK) {
149     ToggleFeature(AMDGPU::FeatureXNACK);
150     EnableXNACK = false;
151   }
152 
153   // ECC is on by default, but turn it off if the hardware doesn't support it
154   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
155   // ECC.
156   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
157     ToggleFeature(AMDGPU::FeatureSRAMECC);
158     EnableSRAMECC = false;
159   }
160 
161   return *this;
162 }
163 
164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
165   TargetTriple(TT),
166   Has16BitInsts(false),
167   HasMadMixInsts(false),
168   FP32Denormals(false),
169   FPExceptions(false),
170   HasSDWA(false),
171   HasVOP3PInsts(false),
172   HasMulI24(true),
173   HasMulU24(true),
174   HasInv2PiInlineImm(false),
175   HasFminFmaxLegacy(true),
176   EnablePromoteAlloca(false),
177   HasTrigReducedRange(false),
178   MaxWavesPerEU(10),
179   LocalMemorySize(0),
180   WavefrontSize(0)
181   { }
182 
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184                            const GCNTargetMachine &TM) :
185     AMDGPUGenSubtargetInfo(TT, GPU, FS),
186     AMDGPUSubtarget(TT),
187     TargetTriple(TT),
188     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
189     InstrItins(getInstrItineraryForCPU(GPU)),
190     LDSBankCount(0),
191     MaxPrivateElementSize(0),
192 
193     FastFMAF32(false),
194     HalfRate64Ops(false),
195 
196     FP64FP16Denormals(false),
197     FlatForGlobal(false),
198     AutoWaitcntBeforeBarrier(false),
199     CodeObjectV3(false),
200     UnalignedScratchAccess(false),
201     UnalignedBufferAccess(false),
202 
203     HasApertureRegs(false),
204     EnableXNACK(false),
205     DoesNotSupportXNACK(false),
206     EnableCuMode(false),
207     TrapHandler(false),
208 
209     EnableLoadStoreOpt(false),
210     EnableUnsafeDSOffsetFolding(false),
211     EnableSIScheduler(false),
212     EnableDS128(false),
213     EnablePRTStrictNull(false),
214     DumpCode(false),
215 
216     FP64(false),
217     GCN3Encoding(false),
218     CIInsts(false),
219     GFX8Insts(false),
220     GFX9Insts(false),
221     GFX10Insts(false),
222     GFX7GFX8GFX9Insts(false),
223     SGPRInitBug(false),
224     HasSMemRealTime(false),
225     HasIntClamp(false),
226     HasFmaMixInsts(false),
227     HasMovrel(false),
228     HasVGPRIndexMode(false),
229     HasScalarStores(false),
230     HasScalarAtomics(false),
231     HasSDWAOmod(false),
232     HasSDWAScalar(false),
233     HasSDWASdst(false),
234     HasSDWAMac(false),
235     HasSDWAOutModsVOPC(false),
236     HasDPP(false),
237     HasDPP8(false),
238     HasR128A16(false),
239     HasNSAEncoding(false),
240     HasDLInsts(false),
241     HasDot1Insts(false),
242     HasDot2Insts(false),
243     HasDot3Insts(false),
244     HasDot4Insts(false),
245     HasDot5Insts(false),
246     HasDot6Insts(false),
247     HasMAIInsts(false),
248     HasPkFmacF16Inst(false),
249     HasAtomicFaddInsts(false),
250     EnableSRAMECC(false),
251     DoesNotSupportSRAMECC(false),
252     HasNoSdstCMPX(false),
253     HasVscnt(false),
254     HasRegisterBanking(false),
255     HasVOP3Literal(false),
256     HasNoDataDepHazard(false),
257     FlatAddressSpace(false),
258     FlatInstOffsets(false),
259     FlatGlobalInsts(false),
260     FlatScratchInsts(false),
261     ScalarFlatScratchInsts(false),
262     AddNoCarryInsts(false),
263     HasUnpackedD16VMem(false),
264     LDSMisalignedBug(false),
265     HasMFMAInlineLiteralBug(false),
266 
267     ScalarizeGlobal(false),
268 
269     HasVcmpxPermlaneHazard(false),
270     HasVMEMtoScalarWriteHazard(false),
271     HasSMEMtoVectorWriteHazard(false),
272     HasInstFwdPrefetchBug(false),
273     HasVcmpxExecWARHazard(false),
274     HasLdsBranchVmemWARHazard(false),
275     HasNSAtoVMEMBug(false),
276     HasOffset3fBug(false),
277     HasFlatSegmentOffsetBug(false),
278 
279     FeatureDisable(false),
280     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
281     TLInfo(TM, *this),
282     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
283   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
284   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
285   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
286   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
287   InstSelector.reset(new AMDGPUInstructionSelector(
288   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
289 }
290 
291 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
292   if (getGeneration() < GFX10)
293     return 1;
294 
295   switch (Opcode) {
296   case AMDGPU::V_LSHLREV_B64:
297   case AMDGPU::V_LSHLREV_B64_gfx10:
298   case AMDGPU::V_LSHL_B64:
299   case AMDGPU::V_LSHRREV_B64:
300   case AMDGPU::V_LSHRREV_B64_gfx10:
301   case AMDGPU::V_LSHR_B64:
302   case AMDGPU::V_ASHRREV_I64:
303   case AMDGPU::V_ASHRREV_I64_gfx10:
304   case AMDGPU::V_ASHR_I64:
305     return 1;
306   }
307 
308   return 2;
309 }
310 
311 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
312   const Function &F) const {
313   if (NWaves == 1)
314     return getLocalMemorySize();
315   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
316   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
317   if (!WorkGroupsPerCu)
318     return 0;
319   unsigned MaxWaves = getMaxWavesPerEU();
320   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
321 }
322 
323 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
324   const Function &F) const {
325   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
326   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
327   if (!WorkGroupsPerCu)
328     return 0;
329   unsigned MaxWaves = getMaxWavesPerEU();
330   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
331   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
332   NumWaves = std::min(NumWaves, MaxWaves);
333   NumWaves = std::max(NumWaves, 1u);
334   return NumWaves;
335 }
336 
337 unsigned
338 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
339   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
340   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
341 }
342 
343 std::pair<unsigned, unsigned>
344 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
345   switch (CC) {
346   case CallingConv::AMDGPU_CS:
347   case CallingConv::AMDGPU_KERNEL:
348   case CallingConv::SPIR_KERNEL:
349     return std::make_pair(getWavefrontSize() * 2,
350                           std::max(getWavefrontSize() * 4, 256u));
351   case CallingConv::AMDGPU_VS:
352   case CallingConv::AMDGPU_LS:
353   case CallingConv::AMDGPU_HS:
354   case CallingConv::AMDGPU_ES:
355   case CallingConv::AMDGPU_GS:
356   case CallingConv::AMDGPU_PS:
357     return std::make_pair(1, getWavefrontSize());
358   default:
359     return std::make_pair(1, 16 * getWavefrontSize());
360   }
361 }
362 
363 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
364   const Function &F) const {
365   // FIXME: 1024 if function.
366   // Default minimum/maximum flat work group sizes.
367   std::pair<unsigned, unsigned> Default =
368     getDefaultFlatWorkGroupSize(F.getCallingConv());
369 
370   // Requested minimum/maximum flat work group sizes.
371   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
372     F, "amdgpu-flat-work-group-size", Default);
373 
374   // Make sure requested minimum is less than requested maximum.
375   if (Requested.first > Requested.second)
376     return Default;
377 
378   // Make sure requested values do not violate subtarget's specifications.
379   if (Requested.first < getMinFlatWorkGroupSize())
380     return Default;
381   if (Requested.second > getMaxFlatWorkGroupSize())
382     return Default;
383 
384   return Requested;
385 }
386 
387 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
388   const Function &F) const {
389   // Default minimum/maximum number of waves per execution unit.
390   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
391 
392   // Default/requested minimum/maximum flat work group sizes.
393   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
394 
395   // If minimum/maximum flat work group sizes were explicitly requested using
396   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
397   // number of waves per execution unit to values implied by requested
398   // minimum/maximum flat work group sizes.
399   unsigned MinImpliedByFlatWorkGroupSize =
400     getMaxWavesPerEU(FlatWorkGroupSizes.second);
401   bool RequestedFlatWorkGroupSize = false;
402 
403   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
404     Default.first = MinImpliedByFlatWorkGroupSize;
405     RequestedFlatWorkGroupSize = true;
406   }
407 
408   // Requested minimum/maximum number of waves per execution unit.
409   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
410     F, "amdgpu-waves-per-eu", Default, true);
411 
412   // Make sure requested minimum is less than requested maximum.
413   if (Requested.second && Requested.first > Requested.second)
414     return Default;
415 
416   // Make sure requested values do not violate subtarget's specifications.
417   if (Requested.first < getMinWavesPerEU() ||
418       Requested.first > getMaxWavesPerEU())
419     return Default;
420   if (Requested.second > getMaxWavesPerEU())
421     return Default;
422 
423   // Make sure requested values are compatible with values implied by requested
424   // minimum/maximum flat work group sizes.
425   if (RequestedFlatWorkGroupSize &&
426       Requested.first < MinImpliedByFlatWorkGroupSize)
427     return Default;
428 
429   return Requested;
430 }
431 
432 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
433   Function *Kernel = I->getParent()->getParent();
434   unsigned MinSize = 0;
435   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
436   bool IdQuery = false;
437 
438   // If reqd_work_group_size is present it narrows value down.
439   if (auto *CI = dyn_cast<CallInst>(I)) {
440     const Function *F = CI->getCalledFunction();
441     if (F) {
442       unsigned Dim = UINT_MAX;
443       switch (F->getIntrinsicID()) {
444       case Intrinsic::amdgcn_workitem_id_x:
445       case Intrinsic::r600_read_tidig_x:
446         IdQuery = true;
447         LLVM_FALLTHROUGH;
448       case Intrinsic::r600_read_local_size_x:
449         Dim = 0;
450         break;
451       case Intrinsic::amdgcn_workitem_id_y:
452       case Intrinsic::r600_read_tidig_y:
453         IdQuery = true;
454         LLVM_FALLTHROUGH;
455       case Intrinsic::r600_read_local_size_y:
456         Dim = 1;
457         break;
458       case Intrinsic::amdgcn_workitem_id_z:
459       case Intrinsic::r600_read_tidig_z:
460         IdQuery = true;
461         LLVM_FALLTHROUGH;
462       case Intrinsic::r600_read_local_size_z:
463         Dim = 2;
464         break;
465       default:
466         break;
467       }
468       if (Dim <= 3) {
469         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
470           if (Node->getNumOperands() == 3)
471             MinSize = MaxSize = mdconst::extract<ConstantInt>(
472                                   Node->getOperand(Dim))->getZExtValue();
473       }
474     }
475   }
476 
477   if (!MaxSize)
478     return false;
479 
480   // Range metadata is [Lo, Hi). For ID query we need to pass max size
481   // as Hi. For size query we need to pass Hi + 1.
482   if (IdQuery)
483     MinSize = 0;
484   else
485     ++MaxSize;
486 
487   MDBuilder MDB(I->getContext());
488   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
489                                                   APInt(32, MaxSize));
490   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
491   return true;
492 }
493 
494 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
495                                                  Align &MaxAlign) const {
496   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
497          F.getCallingConv() == CallingConv::SPIR_KERNEL);
498 
499   const DataLayout &DL = F.getParent()->getDataLayout();
500   uint64_t ExplicitArgBytes = 0;
501   MaxAlign = Align::None();
502 
503   for (const Argument &Arg : F.args()) {
504     Type *ArgTy = Arg.getType();
505 
506     const Align Alignment(DL.getABITypeAlignment(ArgTy));
507     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
508     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
509     MaxAlign = std::max(MaxAlign, Alignment);
510   }
511 
512   return ExplicitArgBytes;
513 }
514 
515 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
516                                                 Align &MaxAlign) const {
517   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
518 
519   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
520 
521   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
522   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
523   if (ImplicitBytes != 0) {
524     const Align Alignment = getAlignmentForImplicitArgPtr();
525     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
526   }
527 
528   // Being able to dereference past the end is useful for emitting scalar loads.
529   return alignTo(TotalSize, 4);
530 }
531 
532 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
533                              const TargetMachine &TM) :
534   R600GenSubtargetInfo(TT, GPU, FS),
535   AMDGPUSubtarget(TT),
536   InstrInfo(*this),
537   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
538   FMA(false),
539   CaymanISA(false),
540   CFALUBug(false),
541   HasVertexCache(false),
542   R600ALUInst(false),
543   FP64(false),
544   TexVTXClauseSize(0),
545   Gen(R600),
546   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
547   InstrItins(getInstrItineraryForCPU(GPU)) { }
548 
549 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
550                                       unsigned NumRegionInstrs) const {
551   // Track register pressure so the scheduler can try to decrease
552   // pressure once register usage is above the threshold defined by
553   // SIRegisterInfo::getRegPressureSetLimit()
554   Policy.ShouldTrackPressure = true;
555 
556   // Enabling both top down and bottom up scheduling seems to give us less
557   // register spills than just using one of these approaches on its own.
558   Policy.OnlyTopDown = false;
559   Policy.OnlyBottomUp = false;
560 
561   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
562   if (!enableSIScheduler())
563     Policy.ShouldTrackLaneMasks = true;
564 }
565 
566 bool GCNSubtarget::hasMadF16() const {
567   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
568 }
569 
570 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
571   if (getGeneration() >= AMDGPUSubtarget::GFX10)
572     return getMaxWavesPerEU();
573 
574   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
575     if (SGPRs <= 80)
576       return 10;
577     if (SGPRs <= 88)
578       return 9;
579     if (SGPRs <= 100)
580       return 8;
581     return 7;
582   }
583   if (SGPRs <= 48)
584     return 10;
585   if (SGPRs <= 56)
586     return 9;
587   if (SGPRs <= 64)
588     return 8;
589   if (SGPRs <= 72)
590     return 7;
591   if (SGPRs <= 80)
592     return 6;
593   return 5;
594 }
595 
596 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
597   unsigned MaxWaves = getMaxWavesPerEU();
598   unsigned Granule = getVGPRAllocGranule();
599   if (VGPRs < Granule)
600     return MaxWaves;
601   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
602   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
603 }
604 
605 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
606   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
607   if (getGeneration() >= AMDGPUSubtarget::GFX10)
608     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
609 
610   if (MFI.hasFlatScratchInit()) {
611     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
612       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
613     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
614       return 4; // FLAT_SCRATCH, VCC (in that order).
615   }
616 
617   if (isXNACKEnabled())
618     return 4; // XNACK, VCC (in that order).
619   return 2; // VCC.
620 }
621 
622 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
623                                         unsigned LDSSize,
624                                         unsigned NumSGPRs,
625                                         unsigned NumVGPRs) const {
626   unsigned Occupancy =
627     std::min(getMaxWavesPerEU(),
628              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
629   if (NumSGPRs)
630     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
631   if (NumVGPRs)
632     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
633   return Occupancy;
634 }
635 
636 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
637   const Function &F = MF.getFunction();
638   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
639 
640   // Compute maximum number of SGPRs function can use using default/requested
641   // minimum number of waves per execution unit.
642   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
643   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
644   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
645 
646   // Check if maximum number of SGPRs was explicitly requested using
647   // "amdgpu-num-sgpr" attribute.
648   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
649     unsigned Requested = AMDGPU::getIntegerAttribute(
650       F, "amdgpu-num-sgpr", MaxNumSGPRs);
651 
652     // Make sure requested value does not violate subtarget's specifications.
653     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
654       Requested = 0;
655 
656     // If more SGPRs are required to support the input user/system SGPRs,
657     // increase to accommodate them.
658     //
659     // FIXME: This really ends up using the requested number of SGPRs + number
660     // of reserved special registers in total. Theoretically you could re-use
661     // the last input registers for these special registers, but this would
662     // require a lot of complexity to deal with the weird aliasing.
663     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
664     if (Requested && Requested < InputNumSGPRs)
665       Requested = InputNumSGPRs;
666 
667     // Make sure requested value is compatible with values implied by
668     // default/requested minimum/maximum number of waves per execution unit.
669     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
670       Requested = 0;
671     if (WavesPerEU.second &&
672         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
673       Requested = 0;
674 
675     if (Requested)
676       MaxNumSGPRs = Requested;
677   }
678 
679   if (hasSGPRInitBug())
680     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
681 
682   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
683                   MaxAddressableNumSGPRs);
684 }
685 
686 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
687   const Function &F = MF.getFunction();
688   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
689 
690   // Compute maximum number of VGPRs function can use using default/requested
691   // minimum number of waves per execution unit.
692   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
693   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
694 
695   // Check if maximum number of VGPRs was explicitly requested using
696   // "amdgpu-num-vgpr" attribute.
697   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
698     unsigned Requested = AMDGPU::getIntegerAttribute(
699       F, "amdgpu-num-vgpr", MaxNumVGPRs);
700 
701     // Make sure requested value is compatible with values implied by
702     // default/requested minimum/maximum number of waves per execution unit.
703     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
704       Requested = 0;
705     if (WavesPerEU.second &&
706         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
707       Requested = 0;
708 
709     if (Requested)
710       MaxNumVGPRs = Requested;
711   }
712 
713   return MaxNumVGPRs;
714 }
715 
716 namespace {
717 struct MemOpClusterMutation : ScheduleDAGMutation {
718   const SIInstrInfo *TII;
719 
720   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
721 
722   void apply(ScheduleDAGInstrs *DAG) override {
723     SUnit *SUa = nullptr;
724     // Search for two consequent memory operations and link them
725     // to prevent scheduler from moving them apart.
726     // In DAG pre-process SUnits are in the original order of
727     // the instructions before scheduling.
728     for (SUnit &SU : DAG->SUnits) {
729       MachineInstr &MI2 = *SU.getInstr();
730       if (!MI2.mayLoad() && !MI2.mayStore()) {
731         SUa = nullptr;
732         continue;
733       }
734       if (!SUa) {
735         SUa = &SU;
736         continue;
737       }
738 
739       MachineInstr &MI1 = *SUa->getInstr();
740       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
741           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
742           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
743           (TII->isDS(MI1)   && TII->isDS(MI2))) {
744         SU.addPredBarrier(SUa);
745 
746         for (const SDep &SI : SU.Preds) {
747           if (SI.getSUnit() != SUa)
748             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
749         }
750 
751         if (&SU != &DAG->ExitSU) {
752           for (const SDep &SI : SUa->Succs) {
753             if (SI.getSUnit() != &SU)
754               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
755           }
756         }
757       }
758 
759       SUa = &SU;
760     }
761   }
762 };
763 
764 struct FillMFMAShadowMutation : ScheduleDAGMutation {
765   const SIInstrInfo *TII;
766 
767   ScheduleDAGMI *DAG;
768 
769   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
770 
771   bool isSALU(const SUnit *SU) const {
772     const MachineInstr *MI = SU->getInstr();
773     return MI && TII->isSALU(*MI) && !MI->isTerminator();
774   }
775 
776   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
777     if (Pred->NodeNum < Succ->NodeNum)
778       return true;
779 
780     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
781 
782     for (unsigned I = 0; I < Succs.size(); ++I) {
783       for (const SDep &SI : Succs[I]->Succs) {
784         const SUnit *SU = SI.getSUnit();
785         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
786           Succs.push_back(SU);
787       }
788     }
789 
790     SmallPtrSet<const SUnit*, 32> Visited;
791     while (!Preds.empty()) {
792       const SUnit *SU = Preds.pop_back_val();
793       if (llvm::find(Succs, SU) != Succs.end())
794         return false;
795       Visited.insert(SU);
796       for (const SDep &SI : SU->Preds)
797         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
798           Preds.push_back(SI.getSUnit());
799     }
800 
801     return true;
802   }
803 
804   // Link as much SALU intructions in chain as possible. Return the size
805   // of the chain. Links up to MaxChain instructions.
806   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
807                          SmallPtrSetImpl<SUnit *> &Visited) const {
808     SmallVector<SUnit *, 8> Worklist({To});
809     unsigned Linked = 0;
810 
811     while (!Worklist.empty() && MaxChain-- > 0) {
812       SUnit *SU = Worklist.pop_back_val();
813       if (!Visited.insert(SU).second)
814         continue;
815 
816       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
817                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
818 
819       if (SU->addPred(SDep(From, SDep::Artificial), false))
820         ++Linked;
821 
822       for (SDep &SI : From->Succs) {
823         SUnit *SUv = SI.getSUnit();
824         if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
825           SUv->addPred(SDep(SU, SDep::Artificial), false);
826       }
827 
828       for (SDep &SI : SU->Succs) {
829         SUnit *Succ = SI.getSUnit();
830         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
831           Worklist.push_back(Succ);
832       }
833     }
834 
835     return Linked;
836   }
837 
838   void apply(ScheduleDAGInstrs *DAGInstrs) override {
839     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
840     if (!ST.hasMAIInsts() || DisablePowerSched)
841       return;
842     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
843     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
844     if (!TSchedModel || DAG->SUnits.empty())
845       return;
846 
847     // Scan for MFMA long latency instructions and try to add a dependency
848     // of available SALU instructions to give them a chance to fill MFMA
849     // shadow. That is desirable to fill MFMA shadow with SALU instructions
850     // rather than VALU to prevent power consumption bursts and throttle.
851     auto LastSALU = DAG->SUnits.begin();
852     auto E = DAG->SUnits.end();
853     SmallPtrSet<SUnit*, 32> Visited;
854     for (SUnit &SU : DAG->SUnits) {
855       MachineInstr &MAI = *SU.getInstr();
856       if (!TII->isMAI(MAI) ||
857            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
858            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
859         continue;
860 
861       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
862 
863       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
864                  dbgs() << "Need " << Lat
865                         << " instructions to cover latency.\n");
866 
867       // Find up to Lat independent scalar instructions as early as
868       // possible such that they can be scheduled after this MFMA.
869       for ( ; Lat && LastSALU != E; ++LastSALU) {
870         if (Visited.count(&*LastSALU))
871           continue;
872 
873         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
874           continue;
875 
876         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
877       }
878     }
879   }
880 };
881 } // namespace
882 
883 void GCNSubtarget::getPostRAMutations(
884     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
885   Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
886   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
887 }
888 
889 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
890   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
891     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
892   else
893     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
894 }
895 
896 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
897   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
898     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
899   else
900     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
901 }
902