xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "SIFrameLowering.h"
21 #include "SIISelLowering.h"
22 #include "SIInstrInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
25 #include "llvm/Support/ErrorHandling.h"
26 
27 #define GET_SUBTARGETINFO_HEADER
28 #include "AMDGPUGenSubtargetInfo.inc"
29 
30 namespace llvm {
31 
32 class GCNTargetMachine;
33 
34 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
35                            public AMDGPUSubtarget {
36 public:
37   using AMDGPUSubtarget::getMaxWavesPerEU;
38 
39   // Following 2 enums are documented at:
40   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41   enum class TrapHandlerAbi {
42     NONE   = 0x00,
43     AMDHSA = 0x01,
44   };
45 
46   enum class TrapID {
47     LLVMAMDHSATrap      = 0x02,
48     LLVMAMDHSADebugTrap = 0x03,
49   };
50 
51 private:
52   /// GlobalISel related APIs.
53   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
54   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
55   std::unique_ptr<InstructionSelector> InstSelector;
56   std::unique_ptr<LegalizerInfo> Legalizer;
57   std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
58 
59 protected:
60   // Basic subtarget description.
61   Triple TargetTriple;
62   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
63   unsigned Gen = INVALID;
64   InstrItineraryData InstrItins;
65   int LDSBankCount = 0;
66   unsigned MaxPrivateElementSize = 0;
67 
68   // Possibly statically set by tablegen, but may want to be overridden.
69   bool FastDenormalF32 = false;
70   bool HalfRate64Ops = false;
71   bool FullRate64Ops = false;
72 
73   // Dynamically set bits that enable features.
74   bool FlatForGlobal = false;
75   bool AutoWaitcntBeforeBarrier = false;
76   bool BackOffBarrier = false;
77   bool UnalignedScratchAccess = false;
78   bool UnalignedAccessMode = false;
79   bool HasApertureRegs = false;
80   bool SupportsXNACK = false;
81   bool KernargPreload = false;
82 
83   // This should not be used directly. 'TargetID' tracks the dynamic settings
84   // for XNACK.
85   bool EnableXNACK = false;
86 
87   bool EnableTgSplit = false;
88   bool EnableCuMode = false;
89   bool TrapHandler = false;
90 
91   // Used as options.
92   bool EnableLoadStoreOpt = false;
93   bool EnableUnsafeDSOffsetFolding = false;
94   bool EnableSIScheduler = false;
95   bool EnableDS128 = false;
96   bool EnablePRTStrictNull = false;
97   bool DumpCode = false;
98 
99   // Subtarget statically properties set by tablegen
100   bool FP64 = false;
101   bool FMA = false;
102   bool MIMG_R128 = false;
103   bool CIInsts = false;
104   bool GFX8Insts = false;
105   bool GFX9Insts = false;
106   bool GFX90AInsts = false;
107   bool GFX940Insts = false;
108   bool GFX10Insts = false;
109   bool GFX11Insts = false;
110   bool GFX12Insts = false;
111   bool GFX10_3Insts = false;
112   bool GFX7GFX8GFX9Insts = false;
113   bool SGPRInitBug = false;
114   bool UserSGPRInit16Bug = false;
115   bool NegativeScratchOffsetBug = false;
116   bool NegativeUnalignedScratchOffsetBug = false;
117   bool HasSMemRealTime = false;
118   bool HasIntClamp = false;
119   bool HasFmaMixInsts = false;
120   bool HasMovrel = false;
121   bool HasVGPRIndexMode = false;
122   bool HasScalarDwordx3Loads = false;
123   bool HasScalarStores = false;
124   bool HasScalarAtomics = false;
125   bool HasSDWAOmod = false;
126   bool HasSDWAScalar = false;
127   bool HasSDWASdst = false;
128   bool HasSDWAMac = false;
129   bool HasSDWAOutModsVOPC = false;
130   bool HasDPP = false;
131   bool HasDPP8 = false;
132   bool HasDPALU_DPP = false;
133   bool HasDPPSrc1SGPR = false;
134   bool HasPackedFP32Ops = false;
135   bool HasImageInsts = false;
136   bool HasExtendedImageInsts = false;
137   bool HasR128A16 = false;
138   bool HasA16 = false;
139   bool HasG16 = false;
140   bool HasNSAEncoding = false;
141   bool HasPartialNSAEncoding = false;
142   bool GFX10_AEncoding = false;
143   bool GFX10_BEncoding = false;
144   bool HasDLInsts = false;
145   bool HasFmacF64Inst = false;
146   bool HasDot1Insts = false;
147   bool HasDot2Insts = false;
148   bool HasDot3Insts = false;
149   bool HasDot4Insts = false;
150   bool HasDot5Insts = false;
151   bool HasDot6Insts = false;
152   bool HasDot7Insts = false;
153   bool HasDot8Insts = false;
154   bool HasDot9Insts = false;
155   bool HasDot10Insts = false;
156   bool HasMAIInsts = false;
157   bool HasFP8Insts = false;
158   bool HasPkFmacF16Inst = false;
159   bool HasAtomicDsPkAdd16Insts = false;
160   bool HasAtomicFlatPkAdd16Insts = false;
161   bool HasAtomicFaddRtnInsts = false;
162   bool HasAtomicFaddNoRtnInsts = false;
163   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
164   bool HasAtomicBufferGlobalPkAddF16Insts = false;
165   bool HasAtomicCSubNoRtnInsts = false;
166   bool HasAtomicGlobalPkAddBF16Inst = false;
167   bool HasFlatAtomicFaddF32Inst = false;
168   bool SupportsSRAMECC = false;
169 
170   // This should not be used directly. 'TargetID' tracks the dynamic settings
171   // for SRAMECC.
172   bool EnableSRAMECC = false;
173 
174   bool HasNoSdstCMPX = false;
175   bool HasVscnt = false;
176   bool HasGetWaveIdInst = false;
177   bool HasSMemTimeInst = false;
178   bool HasShaderCyclesRegister = false;
179   bool HasShaderCyclesHiLoRegisters = false;
180   bool HasVOP3Literal = false;
181   bool HasNoDataDepHazard = false;
182   bool FlatAddressSpace = false;
183   bool FlatInstOffsets = false;
184   bool FlatGlobalInsts = false;
185   bool FlatScratchInsts = false;
186   bool ScalarFlatScratchInsts = false;
187   bool HasArchitectedFlatScratch = false;
188   bool EnableFlatScratch = false;
189   bool HasArchitectedSGPRs = false;
190   bool HasGDS = false;
191   bool HasGWS = false;
192   bool AddNoCarryInsts = false;
193   bool HasUnpackedD16VMem = false;
194   bool LDSMisalignedBug = false;
195   bool HasMFMAInlineLiteralBug = false;
196   bool UnalignedBufferAccess = false;
197   bool UnalignedDSAccess = false;
198   bool HasPackedTID = false;
199   bool ScalarizeGlobal = false;
200   bool HasSALUFloatInsts = false;
201   bool HasVGPRSingleUseHintInsts = false;
202   bool HasPseudoScalarTrans = false;
203   bool HasRestrictedSOffset = false;
204 
205   bool HasVcmpxPermlaneHazard = false;
206   bool HasVMEMtoScalarWriteHazard = false;
207   bool HasSMEMtoVectorWriteHazard = false;
208   bool HasInstFwdPrefetchBug = false;
209   bool HasVcmpxExecWARHazard = false;
210   bool HasLdsBranchVmemWARHazard = false;
211   bool HasNSAtoVMEMBug = false;
212   bool HasNSAClauseBug = false;
213   bool HasOffset3fBug = false;
214   bool HasFlatSegmentOffsetBug = false;
215   bool HasImageStoreD16Bug = false;
216   bool HasImageGather4D16Bug = false;
217   bool HasMSAALoadDstSelBug = false;
218   bool HasGFX11FullVGPRs = false;
219   bool HasMADIntraFwdBug = false;
220   bool HasVOPDInsts = false;
221   bool HasVALUTransUseHazard = false;
222   bool HasForceStoreSC0SC1 = false;
223 
224   // Dummy feature to use for assembler in tablegen.
225   bool FeatureDisable = false;
226 
227   SelectionDAGTargetInfo TSInfo;
228 private:
229   SIInstrInfo InstrInfo;
230   SITargetLowering TLInfo;
231   SIFrameLowering FrameLowering;
232 
233 public:
234   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
235                const GCNTargetMachine &TM);
236   ~GCNSubtarget() override;
237 
238   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
239                                                    StringRef GPU, StringRef FS);
240 
241   const SIInstrInfo *getInstrInfo() const override {
242     return &InstrInfo;
243   }
244 
245   const SIFrameLowering *getFrameLowering() const override {
246     return &FrameLowering;
247   }
248 
249   const SITargetLowering *getTargetLowering() const override {
250     return &TLInfo;
251   }
252 
253   const SIRegisterInfo *getRegisterInfo() const override {
254     return &InstrInfo.getRegisterInfo();
255   }
256 
257   const CallLowering *getCallLowering() const override {
258     return CallLoweringInfo.get();
259   }
260 
261   const InlineAsmLowering *getInlineAsmLowering() const override {
262     return InlineAsmLoweringInfo.get();
263   }
264 
265   InstructionSelector *getInstructionSelector() const override {
266     return InstSelector.get();
267   }
268 
269   const LegalizerInfo *getLegalizerInfo() const override {
270     return Legalizer.get();
271   }
272 
273   const AMDGPURegisterBankInfo *getRegBankInfo() const override {
274     return RegBankInfo.get();
275   }
276 
277   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
278     return TargetID;
279   }
280 
281   // Nothing implemented, just prevent crashes on use.
282   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
283     return &TSInfo;
284   }
285 
286   const InstrItineraryData *getInstrItineraryData() const override {
287     return &InstrItins;
288   }
289 
290   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
291 
292   Generation getGeneration() const {
293     return (Generation)Gen;
294   }
295 
296   unsigned getMaxWaveScratchSize() const {
297     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
298     if (getGeneration() < GFX11) {
299       // 13-bit field in units of 256-dword.
300       return (256 * 4) * ((1 << 13) - 1);
301     }
302     // 15-bit field in units of 64-dword.
303     return (64 * 4) * ((1 << 15) - 1);
304   }
305 
306   /// Return the number of high bits known to be zero for a frame index.
307   unsigned getKnownHighZeroBitsForFrameIndex() const {
308     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
309   }
310 
311   int getLDSBankCount() const {
312     return LDSBankCount;
313   }
314 
315   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
316     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
317   }
318 
319   unsigned getConstantBusLimit(unsigned Opcode) const;
320 
321   /// Returns if the result of this instruction with a 16-bit result returned in
322   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
323   /// the original value.
324   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
325 
326   bool supportsWGP() const { return getGeneration() >= GFX10; }
327 
328   bool hasIntClamp() const {
329     return HasIntClamp;
330   }
331 
332   bool hasFP64() const {
333     return FP64;
334   }
335 
336   bool hasMIMG_R128() const {
337     return MIMG_R128;
338   }
339 
340   bool hasHWFP64() const {
341     return FP64;
342   }
343 
344   bool hasHalfRate64Ops() const {
345     return HalfRate64Ops;
346   }
347 
348   bool hasFullRate64Ops() const {
349     return FullRate64Ops;
350   }
351 
352   bool hasAddr64() const {
353     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
354   }
355 
356   bool hasFlat() const {
357     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
358   }
359 
360   // Return true if the target only has the reverse operand versions of VALU
361   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
362   bool hasOnlyRevVALUShifts() const {
363     return getGeneration() >= VOLCANIC_ISLANDS;
364   }
365 
366   bool hasFractBug() const {
367     return getGeneration() == SOUTHERN_ISLANDS;
368   }
369 
370   bool hasBFE() const {
371     return true;
372   }
373 
374   bool hasBFI() const {
375     return true;
376   }
377 
378   bool hasBFM() const {
379     return hasBFE();
380   }
381 
382   bool hasBCNT(unsigned Size) const {
383     return true;
384   }
385 
386   bool hasFFBL() const {
387     return true;
388   }
389 
390   bool hasFFBH() const {
391     return true;
392   }
393 
394   bool hasMed3_16() const {
395     return getGeneration() >= AMDGPUSubtarget::GFX9;
396   }
397 
398   bool hasMin3Max3_16() const {
399     return getGeneration() >= AMDGPUSubtarget::GFX9;
400   }
401 
402   bool hasFmaMixInsts() const {
403     return HasFmaMixInsts;
404   }
405 
406   bool hasCARRY() const {
407     return true;
408   }
409 
410   bool hasFMA() const {
411     return FMA;
412   }
413 
414   bool hasSwap() const {
415     return GFX9Insts;
416   }
417 
418   bool hasScalarPackInsts() const {
419     return GFX9Insts;
420   }
421 
422   bool hasScalarMulHiInsts() const {
423     return GFX9Insts;
424   }
425 
426   TrapHandlerAbi getTrapHandlerAbi() const {
427     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
428   }
429 
430   bool supportsGetDoorbellID() const {
431     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
432     return getGeneration() >= GFX9;
433   }
434 
435   /// True if the offset field of DS instructions works as expected. On SI, the
436   /// offset uses a 16-bit adder and does not always wrap properly.
437   bool hasUsableDSOffset() const {
438     return getGeneration() >= SEA_ISLANDS;
439   }
440 
441   bool unsafeDSOffsetFoldingEnabled() const {
442     return EnableUnsafeDSOffsetFolding;
443   }
444 
445   /// Condition output from div_scale is usable.
446   bool hasUsableDivScaleConditionOutput() const {
447     return getGeneration() != SOUTHERN_ISLANDS;
448   }
449 
450   /// Extra wait hazard is needed in some cases before
451   /// s_cbranch_vccnz/s_cbranch_vccz.
452   bool hasReadVCCZBug() const {
453     return getGeneration() <= SEA_ISLANDS;
454   }
455 
456   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
457   bool partialVCCWritesUpdateVCCZ() const {
458     return getGeneration() >= GFX10;
459   }
460 
461   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
462   /// was written by a VALU instruction.
463   bool hasSMRDReadVALUDefHazard() const {
464     return getGeneration() == SOUTHERN_ISLANDS;
465   }
466 
467   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
468   /// SGPR was written by a VALU Instruction.
469   bool hasVMEMReadSGPRVALUDefHazard() const {
470     return getGeneration() >= VOLCANIC_ISLANDS;
471   }
472 
473   bool hasRFEHazards() const {
474     return getGeneration() >= VOLCANIC_ISLANDS;
475   }
476 
477   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
478   unsigned getSetRegWaitStates() const {
479     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
480   }
481 
482   bool dumpCode() const {
483     return DumpCode;
484   }
485 
486   /// Return the amount of LDS that can be used that will not restrict the
487   /// occupancy lower than WaveCount.
488   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
489                                            const Function &) const;
490 
491   bool supportsMinMaxDenormModes() const {
492     return getGeneration() >= AMDGPUSubtarget::GFX9;
493   }
494 
495   /// \returns If target supports S_DENORM_MODE.
496   bool hasDenormModeInst() const {
497     return getGeneration() >= AMDGPUSubtarget::GFX10;
498   }
499 
500   bool useFlatForGlobal() const {
501     return FlatForGlobal;
502   }
503 
504   /// \returns If target supports ds_read/write_b128 and user enables generation
505   /// of ds_read/write_b128.
506   bool useDS128() const {
507     return CIInsts && EnableDS128;
508   }
509 
510   /// \return If target supports ds_read/write_b96/128.
511   bool hasDS96AndDS128() const {
512     return CIInsts;
513   }
514 
515   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
516   bool haveRoundOpsF64() const {
517     return CIInsts;
518   }
519 
520   /// \returns If MUBUF instructions always perform range checking, even for
521   /// buffer resources used for private memory access.
522   bool privateMemoryResourceIsRangeChecked() const {
523     return getGeneration() < AMDGPUSubtarget::GFX9;
524   }
525 
526   /// \returns If target requires PRT Struct NULL support (zero result registers
527   /// for sparse texture support).
528   bool usePRTStrictNull() const {
529     return EnablePRTStrictNull;
530   }
531 
532   bool hasAutoWaitcntBeforeBarrier() const {
533     return AutoWaitcntBeforeBarrier;
534   }
535 
536   /// \returns true if the target supports backing off of s_barrier instructions
537   /// when an exception is raised.
538   bool supportsBackOffBarrier() const {
539     return BackOffBarrier;
540   }
541 
542   bool hasUnalignedBufferAccess() const {
543     return UnalignedBufferAccess;
544   }
545 
546   bool hasUnalignedBufferAccessEnabled() const {
547     return UnalignedBufferAccess && UnalignedAccessMode;
548   }
549 
550   bool hasUnalignedDSAccess() const {
551     return UnalignedDSAccess;
552   }
553 
554   bool hasUnalignedDSAccessEnabled() const {
555     return UnalignedDSAccess && UnalignedAccessMode;
556   }
557 
558   bool hasUnalignedScratchAccess() const {
559     return UnalignedScratchAccess;
560   }
561 
562   bool hasUnalignedAccessMode() const {
563     return UnalignedAccessMode;
564   }
565 
566   bool hasApertureRegs() const {
567     return HasApertureRegs;
568   }
569 
570   bool isTrapHandlerEnabled() const {
571     return TrapHandler;
572   }
573 
574   bool isXNACKEnabled() const {
575     return TargetID.isXnackOnOrAny();
576   }
577 
578   bool isTgSplitEnabled() const {
579     return EnableTgSplit;
580   }
581 
582   bool isCuModeEnabled() const {
583     return EnableCuMode;
584   }
585 
586   bool hasFlatAddressSpace() const {
587     return FlatAddressSpace;
588   }
589 
590   bool hasFlatScrRegister() const {
591     return hasFlatAddressSpace();
592   }
593 
594   bool hasFlatInstOffsets() const {
595     return FlatInstOffsets;
596   }
597 
598   bool hasFlatGlobalInsts() const {
599     return FlatGlobalInsts;
600   }
601 
602   bool hasFlatScratchInsts() const {
603     return FlatScratchInsts;
604   }
605 
606   // Check if target supports ST addressing mode with FLAT scratch instructions.
607   // The ST addressing mode means no registers are used, either VGPR or SGPR,
608   // but only immediate offset is swizzled and added to the FLAT scratch base.
609   bool hasFlatScratchSTMode() const {
610     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
611   }
612 
613   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
614 
615   bool hasScalarFlatScratchInsts() const {
616     return ScalarFlatScratchInsts;
617   }
618 
619   bool enableFlatScratch() const {
620     return flatScratchIsArchitected() ||
621            (EnableFlatScratch && hasFlatScratchInsts());
622   }
623 
624   bool hasGlobalAddTidInsts() const {
625     return GFX10_BEncoding;
626   }
627 
628   bool hasAtomicCSub() const {
629     return GFX10_BEncoding;
630   }
631 
632   bool hasMultiDwordFlatScratchAddressing() const {
633     return getGeneration() >= GFX9;
634   }
635 
636   bool hasFlatSegmentOffsetBug() const {
637     return HasFlatSegmentOffsetBug;
638   }
639 
640   bool hasFlatLgkmVMemCountInOrder() const {
641     return getGeneration() > GFX9;
642   }
643 
644   bool hasD16LoadStore() const {
645     return getGeneration() >= GFX9;
646   }
647 
648   bool d16PreservesUnusedBits() const {
649     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
650   }
651 
652   bool hasD16Images() const {
653     return getGeneration() >= VOLCANIC_ISLANDS;
654   }
655 
656   /// Return if most LDS instructions have an m0 use that require m0 to be
657   /// initialized.
658   bool ldsRequiresM0Init() const {
659     return getGeneration() < GFX9;
660   }
661 
662   // True if the hardware rewinds and replays GWS operations if a wave is
663   // preempted.
664   //
665   // If this is false, a GWS operation requires testing if a nack set the
666   // MEM_VIOL bit, and repeating if so.
667   bool hasGWSAutoReplay() const {
668     return getGeneration() >= GFX9;
669   }
670 
671   /// \returns if target has ds_gws_sema_release_all instruction.
672   bool hasGWSSemaReleaseAll() const {
673     return CIInsts;
674   }
675 
676   /// \returns true if the target has integer add/sub instructions that do not
677   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
678   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
679   /// for saturation.
680   bool hasAddNoCarry() const {
681     return AddNoCarryInsts;
682   }
683 
684   bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
685 
686   bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
687 
688   bool hasUnpackedD16VMem() const {
689     return HasUnpackedD16VMem;
690   }
691 
692   // Covers VS/PS/CS graphics shaders
693   bool isMesaGfxShader(const Function &F) const {
694     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
695   }
696 
697   bool hasMad64_32() const {
698     return getGeneration() >= SEA_ISLANDS;
699   }
700 
701   bool hasSDWAOmod() const {
702     return HasSDWAOmod;
703   }
704 
705   bool hasSDWAScalar() const {
706     return HasSDWAScalar;
707   }
708 
709   bool hasSDWASdst() const {
710     return HasSDWASdst;
711   }
712 
713   bool hasSDWAMac() const {
714     return HasSDWAMac;
715   }
716 
717   bool hasSDWAOutModsVOPC() const {
718     return HasSDWAOutModsVOPC;
719   }
720 
721   bool hasDLInsts() const {
722     return HasDLInsts;
723   }
724 
725   bool hasFmacF64Inst() const { return HasFmacF64Inst; }
726 
727   bool hasDot1Insts() const {
728     return HasDot1Insts;
729   }
730 
731   bool hasDot2Insts() const {
732     return HasDot2Insts;
733   }
734 
735   bool hasDot3Insts() const {
736     return HasDot3Insts;
737   }
738 
739   bool hasDot4Insts() const {
740     return HasDot4Insts;
741   }
742 
743   bool hasDot5Insts() const {
744     return HasDot5Insts;
745   }
746 
747   bool hasDot6Insts() const {
748     return HasDot6Insts;
749   }
750 
751   bool hasDot7Insts() const {
752     return HasDot7Insts;
753   }
754 
755   bool hasDot8Insts() const {
756     return HasDot8Insts;
757   }
758 
759   bool hasDot9Insts() const {
760     return HasDot9Insts;
761   }
762 
763   bool hasDot10Insts() const {
764     return HasDot10Insts;
765   }
766 
767   bool hasMAIInsts() const {
768     return HasMAIInsts;
769   }
770 
771   bool hasFP8Insts() const {
772     return HasFP8Insts;
773   }
774 
775   bool hasPkFmacF16Inst() const {
776     return HasPkFmacF16Inst;
777   }
778 
779   bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
780 
781   bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
782 
783   bool hasAtomicFaddInsts() const {
784     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
785   }
786 
787   bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
788 
789   bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
790 
791   bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
792     return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
793   }
794 
795   bool hasAtomicBufferGlobalPkAddF16Insts() const {
796     return HasAtomicBufferGlobalPkAddF16Insts;
797   }
798 
799   bool hasAtomicGlobalPkAddBF16Inst() const {
800     return HasAtomicGlobalPkAddBF16Inst;
801   }
802 
803   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
804 
805   bool hasNoSdstCMPX() const {
806     return HasNoSdstCMPX;
807   }
808 
809   bool hasVscnt() const {
810     return HasVscnt;
811   }
812 
813   bool hasGetWaveIdInst() const {
814     return HasGetWaveIdInst;
815   }
816 
817   bool hasSMemTimeInst() const {
818     return HasSMemTimeInst;
819   }
820 
821   bool hasShaderCyclesRegister() const {
822     return HasShaderCyclesRegister;
823   }
824 
825   bool hasShaderCyclesHiLoRegisters() const {
826     return HasShaderCyclesHiLoRegisters;
827   }
828 
829   bool hasVOP3Literal() const {
830     return HasVOP3Literal;
831   }
832 
833   bool hasNoDataDepHazard() const {
834     return HasNoDataDepHazard;
835   }
836 
837   bool vmemWriteNeedsExpWaitcnt() const {
838     return getGeneration() < SEA_ISLANDS;
839   }
840 
841   bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
842 
843   bool hasPrefetch() const { return GFX12Insts; }
844 
845   // Has s_cmpk_* instructions.
846   bool hasSCmpK() const { return getGeneration() < GFX12; }
847 
848   // Scratch is allocated in 256 dword per wave blocks for the entire
849   // wavefront. When viewed from the perspective of an arbitrary workitem, this
850   // is 4-byte aligned.
851   //
852   // Only 4-byte alignment is really needed to access anything. Transformations
853   // on the pointer value itself may rely on the alignment / known low bits of
854   // the pointer. Set this to something above the minimum to avoid needing
855   // dynamic realignment in common cases.
856   Align getStackAlignment() const { return Align(16); }
857 
858   bool enableMachineScheduler() const override {
859     return true;
860   }
861 
862   bool useAA() const override;
863 
864   bool enableSubRegLiveness() const override {
865     return true;
866   }
867 
868   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
869   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
870 
871   // static wrappers
872   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
873 
874   // XXX - Why is this here if it isn't in the default pass set?
875   bool enableEarlyIfConversion() const override {
876     return true;
877   }
878 
879   void overrideSchedPolicy(MachineSchedPolicy &Policy,
880                            unsigned NumRegionInstrs) const override;
881 
882   unsigned getMaxNumUserSGPRs() const {
883     return AMDGPU::getMaxNumUserSGPRs(*this);
884   }
885 
886   bool hasSMemRealTime() const {
887     return HasSMemRealTime;
888   }
889 
890   bool hasMovrel() const {
891     return HasMovrel;
892   }
893 
894   bool hasVGPRIndexMode() const {
895     return HasVGPRIndexMode;
896   }
897 
898   bool useVGPRIndexMode() const;
899 
900   bool hasScalarCompareEq64() const {
901     return getGeneration() >= VOLCANIC_ISLANDS;
902   }
903 
904   bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
905 
906   bool hasScalarStores() const {
907     return HasScalarStores;
908   }
909 
910   bool hasScalarAtomics() const {
911     return HasScalarAtomics;
912   }
913 
914   bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
915 
916   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
917   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
918 
919   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
920   bool hasPermLane64() const { return getGeneration() >= GFX11; }
921 
922   bool hasDPP() const {
923     return HasDPP;
924   }
925 
926   bool hasDPPBroadcasts() const {
927     return HasDPP && getGeneration() < GFX10;
928   }
929 
930   bool hasDPPWavefrontShifts() const {
931     return HasDPP && getGeneration() < GFX10;
932   }
933 
934   bool hasDPP8() const {
935     return HasDPP8;
936   }
937 
938   bool hasDPALU_DPP() const {
939     return HasDPALU_DPP;
940   }
941 
942   bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
943 
944   bool hasPackedFP32Ops() const {
945     return HasPackedFP32Ops;
946   }
947 
948   // Has V_PK_MOV_B32 opcode
949   bool hasPkMovB32() const {
950     return GFX90AInsts;
951   }
952 
953   bool hasFmaakFmamkF32Insts() const {
954     return getGeneration() >= GFX10 || hasGFX940Insts();
955   }
956 
957   bool hasImageInsts() const {
958     return HasImageInsts;
959   }
960 
961   bool hasExtendedImageInsts() const {
962     return HasExtendedImageInsts;
963   }
964 
965   bool hasR128A16() const {
966     return HasR128A16;
967   }
968 
969   bool hasA16() const { return HasA16; }
970 
971   bool hasG16() const { return HasG16; }
972 
973   bool hasOffset3fBug() const {
974     return HasOffset3fBug;
975   }
976 
977   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
978 
979   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
980 
981   bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
982 
983   bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
984 
985   bool hasNSAEncoding() const { return HasNSAEncoding; }
986 
987   bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
988 
989   unsigned getNSAMaxSize(bool HasSampler = false) const {
990     return AMDGPU::getNSAMaxSize(*this, HasSampler);
991   }
992 
993   bool hasGFX10_AEncoding() const {
994     return GFX10_AEncoding;
995   }
996 
997   bool hasGFX10_BEncoding() const {
998     return GFX10_BEncoding;
999   }
1000 
1001   bool hasGFX10_3Insts() const {
1002     return GFX10_3Insts;
1003   }
1004 
1005   bool hasMadF16() const;
1006 
1007   bool hasMovB64() const { return GFX940Insts; }
1008 
1009   bool hasLshlAddB64() const { return GFX940Insts; }
1010 
1011   bool enableSIScheduler() const {
1012     return EnableSIScheduler;
1013   }
1014 
1015   bool loadStoreOptEnabled() const {
1016     return EnableLoadStoreOpt;
1017   }
1018 
1019   bool hasSGPRInitBug() const {
1020     return SGPRInitBug;
1021   }
1022 
1023   bool hasUserSGPRInit16Bug() const {
1024     return UserSGPRInit16Bug && isWave32();
1025   }
1026 
1027   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1028 
1029   bool hasNegativeUnalignedScratchOffsetBug() const {
1030     return NegativeUnalignedScratchOffsetBug;
1031   }
1032 
1033   bool hasMFMAInlineLiteralBug() const {
1034     return HasMFMAInlineLiteralBug;
1035   }
1036 
1037   bool has12DWordStoreHazard() const {
1038     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1039   }
1040 
1041   // \returns true if the subtarget supports DWORDX3 load/store instructions.
1042   bool hasDwordx3LoadStores() const {
1043     return CIInsts;
1044   }
1045 
1046   bool hasReadM0MovRelInterpHazard() const {
1047     return getGeneration() == AMDGPUSubtarget::GFX9;
1048   }
1049 
1050   bool hasReadM0SendMsgHazard() const {
1051     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1052            getGeneration() <= AMDGPUSubtarget::GFX9;
1053   }
1054 
1055   bool hasReadM0LdsDmaHazard() const {
1056     return getGeneration() == AMDGPUSubtarget::GFX9;
1057   }
1058 
1059   bool hasReadM0LdsDirectHazard() const {
1060     return getGeneration() == AMDGPUSubtarget::GFX9;
1061   }
1062 
1063   bool hasVcmpxPermlaneHazard() const {
1064     return HasVcmpxPermlaneHazard;
1065   }
1066 
1067   bool hasVMEMtoScalarWriteHazard() const {
1068     return HasVMEMtoScalarWriteHazard;
1069   }
1070 
1071   bool hasSMEMtoVectorWriteHazard() const {
1072     return HasSMEMtoVectorWriteHazard;
1073   }
1074 
1075   bool hasLDSMisalignedBug() const {
1076     return LDSMisalignedBug && !EnableCuMode;
1077   }
1078 
1079   bool hasInstFwdPrefetchBug() const {
1080     return HasInstFwdPrefetchBug;
1081   }
1082 
1083   bool hasVcmpxExecWARHazard() const {
1084     return HasVcmpxExecWARHazard;
1085   }
1086 
1087   bool hasLdsBranchVmemWARHazard() const {
1088     return HasLdsBranchVmemWARHazard;
1089   }
1090 
1091   // Shift amount of a 64 bit shift cannot be a highest allocated register
1092   // if also at the end of the allocation block.
1093   bool hasShift64HighRegBug() const {
1094     return GFX90AInsts && !GFX940Insts;
1095   }
1096 
1097   // Has one cycle hazard on transcendental instruction feeding a
1098   // non transcendental VALU.
1099   bool hasTransForwardingHazard() const { return GFX940Insts; }
1100 
1101   // Has one cycle hazard on a VALU instruction partially writing dst with
1102   // a shift of result bits feeding another VALU instruction.
1103   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1104 
1105   // Cannot use op_sel with v_dot instructions.
1106   bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
1107 
1108   // Does not have HW interlocs for VALU writing and then reading SGPRs.
1109   bool hasVDecCoExecHazard() const {
1110     return GFX940Insts;
1111   }
1112 
1113   bool hasNSAtoVMEMBug() const {
1114     return HasNSAtoVMEMBug;
1115   }
1116 
1117   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1118 
1119   bool hasHardClauses() const { return getGeneration() >= GFX10; }
1120 
1121   bool hasGFX90AInsts() const { return GFX90AInsts; }
1122 
1123   bool hasFPAtomicToDenormModeHazard() const {
1124     return getGeneration() == GFX10;
1125   }
1126 
1127   bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1128 
1129   bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1130 
1131   bool hasVALUPartialForwardingHazard() const {
1132     return getGeneration() >= GFX11;
1133   }
1134 
1135   bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1136 
1137   bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1138 
1139   bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; }
1140 
1141   /// Return if operations acting on VGPR tuples require even alignment.
1142   bool needsAlignedVGPRs() const { return GFX90AInsts; }
1143 
1144   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1145   bool hasSPackHL() const { return GFX11Insts; }
1146 
1147   /// Return true if the target's EXP instruction has the COMPR flag, which
1148   /// affects the meaning of the EN (enable) bits.
1149   bool hasCompressedExport() const { return !GFX11Insts; }
1150 
1151   /// Return true if the target's EXP instruction supports the NULL export
1152   /// target.
1153   bool hasNullExportTarget() const { return !GFX11Insts; }
1154 
1155   bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; }
1156 
1157   bool hasVOPDInsts() const { return HasVOPDInsts; }
1158 
1159   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1160 
1161   /// Return true if the target has the S_DELAY_ALU instruction.
1162   bool hasDelayAlu() const { return GFX11Insts; }
1163 
1164   bool hasPackedTID() const { return HasPackedTID; }
1165 
1166   // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1167   // hasGFX90AInsts is also true.
1168   bool hasGFX940Insts() const { return GFX940Insts; }
1169 
1170   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1171 
1172   bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
1173 
1174   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1175 
1176   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1177 
1178   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1179   /// SGPRs
1180   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1181 
1182   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1183   /// VGPRs
1184   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1185 
1186   /// Return occupancy for the given function. Used LDS and a number of
1187   /// registers if provided.
1188   /// Note, occupancy can be affected by the scratch allocation as well, but
1189   /// we do not have enough information to compute it.
1190   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1191                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1192 
1193   /// \returns true if the flat_scratch register should be initialized with the
1194   /// pointer to the wave's scratch memory rather than a size and offset.
1195   bool flatScratchIsPointer() const {
1196     return getGeneration() >= AMDGPUSubtarget::GFX9;
1197   }
1198 
1199   /// \returns true if the flat_scratch register is initialized by the HW.
1200   /// In this case it is readonly.
1201   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1202 
1203   /// \returns true if the architected SGPRs are enabled.
1204   bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1205 
1206   /// \returns true if Global Data Share is supported.
1207   bool hasGDS() const { return HasGDS; }
1208 
1209   /// \returns true if Global Wave Sync is supported.
1210   bool hasGWS() const { return HasGWS; }
1211 
1212   /// \returns true if the machine has merged shaders in which s0-s7 are
1213   /// reserved by the hardware and user SGPRs start at s8
1214   bool hasMergedShaders() const {
1215     return getGeneration() >= GFX9;
1216   }
1217 
1218   // \returns true if the target supports the pre-NGG legacy geometry path.
1219   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1220 
1221   // \returns true if preloading kernel arguments is supported.
1222   bool hasKernargPreload() const { return KernargPreload; }
1223 
1224   // \returns true if we need to generate backwards compatible code when
1225   // preloading kernel arguments.
1226   bool needsKernargPreloadBackwardsCompatibility() const {
1227     return hasKernargPreload() && !hasGFX940Insts();
1228   }
1229 
1230   // \returns true if the target has split barriers feature
1231   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1232 
1233   // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1234   bool hasCvtFP8VOP1Bug() const { return true; }
1235 
1236   // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1237   // no-return form.
1238   bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1239 
1240   // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1241   bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1242 
1243   // \returns true if the target has IEEE kernel descriptor mode bit
1244   bool hasIEEEMode() const { return getGeneration() < GFX12; }
1245 
1246   // \returns true if the target has IEEE fminimum/fmaximum instructions
1247   bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1248 
1249   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1250   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1251 
1252   /// \returns SGPR allocation granularity supported by the subtarget.
1253   unsigned getSGPRAllocGranule() const {
1254     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1255   }
1256 
1257   /// \returns SGPR encoding granularity supported by the subtarget.
1258   unsigned getSGPREncodingGranule() const {
1259     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1260   }
1261 
1262   /// \returns Total number of SGPRs supported by the subtarget.
1263   unsigned getTotalNumSGPRs() const {
1264     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1265   }
1266 
1267   /// \returns Addressable number of SGPRs supported by the subtarget.
1268   unsigned getAddressableNumSGPRs() const {
1269     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1270   }
1271 
1272   /// \returns Minimum number of SGPRs that meets the given number of waves per
1273   /// execution unit requirement supported by the subtarget.
1274   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1275     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1276   }
1277 
1278   /// \returns Maximum number of SGPRs that meets the given number of waves per
1279   /// execution unit requirement supported by the subtarget.
1280   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1281     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1282   }
1283 
1284   /// \returns Reserved number of SGPRs. This is common
1285   /// utility function called by MachineFunction and
1286   /// Function variants of getReservedNumSGPRs.
1287   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1288   /// \returns Reserved number of SGPRs for given machine function \p MF.
1289   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1290 
1291   /// \returns Reserved number of SGPRs for given function \p F.
1292   unsigned getReservedNumSGPRs(const Function &F) const;
1293 
1294   /// \returns max num SGPRs. This is the common utility
1295   /// function called by MachineFunction and Function
1296   /// variants of getMaxNumSGPRs.
1297   unsigned getBaseMaxNumSGPRs(const Function &F,
1298                               std::pair<unsigned, unsigned> WavesPerEU,
1299                               unsigned PreloadedSGPRs,
1300                               unsigned ReservedNumSGPRs) const;
1301 
1302   /// \returns Maximum number of SGPRs that meets number of waves per execution
1303   /// unit requirement for function \p MF, or number of SGPRs explicitly
1304   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1305   ///
1306   /// \returns Value that meets number of waves per execution unit requirement
1307   /// if explicitly requested value cannot be converted to integer, violates
1308   /// subtarget's specifications, or does not meet number of waves per execution
1309   /// unit requirement.
1310   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1311 
1312   /// \returns Maximum number of SGPRs that meets number of waves per execution
1313   /// unit requirement for function \p F, or number of SGPRs explicitly
1314   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1315   ///
1316   /// \returns Value that meets number of waves per execution unit requirement
1317   /// if explicitly requested value cannot be converted to integer, violates
1318   /// subtarget's specifications, or does not meet number of waves per execution
1319   /// unit requirement.
1320   unsigned getMaxNumSGPRs(const Function &F) const;
1321 
1322   /// \returns VGPR allocation granularity supported by the subtarget.
1323   unsigned getVGPRAllocGranule() const {
1324     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1325   }
1326 
1327   /// \returns VGPR encoding granularity supported by the subtarget.
1328   unsigned getVGPREncodingGranule() const {
1329     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1330   }
1331 
1332   /// \returns Total number of VGPRs supported by the subtarget.
1333   unsigned getTotalNumVGPRs() const {
1334     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1335   }
1336 
1337   /// \returns Addressable number of VGPRs supported by the subtarget.
1338   unsigned getAddressableNumVGPRs() const {
1339     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1340   }
1341 
1342   /// \returns the minimum number of VGPRs that will prevent achieving more than
1343   /// the specified number of waves \p WavesPerEU.
1344   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1345     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1346   }
1347 
1348   /// \returns the maximum number of VGPRs that can be used and still achieved
1349   /// at least the specified number of waves \p WavesPerEU.
1350   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1351     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1352   }
1353 
1354   /// \returns max num VGPRs. This is the common utility function
1355   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1356   unsigned getBaseMaxNumVGPRs(const Function &F,
1357                               std::pair<unsigned, unsigned> WavesPerEU) const;
1358   /// \returns Maximum number of VGPRs that meets number of waves per execution
1359   /// unit requirement for function \p F, or number of VGPRs explicitly
1360   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1361   ///
1362   /// \returns Value that meets number of waves per execution unit requirement
1363   /// if explicitly requested value cannot be converted to integer, violates
1364   /// subtarget's specifications, or does not meet number of waves per execution
1365   /// unit requirement.
1366   unsigned getMaxNumVGPRs(const Function &F) const;
1367 
1368   unsigned getMaxNumAGPRs(const Function &F) const {
1369     return getMaxNumVGPRs(F);
1370   }
1371 
1372   /// \returns Maximum number of VGPRs that meets number of waves per execution
1373   /// unit requirement for function \p MF, or number of VGPRs explicitly
1374   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1375   ///
1376   /// \returns Value that meets number of waves per execution unit requirement
1377   /// if explicitly requested value cannot be converted to integer, violates
1378   /// subtarget's specifications, or does not meet number of waves per execution
1379   /// unit requirement.
1380   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1381 
1382   void getPostRAMutations(
1383       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1384       const override;
1385 
1386   std::unique_ptr<ScheduleDAGMutation>
1387   createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1388 
1389   bool isWave32() const {
1390     return getWavefrontSize() == 32;
1391   }
1392 
1393   bool isWave64() const {
1394     return getWavefrontSize() == 64;
1395   }
1396 
1397   const TargetRegisterClass *getBoolRC() const {
1398     return getRegisterInfo()->getBoolRC();
1399   }
1400 
1401   /// \returns Maximum number of work groups per compute unit supported by the
1402   /// subtarget and limited by given \p FlatWorkGroupSize.
1403   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1404     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1405   }
1406 
1407   /// \returns Minimum flat work group size supported by the subtarget.
1408   unsigned getMinFlatWorkGroupSize() const override {
1409     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1410   }
1411 
1412   /// \returns Maximum flat work group size supported by the subtarget.
1413   unsigned getMaxFlatWorkGroupSize() const override {
1414     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1415   }
1416 
1417   /// \returns Number of waves per execution unit required to support the given
1418   /// \p FlatWorkGroupSize.
1419   unsigned
1420   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1421     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1422   }
1423 
1424   /// \returns Minimum number of waves per execution unit supported by the
1425   /// subtarget.
1426   unsigned getMinWavesPerEU() const override {
1427     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1428   }
1429 
1430   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1431                              SDep &Dep) const override;
1432 
1433   // \returns true if it's beneficial on this subtarget for the scheduler to
1434   // cluster stores as well as loads.
1435   bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1436 
1437   // \returns the number of address arguments from which to enable MIMG NSA
1438   // on supported architectures.
1439   unsigned getNSAThreshold(const MachineFunction &MF) const;
1440 
1441   // \returns true if the subtarget has a hazard requiring an "s_nop 0"
1442   // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1443   bool requiresNopBeforeDeallocVGPRs() const {
1444     // Currently all targets that support the dealloc VGPRs message also require
1445     // the nop.
1446     return true;
1447   }
1448 };
1449 
1450 class GCNUserSGPRUsageInfo {
1451 public:
1452   bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1453 
1454   bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1455 
1456   bool hasDispatchPtr() const { return DispatchPtr; }
1457 
1458   bool hasQueuePtr() const { return QueuePtr; }
1459 
1460   bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1461 
1462   bool hasDispatchID() const { return DispatchID; }
1463 
1464   bool hasFlatScratchInit() const { return FlatScratchInit; }
1465 
1466   unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1467 
1468   unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1469 
1470   unsigned getNumFreeUserSGPRs();
1471 
1472   void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1473 
1474   enum UserSGPRID : unsigned {
1475     ImplicitBufferPtrID = 0,
1476     PrivateSegmentBufferID = 1,
1477     DispatchPtrID = 2,
1478     QueuePtrID = 3,
1479     KernargSegmentPtrID = 4,
1480     DispatchIdID = 5,
1481     FlatScratchInitID = 6,
1482     PrivateSegmentSizeID = 7
1483   };
1484 
1485   // Returns the size in number of SGPRs for preload user SGPR field.
1486   static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1487     switch (ID) {
1488     case ImplicitBufferPtrID:
1489       return 2;
1490     case PrivateSegmentBufferID:
1491       return 4;
1492     case DispatchPtrID:
1493       return 2;
1494     case QueuePtrID:
1495       return 2;
1496     case KernargSegmentPtrID:
1497       return 2;
1498     case DispatchIdID:
1499       return 2;
1500     case FlatScratchInitID:
1501       return 2;
1502     case PrivateSegmentSizeID:
1503       return 1;
1504     }
1505     llvm_unreachable("Unknown UserSGPRID.");
1506   }
1507 
1508   GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1509 
1510 private:
1511   const GCNSubtarget &ST;
1512 
1513   // Private memory buffer
1514   // Compute directly in sgpr[0:1]
1515   // Other shaders indirect 64-bits at sgpr[0:1]
1516   bool ImplicitBufferPtr = false;
1517 
1518   bool PrivateSegmentBuffer = false;
1519 
1520   bool DispatchPtr = false;
1521 
1522   bool QueuePtr = false;
1523 
1524   bool KernargSegmentPtr = false;
1525 
1526   bool DispatchID = false;
1527 
1528   bool FlatScratchInit = false;
1529 
1530   unsigned NumKernargPreloadSGPRs = 0;
1531 
1532   unsigned NumUsedUserSGPRs = 0;
1533 };
1534 
1535 } // end namespace llvm
1536 
1537 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1538