xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "SIFrameLowering.h"
21 #include "SIISelLowering.h"
22 #include "SIInstrInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/Support/ErrorHandling.h"
25 
26 #define GET_SUBTARGETINFO_HEADER
27 #include "AMDGPUGenSubtargetInfo.inc"
28 
29 namespace llvm {
30 
31 class GCNTargetMachine;
32 
33 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
34                            public AMDGPUSubtarget {
35 public:
36   using AMDGPUSubtarget::getMaxWavesPerEU;
37 
38   // Following 2 enums are documented at:
39   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40   enum class TrapHandlerAbi {
41     NONE   = 0x00,
42     AMDHSA = 0x01,
43   };
44 
45   enum class TrapID {
46     LLVMAMDHSATrap      = 0x02,
47     LLVMAMDHSADebugTrap = 0x03,
48   };
49 
50 private:
51   /// SelectionDAGISel related APIs.
52   std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53 
54   /// GlobalISel related APIs.
55   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57   std::unique_ptr<InstructionSelector> InstSelector;
58   std::unique_ptr<LegalizerInfo> Legalizer;
59   std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60 
61 protected:
62   // Basic subtarget description.
63   Triple TargetTriple;
64   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
65   unsigned Gen = INVALID;
66   InstrItineraryData InstrItins;
67   int LDSBankCount = 0;
68   unsigned MaxPrivateElementSize = 0;
69 
70   // Possibly statically set by tablegen, but may want to be overridden.
71   bool FastDenormalF32 = false;
72   bool HalfRate64Ops = false;
73   bool FullRate64Ops = false;
74 
75   // Dynamically set bits that enable features.
76   bool FlatForGlobal = false;
77   bool AutoWaitcntBeforeBarrier = false;
78   bool BackOffBarrier = false;
79   bool UnalignedScratchAccess = false;
80   bool UnalignedAccessMode = false;
81   bool RelaxedBufferOOBMode = false;
82   bool HasApertureRegs = false;
83   bool SupportsXNACK = false;
84   bool KernargPreload = false;
85 
86   // This should not be used directly. 'TargetID' tracks the dynamic settings
87   // for XNACK.
88   bool EnableXNACK = false;
89 
90   bool EnableTgSplit = false;
91   bool EnableCuMode = false;
92   bool TrapHandler = false;
93   bool EnablePreciseMemory = false;
94 
95   // Used as options.
96   bool EnableLoadStoreOpt = false;
97   bool EnableUnsafeDSOffsetFolding = false;
98   bool EnableSIScheduler = false;
99   bool EnableDS128 = false;
100   bool EnablePRTStrictNull = false;
101   bool DumpCode = false;
102 
103   // Subtarget statically properties set by tablegen
104   bool FP64 = false;
105   bool FMA = false;
106   bool MIMG_R128 = false;
107   bool CIInsts = false;
108   bool GFX8Insts = false;
109   bool GFX9Insts = false;
110   bool GFX90AInsts = false;
111   bool GFX940Insts = false;
112   bool GFX950Insts = false;
113   bool GFX10Insts = false;
114   bool GFX11Insts = false;
115   bool GFX12Insts = false;
116   bool GFX1250Insts = false;
117   bool GFX10_3Insts = false;
118   bool GFX7GFX8GFX9Insts = false;
119   bool SGPRInitBug = false;
120   bool UserSGPRInit16Bug = false;
121   bool NegativeScratchOffsetBug = false;
122   bool NegativeUnalignedScratchOffsetBug = false;
123   bool HasSMemRealTime = false;
124   bool HasIntClamp = false;
125   bool HasFmaMixInsts = false;
126   bool HasMovrel = false;
127   bool HasVGPRIndexMode = false;
128   bool HasScalarDwordx3Loads = false;
129   bool HasScalarStores = false;
130   bool HasScalarAtomics = false;
131   bool HasSDWAOmod = false;
132   bool HasSDWAScalar = false;
133   bool HasSDWASdst = false;
134   bool HasSDWAMac = false;
135   bool HasSDWAOutModsVOPC = false;
136   bool HasDPP = false;
137   bool HasDPP8 = false;
138   bool HasDPALU_DPP = false;
139   bool HasDPPSrc1SGPR = false;
140   bool HasPackedFP32Ops = false;
141   bool HasImageInsts = false;
142   bool HasExtendedImageInsts = false;
143   bool HasR128A16 = false;
144   bool HasA16 = false;
145   bool HasG16 = false;
146   bool HasNSAEncoding = false;
147   bool HasPartialNSAEncoding = false;
148   bool GFX10_AEncoding = false;
149   bool GFX10_BEncoding = false;
150   bool HasDLInsts = false;
151   bool HasFmacF64Inst = false;
152   bool HasDot1Insts = false;
153   bool HasDot2Insts = false;
154   bool HasDot3Insts = false;
155   bool HasDot4Insts = false;
156   bool HasDot5Insts = false;
157   bool HasDot6Insts = false;
158   bool HasDot7Insts = false;
159   bool HasDot8Insts = false;
160   bool HasDot9Insts = false;
161   bool HasDot10Insts = false;
162   bool HasDot11Insts = false;
163   bool HasDot12Insts = false;
164   bool HasDot13Insts = false;
165   bool HasMAIInsts = false;
166   bool HasFP8Insts = false;
167   bool HasFP8ConversionInsts = false;
168   bool HasFP8E5M3Insts = false;
169   bool HasCvtFP8Vop1Bug = false;
170   bool HasPkFmacF16Inst = false;
171   bool HasAtomicFMinFMaxF32GlobalInsts = false;
172   bool HasAtomicFMinFMaxF64GlobalInsts = false;
173   bool HasAtomicFMinFMaxF32FlatInsts = false;
174   bool HasAtomicFMinFMaxF64FlatInsts = false;
175   bool HasAtomicDsPkAdd16Insts = false;
176   bool HasAtomicFlatPkAdd16Insts = false;
177   bool HasAtomicFaddRtnInsts = false;
178   bool HasAtomicFaddNoRtnInsts = false;
179   bool HasMemoryAtomicFaddF32DenormalSupport = false;
180   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
181   bool HasAtomicBufferGlobalPkAddF16Insts = false;
182   bool HasAtomicCSubNoRtnInsts = false;
183   bool HasAtomicGlobalPkAddBF16Inst = false;
184   bool HasAtomicBufferPkAddBF16Inst = false;
185   bool HasFlatAtomicFaddF32Inst = false;
186   bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
187   bool HasDefaultComponentZero = false;
188   bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
189   bool HasDefaultComponentBroadcast = false;
190   bool HasXF32Insts = false;
191   /// The maximum number of instructions that may be placed within an S_CLAUSE,
192   /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
193   /// indicates a lack of S_CLAUSE support.
194   unsigned MaxHardClauseLength = 0;
195   bool SupportsSRAMECC = false;
196   bool DynamicVGPR = false;
197   bool DynamicVGPRBlockSize32 = false;
198   bool HasVMemToLDSLoad = false;
199 
200   // This should not be used directly. 'TargetID' tracks the dynamic settings
201   // for SRAMECC.
202   bool EnableSRAMECC = false;
203 
204   bool HasNoSdstCMPX = false;
205   bool HasVscnt = false;
206   bool HasWaitXcnt = false;
207   bool HasGetWaveIdInst = false;
208   bool HasSMemTimeInst = false;
209   bool HasShaderCyclesRegister = false;
210   bool HasShaderCyclesHiLoRegisters = false;
211   bool HasVOP3Literal = false;
212   bool HasNoDataDepHazard = false;
213   bool FlatAddressSpace = false;
214   bool FlatInstOffsets = false;
215   bool FlatGlobalInsts = false;
216   bool FlatScratchInsts = false;
217   bool ScalarFlatScratchInsts = false;
218   bool HasArchitectedFlatScratch = false;
219   bool EnableFlatScratch = false;
220   bool HasArchitectedSGPRs = false;
221   bool HasGDS = false;
222   bool HasGWS = false;
223   bool AddNoCarryInsts = false;
224   bool HasUnpackedD16VMem = false;
225   bool LDSMisalignedBug = false;
226   bool HasMFMAInlineLiteralBug = false;
227   bool UnalignedBufferAccess = false;
228   bool UnalignedDSAccess = false;
229   bool HasPackedTID = false;
230   bool ScalarizeGlobal = false;
231   bool HasSALUFloatInsts = false;
232   bool HasPseudoScalarTrans = false;
233   bool HasRestrictedSOffset = false;
234   bool Has64BitLiterals = false;
235   bool HasBitOp3Insts = false;
236   bool HasTransposeLoadF4F6Insts = false;
237   bool HasPrngInst = false;
238   bool HasBVHDualAndBVH8Insts = false;
239   bool HasPermlane16Swap = false;
240   bool HasPermlane32Swap = false;
241   bool HasVcmpxPermlaneHazard = false;
242   bool HasVMEMtoScalarWriteHazard = false;
243   bool HasSMEMtoVectorWriteHazard = false;
244   bool HasInstFwdPrefetchBug = false;
245   bool HasSafeSmemPrefetch = false;
246   bool HasVcmpxExecWARHazard = false;
247   bool HasLdsBranchVmemWARHazard = false;
248   bool HasNSAtoVMEMBug = false;
249   bool HasNSAClauseBug = false;
250   bool HasOffset3fBug = false;
251   bool HasFlatSegmentOffsetBug = false;
252   bool HasImageStoreD16Bug = false;
253   bool HasImageGather4D16Bug = false;
254   bool HasMSAALoadDstSelBug = false;
255   bool HasPrivEnabledTrap2NopBug = false;
256   bool Has1_5xVGPRs = false;
257   bool HasMADIntraFwdBug = false;
258   bool HasVOPDInsts = false;
259   bool HasVALUTransUseHazard = false;
260   bool HasRequiredExportPriority = false;
261   bool HasVmemWriteVgprInOrder = false;
262   bool HasAshrPkInsts = false;
263   bool HasIEEEMinimumMaximumInsts = false;
264   bool HasMinimum3Maximum3F32 = false;
265   bool HasMinimum3Maximum3F16 = false;
266   bool HasMinimum3Maximum3PKF16 = false;
267   bool HasLshlAddU64Inst = false;
268   bool HasPointSampleAccel = false;
269   bool HasLdsBarrierArriveAtomic = false;
270   bool HasSetPrioIncWgInst = false;
271 
272   bool RequiresCOV6 = false;
273   bool UseBlockVGPROpsForCSR = false;
274 
275   // Dummy feature to use for assembler in tablegen.
276   bool FeatureDisable = false;
277 
278 private:
279   SIInstrInfo InstrInfo;
280   SITargetLowering TLInfo;
281   SIFrameLowering FrameLowering;
282 
283 public:
284   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
285                const GCNTargetMachine &TM);
286   ~GCNSubtarget() override;
287 
288   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
289                                                    StringRef GPU, StringRef FS);
290 
291   /// Diagnose inconsistent subtarget features before attempting to codegen
292   /// function \p F.
293   void checkSubtargetFeatures(const Function &F) const;
294 
getInstrInfo()295   const SIInstrInfo *getInstrInfo() const override {
296     return &InstrInfo;
297   }
298 
getFrameLowering()299   const SIFrameLowering *getFrameLowering() const override {
300     return &FrameLowering;
301   }
302 
getTargetLowering()303   const SITargetLowering *getTargetLowering() const override {
304     return &TLInfo;
305   }
306 
getRegisterInfo()307   const SIRegisterInfo *getRegisterInfo() const override {
308     return &InstrInfo.getRegisterInfo();
309   }
310 
311   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
312 
getCallLowering()313   const CallLowering *getCallLowering() const override {
314     return CallLoweringInfo.get();
315   }
316 
getInlineAsmLowering()317   const InlineAsmLowering *getInlineAsmLowering() const override {
318     return InlineAsmLoweringInfo.get();
319   }
320 
getInstructionSelector()321   InstructionSelector *getInstructionSelector() const override {
322     return InstSelector.get();
323   }
324 
getLegalizerInfo()325   const LegalizerInfo *getLegalizerInfo() const override {
326     return Legalizer.get();
327   }
328 
getRegBankInfo()329   const AMDGPURegisterBankInfo *getRegBankInfo() const override {
330     return RegBankInfo.get();
331   }
332 
getTargetID()333   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
334     return TargetID;
335   }
336 
getInstrItineraryData()337   const InstrItineraryData *getInstrItineraryData() const override {
338     return &InstrItins;
339   }
340 
341   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
342 
getGeneration()343   Generation getGeneration() const {
344     return (Generation)Gen;
345   }
346 
getMaxWaveScratchSize()347   unsigned getMaxWaveScratchSize() const {
348     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
349     if (getGeneration() >= GFX12) {
350       // 18-bit field in units of 64-dword.
351       return (64 * 4) * ((1 << 18) - 1);
352     }
353     if (getGeneration() == GFX11) {
354       // 15-bit field in units of 64-dword.
355       return (64 * 4) * ((1 << 15) - 1);
356     }
357     // 13-bit field in units of 256-dword.
358     return (256 * 4) * ((1 << 13) - 1);
359   }
360 
361   /// Return the number of high bits known to be zero for a frame index.
getKnownHighZeroBitsForFrameIndex()362   unsigned getKnownHighZeroBitsForFrameIndex() const {
363     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
364   }
365 
getLDSBankCount()366   int getLDSBankCount() const {
367     return LDSBankCount;
368   }
369 
370   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
371     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
372   }
373 
374   unsigned getConstantBusLimit(unsigned Opcode) const;
375 
376   /// Returns if the result of this instruction with a 16-bit result returned in
377   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
378   /// the original value.
379   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
380 
supportsWGP()381   bool supportsWGP() const { return getGeneration() >= GFX10; }
382 
hasIntClamp()383   bool hasIntClamp() const {
384     return HasIntClamp;
385   }
386 
hasFP64()387   bool hasFP64() const {
388     return FP64;
389   }
390 
hasMIMG_R128()391   bool hasMIMG_R128() const {
392     return MIMG_R128;
393   }
394 
hasHWFP64()395   bool hasHWFP64() const {
396     return FP64;
397   }
398 
hasHalfRate64Ops()399   bool hasHalfRate64Ops() const {
400     return HalfRate64Ops;
401   }
402 
hasFullRate64Ops()403   bool hasFullRate64Ops() const {
404     return FullRate64Ops;
405   }
406 
hasAddr64()407   bool hasAddr64() const {
408     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
409   }
410 
hasFlat()411   bool hasFlat() const {
412     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
413   }
414 
415   // Return true if the target only has the reverse operand versions of VALU
416   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
hasOnlyRevVALUShifts()417   bool hasOnlyRevVALUShifts() const {
418     return getGeneration() >= VOLCANIC_ISLANDS;
419   }
420 
hasFractBug()421   bool hasFractBug() const {
422     return getGeneration() == SOUTHERN_ISLANDS;
423   }
424 
hasBFE()425   bool hasBFE() const {
426     return true;
427   }
428 
hasBFI()429   bool hasBFI() const {
430     return true;
431   }
432 
hasBFM()433   bool hasBFM() const {
434     return hasBFE();
435   }
436 
hasBCNT(unsigned Size)437   bool hasBCNT(unsigned Size) const {
438     return true;
439   }
440 
hasFFBL()441   bool hasFFBL() const {
442     return true;
443   }
444 
hasFFBH()445   bool hasFFBH() const {
446     return true;
447   }
448 
hasMed3_16()449   bool hasMed3_16() const {
450     return getGeneration() >= AMDGPUSubtarget::GFX9;
451   }
452 
hasMin3Max3_16()453   bool hasMin3Max3_16() const {
454     return getGeneration() >= AMDGPUSubtarget::GFX9;
455   }
456 
hasFmaMixInsts()457   bool hasFmaMixInsts() const {
458     return HasFmaMixInsts;
459   }
460 
hasCARRY()461   bool hasCARRY() const {
462     return true;
463   }
464 
hasFMA()465   bool hasFMA() const {
466     return FMA;
467   }
468 
hasSwap()469   bool hasSwap() const {
470     return GFX9Insts;
471   }
472 
hasScalarPackInsts()473   bool hasScalarPackInsts() const {
474     return GFX9Insts;
475   }
476 
hasScalarMulHiInsts()477   bool hasScalarMulHiInsts() const {
478     return GFX9Insts;
479   }
480 
hasScalarSubwordLoads()481   bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
482 
getTrapHandlerAbi()483   TrapHandlerAbi getTrapHandlerAbi() const {
484     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
485   }
486 
supportsGetDoorbellID()487   bool supportsGetDoorbellID() const {
488     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
489     return getGeneration() >= GFX9;
490   }
491 
492   /// True if the offset field of DS instructions works as expected. On SI, the
493   /// offset uses a 16-bit adder and does not always wrap properly.
hasUsableDSOffset()494   bool hasUsableDSOffset() const {
495     return getGeneration() >= SEA_ISLANDS;
496   }
497 
unsafeDSOffsetFoldingEnabled()498   bool unsafeDSOffsetFoldingEnabled() const {
499     return EnableUnsafeDSOffsetFolding;
500   }
501 
502   /// Condition output from div_scale is usable.
hasUsableDivScaleConditionOutput()503   bool hasUsableDivScaleConditionOutput() const {
504     return getGeneration() != SOUTHERN_ISLANDS;
505   }
506 
507   /// Extra wait hazard is needed in some cases before
508   /// s_cbranch_vccnz/s_cbranch_vccz.
hasReadVCCZBug()509   bool hasReadVCCZBug() const {
510     return getGeneration() <= SEA_ISLANDS;
511   }
512 
513   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
partialVCCWritesUpdateVCCZ()514   bool partialVCCWritesUpdateVCCZ() const {
515     return getGeneration() >= GFX10;
516   }
517 
518   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
519   /// was written by a VALU instruction.
hasSMRDReadVALUDefHazard()520   bool hasSMRDReadVALUDefHazard() const {
521     return getGeneration() == SOUTHERN_ISLANDS;
522   }
523 
524   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
525   /// SGPR was written by a VALU Instruction.
hasVMEMReadSGPRVALUDefHazard()526   bool hasVMEMReadSGPRVALUDefHazard() const {
527     return getGeneration() >= VOLCANIC_ISLANDS;
528   }
529 
hasRFEHazards()530   bool hasRFEHazards() const {
531     return getGeneration() >= VOLCANIC_ISLANDS;
532   }
533 
534   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
getSetRegWaitStates()535   unsigned getSetRegWaitStates() const {
536     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
537   }
538 
dumpCode()539   bool dumpCode() const {
540     return DumpCode;
541   }
542 
543   /// Return the amount of LDS that can be used that will not restrict the
544   /// occupancy lower than WaveCount.
545   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
546                                            const Function &) const;
547 
supportsMinMaxDenormModes()548   bool supportsMinMaxDenormModes() const {
549     return getGeneration() >= AMDGPUSubtarget::GFX9;
550   }
551 
552   /// \returns If target supports S_DENORM_MODE.
hasDenormModeInst()553   bool hasDenormModeInst() const {
554     return getGeneration() >= AMDGPUSubtarget::GFX10;
555   }
556 
useFlatForGlobal()557   bool useFlatForGlobal() const {
558     return FlatForGlobal;
559   }
560 
561   /// \returns If target supports ds_read/write_b128 and user enables generation
562   /// of ds_read/write_b128.
useDS128()563   bool useDS128() const {
564     return CIInsts && EnableDS128;
565   }
566 
567   /// \return If target supports ds_read/write_b96/128.
hasDS96AndDS128()568   bool hasDS96AndDS128() const {
569     return CIInsts;
570   }
571 
572   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
haveRoundOpsF64()573   bool haveRoundOpsF64() const {
574     return CIInsts;
575   }
576 
577   /// \returns If MUBUF instructions always perform range checking, even for
578   /// buffer resources used for private memory access.
privateMemoryResourceIsRangeChecked()579   bool privateMemoryResourceIsRangeChecked() const {
580     return getGeneration() < AMDGPUSubtarget::GFX9;
581   }
582 
583   /// \returns If target requires PRT Struct NULL support (zero result registers
584   /// for sparse texture support).
usePRTStrictNull()585   bool usePRTStrictNull() const {
586     return EnablePRTStrictNull;
587   }
588 
hasAutoWaitcntBeforeBarrier()589   bool hasAutoWaitcntBeforeBarrier() const {
590     return AutoWaitcntBeforeBarrier;
591   }
592 
593   /// \returns true if the target supports backing off of s_barrier instructions
594   /// when an exception is raised.
supportsBackOffBarrier()595   bool supportsBackOffBarrier() const {
596     return BackOffBarrier;
597   }
598 
hasUnalignedBufferAccess()599   bool hasUnalignedBufferAccess() const {
600     return UnalignedBufferAccess;
601   }
602 
hasUnalignedBufferAccessEnabled()603   bool hasUnalignedBufferAccessEnabled() const {
604     return UnalignedBufferAccess && UnalignedAccessMode;
605   }
606 
hasUnalignedDSAccess()607   bool hasUnalignedDSAccess() const {
608     return UnalignedDSAccess;
609   }
610 
hasUnalignedDSAccessEnabled()611   bool hasUnalignedDSAccessEnabled() const {
612     return UnalignedDSAccess && UnalignedAccessMode;
613   }
614 
hasUnalignedScratchAccess()615   bool hasUnalignedScratchAccess() const {
616     return UnalignedScratchAccess;
617   }
618 
hasUnalignedScratchAccessEnabled()619   bool hasUnalignedScratchAccessEnabled() const {
620     return UnalignedScratchAccess && UnalignedAccessMode;
621   }
622 
hasUnalignedAccessMode()623   bool hasUnalignedAccessMode() const {
624     return UnalignedAccessMode;
625   }
626 
hasRelaxedBufferOOBMode()627   bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }
628 
hasApertureRegs()629   bool hasApertureRegs() const {
630     return HasApertureRegs;
631   }
632 
isTrapHandlerEnabled()633   bool isTrapHandlerEnabled() const {
634     return TrapHandler;
635   }
636 
isXNACKEnabled()637   bool isXNACKEnabled() const {
638     return TargetID.isXnackOnOrAny();
639   }
640 
isTgSplitEnabled()641   bool isTgSplitEnabled() const {
642     return EnableTgSplit;
643   }
644 
isCuModeEnabled()645   bool isCuModeEnabled() const {
646     return EnableCuMode;
647   }
648 
isPreciseMemoryEnabled()649   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
650 
hasFlatAddressSpace()651   bool hasFlatAddressSpace() const {
652     return FlatAddressSpace;
653   }
654 
hasFlatScrRegister()655   bool hasFlatScrRegister() const {
656     return hasFlatAddressSpace();
657   }
658 
hasFlatInstOffsets()659   bool hasFlatInstOffsets() const {
660     return FlatInstOffsets;
661   }
662 
hasFlatGlobalInsts()663   bool hasFlatGlobalInsts() const {
664     return FlatGlobalInsts;
665   }
666 
hasFlatScratchInsts()667   bool hasFlatScratchInsts() const {
668     return FlatScratchInsts;
669   }
670 
671   // Check if target supports ST addressing mode with FLAT scratch instructions.
672   // The ST addressing mode means no registers are used, either VGPR or SGPR,
673   // but only immediate offset is swizzled and added to the FLAT scratch base.
hasFlatScratchSTMode()674   bool hasFlatScratchSTMode() const {
675     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
676   }
677 
hasFlatScratchSVSMode()678   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
679 
hasScalarFlatScratchInsts()680   bool hasScalarFlatScratchInsts() const {
681     return ScalarFlatScratchInsts;
682   }
683 
enableFlatScratch()684   bool enableFlatScratch() const {
685     return flatScratchIsArchitected() ||
686            (EnableFlatScratch && hasFlatScratchInsts());
687   }
688 
hasGlobalAddTidInsts()689   bool hasGlobalAddTidInsts() const {
690     return GFX10_BEncoding;
691   }
692 
hasAtomicCSub()693   bool hasAtomicCSub() const {
694     return GFX10_BEncoding;
695   }
696 
hasMTBUFInsts()697   bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
698 
hasFormattedMUBUFInsts()699   bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
700 
hasExportInsts()701   bool hasExportInsts() const {
702     return !hasGFX940Insts() && !hasGFX1250Insts();
703   }
704 
hasVINTERPEncoding()705   bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
706 
707   // DS_ADD_F64/DS_ADD_RTN_F64
hasLdsAtomicAddF64()708   bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
709 
hasMultiDwordFlatScratchAddressing()710   bool hasMultiDwordFlatScratchAddressing() const {
711     return getGeneration() >= GFX9;
712   }
713 
hasFlatSegmentOffsetBug()714   bool hasFlatSegmentOffsetBug() const {
715     return HasFlatSegmentOffsetBug;
716   }
717 
hasFlatLgkmVMemCountInOrder()718   bool hasFlatLgkmVMemCountInOrder() const {
719     return getGeneration() > GFX9;
720   }
721 
hasD16LoadStore()722   bool hasD16LoadStore() const {
723     return getGeneration() >= GFX9;
724   }
725 
d16PreservesUnusedBits()726   bool d16PreservesUnusedBits() const {
727     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
728   }
729 
hasD16Images()730   bool hasD16Images() const {
731     return getGeneration() >= VOLCANIC_ISLANDS;
732   }
733 
734   /// Return if most LDS instructions have an m0 use that require m0 to be
735   /// initialized.
ldsRequiresM0Init()736   bool ldsRequiresM0Init() const {
737     return getGeneration() < GFX9;
738   }
739 
740   // True if the hardware rewinds and replays GWS operations if a wave is
741   // preempted.
742   //
743   // If this is false, a GWS operation requires testing if a nack set the
744   // MEM_VIOL bit, and repeating if so.
hasGWSAutoReplay()745   bool hasGWSAutoReplay() const {
746     return getGeneration() >= GFX9;
747   }
748 
749   /// \returns if target has ds_gws_sema_release_all instruction.
hasGWSSemaReleaseAll()750   bool hasGWSSemaReleaseAll() const {
751     return CIInsts;
752   }
753 
754   /// \returns true if the target has integer add/sub instructions that do not
755   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
756   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
757   /// for saturation.
hasAddNoCarry()758   bool hasAddNoCarry() const {
759     return AddNoCarryInsts;
760   }
761 
hasScalarAddSub64()762   bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
763 
hasScalarSMulU64()764   bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
765 
hasUnpackedD16VMem()766   bool hasUnpackedD16VMem() const {
767     return HasUnpackedD16VMem;
768   }
769 
770   // Covers VS/PS/CS graphics shaders
isMesaGfxShader(const Function & F)771   bool isMesaGfxShader(const Function &F) const {
772     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
773   }
774 
hasMad64_32()775   bool hasMad64_32() const {
776     return getGeneration() >= SEA_ISLANDS;
777   }
778 
hasSDWAOmod()779   bool hasSDWAOmod() const {
780     return HasSDWAOmod;
781   }
782 
hasSDWAScalar()783   bool hasSDWAScalar() const {
784     return HasSDWAScalar;
785   }
786 
hasSDWASdst()787   bool hasSDWASdst() const {
788     return HasSDWASdst;
789   }
790 
hasSDWAMac()791   bool hasSDWAMac() const {
792     return HasSDWAMac;
793   }
794 
hasSDWAOutModsVOPC()795   bool hasSDWAOutModsVOPC() const {
796     return HasSDWAOutModsVOPC;
797   }
798 
hasDLInsts()799   bool hasDLInsts() const {
800     return HasDLInsts;
801   }
802 
hasFmacF64Inst()803   bool hasFmacF64Inst() const { return HasFmacF64Inst; }
804 
hasDot1Insts()805   bool hasDot1Insts() const {
806     return HasDot1Insts;
807   }
808 
hasDot2Insts()809   bool hasDot2Insts() const {
810     return HasDot2Insts;
811   }
812 
hasDot3Insts()813   bool hasDot3Insts() const {
814     return HasDot3Insts;
815   }
816 
hasDot4Insts()817   bool hasDot4Insts() const {
818     return HasDot4Insts;
819   }
820 
hasDot5Insts()821   bool hasDot5Insts() const {
822     return HasDot5Insts;
823   }
824 
hasDot6Insts()825   bool hasDot6Insts() const {
826     return HasDot6Insts;
827   }
828 
hasDot7Insts()829   bool hasDot7Insts() const {
830     return HasDot7Insts;
831   }
832 
hasDot8Insts()833   bool hasDot8Insts() const {
834     return HasDot8Insts;
835   }
836 
hasDot9Insts()837   bool hasDot9Insts() const {
838     return HasDot9Insts;
839   }
840 
hasDot10Insts()841   bool hasDot10Insts() const {
842     return HasDot10Insts;
843   }
844 
hasDot11Insts()845   bool hasDot11Insts() const {
846     return HasDot11Insts;
847   }
848 
hasDot12Insts()849   bool hasDot12Insts() const {
850     return HasDot12Insts;
851   }
852 
hasDot13Insts()853   bool hasDot13Insts() const {
854     return HasDot13Insts;
855   }
856 
hasMAIInsts()857   bool hasMAIInsts() const {
858     return HasMAIInsts;
859   }
860 
hasFP8Insts()861   bool hasFP8Insts() const {
862     return HasFP8Insts;
863   }
864 
hasFP8ConversionInsts()865   bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
866 
hasFP8E5M3Insts()867   bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }
868 
hasPkFmacF16Inst()869   bool hasPkFmacF16Inst() const {
870     return HasPkFmacF16Inst;
871   }
872 
hasAtomicFMinFMaxF32GlobalInsts()873   bool hasAtomicFMinFMaxF32GlobalInsts() const {
874     return HasAtomicFMinFMaxF32GlobalInsts;
875   }
876 
hasAtomicFMinFMaxF64GlobalInsts()877   bool hasAtomicFMinFMaxF64GlobalInsts() const {
878     return HasAtomicFMinFMaxF64GlobalInsts;
879   }
880 
hasAtomicFMinFMaxF32FlatInsts()881   bool hasAtomicFMinFMaxF32FlatInsts() const {
882     return HasAtomicFMinFMaxF32FlatInsts;
883   }
884 
hasAtomicFMinFMaxF64FlatInsts()885   bool hasAtomicFMinFMaxF64FlatInsts() const {
886     return HasAtomicFMinFMaxF64FlatInsts;
887   }
888 
hasAtomicDsPkAdd16Insts()889   bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
890 
hasAtomicFlatPkAdd16Insts()891   bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
892 
hasAtomicFaddInsts()893   bool hasAtomicFaddInsts() const {
894     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
895   }
896 
hasAtomicFaddRtnInsts()897   bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
898 
hasAtomicFaddNoRtnInsts()899   bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
900 
hasAtomicBufferGlobalPkAddF16NoRtnInsts()901   bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
902     return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
903   }
904 
hasAtomicBufferGlobalPkAddF16Insts()905   bool hasAtomicBufferGlobalPkAddF16Insts() const {
906     return HasAtomicBufferGlobalPkAddF16Insts;
907   }
908 
hasAtomicGlobalPkAddBF16Inst()909   bool hasAtomicGlobalPkAddBF16Inst() const {
910     return HasAtomicGlobalPkAddBF16Inst;
911   }
912 
hasAtomicBufferPkAddBF16Inst()913   bool hasAtomicBufferPkAddBF16Inst() const {
914     return HasAtomicBufferPkAddBF16Inst;
915   }
916 
hasFlatAtomicFaddF32Inst()917   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
918 
919   /// \return true if the target has flat, global, and buffer atomic fadd for
920   /// double.
hasFlatBufferGlobalAtomicFaddF64Inst()921   bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
922     return HasFlatBufferGlobalAtomicFaddF64Inst;
923   }
924 
925   /// \return true if the target's flat, global, and buffer atomic fadd for
926   /// float supports denormal handling.
hasMemoryAtomicFaddF32DenormalSupport()927   bool hasMemoryAtomicFaddF32DenormalSupport() const {
928     return HasMemoryAtomicFaddF32DenormalSupport;
929   }
930 
931   /// \return true if atomic operations targeting fine-grained memory work
932   /// correctly at device scope, in allocations in host or peer PCIe device
933   /// memory.
supportsAgentScopeFineGrainedRemoteMemoryAtomics()934   bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
935     return HasAgentScopeFineGrainedRemoteMemoryAtomics;
936   }
937 
hasDefaultComponentZero()938   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
939 
hasDefaultComponentBroadcast()940   bool hasDefaultComponentBroadcast() const {
941     return HasDefaultComponentBroadcast;
942   }
943 
hasNoSdstCMPX()944   bool hasNoSdstCMPX() const {
945     return HasNoSdstCMPX;
946   }
947 
hasVscnt()948   bool hasVscnt() const {
949     return HasVscnt;
950   }
951 
hasGetWaveIdInst()952   bool hasGetWaveIdInst() const {
953     return HasGetWaveIdInst;
954   }
955 
hasSMemTimeInst()956   bool hasSMemTimeInst() const {
957     return HasSMemTimeInst;
958   }
959 
hasShaderCyclesRegister()960   bool hasShaderCyclesRegister() const {
961     return HasShaderCyclesRegister;
962   }
963 
hasShaderCyclesHiLoRegisters()964   bool hasShaderCyclesHiLoRegisters() const {
965     return HasShaderCyclesHiLoRegisters;
966   }
967 
hasVOP3Literal()968   bool hasVOP3Literal() const {
969     return HasVOP3Literal;
970   }
971 
hasNoDataDepHazard()972   bool hasNoDataDepHazard() const {
973     return HasNoDataDepHazard;
974   }
975 
vmemWriteNeedsExpWaitcnt()976   bool vmemWriteNeedsExpWaitcnt() const {
977     return getGeneration() < SEA_ISLANDS;
978   }
979 
hasInstPrefetch()980   bool hasInstPrefetch() const {
981     return getGeneration() == GFX10 || getGeneration() == GFX11;
982   }
983 
hasPrefetch()984   bool hasPrefetch() const { return GFX12Insts; }
985 
hasSafeSmemPrefetch()986   bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
987 
988   // Has s_cmpk_* instructions.
hasSCmpK()989   bool hasSCmpK() const { return getGeneration() < GFX12; }
990 
991   // Scratch is allocated in 256 dword per wave blocks for the entire
992   // wavefront. When viewed from the perspective of an arbitrary workitem, this
993   // is 4-byte aligned.
994   //
995   // Only 4-byte alignment is really needed to access anything. Transformations
996   // on the pointer value itself may rely on the alignment / known low bits of
997   // the pointer. Set this to something above the minimum to avoid needing
998   // dynamic realignment in common cases.
getStackAlignment()999   Align getStackAlignment() const { return Align(16); }
1000 
enableMachineScheduler()1001   bool enableMachineScheduler() const override {
1002     return true;
1003   }
1004 
1005   bool useAA() const override;
1006 
enableSubRegLiveness()1007   bool enableSubRegLiveness() const override {
1008     return true;
1009   }
1010 
setScalarizeGlobalBehavior(bool b)1011   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
getScalarizeGlobalBehavior()1012   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
1013 
1014   // static wrappers
1015   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
1016 
1017   // XXX - Why is this here if it isn't in the default pass set?
enableEarlyIfConversion()1018   bool enableEarlyIfConversion() const override {
1019     return true;
1020   }
1021 
1022   void overrideSchedPolicy(MachineSchedPolicy &Policy,
1023                            unsigned NumRegionInstrs) const override;
1024 
1025   void mirFileLoaded(MachineFunction &MF) const override;
1026 
getMaxNumUserSGPRs()1027   unsigned getMaxNumUserSGPRs() const {
1028     return AMDGPU::getMaxNumUserSGPRs(*this);
1029   }
1030 
hasSMemRealTime()1031   bool hasSMemRealTime() const {
1032     return HasSMemRealTime;
1033   }
1034 
hasMovrel()1035   bool hasMovrel() const {
1036     return HasMovrel;
1037   }
1038 
hasVGPRIndexMode()1039   bool hasVGPRIndexMode() const {
1040     return HasVGPRIndexMode;
1041   }
1042 
1043   bool useVGPRIndexMode() const;
1044 
hasScalarCompareEq64()1045   bool hasScalarCompareEq64() const {
1046     return getGeneration() >= VOLCANIC_ISLANDS;
1047   }
1048 
hasScalarDwordx3Loads()1049   bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
1050 
hasScalarStores()1051   bool hasScalarStores() const {
1052     return HasScalarStores;
1053   }
1054 
hasScalarAtomics()1055   bool hasScalarAtomics() const {
1056     return HasScalarAtomics;
1057   }
1058 
hasLDSFPAtomicAddF32()1059   bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
hasLDSFPAtomicAddF64()1060   bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
1061 
1062   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
hasPermLaneX16()1063   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
1064 
1065   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
hasPermLane64()1066   bool hasPermLane64() const { return getGeneration() >= GFX11; }
1067 
hasDPP()1068   bool hasDPP() const {
1069     return HasDPP;
1070   }
1071 
hasDPPBroadcasts()1072   bool hasDPPBroadcasts() const {
1073     return HasDPP && getGeneration() < GFX10;
1074   }
1075 
hasDPPWavefrontShifts()1076   bool hasDPPWavefrontShifts() const {
1077     return HasDPP && getGeneration() < GFX10;
1078   }
1079 
hasDPP8()1080   bool hasDPP8() const {
1081     return HasDPP8;
1082   }
1083 
hasDPALU_DPP()1084   bool hasDPALU_DPP() const {
1085     return HasDPALU_DPP;
1086   }
1087 
hasDPPSrc1SGPR()1088   bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
1089 
hasPackedFP32Ops()1090   bool hasPackedFP32Ops() const {
1091     return HasPackedFP32Ops;
1092   }
1093 
1094   // Has V_PK_MOV_B32 opcode
hasPkMovB32()1095   bool hasPkMovB32() const {
1096     return GFX90AInsts;
1097   }
1098 
hasFmaakFmamkF32Insts()1099   bool hasFmaakFmamkF32Insts() const {
1100     return getGeneration() >= GFX10 || hasGFX940Insts();
1101   }
1102 
hasFmaakFmamkF64Insts()1103   bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
1104 
hasImageInsts()1105   bool hasImageInsts() const {
1106     return HasImageInsts;
1107   }
1108 
hasExtendedImageInsts()1109   bool hasExtendedImageInsts() const {
1110     return HasExtendedImageInsts;
1111   }
1112 
hasR128A16()1113   bool hasR128A16() const {
1114     return HasR128A16;
1115   }
1116 
hasA16()1117   bool hasA16() const { return HasA16; }
1118 
hasG16()1119   bool hasG16() const { return HasG16; }
1120 
hasOffset3fBug()1121   bool hasOffset3fBug() const {
1122     return HasOffset3fBug;
1123   }
1124 
hasImageStoreD16Bug()1125   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1126 
hasImageGather4D16Bug()1127   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1128 
hasMADIntraFwdBug()1129   bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1130 
hasMSAALoadDstSelBug()1131   bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1132 
hasPrivEnabledTrap2NopBug()1133   bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1134 
hasNSAEncoding()1135   bool hasNSAEncoding() const { return HasNSAEncoding; }
1136 
hasNonNSAEncoding()1137   bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1138 
hasPartialNSAEncoding()1139   bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1140 
1141   unsigned getNSAMaxSize(bool HasSampler = false) const {
1142     return AMDGPU::getNSAMaxSize(*this, HasSampler);
1143   }
1144 
hasGFX10_AEncoding()1145   bool hasGFX10_AEncoding() const {
1146     return GFX10_AEncoding;
1147   }
1148 
hasGFX10_BEncoding()1149   bool hasGFX10_BEncoding() const {
1150     return GFX10_BEncoding;
1151   }
1152 
hasGFX10_3Insts()1153   bool hasGFX10_3Insts() const {
1154     return GFX10_3Insts;
1155   }
1156 
1157   bool hasMadF16() const;
1158 
hasMovB64()1159   bool hasMovB64() const { return GFX940Insts || GFX1250Insts; }
1160 
hasLshlAddU64Inst()1161   bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
1162 
enableSIScheduler()1163   bool enableSIScheduler() const {
1164     return EnableSIScheduler;
1165   }
1166 
loadStoreOptEnabled()1167   bool loadStoreOptEnabled() const {
1168     return EnableLoadStoreOpt;
1169   }
1170 
hasSGPRInitBug()1171   bool hasSGPRInitBug() const {
1172     return SGPRInitBug;
1173   }
1174 
hasUserSGPRInit16Bug()1175   bool hasUserSGPRInit16Bug() const {
1176     return UserSGPRInit16Bug && isWave32();
1177   }
1178 
hasNegativeScratchOffsetBug()1179   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1180 
hasNegativeUnalignedScratchOffsetBug()1181   bool hasNegativeUnalignedScratchOffsetBug() const {
1182     return NegativeUnalignedScratchOffsetBug;
1183   }
1184 
hasMFMAInlineLiteralBug()1185   bool hasMFMAInlineLiteralBug() const {
1186     return HasMFMAInlineLiteralBug;
1187   }
1188 
has12DWordStoreHazard()1189   bool has12DWordStoreHazard() const {
1190     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1191   }
1192 
1193   // \returns true if the subtarget supports DWORDX3 load/store instructions.
hasDwordx3LoadStores()1194   bool hasDwordx3LoadStores() const {
1195     return CIInsts;
1196   }
1197 
hasReadM0MovRelInterpHazard()1198   bool hasReadM0MovRelInterpHazard() const {
1199     return getGeneration() == AMDGPUSubtarget::GFX9;
1200   }
1201 
hasReadM0SendMsgHazard()1202   bool hasReadM0SendMsgHazard() const {
1203     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1204            getGeneration() <= AMDGPUSubtarget::GFX9;
1205   }
1206 
hasReadM0LdsDmaHazard()1207   bool hasReadM0LdsDmaHazard() const {
1208     return getGeneration() == AMDGPUSubtarget::GFX9;
1209   }
1210 
hasReadM0LdsDirectHazard()1211   bool hasReadM0LdsDirectHazard() const {
1212     return getGeneration() == AMDGPUSubtarget::GFX9;
1213   }
1214 
hasVcmpxPermlaneHazard()1215   bool hasVcmpxPermlaneHazard() const {
1216     return HasVcmpxPermlaneHazard;
1217   }
1218 
hasVMEMtoScalarWriteHazard()1219   bool hasVMEMtoScalarWriteHazard() const {
1220     return HasVMEMtoScalarWriteHazard;
1221   }
1222 
hasSMEMtoVectorWriteHazard()1223   bool hasSMEMtoVectorWriteHazard() const {
1224     return HasSMEMtoVectorWriteHazard;
1225   }
1226 
hasLDSMisalignedBug()1227   bool hasLDSMisalignedBug() const {
1228     return LDSMisalignedBug && !EnableCuMode;
1229   }
1230 
hasInstFwdPrefetchBug()1231   bool hasInstFwdPrefetchBug() const {
1232     return HasInstFwdPrefetchBug;
1233   }
1234 
hasVcmpxExecWARHazard()1235   bool hasVcmpxExecWARHazard() const {
1236     return HasVcmpxExecWARHazard;
1237   }
1238 
hasLdsBranchVmemWARHazard()1239   bool hasLdsBranchVmemWARHazard() const {
1240     return HasLdsBranchVmemWARHazard;
1241   }
1242 
1243   // Shift amount of a 64 bit shift cannot be a highest allocated register
1244   // if also at the end of the allocation block.
hasShift64HighRegBug()1245   bool hasShift64HighRegBug() const {
1246     return GFX90AInsts && !GFX940Insts;
1247   }
1248 
1249   // Has one cycle hazard on transcendental instruction feeding a
1250   // non transcendental VALU.
hasTransForwardingHazard()1251   bool hasTransForwardingHazard() const { return GFX940Insts; }
1252 
1253   // Has one cycle hazard on a VALU instruction partially writing dst with
1254   // a shift of result bits feeding another VALU instruction.
hasDstSelForwardingHazard()1255   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1256 
1257   // Cannot use op_sel with v_dot instructions.
hasDOTOpSelHazard()1258   bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
1259 
1260   // Does not have HW interlocs for VALU writing and then reading SGPRs.
hasVDecCoExecHazard()1261   bool hasVDecCoExecHazard() const {
1262     return GFX940Insts;
1263   }
1264 
hasNSAtoVMEMBug()1265   bool hasNSAtoVMEMBug() const {
1266     return HasNSAtoVMEMBug;
1267   }
1268 
hasNSAClauseBug()1269   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1270 
hasHardClauses()1271   bool hasHardClauses() const { return MaxHardClauseLength > 0; }
1272 
hasGFX90AInsts()1273   bool hasGFX90AInsts() const { return GFX90AInsts; }
1274 
hasFPAtomicToDenormModeHazard()1275   bool hasFPAtomicToDenormModeHazard() const {
1276     return getGeneration() == GFX10;
1277   }
1278 
hasVOP3DPP()1279   bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1280 
hasLdsDirect()1281   bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1282 
hasLdsWaitVMSRC()1283   bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1284 
hasVALUPartialForwardingHazard()1285   bool hasVALUPartialForwardingHazard() const {
1286     return getGeneration() == GFX11;
1287   }
1288 
hasVALUTransUseHazard()1289   bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1290 
hasCvtScaleForwardingHazard()1291   bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
1292 
requiresCodeObjectV6()1293   bool requiresCodeObjectV6() const { return RequiresCOV6; }
1294 
useVGPRBlockOpsForCSR()1295   bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
1296 
hasVALUMaskWriteHazard()1297   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1298 
hasVALUReadSGPRHazard()1299   bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
1300 
1301   /// Return if operations acting on VGPR tuples require even alignment.
needsAlignedVGPRs()1302   bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
1303 
1304   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
hasSPackHL()1305   bool hasSPackHL() const { return GFX11Insts; }
1306 
1307   /// Return true if the target's EXP instruction has the COMPR flag, which
1308   /// affects the meaning of the EN (enable) bits.
hasCompressedExport()1309   bool hasCompressedExport() const { return !GFX11Insts; }
1310 
1311   /// Return true if the target's EXP instruction supports the NULL export
1312   /// target.
hasNullExportTarget()1313   bool hasNullExportTarget() const { return !GFX11Insts; }
1314 
has1_5xVGPRs()1315   bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
1316 
hasVOPDInsts()1317   bool hasVOPDInsts() const { return HasVOPDInsts; }
1318 
hasFlatScratchSVSSwizzleBug()1319   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1320 
1321   /// Return true if the target has the S_DELAY_ALU instruction.
hasDelayAlu()1322   bool hasDelayAlu() const { return GFX11Insts; }
1323 
hasPackedTID()1324   bool hasPackedTID() const { return HasPackedTID; }
1325 
1326   // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that
1327   // hasGFX90AInsts is also true.
hasGFX940Insts()1328   bool hasGFX940Insts() const { return GFX940Insts; }
1329 
1330   // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that
1331   // hasGFX940Insts and hasGFX90AInsts are also true.
hasGFX950Insts()1332   bool hasGFX950Insts() const { return GFX950Insts; }
1333 
1334   /// Returns true if the target supports
1335   /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
1336   /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
hasLDSLoadB96_B128()1337   bool hasLDSLoadB96_B128() const {
1338     return hasGFX950Insts();
1339   }
1340 
hasVMemToLDSLoad()1341   bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }
1342 
hasSALUFloatInsts()1343   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1344 
hasPseudoScalarTrans()1345   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1346 
hasRestrictedSOffset()1347   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1348 
hasRequiredExportPriority()1349   bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
1350 
hasVmemWriteVgprInOrder()1351   bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
1352 
1353   /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1354   /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
hasExtendedWaitCounts()1355   bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1356 
1357   /// \returns true if inline constants are not supported for F16 pseudo
1358   /// scalar transcendentals.
hasNoF16PseudoScalarTransInlineConstants()1359   bool hasNoF16PseudoScalarTransInlineConstants() const {
1360     return getGeneration() == GFX12;
1361   }
1362 
1363   /// \returns true if the target has instructions with xf32 format support.
hasXF32Insts()1364   bool hasXF32Insts() const { return HasXF32Insts; }
1365 
hasBitOp3Insts()1366   bool hasBitOp3Insts() const { return HasBitOp3Insts; }
1367 
hasPermlane16Swap()1368   bool hasPermlane16Swap() const { return HasPermlane16Swap; }
hasPermlane32Swap()1369   bool hasPermlane32Swap() const { return HasPermlane32Swap; }
hasAshrPkInsts()1370   bool hasAshrPkInsts() const { return HasAshrPkInsts; }
1371 
hasMinimum3Maximum3F32()1372   bool hasMinimum3Maximum3F32() const {
1373     return HasMinimum3Maximum3F32;
1374   }
1375 
hasMinimum3Maximum3F16()1376   bool hasMinimum3Maximum3F16() const {
1377     return HasMinimum3Maximum3F16;
1378   }
1379 
hasMinimum3Maximum3PKF16()1380   bool hasMinimum3Maximum3PKF16() const {
1381     return HasMinimum3Maximum3PKF16;
1382   }
1383 
hasTransposeLoadF4F6Insts()1384   bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
1385 
1386   /// \returns true if the target has s_wait_xcnt insertion. Supported for
1387   /// GFX1250.
hasWaitXCnt()1388   bool hasWaitXCnt() const { return HasWaitXcnt; }
1389 
1390   // A single DWORD instructions can use a 64-bit literal.
has64BitLiterals()1391   bool has64BitLiterals() const { return Has64BitLiterals; }
1392 
hasPointSampleAccel()1393   bool hasPointSampleAccel() const { return HasPointSampleAccel; }
1394 
hasLdsBarrierArriveAtomic()1395   bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }
1396 
1397   /// \returns The maximum number of instructions that can be enclosed in an
1398   /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1399   /// instruction.
maxHardClauseLength()1400   unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1401 
hasPrngInst()1402   bool hasPrngInst() const { return HasPrngInst; }
1403 
hasBVHDualAndBVH8Insts()1404   bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; }
1405 
1406   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1407   /// SGPRs
1408   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1409 
1410   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1411   /// VGPRs
1412   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
1413                                     unsigned DynamicVGPRBlockSize) const;
1414 
1415   /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
1416   /// be achieved when the only function running on a CU is \p F, each workgroup
1417   /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
1418   /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
1419   /// range, so this returns a range as well.
1420   ///
1421   /// Note that occupancy can be affected by the scratch allocation as well, but
1422   /// we do not have enough information to compute it.
1423   std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
1424                                                  unsigned LDSSize = 0,
1425                                                  unsigned NumSGPRs = 0,
1426                                                  unsigned NumVGPRs = 0) const;
1427 
1428   /// \returns true if the flat_scratch register should be initialized with the
1429   /// pointer to the wave's scratch memory rather than a size and offset.
flatScratchIsPointer()1430   bool flatScratchIsPointer() const {
1431     return getGeneration() >= AMDGPUSubtarget::GFX9;
1432   }
1433 
1434   /// \returns true if the flat_scratch register is initialized by the HW.
1435   /// In this case it is readonly.
flatScratchIsArchitected()1436   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1437 
1438   /// \returns true if the architected SGPRs are enabled.
hasArchitectedSGPRs()1439   bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1440 
1441   /// \returns true if Global Data Share is supported.
hasGDS()1442   bool hasGDS() const { return HasGDS; }
1443 
1444   /// \returns true if Global Wave Sync is supported.
hasGWS()1445   bool hasGWS() const { return HasGWS; }
1446 
1447   /// \returns true if the machine has merged shaders in which s0-s7 are
1448   /// reserved by the hardware and user SGPRs start at s8
hasMergedShaders()1449   bool hasMergedShaders() const {
1450     return getGeneration() >= GFX9;
1451   }
1452 
1453   // \returns true if the target supports the pre-NGG legacy geometry path.
hasLegacyGeometry()1454   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1455 
1456   // \returns true if preloading kernel arguments is supported.
hasKernargPreload()1457   bool hasKernargPreload() const { return KernargPreload; }
1458 
1459   // \returns true if the target has split barriers feature
hasSplitBarriers()1460   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1461 
1462   // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
hasCvtFP8VOP1Bug()1463   bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }
1464 
1465   // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1466   // no-return form.
hasAtomicCSubNoRtnInsts()1467   bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1468 
1469   // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
hasDX10ClampMode()1470   bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1471 
1472   // \returns true if the target has IEEE kernel descriptor mode bit
hasIEEEMode()1473   bool hasIEEEMode() const { return getGeneration() < GFX12; }
1474 
1475   // \returns true if the target has IEEE fminimum/fmaximum instructions
hasIEEEMinimumMaximumInsts()1476   bool hasIEEEMinimumMaximumInsts() const { return HasIEEEMinimumMaximumInsts; }
1477 
1478   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
hasRrWGMode()1479   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1480 
1481   /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1482   /// values.
hasSignedScratchOffsets()1483   bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1484 
hasGFX1250Insts()1485   bool hasGFX1250Insts() const { return GFX1250Insts; }
1486 
hasVOPD3()1487   bool hasVOPD3() const { return GFX1250Insts; }
1488 
1489   // \returns true if target has S_SETPRIO_INC_WG instruction.
hasSetPrioIncWgInst()1490   bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
1491 
1492   // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1493   // of sign-extending.
hasGetPCZeroExtension()1494   bool hasGetPCZeroExtension() const { return GFX12Insts; }
1495 
1496   /// \returns SGPR allocation granularity supported by the subtarget.
getSGPRAllocGranule()1497   unsigned getSGPRAllocGranule() const {
1498     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1499   }
1500 
1501   /// \returns SGPR encoding granularity supported by the subtarget.
getSGPREncodingGranule()1502   unsigned getSGPREncodingGranule() const {
1503     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1504   }
1505 
1506   /// \returns Total number of SGPRs supported by the subtarget.
getTotalNumSGPRs()1507   unsigned getTotalNumSGPRs() const {
1508     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1509   }
1510 
1511   /// \returns Addressable number of SGPRs supported by the subtarget.
getAddressableNumSGPRs()1512   unsigned getAddressableNumSGPRs() const {
1513     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1514   }
1515 
1516   /// \returns Minimum number of SGPRs that meets the given number of waves per
1517   /// execution unit requirement supported by the subtarget.
getMinNumSGPRs(unsigned WavesPerEU)1518   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1519     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1520   }
1521 
1522   /// \returns Maximum number of SGPRs that meets the given number of waves per
1523   /// execution unit requirement supported by the subtarget.
getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)1524   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1525     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1526   }
1527 
1528   /// \returns Reserved number of SGPRs. This is common
1529   /// utility function called by MachineFunction and
1530   /// Function variants of getReservedNumSGPRs.
1531   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1532   /// \returns Reserved number of SGPRs for given machine function \p MF.
1533   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1534 
1535   /// \returns Reserved number of SGPRs for given function \p F.
1536   unsigned getReservedNumSGPRs(const Function &F) const;
1537 
1538   /// \returns Maximum number of preloaded SGPRs for the subtarget.
1539   unsigned getMaxNumPreloadedSGPRs() const;
1540 
1541   /// \returns max num SGPRs. This is the common utility
1542   /// function called by MachineFunction and Function
1543   /// variants of getMaxNumSGPRs.
1544   unsigned getBaseMaxNumSGPRs(const Function &F,
1545                               std::pair<unsigned, unsigned> WavesPerEU,
1546                               unsigned PreloadedSGPRs,
1547                               unsigned ReservedNumSGPRs) const;
1548 
1549   /// \returns Maximum number of SGPRs that meets number of waves per execution
1550   /// unit requirement for function \p MF, or number of SGPRs explicitly
1551   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1552   ///
1553   /// \returns Value that meets number of waves per execution unit requirement
1554   /// if explicitly requested value cannot be converted to integer, violates
1555   /// subtarget's specifications, or does not meet number of waves per execution
1556   /// unit requirement.
1557   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1558 
1559   /// \returns Maximum number of SGPRs that meets number of waves per execution
1560   /// unit requirement for function \p F, or number of SGPRs explicitly
1561   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1562   ///
1563   /// \returns Value that meets number of waves per execution unit requirement
1564   /// if explicitly requested value cannot be converted to integer, violates
1565   /// subtarget's specifications, or does not meet number of waves per execution
1566   /// unit requirement.
1567   unsigned getMaxNumSGPRs(const Function &F) const;
1568 
1569   /// \returns VGPR allocation granularity supported by the subtarget.
getVGPRAllocGranule(unsigned DynamicVGPRBlockSize)1570   unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
1571     return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
1572   }
1573 
1574   /// \returns VGPR encoding granularity supported by the subtarget.
getVGPREncodingGranule()1575   unsigned getVGPREncodingGranule() const {
1576     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1577   }
1578 
1579   /// \returns Total number of VGPRs supported by the subtarget.
getTotalNumVGPRs()1580   unsigned getTotalNumVGPRs() const {
1581     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1582   }
1583 
1584   /// \returns Addressable number of architectural VGPRs supported by the
1585   /// subtarget.
getAddressableNumArchVGPRs()1586   unsigned getAddressableNumArchVGPRs() const {
1587     return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
1588   }
1589 
1590   /// \returns Addressable number of VGPRs supported by the subtarget.
getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize)1591   unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
1592     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
1593   }
1594 
1595   /// \returns the minimum number of VGPRs that will prevent achieving more than
1596   /// the specified number of waves \p WavesPerEU.
getMinNumVGPRs(unsigned WavesPerEU,unsigned DynamicVGPRBlockSize)1597   unsigned getMinNumVGPRs(unsigned WavesPerEU,
1598                           unsigned DynamicVGPRBlockSize) const {
1599     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
1600                                            DynamicVGPRBlockSize);
1601   }
1602 
1603   /// \returns the maximum number of VGPRs that can be used and still achieved
1604   /// at least the specified number of waves \p WavesPerEU.
getMaxNumVGPRs(unsigned WavesPerEU,unsigned DynamicVGPRBlockSize)1605   unsigned getMaxNumVGPRs(unsigned WavesPerEU,
1606                           unsigned DynamicVGPRBlockSize) const {
1607     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
1608                                            DynamicVGPRBlockSize);
1609   }
1610 
1611   /// \returns max num VGPRs. This is the common utility function
1612   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1613   unsigned
1614   getBaseMaxNumVGPRs(const Function &F,
1615                      std::pair<unsigned, unsigned> NumVGPRBounds) const;
1616 
1617   /// \returns Maximum number of VGPRs that meets number of waves per execution
1618   /// unit requirement for function \p F, or number of VGPRs explicitly
1619   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1620   ///
1621   /// \returns Value that meets number of waves per execution unit requirement
1622   /// if explicitly requested value cannot be converted to integer, violates
1623   /// subtarget's specifications, or does not meet number of waves per execution
1624   /// unit requirement.
1625   unsigned getMaxNumVGPRs(const Function &F) const;
1626 
getMaxNumAGPRs(const Function & F)1627   unsigned getMaxNumAGPRs(const Function &F) const {
1628     return getMaxNumVGPRs(F);
1629   }
1630 
1631   /// \returns Maximum number of VGPRs that meets number of waves per execution
1632   /// unit requirement for function \p MF, or number of VGPRs explicitly
1633   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1634   ///
1635   /// \returns Value that meets number of waves per execution unit requirement
1636   /// if explicitly requested value cannot be converted to integer, violates
1637   /// subtarget's specifications, or does not meet number of waves per execution
1638   /// unit requirement.
1639   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1640 
isWave32()1641   bool isWave32() const {
1642     return getWavefrontSize() == 32;
1643   }
1644 
isWave64()1645   bool isWave64() const {
1646     return getWavefrontSize() == 64;
1647   }
1648 
1649   /// Returns if the wavesize of this subtarget is known reliable. This is false
1650   /// only for the a default target-cpu that does not have an explicit
1651   /// +wavefrontsize target feature.
isWaveSizeKnown()1652   bool isWaveSizeKnown() const {
1653     return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
1654            hasFeature(AMDGPU::FeatureWavefrontSize64);
1655   }
1656 
getBoolRC()1657   const TargetRegisterClass *getBoolRC() const {
1658     return getRegisterInfo()->getBoolRC();
1659   }
1660 
1661   /// \returns Maximum number of work groups per compute unit supported by the
1662   /// subtarget and limited by given \p FlatWorkGroupSize.
getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1663   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1664     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1665   }
1666 
1667   /// \returns Minimum flat work group size supported by the subtarget.
getMinFlatWorkGroupSize()1668   unsigned getMinFlatWorkGroupSize() const override {
1669     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1670   }
1671 
1672   /// \returns Maximum flat work group size supported by the subtarget.
getMaxFlatWorkGroupSize()1673   unsigned getMaxFlatWorkGroupSize() const override {
1674     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1675   }
1676 
1677   /// \returns Number of waves per execution unit required to support the given
1678   /// \p FlatWorkGroupSize.
1679   unsigned
getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)1680   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1681     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1682   }
1683 
1684   /// \returns Minimum number of waves per execution unit supported by the
1685   /// subtarget.
getMinWavesPerEU()1686   unsigned getMinWavesPerEU() const override {
1687     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1688   }
1689 
1690   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1691                              SDep &Dep,
1692                              const TargetSchedModel *SchedModel) const override;
1693 
1694   // \returns true if it's beneficial on this subtarget for the scheduler to
1695   // cluster stores as well as loads.
shouldClusterStores()1696   bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1697 
1698   // \returns the number of address arguments from which to enable MIMG NSA
1699   // on supported architectures.
1700   unsigned getNSAThreshold(const MachineFunction &MF) const;
1701 
1702   // \returns true if the subtarget has a hazard requiring an "s_nop 0"
1703   // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
requiresNopBeforeDeallocVGPRs()1704   bool requiresNopBeforeDeallocVGPRs() const {
1705     // Currently all targets that support the dealloc VGPRs message also require
1706     // the nop.
1707     return true;
1708   }
1709 
isDynamicVGPREnabled()1710   bool isDynamicVGPREnabled() const { return DynamicVGPR; }
getDynamicVGPRBlockSize()1711   unsigned getDynamicVGPRBlockSize() const {
1712     return DynamicVGPRBlockSize32 ? 32 : 16;
1713   }
1714 
requiresDisjointEarlyClobberAndUndef()1715   bool requiresDisjointEarlyClobberAndUndef() const override {
1716     // AMDGPU doesn't care if early-clobber and undef operands are allocated
1717     // to the same register.
1718     return false;
1719   }
1720 };
1721 
1722 class GCNUserSGPRUsageInfo {
1723 public:
hasImplicitBufferPtr()1724   bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1725 
hasPrivateSegmentBuffer()1726   bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1727 
hasDispatchPtr()1728   bool hasDispatchPtr() const { return DispatchPtr; }
1729 
hasQueuePtr()1730   bool hasQueuePtr() const { return QueuePtr; }
1731 
hasKernargSegmentPtr()1732   bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1733 
hasDispatchID()1734   bool hasDispatchID() const { return DispatchID; }
1735 
hasFlatScratchInit()1736   bool hasFlatScratchInit() const { return FlatScratchInit; }
1737 
hasPrivateSegmentSize()1738   bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1739 
getNumKernargPreloadSGPRs()1740   unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1741 
getNumUsedUserSGPRs()1742   unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1743 
1744   unsigned getNumFreeUserSGPRs();
1745 
1746   void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1747 
1748   enum UserSGPRID : unsigned {
1749     ImplicitBufferPtrID = 0,
1750     PrivateSegmentBufferID = 1,
1751     DispatchPtrID = 2,
1752     QueuePtrID = 3,
1753     KernargSegmentPtrID = 4,
1754     DispatchIdID = 5,
1755     FlatScratchInitID = 6,
1756     PrivateSegmentSizeID = 7
1757   };
1758 
1759   // Returns the size in number of SGPRs for preload user SGPR field.
getNumUserSGPRForField(UserSGPRID ID)1760   static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1761     switch (ID) {
1762     case ImplicitBufferPtrID:
1763       return 2;
1764     case PrivateSegmentBufferID:
1765       return 4;
1766     case DispatchPtrID:
1767       return 2;
1768     case QueuePtrID:
1769       return 2;
1770     case KernargSegmentPtrID:
1771       return 2;
1772     case DispatchIdID:
1773       return 2;
1774     case FlatScratchInitID:
1775       return 2;
1776     case PrivateSegmentSizeID:
1777       return 1;
1778     }
1779     llvm_unreachable("Unknown UserSGPRID.");
1780   }
1781 
1782   GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1783 
1784 private:
1785   const GCNSubtarget &ST;
1786 
1787   // Private memory buffer
1788   // Compute directly in sgpr[0:1]
1789   // Other shaders indirect 64-bits at sgpr[0:1]
1790   bool ImplicitBufferPtr = false;
1791 
1792   bool PrivateSegmentBuffer = false;
1793 
1794   bool DispatchPtr = false;
1795 
1796   bool QueuePtr = false;
1797 
1798   bool KernargSegmentPtr = false;
1799 
1800   bool DispatchID = false;
1801 
1802   bool FlatScratchInit = false;
1803 
1804   bool PrivateSegmentSize = false;
1805 
1806   unsigned NumKernargPreloadSGPRs = 0;
1807 
1808   unsigned NumUsedUserSGPRs = 0;
1809 };
1810 
1811 } // end namespace llvm
1812 
1813 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1814