xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h (revision 6c4b055cfb6bf549e9145dde6454cc6b178c35e4)
1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "SIFrameLowering.h"
21 #include "SIISelLowering.h"
22 #include "SIInstrInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
25 #include "llvm/Support/ErrorHandling.h"
26 
27 #define GET_SUBTARGETINFO_HEADER
28 #include "AMDGPUGenSubtargetInfo.inc"
29 
30 namespace llvm {
31 
32 class GCNTargetMachine;
33 
34 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
35                            public AMDGPUSubtarget {
36 public:
37   using AMDGPUSubtarget::getMaxWavesPerEU;
38 
39   // Following 2 enums are documented at:
40   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41   enum class TrapHandlerAbi {
42     NONE   = 0x00,
43     AMDHSA = 0x01,
44   };
45 
46   enum class TrapID {
47     LLVMAMDHSATrap      = 0x02,
48     LLVMAMDHSADebugTrap = 0x03,
49   };
50 
51 private:
52   /// GlobalISel related APIs.
53   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
54   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
55   std::unique_ptr<InstructionSelector> InstSelector;
56   std::unique_ptr<LegalizerInfo> Legalizer;
57   std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
58 
59 protected:
60   // Basic subtarget description.
61   Triple TargetTriple;
62   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
63   unsigned Gen = INVALID;
64   InstrItineraryData InstrItins;
65   int LDSBankCount = 0;
66   unsigned MaxPrivateElementSize = 0;
67 
68   // Possibly statically set by tablegen, but may want to be overridden.
69   bool FastDenormalF32 = false;
70   bool HalfRate64Ops = false;
71   bool FullRate64Ops = false;
72 
73   // Dynamically set bits that enable features.
74   bool FlatForGlobal = false;
75   bool AutoWaitcntBeforeBarrier = false;
76   bool BackOffBarrier = false;
77   bool UnalignedScratchAccess = false;
78   bool UnalignedAccessMode = false;
79   bool HasApertureRegs = false;
80   bool SupportsXNACK = false;
81   bool KernargPreload = false;
82 
83   // This should not be used directly. 'TargetID' tracks the dynamic settings
84   // for XNACK.
85   bool EnableXNACK = false;
86 
87   bool EnableTgSplit = false;
88   bool EnableCuMode = false;
89   bool TrapHandler = false;
90   bool EnablePreciseMemory = false;
91 
92   // Used as options.
93   bool EnableLoadStoreOpt = false;
94   bool EnableUnsafeDSOffsetFolding = false;
95   bool EnableSIScheduler = false;
96   bool EnableDS128 = false;
97   bool EnablePRTStrictNull = false;
98   bool DumpCode = false;
99 
100   // Subtarget statically properties set by tablegen
101   bool FP64 = false;
102   bool FMA = false;
103   bool MIMG_R128 = false;
104   bool CIInsts = false;
105   bool GFX8Insts = false;
106   bool GFX9Insts = false;
107   bool GFX90AInsts = false;
108   bool GFX940Insts = false;
109   bool GFX10Insts = false;
110   bool GFX11Insts = false;
111   bool GFX12Insts = false;
112   bool GFX10_3Insts = false;
113   bool GFX7GFX8GFX9Insts = false;
114   bool SGPRInitBug = false;
115   bool UserSGPRInit16Bug = false;
116   bool NegativeScratchOffsetBug = false;
117   bool NegativeUnalignedScratchOffsetBug = false;
118   bool HasSMemRealTime = false;
119   bool HasIntClamp = false;
120   bool HasFmaMixInsts = false;
121   bool HasMovrel = false;
122   bool HasVGPRIndexMode = false;
123   bool HasScalarDwordx3Loads = false;
124   bool HasScalarStores = false;
125   bool HasScalarAtomics = false;
126   bool HasSDWAOmod = false;
127   bool HasSDWAScalar = false;
128   bool HasSDWASdst = false;
129   bool HasSDWAMac = false;
130   bool HasSDWAOutModsVOPC = false;
131   bool HasDPP = false;
132   bool HasDPP8 = false;
133   bool HasDPALU_DPP = false;
134   bool HasDPPSrc1SGPR = false;
135   bool HasPackedFP32Ops = false;
136   bool HasImageInsts = false;
137   bool HasExtendedImageInsts = false;
138   bool HasR128A16 = false;
139   bool HasA16 = false;
140   bool HasG16 = false;
141   bool HasNSAEncoding = false;
142   bool HasPartialNSAEncoding = false;
143   bool GFX10_AEncoding = false;
144   bool GFX10_BEncoding = false;
145   bool HasDLInsts = false;
146   bool HasFmacF64Inst = false;
147   bool HasDot1Insts = false;
148   bool HasDot2Insts = false;
149   bool HasDot3Insts = false;
150   bool HasDot4Insts = false;
151   bool HasDot5Insts = false;
152   bool HasDot6Insts = false;
153   bool HasDot7Insts = false;
154   bool HasDot8Insts = false;
155   bool HasDot9Insts = false;
156   bool HasDot10Insts = false;
157   bool HasDot11Insts = false;
158   bool HasMAIInsts = false;
159   bool HasFP8Insts = false;
160   bool HasFP8ConversionInsts = false;
161   bool HasPkFmacF16Inst = false;
162   bool HasAtomicFMinFMaxF32GlobalInsts = false;
163   bool HasAtomicFMinFMaxF64GlobalInsts = false;
164   bool HasAtomicFMinFMaxF32FlatInsts = false;
165   bool HasAtomicFMinFMaxF64FlatInsts = false;
166   bool HasAtomicDsPkAdd16Insts = false;
167   bool HasAtomicFlatPkAdd16Insts = false;
168   bool HasAtomicFaddRtnInsts = false;
169   bool HasAtomicFaddNoRtnInsts = false;
170   bool HasMemoryAtomicFaddF32DenormalSupport = false;
171   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
172   bool HasAtomicBufferGlobalPkAddF16Insts = false;
173   bool HasAtomicCSubNoRtnInsts = false;
174   bool HasAtomicGlobalPkAddBF16Inst = false;
175   bool HasAtomicBufferPkAddBF16Inst = false;
176   bool HasFlatAtomicFaddF32Inst = false;
177   bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
178   bool HasDefaultComponentZero = false;
179   bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
180   bool HasDefaultComponentBroadcast = false;
181   /// The maximum number of instructions that may be placed within an S_CLAUSE,
182   /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
183   /// indicates a lack of S_CLAUSE support.
184   unsigned MaxHardClauseLength = 0;
185   bool SupportsSRAMECC = false;
186 
187   // This should not be used directly. 'TargetID' tracks the dynamic settings
188   // for SRAMECC.
189   bool EnableSRAMECC = false;
190 
191   bool HasNoSdstCMPX = false;
192   bool HasVscnt = false;
193   bool HasGetWaveIdInst = false;
194   bool HasSMemTimeInst = false;
195   bool HasShaderCyclesRegister = false;
196   bool HasShaderCyclesHiLoRegisters = false;
197   bool HasVOP3Literal = false;
198   bool HasNoDataDepHazard = false;
199   bool FlatAddressSpace = false;
200   bool FlatInstOffsets = false;
201   bool FlatGlobalInsts = false;
202   bool FlatScratchInsts = false;
203   bool ScalarFlatScratchInsts = false;
204   bool HasArchitectedFlatScratch = false;
205   bool EnableFlatScratch = false;
206   bool HasArchitectedSGPRs = false;
207   bool HasGDS = false;
208   bool HasGWS = false;
209   bool AddNoCarryInsts = false;
210   bool HasUnpackedD16VMem = false;
211   bool LDSMisalignedBug = false;
212   bool HasMFMAInlineLiteralBug = false;
213   bool UnalignedBufferAccess = false;
214   bool UnalignedDSAccess = false;
215   bool HasPackedTID = false;
216   bool ScalarizeGlobal = false;
217   bool HasSALUFloatInsts = false;
218   bool HasVGPRSingleUseHintInsts = false;
219   bool HasPseudoScalarTrans = false;
220   bool HasRestrictedSOffset = false;
221 
222   bool HasVcmpxPermlaneHazard = false;
223   bool HasVMEMtoScalarWriteHazard = false;
224   bool HasSMEMtoVectorWriteHazard = false;
225   bool HasInstFwdPrefetchBug = false;
226   bool HasVcmpxExecWARHazard = false;
227   bool HasLdsBranchVmemWARHazard = false;
228   bool HasNSAtoVMEMBug = false;
229   bool HasNSAClauseBug = false;
230   bool HasOffset3fBug = false;
231   bool HasFlatSegmentOffsetBug = false;
232   bool HasImageStoreD16Bug = false;
233   bool HasImageGather4D16Bug = false;
234   bool HasMSAALoadDstSelBug = false;
235   bool HasPrivEnabledTrap2NopBug = false;
236   bool Has1_5xVGPRs = false;
237   bool HasMADIntraFwdBug = false;
238   bool HasVOPDInsts = false;
239   bool HasVALUTransUseHazard = false;
240   bool HasForceStoreSC0SC1 = false;
241   bool HasRequiredExportPriority = false;
242   bool HasVmemWriteVgprInOrder = false;
243 
244   bool RequiresCOV6 = false;
245 
246   // Dummy feature to use for assembler in tablegen.
247   bool FeatureDisable = false;
248 
249   SelectionDAGTargetInfo TSInfo;
250 private:
251   SIInstrInfo InstrInfo;
252   SITargetLowering TLInfo;
253   SIFrameLowering FrameLowering;
254 
255 public:
256   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
257                const GCNTargetMachine &TM);
258   ~GCNSubtarget() override;
259 
260   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
261                                                    StringRef GPU, StringRef FS);
262 
263   /// Diagnose inconsistent subtarget features before attempting to codegen
264   /// function \p F.
265   void checkSubtargetFeatures(const Function &F) const;
266 
getInstrInfo()267   const SIInstrInfo *getInstrInfo() const override {
268     return &InstrInfo;
269   }
270 
getFrameLowering()271   const SIFrameLowering *getFrameLowering() const override {
272     return &FrameLowering;
273   }
274 
getTargetLowering()275   const SITargetLowering *getTargetLowering() const override {
276     return &TLInfo;
277   }
278 
getRegisterInfo()279   const SIRegisterInfo *getRegisterInfo() const override {
280     return &InstrInfo.getRegisterInfo();
281   }
282 
getCallLowering()283   const CallLowering *getCallLowering() const override {
284     return CallLoweringInfo.get();
285   }
286 
getInlineAsmLowering()287   const InlineAsmLowering *getInlineAsmLowering() const override {
288     return InlineAsmLoweringInfo.get();
289   }
290 
getInstructionSelector()291   InstructionSelector *getInstructionSelector() const override {
292     return InstSelector.get();
293   }
294 
getLegalizerInfo()295   const LegalizerInfo *getLegalizerInfo() const override {
296     return Legalizer.get();
297   }
298 
getRegBankInfo()299   const AMDGPURegisterBankInfo *getRegBankInfo() const override {
300     return RegBankInfo.get();
301   }
302 
getTargetID()303   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
304     return TargetID;
305   }
306 
307   // Nothing implemented, just prevent crashes on use.
getSelectionDAGInfo()308   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
309     return &TSInfo;
310   }
311 
getInstrItineraryData()312   const InstrItineraryData *getInstrItineraryData() const override {
313     return &InstrItins;
314   }
315 
316   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
317 
getGeneration()318   Generation getGeneration() const {
319     return (Generation)Gen;
320   }
321 
getMaxWaveScratchSize()322   unsigned getMaxWaveScratchSize() const {
323     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
324     if (getGeneration() >= GFX12) {
325       // 18-bit field in units of 64-dword.
326       return (64 * 4) * ((1 << 18) - 1);
327     }
328     if (getGeneration() == GFX11) {
329       // 15-bit field in units of 64-dword.
330       return (64 * 4) * ((1 << 15) - 1);
331     }
332     // 13-bit field in units of 256-dword.
333     return (256 * 4) * ((1 << 13) - 1);
334   }
335 
336   /// Return the number of high bits known to be zero for a frame index.
getKnownHighZeroBitsForFrameIndex()337   unsigned getKnownHighZeroBitsForFrameIndex() const {
338     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
339   }
340 
getLDSBankCount()341   int getLDSBankCount() const {
342     return LDSBankCount;
343   }
344 
345   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
346     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
347   }
348 
349   unsigned getConstantBusLimit(unsigned Opcode) const;
350 
351   /// Returns if the result of this instruction with a 16-bit result returned in
352   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
353   /// the original value.
354   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
355 
supportsWGP()356   bool supportsWGP() const { return getGeneration() >= GFX10; }
357 
hasIntClamp()358   bool hasIntClamp() const {
359     return HasIntClamp;
360   }
361 
hasFP64()362   bool hasFP64() const {
363     return FP64;
364   }
365 
hasMIMG_R128()366   bool hasMIMG_R128() const {
367     return MIMG_R128;
368   }
369 
hasHWFP64()370   bool hasHWFP64() const {
371     return FP64;
372   }
373 
hasHalfRate64Ops()374   bool hasHalfRate64Ops() const {
375     return HalfRate64Ops;
376   }
377 
hasFullRate64Ops()378   bool hasFullRate64Ops() const {
379     return FullRate64Ops;
380   }
381 
hasAddr64()382   bool hasAddr64() const {
383     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
384   }
385 
hasFlat()386   bool hasFlat() const {
387     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
388   }
389 
390   // Return true if the target only has the reverse operand versions of VALU
391   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
hasOnlyRevVALUShifts()392   bool hasOnlyRevVALUShifts() const {
393     return getGeneration() >= VOLCANIC_ISLANDS;
394   }
395 
hasFractBug()396   bool hasFractBug() const {
397     return getGeneration() == SOUTHERN_ISLANDS;
398   }
399 
hasBFE()400   bool hasBFE() const {
401     return true;
402   }
403 
hasBFI()404   bool hasBFI() const {
405     return true;
406   }
407 
hasBFM()408   bool hasBFM() const {
409     return hasBFE();
410   }
411 
hasBCNT(unsigned Size)412   bool hasBCNT(unsigned Size) const {
413     return true;
414   }
415 
hasFFBL()416   bool hasFFBL() const {
417     return true;
418   }
419 
hasFFBH()420   bool hasFFBH() const {
421     return true;
422   }
423 
hasMed3_16()424   bool hasMed3_16() const {
425     return getGeneration() >= AMDGPUSubtarget::GFX9;
426   }
427 
hasMin3Max3_16()428   bool hasMin3Max3_16() const {
429     return getGeneration() >= AMDGPUSubtarget::GFX9;
430   }
431 
hasFmaMixInsts()432   bool hasFmaMixInsts() const {
433     return HasFmaMixInsts;
434   }
435 
hasCARRY()436   bool hasCARRY() const {
437     return true;
438   }
439 
hasFMA()440   bool hasFMA() const {
441     return FMA;
442   }
443 
hasSwap()444   bool hasSwap() const {
445     return GFX9Insts;
446   }
447 
hasScalarPackInsts()448   bool hasScalarPackInsts() const {
449     return GFX9Insts;
450   }
451 
hasScalarMulHiInsts()452   bool hasScalarMulHiInsts() const {
453     return GFX9Insts;
454   }
455 
hasScalarSubwordLoads()456   bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
457 
getTrapHandlerAbi()458   TrapHandlerAbi getTrapHandlerAbi() const {
459     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
460   }
461 
supportsGetDoorbellID()462   bool supportsGetDoorbellID() const {
463     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
464     return getGeneration() >= GFX9;
465   }
466 
467   /// True if the offset field of DS instructions works as expected. On SI, the
468   /// offset uses a 16-bit adder and does not always wrap properly.
hasUsableDSOffset()469   bool hasUsableDSOffset() const {
470     return getGeneration() >= SEA_ISLANDS;
471   }
472 
unsafeDSOffsetFoldingEnabled()473   bool unsafeDSOffsetFoldingEnabled() const {
474     return EnableUnsafeDSOffsetFolding;
475   }
476 
477   /// Condition output from div_scale is usable.
hasUsableDivScaleConditionOutput()478   bool hasUsableDivScaleConditionOutput() const {
479     return getGeneration() != SOUTHERN_ISLANDS;
480   }
481 
482   /// Extra wait hazard is needed in some cases before
483   /// s_cbranch_vccnz/s_cbranch_vccz.
hasReadVCCZBug()484   bool hasReadVCCZBug() const {
485     return getGeneration() <= SEA_ISLANDS;
486   }
487 
488   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
partialVCCWritesUpdateVCCZ()489   bool partialVCCWritesUpdateVCCZ() const {
490     return getGeneration() >= GFX10;
491   }
492 
493   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
494   /// was written by a VALU instruction.
hasSMRDReadVALUDefHazard()495   bool hasSMRDReadVALUDefHazard() const {
496     return getGeneration() == SOUTHERN_ISLANDS;
497   }
498 
499   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
500   /// SGPR was written by a VALU Instruction.
hasVMEMReadSGPRVALUDefHazard()501   bool hasVMEMReadSGPRVALUDefHazard() const {
502     return getGeneration() >= VOLCANIC_ISLANDS;
503   }
504 
hasRFEHazards()505   bool hasRFEHazards() const {
506     return getGeneration() >= VOLCANIC_ISLANDS;
507   }
508 
509   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
getSetRegWaitStates()510   unsigned getSetRegWaitStates() const {
511     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
512   }
513 
dumpCode()514   bool dumpCode() const {
515     return DumpCode;
516   }
517 
518   /// Return the amount of LDS that can be used that will not restrict the
519   /// occupancy lower than WaveCount.
520   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
521                                            const Function &) const;
522 
supportsMinMaxDenormModes()523   bool supportsMinMaxDenormModes() const {
524     return getGeneration() >= AMDGPUSubtarget::GFX9;
525   }
526 
527   /// \returns If target supports S_DENORM_MODE.
hasDenormModeInst()528   bool hasDenormModeInst() const {
529     return getGeneration() >= AMDGPUSubtarget::GFX10;
530   }
531 
useFlatForGlobal()532   bool useFlatForGlobal() const {
533     return FlatForGlobal;
534   }
535 
536   /// \returns If target supports ds_read/write_b128 and user enables generation
537   /// of ds_read/write_b128.
useDS128()538   bool useDS128() const {
539     return CIInsts && EnableDS128;
540   }
541 
542   /// \return If target supports ds_read/write_b96/128.
hasDS96AndDS128()543   bool hasDS96AndDS128() const {
544     return CIInsts;
545   }
546 
547   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
haveRoundOpsF64()548   bool haveRoundOpsF64() const {
549     return CIInsts;
550   }
551 
552   /// \returns If MUBUF instructions always perform range checking, even for
553   /// buffer resources used for private memory access.
privateMemoryResourceIsRangeChecked()554   bool privateMemoryResourceIsRangeChecked() const {
555     return getGeneration() < AMDGPUSubtarget::GFX9;
556   }
557 
558   /// \returns If target requires PRT Struct NULL support (zero result registers
559   /// for sparse texture support).
usePRTStrictNull()560   bool usePRTStrictNull() const {
561     return EnablePRTStrictNull;
562   }
563 
hasAutoWaitcntBeforeBarrier()564   bool hasAutoWaitcntBeforeBarrier() const {
565     return AutoWaitcntBeforeBarrier;
566   }
567 
568   /// \returns true if the target supports backing off of s_barrier instructions
569   /// when an exception is raised.
supportsBackOffBarrier()570   bool supportsBackOffBarrier() const {
571     return BackOffBarrier;
572   }
573 
hasUnalignedBufferAccess()574   bool hasUnalignedBufferAccess() const {
575     return UnalignedBufferAccess;
576   }
577 
hasUnalignedBufferAccessEnabled()578   bool hasUnalignedBufferAccessEnabled() const {
579     return UnalignedBufferAccess && UnalignedAccessMode;
580   }
581 
hasUnalignedDSAccess()582   bool hasUnalignedDSAccess() const {
583     return UnalignedDSAccess;
584   }
585 
hasUnalignedDSAccessEnabled()586   bool hasUnalignedDSAccessEnabled() const {
587     return UnalignedDSAccess && UnalignedAccessMode;
588   }
589 
hasUnalignedScratchAccess()590   bool hasUnalignedScratchAccess() const {
591     return UnalignedScratchAccess;
592   }
593 
hasUnalignedAccessMode()594   bool hasUnalignedAccessMode() const {
595     return UnalignedAccessMode;
596   }
597 
hasApertureRegs()598   bool hasApertureRegs() const {
599     return HasApertureRegs;
600   }
601 
isTrapHandlerEnabled()602   bool isTrapHandlerEnabled() const {
603     return TrapHandler;
604   }
605 
isXNACKEnabled()606   bool isXNACKEnabled() const {
607     return TargetID.isXnackOnOrAny();
608   }
609 
isTgSplitEnabled()610   bool isTgSplitEnabled() const {
611     return EnableTgSplit;
612   }
613 
isCuModeEnabled()614   bool isCuModeEnabled() const {
615     return EnableCuMode;
616   }
617 
isPreciseMemoryEnabled()618   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
619 
hasFlatAddressSpace()620   bool hasFlatAddressSpace() const {
621     return FlatAddressSpace;
622   }
623 
hasFlatScrRegister()624   bool hasFlatScrRegister() const {
625     return hasFlatAddressSpace();
626   }
627 
hasFlatInstOffsets()628   bool hasFlatInstOffsets() const {
629     return FlatInstOffsets;
630   }
631 
hasFlatGlobalInsts()632   bool hasFlatGlobalInsts() const {
633     return FlatGlobalInsts;
634   }
635 
hasFlatScratchInsts()636   bool hasFlatScratchInsts() const {
637     return FlatScratchInsts;
638   }
639 
640   // Check if target supports ST addressing mode with FLAT scratch instructions.
641   // The ST addressing mode means no registers are used, either VGPR or SGPR,
642   // but only immediate offset is swizzled and added to the FLAT scratch base.
hasFlatScratchSTMode()643   bool hasFlatScratchSTMode() const {
644     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
645   }
646 
hasFlatScratchSVSMode()647   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
648 
hasScalarFlatScratchInsts()649   bool hasScalarFlatScratchInsts() const {
650     return ScalarFlatScratchInsts;
651   }
652 
enableFlatScratch()653   bool enableFlatScratch() const {
654     return flatScratchIsArchitected() ||
655            (EnableFlatScratch && hasFlatScratchInsts());
656   }
657 
hasGlobalAddTidInsts()658   bool hasGlobalAddTidInsts() const {
659     return GFX10_BEncoding;
660   }
661 
hasAtomicCSub()662   bool hasAtomicCSub() const {
663     return GFX10_BEncoding;
664   }
665 
hasExportInsts()666   bool hasExportInsts() const {
667     return !hasGFX940Insts();
668   }
669 
hasVINTERPEncoding()670   bool hasVINTERPEncoding() const {
671     return GFX11Insts;
672   }
673 
674   // DS_ADD_F64/DS_ADD_RTN_F64
hasLdsAtomicAddF64()675   bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
676 
hasMultiDwordFlatScratchAddressing()677   bool hasMultiDwordFlatScratchAddressing() const {
678     return getGeneration() >= GFX9;
679   }
680 
hasFlatSegmentOffsetBug()681   bool hasFlatSegmentOffsetBug() const {
682     return HasFlatSegmentOffsetBug;
683   }
684 
hasFlatLgkmVMemCountInOrder()685   bool hasFlatLgkmVMemCountInOrder() const {
686     return getGeneration() > GFX9;
687   }
688 
hasD16LoadStore()689   bool hasD16LoadStore() const {
690     return getGeneration() >= GFX9;
691   }
692 
d16PreservesUnusedBits()693   bool d16PreservesUnusedBits() const {
694     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
695   }
696 
hasD16Images()697   bool hasD16Images() const {
698     return getGeneration() >= VOLCANIC_ISLANDS;
699   }
700 
701   /// Return if most LDS instructions have an m0 use that require m0 to be
702   /// initialized.
ldsRequiresM0Init()703   bool ldsRequiresM0Init() const {
704     return getGeneration() < GFX9;
705   }
706 
707   // True if the hardware rewinds and replays GWS operations if a wave is
708   // preempted.
709   //
710   // If this is false, a GWS operation requires testing if a nack set the
711   // MEM_VIOL bit, and repeating if so.
hasGWSAutoReplay()712   bool hasGWSAutoReplay() const {
713     return getGeneration() >= GFX9;
714   }
715 
716   /// \returns if target has ds_gws_sema_release_all instruction.
hasGWSSemaReleaseAll()717   bool hasGWSSemaReleaseAll() const {
718     return CIInsts;
719   }
720 
721   /// \returns true if the target has integer add/sub instructions that do not
722   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
723   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
724   /// for saturation.
hasAddNoCarry()725   bool hasAddNoCarry() const {
726     return AddNoCarryInsts;
727   }
728 
hasScalarAddSub64()729   bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
730 
hasScalarSMulU64()731   bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
732 
hasUnpackedD16VMem()733   bool hasUnpackedD16VMem() const {
734     return HasUnpackedD16VMem;
735   }
736 
737   // Covers VS/PS/CS graphics shaders
isMesaGfxShader(const Function & F)738   bool isMesaGfxShader(const Function &F) const {
739     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
740   }
741 
hasMad64_32()742   bool hasMad64_32() const {
743     return getGeneration() >= SEA_ISLANDS;
744   }
745 
hasSDWAOmod()746   bool hasSDWAOmod() const {
747     return HasSDWAOmod;
748   }
749 
hasSDWAScalar()750   bool hasSDWAScalar() const {
751     return HasSDWAScalar;
752   }
753 
hasSDWASdst()754   bool hasSDWASdst() const {
755     return HasSDWASdst;
756   }
757 
hasSDWAMac()758   bool hasSDWAMac() const {
759     return HasSDWAMac;
760   }
761 
hasSDWAOutModsVOPC()762   bool hasSDWAOutModsVOPC() const {
763     return HasSDWAOutModsVOPC;
764   }
765 
hasDLInsts()766   bool hasDLInsts() const {
767     return HasDLInsts;
768   }
769 
hasFmacF64Inst()770   bool hasFmacF64Inst() const { return HasFmacF64Inst; }
771 
hasDot1Insts()772   bool hasDot1Insts() const {
773     return HasDot1Insts;
774   }
775 
hasDot2Insts()776   bool hasDot2Insts() const {
777     return HasDot2Insts;
778   }
779 
hasDot3Insts()780   bool hasDot3Insts() const {
781     return HasDot3Insts;
782   }
783 
hasDot4Insts()784   bool hasDot4Insts() const {
785     return HasDot4Insts;
786   }
787 
hasDot5Insts()788   bool hasDot5Insts() const {
789     return HasDot5Insts;
790   }
791 
hasDot6Insts()792   bool hasDot6Insts() const {
793     return HasDot6Insts;
794   }
795 
hasDot7Insts()796   bool hasDot7Insts() const {
797     return HasDot7Insts;
798   }
799 
hasDot8Insts()800   bool hasDot8Insts() const {
801     return HasDot8Insts;
802   }
803 
hasDot9Insts()804   bool hasDot9Insts() const {
805     return HasDot9Insts;
806   }
807 
hasDot10Insts()808   bool hasDot10Insts() const {
809     return HasDot10Insts;
810   }
811 
hasDot11Insts()812   bool hasDot11Insts() const {
813     return HasDot11Insts;
814   }
815 
hasMAIInsts()816   bool hasMAIInsts() const {
817     return HasMAIInsts;
818   }
819 
hasFP8Insts()820   bool hasFP8Insts() const {
821     return HasFP8Insts;
822   }
823 
hasFP8ConversionInsts()824   bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
825 
hasPkFmacF16Inst()826   bool hasPkFmacF16Inst() const {
827     return HasPkFmacF16Inst;
828   }
829 
hasAtomicFMinFMaxF32GlobalInsts()830   bool hasAtomicFMinFMaxF32GlobalInsts() const {
831     return HasAtomicFMinFMaxF32GlobalInsts;
832   }
833 
hasAtomicFMinFMaxF64GlobalInsts()834   bool hasAtomicFMinFMaxF64GlobalInsts() const {
835     return HasAtomicFMinFMaxF64GlobalInsts;
836   }
837 
hasAtomicFMinFMaxF32FlatInsts()838   bool hasAtomicFMinFMaxF32FlatInsts() const {
839     return HasAtomicFMinFMaxF32FlatInsts;
840   }
841 
hasAtomicFMinFMaxF64FlatInsts()842   bool hasAtomicFMinFMaxF64FlatInsts() const {
843     return HasAtomicFMinFMaxF64FlatInsts;
844   }
845 
hasAtomicDsPkAdd16Insts()846   bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
847 
hasAtomicFlatPkAdd16Insts()848   bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
849 
hasAtomicFaddInsts()850   bool hasAtomicFaddInsts() const {
851     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
852   }
853 
hasAtomicFaddRtnInsts()854   bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
855 
hasAtomicFaddNoRtnInsts()856   bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
857 
hasAtomicBufferGlobalPkAddF16NoRtnInsts()858   bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
859     return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
860   }
861 
hasAtomicBufferGlobalPkAddF16Insts()862   bool hasAtomicBufferGlobalPkAddF16Insts() const {
863     return HasAtomicBufferGlobalPkAddF16Insts;
864   }
865 
hasAtomicGlobalPkAddBF16Inst()866   bool hasAtomicGlobalPkAddBF16Inst() const {
867     return HasAtomicGlobalPkAddBF16Inst;
868   }
869 
hasAtomicBufferPkAddBF16Inst()870   bool hasAtomicBufferPkAddBF16Inst() const {
871     return HasAtomicBufferPkAddBF16Inst;
872   }
873 
hasFlatAtomicFaddF32Inst()874   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
875 
876   /// \return true if the target has flat, global, and buffer atomic fadd for
877   /// double.
hasFlatBufferGlobalAtomicFaddF64Inst()878   bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
879     return HasFlatBufferGlobalAtomicFaddF64Inst;
880   }
881 
882   /// \return true if the target's flat, global, and buffer atomic fadd for
883   /// float supports denormal handling.
hasMemoryAtomicFaddF32DenormalSupport()884   bool hasMemoryAtomicFaddF32DenormalSupport() const {
885     return HasMemoryAtomicFaddF32DenormalSupport;
886   }
887 
888   /// \return true if atomic operations targeting fine-grained memory work
889   /// correctly at device scope, in allocations in host or peer PCIe device
890   /// memory.
supportsAgentScopeFineGrainedRemoteMemoryAtomics()891   bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
892     return HasAgentScopeFineGrainedRemoteMemoryAtomics;
893   }
894 
hasDefaultComponentZero()895   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
896 
hasDefaultComponentBroadcast()897   bool hasDefaultComponentBroadcast() const {
898     return HasDefaultComponentBroadcast;
899   }
900 
hasNoSdstCMPX()901   bool hasNoSdstCMPX() const {
902     return HasNoSdstCMPX;
903   }
904 
hasVscnt()905   bool hasVscnt() const {
906     return HasVscnt;
907   }
908 
hasGetWaveIdInst()909   bool hasGetWaveIdInst() const {
910     return HasGetWaveIdInst;
911   }
912 
hasSMemTimeInst()913   bool hasSMemTimeInst() const {
914     return HasSMemTimeInst;
915   }
916 
hasShaderCyclesRegister()917   bool hasShaderCyclesRegister() const {
918     return HasShaderCyclesRegister;
919   }
920 
hasShaderCyclesHiLoRegisters()921   bool hasShaderCyclesHiLoRegisters() const {
922     return HasShaderCyclesHiLoRegisters;
923   }
924 
hasVOP3Literal()925   bool hasVOP3Literal() const {
926     return HasVOP3Literal;
927   }
928 
hasNoDataDepHazard()929   bool hasNoDataDepHazard() const {
930     return HasNoDataDepHazard;
931   }
932 
vmemWriteNeedsExpWaitcnt()933   bool vmemWriteNeedsExpWaitcnt() const {
934     return getGeneration() < SEA_ISLANDS;
935   }
936 
hasInstPrefetch()937   bool hasInstPrefetch() const {
938     return getGeneration() == GFX10 || getGeneration() == GFX11;
939   }
940 
hasPrefetch()941   bool hasPrefetch() const { return GFX12Insts; }
942 
943   // Has s_cmpk_* instructions.
hasSCmpK()944   bool hasSCmpK() const { return getGeneration() < GFX12; }
945 
946   // Scratch is allocated in 256 dword per wave blocks for the entire
947   // wavefront. When viewed from the perspective of an arbitrary workitem, this
948   // is 4-byte aligned.
949   //
950   // Only 4-byte alignment is really needed to access anything. Transformations
951   // on the pointer value itself may rely on the alignment / known low bits of
952   // the pointer. Set this to something above the minimum to avoid needing
953   // dynamic realignment in common cases.
getStackAlignment()954   Align getStackAlignment() const { return Align(16); }
955 
enableMachineScheduler()956   bool enableMachineScheduler() const override {
957     return true;
958   }
959 
960   bool useAA() const override;
961 
enableSubRegLiveness()962   bool enableSubRegLiveness() const override {
963     return true;
964   }
965 
setScalarizeGlobalBehavior(bool b)966   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
getScalarizeGlobalBehavior()967   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
968 
969   // static wrappers
970   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
971 
972   // XXX - Why is this here if it isn't in the default pass set?
enableEarlyIfConversion()973   bool enableEarlyIfConversion() const override {
974     return true;
975   }
976 
977   void overrideSchedPolicy(MachineSchedPolicy &Policy,
978                            unsigned NumRegionInstrs) const override;
979 
980   void mirFileLoaded(MachineFunction &MF) const override;
981 
getMaxNumUserSGPRs()982   unsigned getMaxNumUserSGPRs() const {
983     return AMDGPU::getMaxNumUserSGPRs(*this);
984   }
985 
hasSMemRealTime()986   bool hasSMemRealTime() const {
987     return HasSMemRealTime;
988   }
989 
hasMovrel()990   bool hasMovrel() const {
991     return HasMovrel;
992   }
993 
hasVGPRIndexMode()994   bool hasVGPRIndexMode() const {
995     return HasVGPRIndexMode;
996   }
997 
998   bool useVGPRIndexMode() const;
999 
hasScalarCompareEq64()1000   bool hasScalarCompareEq64() const {
1001     return getGeneration() >= VOLCANIC_ISLANDS;
1002   }
1003 
hasScalarDwordx3Loads()1004   bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
1005 
hasScalarStores()1006   bool hasScalarStores() const {
1007     return HasScalarStores;
1008   }
1009 
hasScalarAtomics()1010   bool hasScalarAtomics() const {
1011     return HasScalarAtomics;
1012   }
1013 
hasLDSFPAtomicAddF32()1014   bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
hasLDSFPAtomicAddF64()1015   bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
1016 
1017   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
hasPermLaneX16()1018   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
1019 
1020   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
hasPermLane64()1021   bool hasPermLane64() const { return getGeneration() >= GFX11; }
1022 
hasDPP()1023   bool hasDPP() const {
1024     return HasDPP;
1025   }
1026 
hasDPPBroadcasts()1027   bool hasDPPBroadcasts() const {
1028     return HasDPP && getGeneration() < GFX10;
1029   }
1030 
hasDPPWavefrontShifts()1031   bool hasDPPWavefrontShifts() const {
1032     return HasDPP && getGeneration() < GFX10;
1033   }
1034 
hasDPP8()1035   bool hasDPP8() const {
1036     return HasDPP8;
1037   }
1038 
hasDPALU_DPP()1039   bool hasDPALU_DPP() const {
1040     return HasDPALU_DPP;
1041   }
1042 
hasDPPSrc1SGPR()1043   bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
1044 
hasPackedFP32Ops()1045   bool hasPackedFP32Ops() const {
1046     return HasPackedFP32Ops;
1047   }
1048 
1049   // Has V_PK_MOV_B32 opcode
hasPkMovB32()1050   bool hasPkMovB32() const {
1051     return GFX90AInsts;
1052   }
1053 
hasFmaakFmamkF32Insts()1054   bool hasFmaakFmamkF32Insts() const {
1055     return getGeneration() >= GFX10 || hasGFX940Insts();
1056   }
1057 
hasImageInsts()1058   bool hasImageInsts() const {
1059     return HasImageInsts;
1060   }
1061 
hasExtendedImageInsts()1062   bool hasExtendedImageInsts() const {
1063     return HasExtendedImageInsts;
1064   }
1065 
hasR128A16()1066   bool hasR128A16() const {
1067     return HasR128A16;
1068   }
1069 
hasA16()1070   bool hasA16() const { return HasA16; }
1071 
hasG16()1072   bool hasG16() const { return HasG16; }
1073 
hasOffset3fBug()1074   bool hasOffset3fBug() const {
1075     return HasOffset3fBug;
1076   }
1077 
hasImageStoreD16Bug()1078   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1079 
hasImageGather4D16Bug()1080   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1081 
hasMADIntraFwdBug()1082   bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1083 
hasMSAALoadDstSelBug()1084   bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1085 
hasPrivEnabledTrap2NopBug()1086   bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1087 
hasNSAEncoding()1088   bool hasNSAEncoding() const { return HasNSAEncoding; }
1089 
hasNonNSAEncoding()1090   bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1091 
hasPartialNSAEncoding()1092   bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1093 
1094   unsigned getNSAMaxSize(bool HasSampler = false) const {
1095     return AMDGPU::getNSAMaxSize(*this, HasSampler);
1096   }
1097 
hasGFX10_AEncoding()1098   bool hasGFX10_AEncoding() const {
1099     return GFX10_AEncoding;
1100   }
1101 
hasGFX10_BEncoding()1102   bool hasGFX10_BEncoding() const {
1103     return GFX10_BEncoding;
1104   }
1105 
hasGFX10_3Insts()1106   bool hasGFX10_3Insts() const {
1107     return GFX10_3Insts;
1108   }
1109 
1110   bool hasMadF16() const;
1111 
hasMovB64()1112   bool hasMovB64() const { return GFX940Insts; }
1113 
hasLshlAddB64()1114   bool hasLshlAddB64() const { return GFX940Insts; }
1115 
enableSIScheduler()1116   bool enableSIScheduler() const {
1117     return EnableSIScheduler;
1118   }
1119 
loadStoreOptEnabled()1120   bool loadStoreOptEnabled() const {
1121     return EnableLoadStoreOpt;
1122   }
1123 
hasSGPRInitBug()1124   bool hasSGPRInitBug() const {
1125     return SGPRInitBug;
1126   }
1127 
hasUserSGPRInit16Bug()1128   bool hasUserSGPRInit16Bug() const {
1129     return UserSGPRInit16Bug && isWave32();
1130   }
1131 
hasNegativeScratchOffsetBug()1132   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1133 
hasNegativeUnalignedScratchOffsetBug()1134   bool hasNegativeUnalignedScratchOffsetBug() const {
1135     return NegativeUnalignedScratchOffsetBug;
1136   }
1137 
hasMFMAInlineLiteralBug()1138   bool hasMFMAInlineLiteralBug() const {
1139     return HasMFMAInlineLiteralBug;
1140   }
1141 
has12DWordStoreHazard()1142   bool has12DWordStoreHazard() const {
1143     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1144   }
1145 
1146   // \returns true if the subtarget supports DWORDX3 load/store instructions.
hasDwordx3LoadStores()1147   bool hasDwordx3LoadStores() const {
1148     return CIInsts;
1149   }
1150 
hasReadM0MovRelInterpHazard()1151   bool hasReadM0MovRelInterpHazard() const {
1152     return getGeneration() == AMDGPUSubtarget::GFX9;
1153   }
1154 
hasReadM0SendMsgHazard()1155   bool hasReadM0SendMsgHazard() const {
1156     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1157            getGeneration() <= AMDGPUSubtarget::GFX9;
1158   }
1159 
hasReadM0LdsDmaHazard()1160   bool hasReadM0LdsDmaHazard() const {
1161     return getGeneration() == AMDGPUSubtarget::GFX9;
1162   }
1163 
hasReadM0LdsDirectHazard()1164   bool hasReadM0LdsDirectHazard() const {
1165     return getGeneration() == AMDGPUSubtarget::GFX9;
1166   }
1167 
hasVcmpxPermlaneHazard()1168   bool hasVcmpxPermlaneHazard() const {
1169     return HasVcmpxPermlaneHazard;
1170   }
1171 
hasVMEMtoScalarWriteHazard()1172   bool hasVMEMtoScalarWriteHazard() const {
1173     return HasVMEMtoScalarWriteHazard;
1174   }
1175 
hasSMEMtoVectorWriteHazard()1176   bool hasSMEMtoVectorWriteHazard() const {
1177     return HasSMEMtoVectorWriteHazard;
1178   }
1179 
hasLDSMisalignedBug()1180   bool hasLDSMisalignedBug() const {
1181     return LDSMisalignedBug && !EnableCuMode;
1182   }
1183 
hasInstFwdPrefetchBug()1184   bool hasInstFwdPrefetchBug() const {
1185     return HasInstFwdPrefetchBug;
1186   }
1187 
hasVcmpxExecWARHazard()1188   bool hasVcmpxExecWARHazard() const {
1189     return HasVcmpxExecWARHazard;
1190   }
1191 
hasLdsBranchVmemWARHazard()1192   bool hasLdsBranchVmemWARHazard() const {
1193     return HasLdsBranchVmemWARHazard;
1194   }
1195 
1196   // Shift amount of a 64 bit shift cannot be a highest allocated register
1197   // if also at the end of the allocation block.
hasShift64HighRegBug()1198   bool hasShift64HighRegBug() const {
1199     return GFX90AInsts && !GFX940Insts;
1200   }
1201 
1202   // Has one cycle hazard on transcendental instruction feeding a
1203   // non transcendental VALU.
hasTransForwardingHazard()1204   bool hasTransForwardingHazard() const { return GFX940Insts; }
1205 
1206   // Has one cycle hazard on a VALU instruction partially writing dst with
1207   // a shift of result bits feeding another VALU instruction.
hasDstSelForwardingHazard()1208   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1209 
1210   // Cannot use op_sel with v_dot instructions.
hasDOTOpSelHazard()1211   bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
1212 
1213   // Does not have HW interlocs for VALU writing and then reading SGPRs.
hasVDecCoExecHazard()1214   bool hasVDecCoExecHazard() const {
1215     return GFX940Insts;
1216   }
1217 
hasNSAtoVMEMBug()1218   bool hasNSAtoVMEMBug() const {
1219     return HasNSAtoVMEMBug;
1220   }
1221 
hasNSAClauseBug()1222   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1223 
hasHardClauses()1224   bool hasHardClauses() const { return MaxHardClauseLength > 0; }
1225 
hasGFX90AInsts()1226   bool hasGFX90AInsts() const { return GFX90AInsts; }
1227 
hasFPAtomicToDenormModeHazard()1228   bool hasFPAtomicToDenormModeHazard() const {
1229     return getGeneration() == GFX10;
1230   }
1231 
hasVOP3DPP()1232   bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1233 
hasLdsDirect()1234   bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1235 
hasLdsWaitVMSRC()1236   bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1237 
hasVALUPartialForwardingHazard()1238   bool hasVALUPartialForwardingHazard() const {
1239     return getGeneration() == GFX11;
1240   }
1241 
hasVALUTransUseHazard()1242   bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1243 
hasForceStoreSC0SC1()1244   bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1245 
requiresCodeObjectV6()1246   bool requiresCodeObjectV6() const { return RequiresCOV6; }
1247 
hasVALUMaskWriteHazard()1248   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1249 
1250   /// Return if operations acting on VGPR tuples require even alignment.
needsAlignedVGPRs()1251   bool needsAlignedVGPRs() const { return GFX90AInsts; }
1252 
1253   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
hasSPackHL()1254   bool hasSPackHL() const { return GFX11Insts; }
1255 
1256   /// Return true if the target's EXP instruction has the COMPR flag, which
1257   /// affects the meaning of the EN (enable) bits.
hasCompressedExport()1258   bool hasCompressedExport() const { return !GFX11Insts; }
1259 
1260   /// Return true if the target's EXP instruction supports the NULL export
1261   /// target.
hasNullExportTarget()1262   bool hasNullExportTarget() const { return !GFX11Insts; }
1263 
has1_5xVGPRs()1264   bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
1265 
hasVOPDInsts()1266   bool hasVOPDInsts() const { return HasVOPDInsts; }
1267 
hasFlatScratchSVSSwizzleBug()1268   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1269 
1270   /// Return true if the target has the S_DELAY_ALU instruction.
hasDelayAlu()1271   bool hasDelayAlu() const { return GFX11Insts; }
1272 
hasPackedTID()1273   bool hasPackedTID() const { return HasPackedTID; }
1274 
1275   // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1276   // hasGFX90AInsts is also true.
hasGFX940Insts()1277   bool hasGFX940Insts() const { return GFX940Insts; }
1278 
hasSALUFloatInsts()1279   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1280 
hasVGPRSingleUseHintInsts()1281   bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
1282 
hasPseudoScalarTrans()1283   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1284 
hasRestrictedSOffset()1285   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1286 
hasRequiredExportPriority()1287   bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
1288 
hasVmemWriteVgprInOrder()1289   bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
1290 
1291   /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1292   /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
hasExtendedWaitCounts()1293   bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1294 
1295   /// \returns true if inline constants are not supported for F16 pseudo
1296   /// scalar transcendentals.
hasNoF16PseudoScalarTransInlineConstants()1297   bool hasNoF16PseudoScalarTransInlineConstants() const {
1298     return getGeneration() == GFX12;
1299   }
1300 
1301   /// \returns The maximum number of instructions that can be enclosed in an
1302   /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1303   /// instruction.
maxHardClauseLength()1304   unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1305 
1306   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1307   /// SGPRs
1308   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1309 
1310   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1311   /// VGPRs
1312   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1313 
1314   /// Return occupancy for the given function. Used LDS and a number of
1315   /// registers if provided.
1316   /// Note, occupancy can be affected by the scratch allocation as well, but
1317   /// we do not have enough information to compute it.
1318   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1319                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1320 
1321   /// \returns true if the flat_scratch register should be initialized with the
1322   /// pointer to the wave's scratch memory rather than a size and offset.
flatScratchIsPointer()1323   bool flatScratchIsPointer() const {
1324     return getGeneration() >= AMDGPUSubtarget::GFX9;
1325   }
1326 
1327   /// \returns true if the flat_scratch register is initialized by the HW.
1328   /// In this case it is readonly.
flatScratchIsArchitected()1329   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1330 
1331   /// \returns true if the architected SGPRs are enabled.
hasArchitectedSGPRs()1332   bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1333 
1334   /// \returns true if Global Data Share is supported.
hasGDS()1335   bool hasGDS() const { return HasGDS; }
1336 
1337   /// \returns true if Global Wave Sync is supported.
hasGWS()1338   bool hasGWS() const { return HasGWS; }
1339 
1340   /// \returns true if the machine has merged shaders in which s0-s7 are
1341   /// reserved by the hardware and user SGPRs start at s8
hasMergedShaders()1342   bool hasMergedShaders() const {
1343     return getGeneration() >= GFX9;
1344   }
1345 
1346   // \returns true if the target supports the pre-NGG legacy geometry path.
hasLegacyGeometry()1347   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1348 
1349   // \returns true if preloading kernel arguments is supported.
hasKernargPreload()1350   bool hasKernargPreload() const { return KernargPreload; }
1351 
1352   // \returns true if the target has split barriers feature
hasSplitBarriers()1353   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1354 
1355   // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
hasCvtFP8VOP1Bug()1356   bool hasCvtFP8VOP1Bug() const { return true; }
1357 
1358   // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1359   // no-return form.
hasAtomicCSubNoRtnInsts()1360   bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1361 
1362   // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
hasDX10ClampMode()1363   bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1364 
1365   // \returns true if the target has IEEE kernel descriptor mode bit
hasIEEEMode()1366   bool hasIEEEMode() const { return getGeneration() < GFX12; }
1367 
1368   // \returns true if the target has IEEE fminimum/fmaximum instructions
hasIEEEMinMax()1369   bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1370 
1371   // \returns true if the target has IEEE fminimum3/fmaximum3 instructions
hasIEEEMinMax3()1372   bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
1373 
1374   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
hasRrWGMode()1375   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1376 
1377   /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1378   /// values.
hasSignedScratchOffsets()1379   bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1380 
1381   // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1382   // of sign-extending.
hasGetPCZeroExtension()1383   bool hasGetPCZeroExtension() const { return GFX12Insts; }
1384 
1385   /// \returns SGPR allocation granularity supported by the subtarget.
getSGPRAllocGranule()1386   unsigned getSGPRAllocGranule() const {
1387     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1388   }
1389 
1390   /// \returns SGPR encoding granularity supported by the subtarget.
getSGPREncodingGranule()1391   unsigned getSGPREncodingGranule() const {
1392     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1393   }
1394 
1395   /// \returns Total number of SGPRs supported by the subtarget.
getTotalNumSGPRs()1396   unsigned getTotalNumSGPRs() const {
1397     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1398   }
1399 
1400   /// \returns Addressable number of SGPRs supported by the subtarget.
getAddressableNumSGPRs()1401   unsigned getAddressableNumSGPRs() const {
1402     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1403   }
1404 
1405   /// \returns Minimum number of SGPRs that meets the given number of waves per
1406   /// execution unit requirement supported by the subtarget.
getMinNumSGPRs(unsigned WavesPerEU)1407   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1408     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1409   }
1410 
1411   /// \returns Maximum number of SGPRs that meets the given number of waves per
1412   /// execution unit requirement supported by the subtarget.
getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)1413   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1414     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1415   }
1416 
1417   /// \returns Reserved number of SGPRs. This is common
1418   /// utility function called by MachineFunction and
1419   /// Function variants of getReservedNumSGPRs.
1420   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1421   /// \returns Reserved number of SGPRs for given machine function \p MF.
1422   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1423 
1424   /// \returns Reserved number of SGPRs for given function \p F.
1425   unsigned getReservedNumSGPRs(const Function &F) const;
1426 
1427   /// \returns max num SGPRs. This is the common utility
1428   /// function called by MachineFunction and Function
1429   /// variants of getMaxNumSGPRs.
1430   unsigned getBaseMaxNumSGPRs(const Function &F,
1431                               std::pair<unsigned, unsigned> WavesPerEU,
1432                               unsigned PreloadedSGPRs,
1433                               unsigned ReservedNumSGPRs) const;
1434 
1435   /// \returns Maximum number of SGPRs that meets number of waves per execution
1436   /// unit requirement for function \p MF, or number of SGPRs explicitly
1437   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1438   ///
1439   /// \returns Value that meets number of waves per execution unit requirement
1440   /// if explicitly requested value cannot be converted to integer, violates
1441   /// subtarget's specifications, or does not meet number of waves per execution
1442   /// unit requirement.
1443   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1444 
1445   /// \returns Maximum number of SGPRs that meets number of waves per execution
1446   /// unit requirement for function \p F, or number of SGPRs explicitly
1447   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1448   ///
1449   /// \returns Value that meets number of waves per execution unit requirement
1450   /// if explicitly requested value cannot be converted to integer, violates
1451   /// subtarget's specifications, or does not meet number of waves per execution
1452   /// unit requirement.
1453   unsigned getMaxNumSGPRs(const Function &F) const;
1454 
1455   /// \returns VGPR allocation granularity supported by the subtarget.
getVGPRAllocGranule()1456   unsigned getVGPRAllocGranule() const {
1457     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1458   }
1459 
1460   /// \returns VGPR encoding granularity supported by the subtarget.
getVGPREncodingGranule()1461   unsigned getVGPREncodingGranule() const {
1462     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1463   }
1464 
1465   /// \returns Total number of VGPRs supported by the subtarget.
getTotalNumVGPRs()1466   unsigned getTotalNumVGPRs() const {
1467     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1468   }
1469 
1470   /// \returns Addressable number of architectural VGPRs supported by the
1471   /// subtarget.
getAddressableNumArchVGPRs()1472   unsigned getAddressableNumArchVGPRs() const {
1473     return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
1474   }
1475 
1476   /// \returns Addressable number of VGPRs supported by the subtarget.
getAddressableNumVGPRs()1477   unsigned getAddressableNumVGPRs() const {
1478     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1479   }
1480 
1481   /// \returns the minimum number of VGPRs that will prevent achieving more than
1482   /// the specified number of waves \p WavesPerEU.
getMinNumVGPRs(unsigned WavesPerEU)1483   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1484     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1485   }
1486 
1487   /// \returns the maximum number of VGPRs that can be used and still achieved
1488   /// at least the specified number of waves \p WavesPerEU.
getMaxNumVGPRs(unsigned WavesPerEU)1489   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1490     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1491   }
1492 
1493   /// \returns max num VGPRs. This is the common utility function
1494   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1495   unsigned getBaseMaxNumVGPRs(const Function &F,
1496                               std::pair<unsigned, unsigned> WavesPerEU) const;
1497   /// \returns Maximum number of VGPRs that meets number of waves per execution
1498   /// unit requirement for function \p F, or number of VGPRs explicitly
1499   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1500   ///
1501   /// \returns Value that meets number of waves per execution unit requirement
1502   /// if explicitly requested value cannot be converted to integer, violates
1503   /// subtarget's specifications, or does not meet number of waves per execution
1504   /// unit requirement.
1505   unsigned getMaxNumVGPRs(const Function &F) const;
1506 
getMaxNumAGPRs(const Function & F)1507   unsigned getMaxNumAGPRs(const Function &F) const {
1508     return getMaxNumVGPRs(F);
1509   }
1510 
1511   /// \returns Maximum number of VGPRs that meets number of waves per execution
1512   /// unit requirement for function \p MF, or number of VGPRs explicitly
1513   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1514   ///
1515   /// \returns Value that meets number of waves per execution unit requirement
1516   /// if explicitly requested value cannot be converted to integer, violates
1517   /// subtarget's specifications, or does not meet number of waves per execution
1518   /// unit requirement.
1519   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1520 
1521   void getPostRAMutations(
1522       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1523       const override;
1524 
1525   std::unique_ptr<ScheduleDAGMutation>
1526   createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1527 
isWave32()1528   bool isWave32() const {
1529     return getWavefrontSize() == 32;
1530   }
1531 
isWave64()1532   bool isWave64() const {
1533     return getWavefrontSize() == 64;
1534   }
1535 
getBoolRC()1536   const TargetRegisterClass *getBoolRC() const {
1537     return getRegisterInfo()->getBoolRC();
1538   }
1539 
1540   /// \returns Maximum number of work groups per compute unit supported by the
1541   /// subtarget and limited by given \p FlatWorkGroupSize.
getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1542   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1543     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1544   }
1545 
1546   /// \returns Minimum flat work group size supported by the subtarget.
getMinFlatWorkGroupSize()1547   unsigned getMinFlatWorkGroupSize() const override {
1548     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1549   }
1550 
1551   /// \returns Maximum flat work group size supported by the subtarget.
getMaxFlatWorkGroupSize()1552   unsigned getMaxFlatWorkGroupSize() const override {
1553     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1554   }
1555 
1556   /// \returns Number of waves per execution unit required to support the given
1557   /// \p FlatWorkGroupSize.
1558   unsigned
getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)1559   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1560     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1561   }
1562 
1563   /// \returns Minimum number of waves per execution unit supported by the
1564   /// subtarget.
getMinWavesPerEU()1565   unsigned getMinWavesPerEU() const override {
1566     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1567   }
1568 
1569   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1570                              SDep &Dep,
1571                              const TargetSchedModel *SchedModel) const override;
1572 
1573   // \returns true if it's beneficial on this subtarget for the scheduler to
1574   // cluster stores as well as loads.
shouldClusterStores()1575   bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1576 
1577   // \returns the number of address arguments from which to enable MIMG NSA
1578   // on supported architectures.
1579   unsigned getNSAThreshold(const MachineFunction &MF) const;
1580 
1581   // \returns true if the subtarget has a hazard requiring an "s_nop 0"
1582   // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
requiresNopBeforeDeallocVGPRs()1583   bool requiresNopBeforeDeallocVGPRs() const {
1584     // Currently all targets that support the dealloc VGPRs message also require
1585     // the nop.
1586     return true;
1587   }
1588 };
1589 
1590 class GCNUserSGPRUsageInfo {
1591 public:
hasImplicitBufferPtr()1592   bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1593 
hasPrivateSegmentBuffer()1594   bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1595 
hasDispatchPtr()1596   bool hasDispatchPtr() const { return DispatchPtr; }
1597 
hasQueuePtr()1598   bool hasQueuePtr() const { return QueuePtr; }
1599 
hasKernargSegmentPtr()1600   bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1601 
hasDispatchID()1602   bool hasDispatchID() const { return DispatchID; }
1603 
hasFlatScratchInit()1604   bool hasFlatScratchInit() const { return FlatScratchInit; }
1605 
hasPrivateSegmentSize()1606   bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1607 
getNumKernargPreloadSGPRs()1608   unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1609 
getNumUsedUserSGPRs()1610   unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1611 
1612   unsigned getNumFreeUserSGPRs();
1613 
1614   void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1615 
1616   enum UserSGPRID : unsigned {
1617     ImplicitBufferPtrID = 0,
1618     PrivateSegmentBufferID = 1,
1619     DispatchPtrID = 2,
1620     QueuePtrID = 3,
1621     KernargSegmentPtrID = 4,
1622     DispatchIdID = 5,
1623     FlatScratchInitID = 6,
1624     PrivateSegmentSizeID = 7
1625   };
1626 
1627   // Returns the size in number of SGPRs for preload user SGPR field.
getNumUserSGPRForField(UserSGPRID ID)1628   static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1629     switch (ID) {
1630     case ImplicitBufferPtrID:
1631       return 2;
1632     case PrivateSegmentBufferID:
1633       return 4;
1634     case DispatchPtrID:
1635       return 2;
1636     case QueuePtrID:
1637       return 2;
1638     case KernargSegmentPtrID:
1639       return 2;
1640     case DispatchIdID:
1641       return 2;
1642     case FlatScratchInitID:
1643       return 2;
1644     case PrivateSegmentSizeID:
1645       return 1;
1646     }
1647     llvm_unreachable("Unknown UserSGPRID.");
1648   }
1649 
1650   GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1651 
1652 private:
1653   const GCNSubtarget &ST;
1654 
1655   // Private memory buffer
1656   // Compute directly in sgpr[0:1]
1657   // Other shaders indirect 64-bits at sgpr[0:1]
1658   bool ImplicitBufferPtr = false;
1659 
1660   bool PrivateSegmentBuffer = false;
1661 
1662   bool DispatchPtr = false;
1663 
1664   bool QueuePtr = false;
1665 
1666   bool KernargSegmentPtr = false;
1667 
1668   bool DispatchID = false;
1669 
1670   bool FlatScratchInit = false;
1671 
1672   bool PrivateSegmentSize = false;
1673 
1674   unsigned NumKernargPreloadSGPRs = 0;
1675 
1676   unsigned NumUsedUserSGPRs = 0;
1677 };
1678 
1679 } // end namespace llvm
1680 
1681 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1682