xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "SIFrameLowering.h"
21 #include "SIISelLowering.h"
22 #include "SIInstrInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
25 #include "llvm/Support/ErrorHandling.h"
26 
27 #define GET_SUBTARGETINFO_HEADER
28 #include "AMDGPUGenSubtargetInfo.inc"
29 
30 namespace llvm {
31 
32 class GCNTargetMachine;
33 
34 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
35                            public AMDGPUSubtarget {
36 public:
37   using AMDGPUSubtarget::getMaxWavesPerEU;
38 
39   // Following 2 enums are documented at:
40   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41   enum class TrapHandlerAbi {
42     NONE   = 0x00,
43     AMDHSA = 0x01,
44   };
45 
46   enum class TrapID {
47     LLVMAMDHSATrap      = 0x02,
48     LLVMAMDHSADebugTrap = 0x03,
49   };
50 
51 private:
52   /// GlobalISel related APIs.
53   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
54   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
55   std::unique_ptr<InstructionSelector> InstSelector;
56   std::unique_ptr<LegalizerInfo> Legalizer;
57   std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
58 
59 protected:
60   // Basic subtarget description.
61   Triple TargetTriple;
62   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
63   unsigned Gen = INVALID;
64   InstrItineraryData InstrItins;
65   int LDSBankCount = 0;
66   unsigned MaxPrivateElementSize = 0;
67 
68   // Possibly statically set by tablegen, but may want to be overridden.
69   bool FastDenormalF32 = false;
70   bool HalfRate64Ops = false;
71   bool FullRate64Ops = false;
72 
73   // Dynamically set bits that enable features.
74   bool FlatForGlobal = false;
75   bool AutoWaitcntBeforeBarrier = false;
76   bool BackOffBarrier = false;
77   bool UnalignedScratchAccess = false;
78   bool UnalignedAccessMode = false;
79   bool HasApertureRegs = false;
80   bool SupportsXNACK = false;
81   bool KernargPreload = false;
82 
83   // This should not be used directly. 'TargetID' tracks the dynamic settings
84   // for XNACK.
85   bool EnableXNACK = false;
86 
87   bool EnableTgSplit = false;
88   bool EnableCuMode = false;
89   bool TrapHandler = false;
90   bool EnablePreciseMemory = false;
91 
92   // Used as options.
93   bool EnableLoadStoreOpt = false;
94   bool EnableUnsafeDSOffsetFolding = false;
95   bool EnableSIScheduler = false;
96   bool EnableDS128 = false;
97   bool EnablePRTStrictNull = false;
98   bool DumpCode = false;
99 
100   // Subtarget statically properties set by tablegen
101   bool FP64 = false;
102   bool FMA = false;
103   bool MIMG_R128 = false;
104   bool CIInsts = false;
105   bool GFX8Insts = false;
106   bool GFX9Insts = false;
107   bool GFX90AInsts = false;
108   bool GFX940Insts = false;
109   bool GFX10Insts = false;
110   bool GFX11Insts = false;
111   bool GFX12Insts = false;
112   bool GFX10_3Insts = false;
113   bool GFX7GFX8GFX9Insts = false;
114   bool SGPRInitBug = false;
115   bool UserSGPRInit16Bug = false;
116   bool NegativeScratchOffsetBug = false;
117   bool NegativeUnalignedScratchOffsetBug = false;
118   bool HasSMemRealTime = false;
119   bool HasIntClamp = false;
120   bool HasFmaMixInsts = false;
121   bool HasMovrel = false;
122   bool HasVGPRIndexMode = false;
123   bool HasScalarDwordx3Loads = false;
124   bool HasScalarStores = false;
125   bool HasScalarAtomics = false;
126   bool HasSDWAOmod = false;
127   bool HasSDWAScalar = false;
128   bool HasSDWASdst = false;
129   bool HasSDWAMac = false;
130   bool HasSDWAOutModsVOPC = false;
131   bool HasDPP = false;
132   bool HasDPP8 = false;
133   bool HasDPALU_DPP = false;
134   bool HasDPPSrc1SGPR = false;
135   bool HasPackedFP32Ops = false;
136   bool HasImageInsts = false;
137   bool HasExtendedImageInsts = false;
138   bool HasR128A16 = false;
139   bool HasA16 = false;
140   bool HasG16 = false;
141   bool HasNSAEncoding = false;
142   bool HasPartialNSAEncoding = false;
143   bool GFX10_AEncoding = false;
144   bool GFX10_BEncoding = false;
145   bool HasDLInsts = false;
146   bool HasFmacF64Inst = false;
147   bool HasDot1Insts = false;
148   bool HasDot2Insts = false;
149   bool HasDot3Insts = false;
150   bool HasDot4Insts = false;
151   bool HasDot5Insts = false;
152   bool HasDot6Insts = false;
153   bool HasDot7Insts = false;
154   bool HasDot8Insts = false;
155   bool HasDot9Insts = false;
156   bool HasDot10Insts = false;
157   bool HasDot11Insts = false;
158   bool HasMAIInsts = false;
159   bool HasFP8Insts = false;
160   bool HasFP8ConversionInsts = false;
161   bool HasPkFmacF16Inst = false;
162   bool HasAtomicFMinFMaxF32GlobalInsts = false;
163   bool HasAtomicFMinFMaxF64GlobalInsts = false;
164   bool HasAtomicFMinFMaxF32FlatInsts = false;
165   bool HasAtomicFMinFMaxF64FlatInsts = false;
166   bool HasAtomicDsPkAdd16Insts = false;
167   bool HasAtomicFlatPkAdd16Insts = false;
168   bool HasAtomicFaddRtnInsts = false;
169   bool HasAtomicFaddNoRtnInsts = false;
170   bool HasMemoryAtomicFaddF32DenormalSupport = false;
171   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
172   bool HasAtomicBufferGlobalPkAddF16Insts = false;
173   bool HasAtomicCSubNoRtnInsts = false;
174   bool HasAtomicGlobalPkAddBF16Inst = false;
175   bool HasAtomicBufferPkAddBF16Inst = false;
176   bool HasFlatAtomicFaddF32Inst = false;
177   bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
178   bool HasDefaultComponentZero = false;
179   bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
180   bool HasDefaultComponentBroadcast = false;
181   /// The maximum number of instructions that may be placed within an S_CLAUSE,
182   /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
183   /// indicates a lack of S_CLAUSE support.
184   unsigned MaxHardClauseLength = 0;
185   bool SupportsSRAMECC = false;
186 
187   // This should not be used directly. 'TargetID' tracks the dynamic settings
188   // for SRAMECC.
189   bool EnableSRAMECC = false;
190 
191   bool HasNoSdstCMPX = false;
192   bool HasVscnt = false;
193   bool HasGetWaveIdInst = false;
194   bool HasSMemTimeInst = false;
195   bool HasShaderCyclesRegister = false;
196   bool HasShaderCyclesHiLoRegisters = false;
197   bool HasVOP3Literal = false;
198   bool HasNoDataDepHazard = false;
199   bool FlatAddressSpace = false;
200   bool FlatInstOffsets = false;
201   bool FlatGlobalInsts = false;
202   bool FlatScratchInsts = false;
203   bool ScalarFlatScratchInsts = false;
204   bool HasArchitectedFlatScratch = false;
205   bool EnableFlatScratch = false;
206   bool HasArchitectedSGPRs = false;
207   bool HasGDS = false;
208   bool HasGWS = false;
209   bool AddNoCarryInsts = false;
210   bool HasUnpackedD16VMem = false;
211   bool LDSMisalignedBug = false;
212   bool HasMFMAInlineLiteralBug = false;
213   bool UnalignedBufferAccess = false;
214   bool UnalignedDSAccess = false;
215   bool HasPackedTID = false;
216   bool ScalarizeGlobal = false;
217   bool HasSALUFloatInsts = false;
218   bool HasVGPRSingleUseHintInsts = false;
219   bool HasPseudoScalarTrans = false;
220   bool HasRestrictedSOffset = false;
221 
222   bool HasVcmpxPermlaneHazard = false;
223   bool HasVMEMtoScalarWriteHazard = false;
224   bool HasSMEMtoVectorWriteHazard = false;
225   bool HasInstFwdPrefetchBug = false;
226   bool HasVcmpxExecWARHazard = false;
227   bool HasLdsBranchVmemWARHazard = false;
228   bool HasNSAtoVMEMBug = false;
229   bool HasNSAClauseBug = false;
230   bool HasOffset3fBug = false;
231   bool HasFlatSegmentOffsetBug = false;
232   bool HasImageStoreD16Bug = false;
233   bool HasImageGather4D16Bug = false;
234   bool HasMSAALoadDstSelBug = false;
235   bool HasPrivEnabledTrap2NopBug = false;
236   bool Has1_5xVGPRs = false;
237   bool HasMADIntraFwdBug = false;
238   bool HasVOPDInsts = false;
239   bool HasVALUTransUseHazard = false;
240   bool HasForceStoreSC0SC1 = false;
241   bool HasRequiredExportPriority = false;
242   bool HasVmemWriteVgprInOrder = false;
243 
244   bool RequiresCOV6 = false;
245 
246   // Dummy feature to use for assembler in tablegen.
247   bool FeatureDisable = false;
248 
249   SelectionDAGTargetInfo TSInfo;
250 private:
251   SIInstrInfo InstrInfo;
252   SITargetLowering TLInfo;
253   SIFrameLowering FrameLowering;
254 
255 public:
256   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
257                const GCNTargetMachine &TM);
258   ~GCNSubtarget() override;
259 
260   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
261                                                    StringRef GPU, StringRef FS);
262 
263   /// Diagnose inconsistent subtarget features before attempting to codegen
264   /// function \p F.
265   void checkSubtargetFeatures(const Function &F) const;
266 
267   const SIInstrInfo *getInstrInfo() const override {
268     return &InstrInfo;
269   }
270 
271   const SIFrameLowering *getFrameLowering() const override {
272     return &FrameLowering;
273   }
274 
275   const SITargetLowering *getTargetLowering() const override {
276     return &TLInfo;
277   }
278 
279   const SIRegisterInfo *getRegisterInfo() const override {
280     return &InstrInfo.getRegisterInfo();
281   }
282 
283   const CallLowering *getCallLowering() const override {
284     return CallLoweringInfo.get();
285   }
286 
287   const InlineAsmLowering *getInlineAsmLowering() const override {
288     return InlineAsmLoweringInfo.get();
289   }
290 
291   InstructionSelector *getInstructionSelector() const override {
292     return InstSelector.get();
293   }
294 
295   const LegalizerInfo *getLegalizerInfo() const override {
296     return Legalizer.get();
297   }
298 
299   const AMDGPURegisterBankInfo *getRegBankInfo() const override {
300     return RegBankInfo.get();
301   }
302 
303   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
304     return TargetID;
305   }
306 
307   // Nothing implemented, just prevent crashes on use.
308   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
309     return &TSInfo;
310   }
311 
312   const InstrItineraryData *getInstrItineraryData() const override {
313     return &InstrItins;
314   }
315 
316   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
317 
318   Generation getGeneration() const {
319     return (Generation)Gen;
320   }
321 
322   unsigned getMaxWaveScratchSize() const {
323     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
324     if (getGeneration() >= GFX12) {
325       // 18-bit field in units of 64-dword.
326       return (64 * 4) * ((1 << 18) - 1);
327     }
328     if (getGeneration() == GFX11) {
329       // 15-bit field in units of 64-dword.
330       return (64 * 4) * ((1 << 15) - 1);
331     }
332     // 13-bit field in units of 256-dword.
333     return (256 * 4) * ((1 << 13) - 1);
334   }
335 
336   /// Return the number of high bits known to be zero for a frame index.
337   unsigned getKnownHighZeroBitsForFrameIndex() const {
338     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
339   }
340 
341   int getLDSBankCount() const {
342     return LDSBankCount;
343   }
344 
345   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
346     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
347   }
348 
349   unsigned getConstantBusLimit(unsigned Opcode) const;
350 
351   /// Returns if the result of this instruction with a 16-bit result returned in
352   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
353   /// the original value.
354   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
355 
356   bool supportsWGP() const { return getGeneration() >= GFX10; }
357 
358   bool hasIntClamp() const {
359     return HasIntClamp;
360   }
361 
362   bool hasFP64() const {
363     return FP64;
364   }
365 
366   bool hasMIMG_R128() const {
367     return MIMG_R128;
368   }
369 
370   bool hasHWFP64() const {
371     return FP64;
372   }
373 
374   bool hasHalfRate64Ops() const {
375     return HalfRate64Ops;
376   }
377 
378   bool hasFullRate64Ops() const {
379     return FullRate64Ops;
380   }
381 
382   bool hasAddr64() const {
383     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
384   }
385 
386   bool hasFlat() const {
387     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
388   }
389 
390   // Return true if the target only has the reverse operand versions of VALU
391   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
392   bool hasOnlyRevVALUShifts() const {
393     return getGeneration() >= VOLCANIC_ISLANDS;
394   }
395 
396   bool hasFractBug() const {
397     return getGeneration() == SOUTHERN_ISLANDS;
398   }
399 
400   bool hasBFE() const {
401     return true;
402   }
403 
404   bool hasBFI() const {
405     return true;
406   }
407 
408   bool hasBFM() const {
409     return hasBFE();
410   }
411 
412   bool hasBCNT(unsigned Size) const {
413     return true;
414   }
415 
416   bool hasFFBL() const {
417     return true;
418   }
419 
420   bool hasFFBH() const {
421     return true;
422   }
423 
424   bool hasMed3_16() const {
425     return getGeneration() >= AMDGPUSubtarget::GFX9;
426   }
427 
428   bool hasMin3Max3_16() const {
429     return getGeneration() >= AMDGPUSubtarget::GFX9;
430   }
431 
432   bool hasFmaMixInsts() const {
433     return HasFmaMixInsts;
434   }
435 
436   bool hasCARRY() const {
437     return true;
438   }
439 
440   bool hasFMA() const {
441     return FMA;
442   }
443 
444   bool hasSwap() const {
445     return GFX9Insts;
446   }
447 
448   bool hasScalarPackInsts() const {
449     return GFX9Insts;
450   }
451 
452   bool hasScalarMulHiInsts() const {
453     return GFX9Insts;
454   }
455 
456   bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
457 
458   TrapHandlerAbi getTrapHandlerAbi() const {
459     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
460   }
461 
462   bool supportsGetDoorbellID() const {
463     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
464     return getGeneration() >= GFX9;
465   }
466 
467   /// True if the offset field of DS instructions works as expected. On SI, the
468   /// offset uses a 16-bit adder and does not always wrap properly.
469   bool hasUsableDSOffset() const {
470     return getGeneration() >= SEA_ISLANDS;
471   }
472 
473   bool unsafeDSOffsetFoldingEnabled() const {
474     return EnableUnsafeDSOffsetFolding;
475   }
476 
477   /// Condition output from div_scale is usable.
478   bool hasUsableDivScaleConditionOutput() const {
479     return getGeneration() != SOUTHERN_ISLANDS;
480   }
481 
482   /// Extra wait hazard is needed in some cases before
483   /// s_cbranch_vccnz/s_cbranch_vccz.
484   bool hasReadVCCZBug() const {
485     return getGeneration() <= SEA_ISLANDS;
486   }
487 
488   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
489   bool partialVCCWritesUpdateVCCZ() const {
490     return getGeneration() >= GFX10;
491   }
492 
493   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
494   /// was written by a VALU instruction.
495   bool hasSMRDReadVALUDefHazard() const {
496     return getGeneration() == SOUTHERN_ISLANDS;
497   }
498 
499   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
500   /// SGPR was written by a VALU Instruction.
501   bool hasVMEMReadSGPRVALUDefHazard() const {
502     return getGeneration() >= VOLCANIC_ISLANDS;
503   }
504 
505   bool hasRFEHazards() const {
506     return getGeneration() >= VOLCANIC_ISLANDS;
507   }
508 
509   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
510   unsigned getSetRegWaitStates() const {
511     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
512   }
513 
514   bool dumpCode() const {
515     return DumpCode;
516   }
517 
518   /// Return the amount of LDS that can be used that will not restrict the
519   /// occupancy lower than WaveCount.
520   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
521                                            const Function &) const;
522 
523   bool supportsMinMaxDenormModes() const {
524     return getGeneration() >= AMDGPUSubtarget::GFX9;
525   }
526 
527   /// \returns If target supports S_DENORM_MODE.
528   bool hasDenormModeInst() const {
529     return getGeneration() >= AMDGPUSubtarget::GFX10;
530   }
531 
532   bool useFlatForGlobal() const {
533     return FlatForGlobal;
534   }
535 
536   /// \returns If target supports ds_read/write_b128 and user enables generation
537   /// of ds_read/write_b128.
538   bool useDS128() const {
539     return CIInsts && EnableDS128;
540   }
541 
542   /// \return If target supports ds_read/write_b96/128.
543   bool hasDS96AndDS128() const {
544     return CIInsts;
545   }
546 
547   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
548   bool haveRoundOpsF64() const {
549     return CIInsts;
550   }
551 
552   /// \returns If MUBUF instructions always perform range checking, even for
553   /// buffer resources used for private memory access.
554   bool privateMemoryResourceIsRangeChecked() const {
555     return getGeneration() < AMDGPUSubtarget::GFX9;
556   }
557 
558   /// \returns If target requires PRT Struct NULL support (zero result registers
559   /// for sparse texture support).
560   bool usePRTStrictNull() const {
561     return EnablePRTStrictNull;
562   }
563 
564   bool hasAutoWaitcntBeforeBarrier() const {
565     return AutoWaitcntBeforeBarrier;
566   }
567 
568   /// \returns true if the target supports backing off of s_barrier instructions
569   /// when an exception is raised.
570   bool supportsBackOffBarrier() const {
571     return BackOffBarrier;
572   }
573 
574   bool hasUnalignedBufferAccess() const {
575     return UnalignedBufferAccess;
576   }
577 
578   bool hasUnalignedBufferAccessEnabled() const {
579     return UnalignedBufferAccess && UnalignedAccessMode;
580   }
581 
582   bool hasUnalignedDSAccess() const {
583     return UnalignedDSAccess;
584   }
585 
586   bool hasUnalignedDSAccessEnabled() const {
587     return UnalignedDSAccess && UnalignedAccessMode;
588   }
589 
590   bool hasUnalignedScratchAccess() const {
591     return UnalignedScratchAccess;
592   }
593 
594   bool hasUnalignedAccessMode() const {
595     return UnalignedAccessMode;
596   }
597 
598   bool hasApertureRegs() const {
599     return HasApertureRegs;
600   }
601 
602   bool isTrapHandlerEnabled() const {
603     return TrapHandler;
604   }
605 
606   bool isXNACKEnabled() const {
607     return TargetID.isXnackOnOrAny();
608   }
609 
610   bool isTgSplitEnabled() const {
611     return EnableTgSplit;
612   }
613 
614   bool isCuModeEnabled() const {
615     return EnableCuMode;
616   }
617 
618   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
619 
620   bool hasFlatAddressSpace() const {
621     return FlatAddressSpace;
622   }
623 
624   bool hasFlatScrRegister() const {
625     return hasFlatAddressSpace();
626   }
627 
628   bool hasFlatInstOffsets() const {
629     return FlatInstOffsets;
630   }
631 
632   bool hasFlatGlobalInsts() const {
633     return FlatGlobalInsts;
634   }
635 
636   bool hasFlatScratchInsts() const {
637     return FlatScratchInsts;
638   }
639 
640   // Check if target supports ST addressing mode with FLAT scratch instructions.
641   // The ST addressing mode means no registers are used, either VGPR or SGPR,
642   // but only immediate offset is swizzled and added to the FLAT scratch base.
643   bool hasFlatScratchSTMode() const {
644     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
645   }
646 
647   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
648 
649   bool hasScalarFlatScratchInsts() const {
650     return ScalarFlatScratchInsts;
651   }
652 
653   bool enableFlatScratch() const {
654     return flatScratchIsArchitected() ||
655            (EnableFlatScratch && hasFlatScratchInsts());
656   }
657 
658   bool hasGlobalAddTidInsts() const {
659     return GFX10_BEncoding;
660   }
661 
662   bool hasAtomicCSub() const {
663     return GFX10_BEncoding;
664   }
665 
666   bool hasExportInsts() const {
667     return !hasGFX940Insts();
668   }
669 
670   bool hasVINTERPEncoding() const {
671     return GFX11Insts;
672   }
673 
674   // DS_ADD_F64/DS_ADD_RTN_F64
675   bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
676 
677   bool hasMultiDwordFlatScratchAddressing() const {
678     return getGeneration() >= GFX9;
679   }
680 
681   bool hasFlatSegmentOffsetBug() const {
682     return HasFlatSegmentOffsetBug;
683   }
684 
685   bool hasFlatLgkmVMemCountInOrder() const {
686     return getGeneration() > GFX9;
687   }
688 
689   bool hasD16LoadStore() const {
690     return getGeneration() >= GFX9;
691   }
692 
693   bool d16PreservesUnusedBits() const {
694     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
695   }
696 
697   bool hasD16Images() const {
698     return getGeneration() >= VOLCANIC_ISLANDS;
699   }
700 
701   /// Return if most LDS instructions have an m0 use that require m0 to be
702   /// initialized.
703   bool ldsRequiresM0Init() const {
704     return getGeneration() < GFX9;
705   }
706 
707   // True if the hardware rewinds and replays GWS operations if a wave is
708   // preempted.
709   //
710   // If this is false, a GWS operation requires testing if a nack set the
711   // MEM_VIOL bit, and repeating if so.
712   bool hasGWSAutoReplay() const {
713     return getGeneration() >= GFX9;
714   }
715 
716   /// \returns if target has ds_gws_sema_release_all instruction.
717   bool hasGWSSemaReleaseAll() const {
718     return CIInsts;
719   }
720 
721   /// \returns true if the target has integer add/sub instructions that do not
722   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
723   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
724   /// for saturation.
725   bool hasAddNoCarry() const {
726     return AddNoCarryInsts;
727   }
728 
729   bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
730 
731   bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
732 
733   bool hasUnpackedD16VMem() const {
734     return HasUnpackedD16VMem;
735   }
736 
737   // Covers VS/PS/CS graphics shaders
738   bool isMesaGfxShader(const Function &F) const {
739     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
740   }
741 
742   bool hasMad64_32() const {
743     return getGeneration() >= SEA_ISLANDS;
744   }
745 
746   bool hasSDWAOmod() const {
747     return HasSDWAOmod;
748   }
749 
750   bool hasSDWAScalar() const {
751     return HasSDWAScalar;
752   }
753 
754   bool hasSDWASdst() const {
755     return HasSDWASdst;
756   }
757 
758   bool hasSDWAMac() const {
759     return HasSDWAMac;
760   }
761 
762   bool hasSDWAOutModsVOPC() const {
763     return HasSDWAOutModsVOPC;
764   }
765 
766   bool hasDLInsts() const {
767     return HasDLInsts;
768   }
769 
770   bool hasFmacF64Inst() const { return HasFmacF64Inst; }
771 
772   bool hasDot1Insts() const {
773     return HasDot1Insts;
774   }
775 
776   bool hasDot2Insts() const {
777     return HasDot2Insts;
778   }
779 
780   bool hasDot3Insts() const {
781     return HasDot3Insts;
782   }
783 
784   bool hasDot4Insts() const {
785     return HasDot4Insts;
786   }
787 
788   bool hasDot5Insts() const {
789     return HasDot5Insts;
790   }
791 
792   bool hasDot6Insts() const {
793     return HasDot6Insts;
794   }
795 
796   bool hasDot7Insts() const {
797     return HasDot7Insts;
798   }
799 
800   bool hasDot8Insts() const {
801     return HasDot8Insts;
802   }
803 
804   bool hasDot9Insts() const {
805     return HasDot9Insts;
806   }
807 
808   bool hasDot10Insts() const {
809     return HasDot10Insts;
810   }
811 
812   bool hasDot11Insts() const {
813     return HasDot11Insts;
814   }
815 
816   bool hasMAIInsts() const {
817     return HasMAIInsts;
818   }
819 
820   bool hasFP8Insts() const {
821     return HasFP8Insts;
822   }
823 
824   bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
825 
826   bool hasPkFmacF16Inst() const {
827     return HasPkFmacF16Inst;
828   }
829 
830   bool hasAtomicFMinFMaxF32GlobalInsts() const {
831     return HasAtomicFMinFMaxF32GlobalInsts;
832   }
833 
834   bool hasAtomicFMinFMaxF64GlobalInsts() const {
835     return HasAtomicFMinFMaxF64GlobalInsts;
836   }
837 
838   bool hasAtomicFMinFMaxF32FlatInsts() const {
839     return HasAtomicFMinFMaxF32FlatInsts;
840   }
841 
842   bool hasAtomicFMinFMaxF64FlatInsts() const {
843     return HasAtomicFMinFMaxF64FlatInsts;
844   }
845 
846   bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
847 
848   bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
849 
850   bool hasAtomicFaddInsts() const {
851     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
852   }
853 
854   bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
855 
856   bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
857 
858   bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
859     return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
860   }
861 
862   bool hasAtomicBufferGlobalPkAddF16Insts() const {
863     return HasAtomicBufferGlobalPkAddF16Insts;
864   }
865 
866   bool hasAtomicGlobalPkAddBF16Inst() const {
867     return HasAtomicGlobalPkAddBF16Inst;
868   }
869 
870   bool hasAtomicBufferPkAddBF16Inst() const {
871     return HasAtomicBufferPkAddBF16Inst;
872   }
873 
874   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
875 
876   /// \return true if the target has flat, global, and buffer atomic fadd for
877   /// double.
878   bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
879     return HasFlatBufferGlobalAtomicFaddF64Inst;
880   }
881 
882   /// \return true if the target's flat, global, and buffer atomic fadd for
883   /// float supports denormal handling.
884   bool hasMemoryAtomicFaddF32DenormalSupport() const {
885     return HasMemoryAtomicFaddF32DenormalSupport;
886   }
887 
888   /// \return true if atomic operations targeting fine-grained memory work
889   /// correctly at device scope, in allocations in host or peer PCIe device
890   /// memory.
891   bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
892     return HasAgentScopeFineGrainedRemoteMemoryAtomics;
893   }
894 
895   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
896 
897   bool hasDefaultComponentBroadcast() const {
898     return HasDefaultComponentBroadcast;
899   }
900 
901   bool hasNoSdstCMPX() const {
902     return HasNoSdstCMPX;
903   }
904 
905   bool hasVscnt() const {
906     return HasVscnt;
907   }
908 
909   bool hasGetWaveIdInst() const {
910     return HasGetWaveIdInst;
911   }
912 
913   bool hasSMemTimeInst() const {
914     return HasSMemTimeInst;
915   }
916 
917   bool hasShaderCyclesRegister() const {
918     return HasShaderCyclesRegister;
919   }
920 
921   bool hasShaderCyclesHiLoRegisters() const {
922     return HasShaderCyclesHiLoRegisters;
923   }
924 
925   bool hasVOP3Literal() const {
926     return HasVOP3Literal;
927   }
928 
929   bool hasNoDataDepHazard() const {
930     return HasNoDataDepHazard;
931   }
932 
933   bool vmemWriteNeedsExpWaitcnt() const {
934     return getGeneration() < SEA_ISLANDS;
935   }
936 
937   bool hasInstPrefetch() const {
938     return getGeneration() == GFX10 || getGeneration() == GFX11;
939   }
940 
941   bool hasPrefetch() const { return GFX12Insts; }
942 
943   // Has s_cmpk_* instructions.
944   bool hasSCmpK() const { return getGeneration() < GFX12; }
945 
946   // Scratch is allocated in 256 dword per wave blocks for the entire
947   // wavefront. When viewed from the perspective of an arbitrary workitem, this
948   // is 4-byte aligned.
949   //
950   // Only 4-byte alignment is really needed to access anything. Transformations
951   // on the pointer value itself may rely on the alignment / known low bits of
952   // the pointer. Set this to something above the minimum to avoid needing
953   // dynamic realignment in common cases.
954   Align getStackAlignment() const { return Align(16); }
955 
956   bool enableMachineScheduler() const override {
957     return true;
958   }
959 
960   bool useAA() const override;
961 
962   bool enableSubRegLiveness() const override {
963     return true;
964   }
965 
966   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
967   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
968 
969   // static wrappers
970   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
971 
972   // XXX - Why is this here if it isn't in the default pass set?
973   bool enableEarlyIfConversion() const override {
974     return true;
975   }
976 
977   void overrideSchedPolicy(MachineSchedPolicy &Policy,
978                            unsigned NumRegionInstrs) const override;
979 
980   void mirFileLoaded(MachineFunction &MF) const override;
981 
982   unsigned getMaxNumUserSGPRs() const {
983     return AMDGPU::getMaxNumUserSGPRs(*this);
984   }
985 
986   bool hasSMemRealTime() const {
987     return HasSMemRealTime;
988   }
989 
990   bool hasMovrel() const {
991     return HasMovrel;
992   }
993 
994   bool hasVGPRIndexMode() const {
995     return HasVGPRIndexMode;
996   }
997 
998   bool useVGPRIndexMode() const;
999 
1000   bool hasScalarCompareEq64() const {
1001     return getGeneration() >= VOLCANIC_ISLANDS;
1002   }
1003 
1004   bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
1005 
1006   bool hasScalarStores() const {
1007     return HasScalarStores;
1008   }
1009 
1010   bool hasScalarAtomics() const {
1011     return HasScalarAtomics;
1012   }
1013 
1014   bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
1015   bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
1016 
1017   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
1018   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
1019 
1020   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
1021   bool hasPermLane64() const { return getGeneration() >= GFX11; }
1022 
1023   bool hasDPP() const {
1024     return HasDPP;
1025   }
1026 
1027   bool hasDPPBroadcasts() const {
1028     return HasDPP && getGeneration() < GFX10;
1029   }
1030 
1031   bool hasDPPWavefrontShifts() const {
1032     return HasDPP && getGeneration() < GFX10;
1033   }
1034 
1035   bool hasDPP8() const {
1036     return HasDPP8;
1037   }
1038 
1039   bool hasDPALU_DPP() const {
1040     return HasDPALU_DPP;
1041   }
1042 
1043   bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
1044 
1045   bool hasPackedFP32Ops() const {
1046     return HasPackedFP32Ops;
1047   }
1048 
1049   // Has V_PK_MOV_B32 opcode
1050   bool hasPkMovB32() const {
1051     return GFX90AInsts;
1052   }
1053 
1054   bool hasFmaakFmamkF32Insts() const {
1055     return getGeneration() >= GFX10 || hasGFX940Insts();
1056   }
1057 
1058   bool hasImageInsts() const {
1059     return HasImageInsts;
1060   }
1061 
1062   bool hasExtendedImageInsts() const {
1063     return HasExtendedImageInsts;
1064   }
1065 
1066   bool hasR128A16() const {
1067     return HasR128A16;
1068   }
1069 
1070   bool hasA16() const { return HasA16; }
1071 
1072   bool hasG16() const { return HasG16; }
1073 
1074   bool hasOffset3fBug() const {
1075     return HasOffset3fBug;
1076   }
1077 
1078   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1079 
1080   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1081 
1082   bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1083 
1084   bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1085 
1086   bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1087 
1088   bool hasNSAEncoding() const { return HasNSAEncoding; }
1089 
1090   bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1091 
1092   bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1093 
1094   unsigned getNSAMaxSize(bool HasSampler = false) const {
1095     return AMDGPU::getNSAMaxSize(*this, HasSampler);
1096   }
1097 
1098   bool hasGFX10_AEncoding() const {
1099     return GFX10_AEncoding;
1100   }
1101 
1102   bool hasGFX10_BEncoding() const {
1103     return GFX10_BEncoding;
1104   }
1105 
1106   bool hasGFX10_3Insts() const {
1107     return GFX10_3Insts;
1108   }
1109 
1110   bool hasMadF16() const;
1111 
1112   bool hasMovB64() const { return GFX940Insts; }
1113 
1114   bool hasLshlAddB64() const { return GFX940Insts; }
1115 
1116   bool enableSIScheduler() const {
1117     return EnableSIScheduler;
1118   }
1119 
1120   bool loadStoreOptEnabled() const {
1121     return EnableLoadStoreOpt;
1122   }
1123 
1124   bool hasSGPRInitBug() const {
1125     return SGPRInitBug;
1126   }
1127 
1128   bool hasUserSGPRInit16Bug() const {
1129     return UserSGPRInit16Bug && isWave32();
1130   }
1131 
1132   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1133 
1134   bool hasNegativeUnalignedScratchOffsetBug() const {
1135     return NegativeUnalignedScratchOffsetBug;
1136   }
1137 
1138   bool hasMFMAInlineLiteralBug() const {
1139     return HasMFMAInlineLiteralBug;
1140   }
1141 
1142   bool has12DWordStoreHazard() const {
1143     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1144   }
1145 
1146   // \returns true if the subtarget supports DWORDX3 load/store instructions.
1147   bool hasDwordx3LoadStores() const {
1148     return CIInsts;
1149   }
1150 
1151   bool hasReadM0MovRelInterpHazard() const {
1152     return getGeneration() == AMDGPUSubtarget::GFX9;
1153   }
1154 
1155   bool hasReadM0SendMsgHazard() const {
1156     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1157            getGeneration() <= AMDGPUSubtarget::GFX9;
1158   }
1159 
1160   bool hasReadM0LdsDmaHazard() const {
1161     return getGeneration() == AMDGPUSubtarget::GFX9;
1162   }
1163 
1164   bool hasReadM0LdsDirectHazard() const {
1165     return getGeneration() == AMDGPUSubtarget::GFX9;
1166   }
1167 
1168   bool hasVcmpxPermlaneHazard() const {
1169     return HasVcmpxPermlaneHazard;
1170   }
1171 
1172   bool hasVMEMtoScalarWriteHazard() const {
1173     return HasVMEMtoScalarWriteHazard;
1174   }
1175 
1176   bool hasSMEMtoVectorWriteHazard() const {
1177     return HasSMEMtoVectorWriteHazard;
1178   }
1179 
1180   bool hasLDSMisalignedBug() const {
1181     return LDSMisalignedBug && !EnableCuMode;
1182   }
1183 
1184   bool hasInstFwdPrefetchBug() const {
1185     return HasInstFwdPrefetchBug;
1186   }
1187 
1188   bool hasVcmpxExecWARHazard() const {
1189     return HasVcmpxExecWARHazard;
1190   }
1191 
1192   bool hasLdsBranchVmemWARHazard() const {
1193     return HasLdsBranchVmemWARHazard;
1194   }
1195 
1196   // Shift amount of a 64 bit shift cannot be a highest allocated register
1197   // if also at the end of the allocation block.
1198   bool hasShift64HighRegBug() const {
1199     return GFX90AInsts && !GFX940Insts;
1200   }
1201 
1202   // Has one cycle hazard on transcendental instruction feeding a
1203   // non transcendental VALU.
1204   bool hasTransForwardingHazard() const { return GFX940Insts; }
1205 
1206   // Has one cycle hazard on a VALU instruction partially writing dst with
1207   // a shift of result bits feeding another VALU instruction.
1208   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1209 
1210   // Cannot use op_sel with v_dot instructions.
1211   bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
1212 
1213   // Does not have HW interlocs for VALU writing and then reading SGPRs.
1214   bool hasVDecCoExecHazard() const {
1215     return GFX940Insts;
1216   }
1217 
1218   bool hasNSAtoVMEMBug() const {
1219     return HasNSAtoVMEMBug;
1220   }
1221 
1222   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1223 
1224   bool hasHardClauses() const { return MaxHardClauseLength > 0; }
1225 
1226   bool hasGFX90AInsts() const { return GFX90AInsts; }
1227 
1228   bool hasFPAtomicToDenormModeHazard() const {
1229     return getGeneration() == GFX10;
1230   }
1231 
1232   bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1233 
1234   bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1235 
1236   bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1237 
1238   bool hasVALUPartialForwardingHazard() const {
1239     return getGeneration() == GFX11;
1240   }
1241 
1242   bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1243 
1244   bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1245 
1246   bool requiresCodeObjectV6() const { return RequiresCOV6; }
1247 
1248   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1249 
1250   /// Return if operations acting on VGPR tuples require even alignment.
1251   bool needsAlignedVGPRs() const { return GFX90AInsts; }
1252 
1253   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1254   bool hasSPackHL() const { return GFX11Insts; }
1255 
1256   /// Return true if the target's EXP instruction has the COMPR flag, which
1257   /// affects the meaning of the EN (enable) bits.
1258   bool hasCompressedExport() const { return !GFX11Insts; }
1259 
1260   /// Return true if the target's EXP instruction supports the NULL export
1261   /// target.
1262   bool hasNullExportTarget() const { return !GFX11Insts; }
1263 
1264   bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
1265 
1266   bool hasVOPDInsts() const { return HasVOPDInsts; }
1267 
1268   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1269 
1270   /// Return true if the target has the S_DELAY_ALU instruction.
1271   bool hasDelayAlu() const { return GFX11Insts; }
1272 
1273   bool hasPackedTID() const { return HasPackedTID; }
1274 
1275   // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1276   // hasGFX90AInsts is also true.
1277   bool hasGFX940Insts() const { return GFX940Insts; }
1278 
1279   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1280 
1281   bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
1282 
1283   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1284 
1285   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1286 
1287   bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
1288 
1289   bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
1290 
1291   /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1292   /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1293   bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1294 
1295   /// \returns true if inline constants are not supported for F16 pseudo
1296   /// scalar transcendentals.
1297   bool hasNoF16PseudoScalarTransInlineConstants() const {
1298     return getGeneration() == GFX12;
1299   }
1300 
1301   /// \returns The maximum number of instructions that can be enclosed in an
1302   /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1303   /// instruction.
1304   unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1305 
1306   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1307   /// SGPRs
1308   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1309 
1310   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1311   /// VGPRs
1312   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1313 
1314   /// Return occupancy for the given function. Used LDS and a number of
1315   /// registers if provided.
1316   /// Note, occupancy can be affected by the scratch allocation as well, but
1317   /// we do not have enough information to compute it.
1318   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1319                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1320 
1321   /// \returns true if the flat_scratch register should be initialized with the
1322   /// pointer to the wave's scratch memory rather than a size and offset.
1323   bool flatScratchIsPointer() const {
1324     return getGeneration() >= AMDGPUSubtarget::GFX9;
1325   }
1326 
1327   /// \returns true if the flat_scratch register is initialized by the HW.
1328   /// In this case it is readonly.
1329   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1330 
1331   /// \returns true if the architected SGPRs are enabled.
1332   bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1333 
1334   /// \returns true if Global Data Share is supported.
1335   bool hasGDS() const { return HasGDS; }
1336 
1337   /// \returns true if Global Wave Sync is supported.
1338   bool hasGWS() const { return HasGWS; }
1339 
1340   /// \returns true if the machine has merged shaders in which s0-s7 are
1341   /// reserved by the hardware and user SGPRs start at s8
1342   bool hasMergedShaders() const {
1343     return getGeneration() >= GFX9;
1344   }
1345 
1346   // \returns true if the target supports the pre-NGG legacy geometry path.
1347   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1348 
1349   // \returns true if preloading kernel arguments is supported.
1350   bool hasKernargPreload() const { return KernargPreload; }
1351 
1352   // \returns true if the target has split barriers feature
1353   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1354 
1355   // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1356   bool hasCvtFP8VOP1Bug() const { return true; }
1357 
1358   // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1359   // no-return form.
1360   bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1361 
1362   // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1363   bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1364 
1365   // \returns true if the target has IEEE kernel descriptor mode bit
1366   bool hasIEEEMode() const { return getGeneration() < GFX12; }
1367 
1368   // \returns true if the target has IEEE fminimum/fmaximum instructions
1369   bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1370 
1371   // \returns true if the target has IEEE fminimum3/fmaximum3 instructions
1372   bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
1373 
1374   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1375   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1376 
1377   /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1378   /// values.
1379   bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1380 
1381   // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1382   // of sign-extending.
1383   bool hasGetPCZeroExtension() const { return GFX12Insts; }
1384 
1385   /// \returns SGPR allocation granularity supported by the subtarget.
1386   unsigned getSGPRAllocGranule() const {
1387     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1388   }
1389 
1390   /// \returns SGPR encoding granularity supported by the subtarget.
1391   unsigned getSGPREncodingGranule() const {
1392     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1393   }
1394 
1395   /// \returns Total number of SGPRs supported by the subtarget.
1396   unsigned getTotalNumSGPRs() const {
1397     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1398   }
1399 
1400   /// \returns Addressable number of SGPRs supported by the subtarget.
1401   unsigned getAddressableNumSGPRs() const {
1402     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1403   }
1404 
1405   /// \returns Minimum number of SGPRs that meets the given number of waves per
1406   /// execution unit requirement supported by the subtarget.
1407   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1408     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1409   }
1410 
1411   /// \returns Maximum number of SGPRs that meets the given number of waves per
1412   /// execution unit requirement supported by the subtarget.
1413   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1414     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1415   }
1416 
1417   /// \returns Reserved number of SGPRs. This is common
1418   /// utility function called by MachineFunction and
1419   /// Function variants of getReservedNumSGPRs.
1420   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1421   /// \returns Reserved number of SGPRs for given machine function \p MF.
1422   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1423 
1424   /// \returns Reserved number of SGPRs for given function \p F.
1425   unsigned getReservedNumSGPRs(const Function &F) const;
1426 
1427   /// \returns max num SGPRs. This is the common utility
1428   /// function called by MachineFunction and Function
1429   /// variants of getMaxNumSGPRs.
1430   unsigned getBaseMaxNumSGPRs(const Function &F,
1431                               std::pair<unsigned, unsigned> WavesPerEU,
1432                               unsigned PreloadedSGPRs,
1433                               unsigned ReservedNumSGPRs) const;
1434 
1435   /// \returns Maximum number of SGPRs that meets number of waves per execution
1436   /// unit requirement for function \p MF, or number of SGPRs explicitly
1437   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1438   ///
1439   /// \returns Value that meets number of waves per execution unit requirement
1440   /// if explicitly requested value cannot be converted to integer, violates
1441   /// subtarget's specifications, or does not meet number of waves per execution
1442   /// unit requirement.
1443   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1444 
1445   /// \returns Maximum number of SGPRs that meets number of waves per execution
1446   /// unit requirement for function \p F, or number of SGPRs explicitly
1447   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1448   ///
1449   /// \returns Value that meets number of waves per execution unit requirement
1450   /// if explicitly requested value cannot be converted to integer, violates
1451   /// subtarget's specifications, or does not meet number of waves per execution
1452   /// unit requirement.
1453   unsigned getMaxNumSGPRs(const Function &F) const;
1454 
1455   /// \returns VGPR allocation granularity supported by the subtarget.
1456   unsigned getVGPRAllocGranule() const {
1457     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1458   }
1459 
1460   /// \returns VGPR encoding granularity supported by the subtarget.
1461   unsigned getVGPREncodingGranule() const {
1462     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1463   }
1464 
1465   /// \returns Total number of VGPRs supported by the subtarget.
1466   unsigned getTotalNumVGPRs() const {
1467     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1468   }
1469 
1470   /// \returns Addressable number of architectural VGPRs supported by the
1471   /// subtarget.
1472   unsigned getAddressableNumArchVGPRs() const {
1473     return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
1474   }
1475 
1476   /// \returns Addressable number of VGPRs supported by the subtarget.
1477   unsigned getAddressableNumVGPRs() const {
1478     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1479   }
1480 
1481   /// \returns the minimum number of VGPRs that will prevent achieving more than
1482   /// the specified number of waves \p WavesPerEU.
1483   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1484     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1485   }
1486 
1487   /// \returns the maximum number of VGPRs that can be used and still achieved
1488   /// at least the specified number of waves \p WavesPerEU.
1489   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1490     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1491   }
1492 
1493   /// \returns max num VGPRs. This is the common utility function
1494   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1495   unsigned getBaseMaxNumVGPRs(const Function &F,
1496                               std::pair<unsigned, unsigned> WavesPerEU) const;
1497   /// \returns Maximum number of VGPRs that meets number of waves per execution
1498   /// unit requirement for function \p F, or number of VGPRs explicitly
1499   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1500   ///
1501   /// \returns Value that meets number of waves per execution unit requirement
1502   /// if explicitly requested value cannot be converted to integer, violates
1503   /// subtarget's specifications, or does not meet number of waves per execution
1504   /// unit requirement.
1505   unsigned getMaxNumVGPRs(const Function &F) const;
1506 
1507   unsigned getMaxNumAGPRs(const Function &F) const {
1508     return getMaxNumVGPRs(F);
1509   }
1510 
1511   /// \returns Maximum number of VGPRs that meets number of waves per execution
1512   /// unit requirement for function \p MF, or number of VGPRs explicitly
1513   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1514   ///
1515   /// \returns Value that meets number of waves per execution unit requirement
1516   /// if explicitly requested value cannot be converted to integer, violates
1517   /// subtarget's specifications, or does not meet number of waves per execution
1518   /// unit requirement.
1519   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1520 
1521   void getPostRAMutations(
1522       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1523       const override;
1524 
1525   std::unique_ptr<ScheduleDAGMutation>
1526   createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1527 
1528   bool isWave32() const {
1529     return getWavefrontSize() == 32;
1530   }
1531 
1532   bool isWave64() const {
1533     return getWavefrontSize() == 64;
1534   }
1535 
1536   const TargetRegisterClass *getBoolRC() const {
1537     return getRegisterInfo()->getBoolRC();
1538   }
1539 
1540   /// \returns Maximum number of work groups per compute unit supported by the
1541   /// subtarget and limited by given \p FlatWorkGroupSize.
1542   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1543     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1544   }
1545 
1546   /// \returns Minimum flat work group size supported by the subtarget.
1547   unsigned getMinFlatWorkGroupSize() const override {
1548     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1549   }
1550 
1551   /// \returns Maximum flat work group size supported by the subtarget.
1552   unsigned getMaxFlatWorkGroupSize() const override {
1553     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1554   }
1555 
1556   /// \returns Number of waves per execution unit required to support the given
1557   /// \p FlatWorkGroupSize.
1558   unsigned
1559   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1560     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1561   }
1562 
1563   /// \returns Minimum number of waves per execution unit supported by the
1564   /// subtarget.
1565   unsigned getMinWavesPerEU() const override {
1566     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1567   }
1568 
1569   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1570                              SDep &Dep,
1571                              const TargetSchedModel *SchedModel) const override;
1572 
1573   // \returns true if it's beneficial on this subtarget for the scheduler to
1574   // cluster stores as well as loads.
1575   bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1576 
1577   // \returns the number of address arguments from which to enable MIMG NSA
1578   // on supported architectures.
1579   unsigned getNSAThreshold(const MachineFunction &MF) const;
1580 
1581   // \returns true if the subtarget has a hazard requiring an "s_nop 0"
1582   // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1583   bool requiresNopBeforeDeallocVGPRs() const {
1584     // Currently all targets that support the dealloc VGPRs message also require
1585     // the nop.
1586     return true;
1587   }
1588 };
1589 
1590 class GCNUserSGPRUsageInfo {
1591 public:
1592   bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1593 
1594   bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1595 
1596   bool hasDispatchPtr() const { return DispatchPtr; }
1597 
1598   bool hasQueuePtr() const { return QueuePtr; }
1599 
1600   bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1601 
1602   bool hasDispatchID() const { return DispatchID; }
1603 
1604   bool hasFlatScratchInit() const { return FlatScratchInit; }
1605 
1606   bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1607 
1608   unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1609 
1610   unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1611 
1612   unsigned getNumFreeUserSGPRs();
1613 
1614   void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1615 
1616   enum UserSGPRID : unsigned {
1617     ImplicitBufferPtrID = 0,
1618     PrivateSegmentBufferID = 1,
1619     DispatchPtrID = 2,
1620     QueuePtrID = 3,
1621     KernargSegmentPtrID = 4,
1622     DispatchIdID = 5,
1623     FlatScratchInitID = 6,
1624     PrivateSegmentSizeID = 7
1625   };
1626 
1627   // Returns the size in number of SGPRs for preload user SGPR field.
1628   static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1629     switch (ID) {
1630     case ImplicitBufferPtrID:
1631       return 2;
1632     case PrivateSegmentBufferID:
1633       return 4;
1634     case DispatchPtrID:
1635       return 2;
1636     case QueuePtrID:
1637       return 2;
1638     case KernargSegmentPtrID:
1639       return 2;
1640     case DispatchIdID:
1641       return 2;
1642     case FlatScratchInitID:
1643       return 2;
1644     case PrivateSegmentSizeID:
1645       return 1;
1646     }
1647     llvm_unreachable("Unknown UserSGPRID.");
1648   }
1649 
1650   GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1651 
1652 private:
1653   const GCNSubtarget &ST;
1654 
1655   // Private memory buffer
1656   // Compute directly in sgpr[0:1]
1657   // Other shaders indirect 64-bits at sgpr[0:1]
1658   bool ImplicitBufferPtr = false;
1659 
1660   bool PrivateSegmentBuffer = false;
1661 
1662   bool DispatchPtr = false;
1663 
1664   bool QueuePtr = false;
1665 
1666   bool KernargSegmentPtr = false;
1667 
1668   bool DispatchID = false;
1669 
1670   bool FlatScratchInit = false;
1671 
1672   bool PrivateSegmentSize = false;
1673 
1674   unsigned NumKernargPreloadSGPRs = 0;
1675 
1676   unsigned NumUsedUserSGPRs = 0;
1677 };
1678 
1679 } // end namespace llvm
1680 
1681 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1682