xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h (revision 53120fbb68952b7d620c2c0e1cf05c5017fc1b27)
1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "SIFrameLowering.h"
21 #include "SIISelLowering.h"
22 #include "SIInstrInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
25 #include "llvm/Support/ErrorHandling.h"
26 
27 #define GET_SUBTARGETINFO_HEADER
28 #include "AMDGPUGenSubtargetInfo.inc"
29 
30 namespace llvm {
31 
32 class GCNTargetMachine;
33 
34 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
35                            public AMDGPUSubtarget {
36 public:
37   using AMDGPUSubtarget::getMaxWavesPerEU;
38 
39   // Following 2 enums are documented at:
40   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41   enum class TrapHandlerAbi {
42     NONE   = 0x00,
43     AMDHSA = 0x01,
44   };
45 
46   enum class TrapID {
47     LLVMAMDHSATrap      = 0x02,
48     LLVMAMDHSADebugTrap = 0x03,
49   };
50 
51 private:
52   /// GlobalISel related APIs.
53   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
54   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
55   std::unique_ptr<InstructionSelector> InstSelector;
56   std::unique_ptr<LegalizerInfo> Legalizer;
57   std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
58 
59 protected:
60   // Basic subtarget description.
61   Triple TargetTriple;
62   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
63   unsigned Gen = INVALID;
64   InstrItineraryData InstrItins;
65   int LDSBankCount = 0;
66   unsigned MaxPrivateElementSize = 0;
67 
68   // Possibly statically set by tablegen, but may want to be overridden.
69   bool FastDenormalF32 = false;
70   bool HalfRate64Ops = false;
71   bool FullRate64Ops = false;
72 
73   // Dynamically set bits that enable features.
74   bool FlatForGlobal = false;
75   bool AutoWaitcntBeforeBarrier = false;
76   bool BackOffBarrier = false;
77   bool UnalignedScratchAccess = false;
78   bool UnalignedAccessMode = false;
79   bool HasApertureRegs = false;
80   bool SupportsXNACK = false;
81   bool KernargPreload = false;
82 
83   // This should not be used directly. 'TargetID' tracks the dynamic settings
84   // for XNACK.
85   bool EnableXNACK = false;
86 
87   bool EnableTgSplit = false;
88   bool EnableCuMode = false;
89   bool TrapHandler = false;
90 
91   // Used as options.
92   bool EnableLoadStoreOpt = false;
93   bool EnableUnsafeDSOffsetFolding = false;
94   bool EnableSIScheduler = false;
95   bool EnableDS128 = false;
96   bool EnablePRTStrictNull = false;
97   bool DumpCode = false;
98 
99   // Subtarget statically properties set by tablegen
100   bool FP64 = false;
101   bool FMA = false;
102   bool MIMG_R128 = false;
103   bool CIInsts = false;
104   bool GFX8Insts = false;
105   bool GFX9Insts = false;
106   bool GFX90AInsts = false;
107   bool GFX940Insts = false;
108   bool GFX10Insts = false;
109   bool GFX11Insts = false;
110   bool GFX12Insts = false;
111   bool GFX10_3Insts = false;
112   bool GFX7GFX8GFX9Insts = false;
113   bool SGPRInitBug = false;
114   bool UserSGPRInit16Bug = false;
115   bool NegativeScratchOffsetBug = false;
116   bool NegativeUnalignedScratchOffsetBug = false;
117   bool HasSMemRealTime = false;
118   bool HasIntClamp = false;
119   bool HasFmaMixInsts = false;
120   bool HasMovrel = false;
121   bool HasVGPRIndexMode = false;
122   bool HasScalarDwordx3Loads = false;
123   bool HasScalarStores = false;
124   bool HasScalarAtomics = false;
125   bool HasSDWAOmod = false;
126   bool HasSDWAScalar = false;
127   bool HasSDWASdst = false;
128   bool HasSDWAMac = false;
129   bool HasSDWAOutModsVOPC = false;
130   bool HasDPP = false;
131   bool HasDPP8 = false;
132   bool HasDPALU_DPP = false;
133   bool HasDPPSrc1SGPR = false;
134   bool HasPackedFP32Ops = false;
135   bool HasImageInsts = false;
136   bool HasExtendedImageInsts = false;
137   bool HasR128A16 = false;
138   bool HasA16 = false;
139   bool HasG16 = false;
140   bool HasNSAEncoding = false;
141   bool HasPartialNSAEncoding = false;
142   bool GFX10_AEncoding = false;
143   bool GFX10_BEncoding = false;
144   bool HasDLInsts = false;
145   bool HasFmacF64Inst = false;
146   bool HasDot1Insts = false;
147   bool HasDot2Insts = false;
148   bool HasDot3Insts = false;
149   bool HasDot4Insts = false;
150   bool HasDot5Insts = false;
151   bool HasDot6Insts = false;
152   bool HasDot7Insts = false;
153   bool HasDot8Insts = false;
154   bool HasDot9Insts = false;
155   bool HasDot10Insts = false;
156   bool HasMAIInsts = false;
157   bool HasFP8Insts = false;
158   bool HasFP8ConversionInsts = false;
159   bool HasPkFmacF16Inst = false;
160   bool HasAtomicDsPkAdd16Insts = false;
161   bool HasAtomicFlatPkAdd16Insts = false;
162   bool HasAtomicFaddRtnInsts = false;
163   bool HasAtomicFaddNoRtnInsts = false;
164   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
165   bool HasAtomicBufferGlobalPkAddF16Insts = false;
166   bool HasAtomicCSubNoRtnInsts = false;
167   bool HasAtomicGlobalPkAddBF16Inst = false;
168   bool HasFlatAtomicFaddF32Inst = false;
169   bool HasDefaultComponentZero = false;
170   bool HasDefaultComponentBroadcast = false;
171   bool SupportsSRAMECC = false;
172 
173   // This should not be used directly. 'TargetID' tracks the dynamic settings
174   // for SRAMECC.
175   bool EnableSRAMECC = false;
176 
177   bool HasNoSdstCMPX = false;
178   bool HasVscnt = false;
179   bool HasGetWaveIdInst = false;
180   bool HasSMemTimeInst = false;
181   bool HasShaderCyclesRegister = false;
182   bool HasShaderCyclesHiLoRegisters = false;
183   bool HasVOP3Literal = false;
184   bool HasNoDataDepHazard = false;
185   bool FlatAddressSpace = false;
186   bool FlatInstOffsets = false;
187   bool FlatGlobalInsts = false;
188   bool FlatScratchInsts = false;
189   bool ScalarFlatScratchInsts = false;
190   bool HasArchitectedFlatScratch = false;
191   bool EnableFlatScratch = false;
192   bool HasArchitectedSGPRs = false;
193   bool HasGDS = false;
194   bool HasGWS = false;
195   bool AddNoCarryInsts = false;
196   bool HasUnpackedD16VMem = false;
197   bool LDSMisalignedBug = false;
198   bool HasMFMAInlineLiteralBug = false;
199   bool UnalignedBufferAccess = false;
200   bool UnalignedDSAccess = false;
201   bool HasPackedTID = false;
202   bool ScalarizeGlobal = false;
203   bool HasSALUFloatInsts = false;
204   bool HasVGPRSingleUseHintInsts = false;
205   bool HasPseudoScalarTrans = false;
206   bool HasRestrictedSOffset = false;
207 
208   bool HasVcmpxPermlaneHazard = false;
209   bool HasVMEMtoScalarWriteHazard = false;
210   bool HasSMEMtoVectorWriteHazard = false;
211   bool HasInstFwdPrefetchBug = false;
212   bool HasVcmpxExecWARHazard = false;
213   bool HasLdsBranchVmemWARHazard = false;
214   bool HasNSAtoVMEMBug = false;
215   bool HasNSAClauseBug = false;
216   bool HasOffset3fBug = false;
217   bool HasFlatSegmentOffsetBug = false;
218   bool HasImageStoreD16Bug = false;
219   bool HasImageGather4D16Bug = false;
220   bool HasMSAALoadDstSelBug = false;
221   bool HasGFX11FullVGPRs = false;
222   bool HasMADIntraFwdBug = false;
223   bool HasVOPDInsts = false;
224   bool HasVALUTransUseHazard = false;
225   bool HasForceStoreSC0SC1 = false;
226 
227   // Dummy feature to use for assembler in tablegen.
228   bool FeatureDisable = false;
229 
230   SelectionDAGTargetInfo TSInfo;
231 private:
232   SIInstrInfo InstrInfo;
233   SITargetLowering TLInfo;
234   SIFrameLowering FrameLowering;
235 
236 public:
237   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
238                const GCNTargetMachine &TM);
239   ~GCNSubtarget() override;
240 
241   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
242                                                    StringRef GPU, StringRef FS);
243 
244   const SIInstrInfo *getInstrInfo() const override {
245     return &InstrInfo;
246   }
247 
248   const SIFrameLowering *getFrameLowering() const override {
249     return &FrameLowering;
250   }
251 
252   const SITargetLowering *getTargetLowering() const override {
253     return &TLInfo;
254   }
255 
256   const SIRegisterInfo *getRegisterInfo() const override {
257     return &InstrInfo.getRegisterInfo();
258   }
259 
260   const CallLowering *getCallLowering() const override {
261     return CallLoweringInfo.get();
262   }
263 
264   const InlineAsmLowering *getInlineAsmLowering() const override {
265     return InlineAsmLoweringInfo.get();
266   }
267 
268   InstructionSelector *getInstructionSelector() const override {
269     return InstSelector.get();
270   }
271 
272   const LegalizerInfo *getLegalizerInfo() const override {
273     return Legalizer.get();
274   }
275 
276   const AMDGPURegisterBankInfo *getRegBankInfo() const override {
277     return RegBankInfo.get();
278   }
279 
280   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
281     return TargetID;
282   }
283 
284   // Nothing implemented, just prevent crashes on use.
285   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
286     return &TSInfo;
287   }
288 
289   const InstrItineraryData *getInstrItineraryData() const override {
290     return &InstrItins;
291   }
292 
293   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
294 
295   Generation getGeneration() const {
296     return (Generation)Gen;
297   }
298 
299   unsigned getMaxWaveScratchSize() const {
300     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
301     if (getGeneration() >= GFX12) {
302       // 18-bit field in units of 64-dword.
303       return (64 * 4) * ((1 << 18) - 1);
304     }
305     if (getGeneration() == GFX11) {
306       // 15-bit field in units of 64-dword.
307       return (64 * 4) * ((1 << 15) - 1);
308     }
309     // 13-bit field in units of 256-dword.
310     return (256 * 4) * ((1 << 13) - 1);
311   }
312 
313   /// Return the number of high bits known to be zero for a frame index.
314   unsigned getKnownHighZeroBitsForFrameIndex() const {
315     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
316   }
317 
318   int getLDSBankCount() const {
319     return LDSBankCount;
320   }
321 
322   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
323     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
324   }
325 
326   unsigned getConstantBusLimit(unsigned Opcode) const;
327 
328   /// Returns if the result of this instruction with a 16-bit result returned in
329   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
330   /// the original value.
331   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
332 
333   bool supportsWGP() const { return getGeneration() >= GFX10; }
334 
335   bool hasIntClamp() const {
336     return HasIntClamp;
337   }
338 
339   bool hasFP64() const {
340     return FP64;
341   }
342 
343   bool hasMIMG_R128() const {
344     return MIMG_R128;
345   }
346 
347   bool hasHWFP64() const {
348     return FP64;
349   }
350 
351   bool hasHalfRate64Ops() const {
352     return HalfRate64Ops;
353   }
354 
355   bool hasFullRate64Ops() const {
356     return FullRate64Ops;
357   }
358 
359   bool hasAddr64() const {
360     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
361   }
362 
363   bool hasFlat() const {
364     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
365   }
366 
367   // Return true if the target only has the reverse operand versions of VALU
368   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
369   bool hasOnlyRevVALUShifts() const {
370     return getGeneration() >= VOLCANIC_ISLANDS;
371   }
372 
373   bool hasFractBug() const {
374     return getGeneration() == SOUTHERN_ISLANDS;
375   }
376 
377   bool hasBFE() const {
378     return true;
379   }
380 
381   bool hasBFI() const {
382     return true;
383   }
384 
385   bool hasBFM() const {
386     return hasBFE();
387   }
388 
389   bool hasBCNT(unsigned Size) const {
390     return true;
391   }
392 
393   bool hasFFBL() const {
394     return true;
395   }
396 
397   bool hasFFBH() const {
398     return true;
399   }
400 
401   bool hasMed3_16() const {
402     return getGeneration() >= AMDGPUSubtarget::GFX9;
403   }
404 
405   bool hasMin3Max3_16() const {
406     return getGeneration() >= AMDGPUSubtarget::GFX9;
407   }
408 
409   bool hasFmaMixInsts() const {
410     return HasFmaMixInsts;
411   }
412 
413   bool hasCARRY() const {
414     return true;
415   }
416 
417   bool hasFMA() const {
418     return FMA;
419   }
420 
421   bool hasSwap() const {
422     return GFX9Insts;
423   }
424 
425   bool hasScalarPackInsts() const {
426     return GFX9Insts;
427   }
428 
429   bool hasScalarMulHiInsts() const {
430     return GFX9Insts;
431   }
432 
433   bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
434 
435   TrapHandlerAbi getTrapHandlerAbi() const {
436     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
437   }
438 
439   bool supportsGetDoorbellID() const {
440     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
441     return getGeneration() >= GFX9;
442   }
443 
444   /// True if the offset field of DS instructions works as expected. On SI, the
445   /// offset uses a 16-bit adder and does not always wrap properly.
446   bool hasUsableDSOffset() const {
447     return getGeneration() >= SEA_ISLANDS;
448   }
449 
450   bool unsafeDSOffsetFoldingEnabled() const {
451     return EnableUnsafeDSOffsetFolding;
452   }
453 
454   /// Condition output from div_scale is usable.
455   bool hasUsableDivScaleConditionOutput() const {
456     return getGeneration() != SOUTHERN_ISLANDS;
457   }
458 
459   /// Extra wait hazard is needed in some cases before
460   /// s_cbranch_vccnz/s_cbranch_vccz.
461   bool hasReadVCCZBug() const {
462     return getGeneration() <= SEA_ISLANDS;
463   }
464 
465   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
466   bool partialVCCWritesUpdateVCCZ() const {
467     return getGeneration() >= GFX10;
468   }
469 
470   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
471   /// was written by a VALU instruction.
472   bool hasSMRDReadVALUDefHazard() const {
473     return getGeneration() == SOUTHERN_ISLANDS;
474   }
475 
476   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
477   /// SGPR was written by a VALU Instruction.
478   bool hasVMEMReadSGPRVALUDefHazard() const {
479     return getGeneration() >= VOLCANIC_ISLANDS;
480   }
481 
482   bool hasRFEHazards() const {
483     return getGeneration() >= VOLCANIC_ISLANDS;
484   }
485 
486   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
487   unsigned getSetRegWaitStates() const {
488     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
489   }
490 
491   bool dumpCode() const {
492     return DumpCode;
493   }
494 
495   /// Return the amount of LDS that can be used that will not restrict the
496   /// occupancy lower than WaveCount.
497   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
498                                            const Function &) const;
499 
500   bool supportsMinMaxDenormModes() const {
501     return getGeneration() >= AMDGPUSubtarget::GFX9;
502   }
503 
504   /// \returns If target supports S_DENORM_MODE.
505   bool hasDenormModeInst() const {
506     return getGeneration() >= AMDGPUSubtarget::GFX10;
507   }
508 
509   bool useFlatForGlobal() const {
510     return FlatForGlobal;
511   }
512 
513   /// \returns If target supports ds_read/write_b128 and user enables generation
514   /// of ds_read/write_b128.
515   bool useDS128() const {
516     return CIInsts && EnableDS128;
517   }
518 
519   /// \return If target supports ds_read/write_b96/128.
520   bool hasDS96AndDS128() const {
521     return CIInsts;
522   }
523 
524   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
525   bool haveRoundOpsF64() const {
526     return CIInsts;
527   }
528 
529   /// \returns If MUBUF instructions always perform range checking, even for
530   /// buffer resources used for private memory access.
531   bool privateMemoryResourceIsRangeChecked() const {
532     return getGeneration() < AMDGPUSubtarget::GFX9;
533   }
534 
535   /// \returns If target requires PRT Struct NULL support (zero result registers
536   /// for sparse texture support).
537   bool usePRTStrictNull() const {
538     return EnablePRTStrictNull;
539   }
540 
541   bool hasAutoWaitcntBeforeBarrier() const {
542     return AutoWaitcntBeforeBarrier;
543   }
544 
545   /// \returns true if the target supports backing off of s_barrier instructions
546   /// when an exception is raised.
547   bool supportsBackOffBarrier() const {
548     return BackOffBarrier;
549   }
550 
551   bool hasUnalignedBufferAccess() const {
552     return UnalignedBufferAccess;
553   }
554 
555   bool hasUnalignedBufferAccessEnabled() const {
556     return UnalignedBufferAccess && UnalignedAccessMode;
557   }
558 
559   bool hasUnalignedDSAccess() const {
560     return UnalignedDSAccess;
561   }
562 
563   bool hasUnalignedDSAccessEnabled() const {
564     return UnalignedDSAccess && UnalignedAccessMode;
565   }
566 
567   bool hasUnalignedScratchAccess() const {
568     return UnalignedScratchAccess;
569   }
570 
571   bool hasUnalignedAccessMode() const {
572     return UnalignedAccessMode;
573   }
574 
575   bool hasApertureRegs() const {
576     return HasApertureRegs;
577   }
578 
579   bool isTrapHandlerEnabled() const {
580     return TrapHandler;
581   }
582 
583   bool isXNACKEnabled() const {
584     return TargetID.isXnackOnOrAny();
585   }
586 
587   bool isTgSplitEnabled() const {
588     return EnableTgSplit;
589   }
590 
591   bool isCuModeEnabled() const {
592     return EnableCuMode;
593   }
594 
595   bool hasFlatAddressSpace() const {
596     return FlatAddressSpace;
597   }
598 
599   bool hasFlatScrRegister() const {
600     return hasFlatAddressSpace();
601   }
602 
603   bool hasFlatInstOffsets() const {
604     return FlatInstOffsets;
605   }
606 
607   bool hasFlatGlobalInsts() const {
608     return FlatGlobalInsts;
609   }
610 
611   bool hasFlatScratchInsts() const {
612     return FlatScratchInsts;
613   }
614 
615   // Check if target supports ST addressing mode with FLAT scratch instructions.
616   // The ST addressing mode means no registers are used, either VGPR or SGPR,
617   // but only immediate offset is swizzled and added to the FLAT scratch base.
618   bool hasFlatScratchSTMode() const {
619     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
620   }
621 
622   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
623 
624   bool hasScalarFlatScratchInsts() const {
625     return ScalarFlatScratchInsts;
626   }
627 
628   bool enableFlatScratch() const {
629     return flatScratchIsArchitected() ||
630            (EnableFlatScratch && hasFlatScratchInsts());
631   }
632 
633   bool hasGlobalAddTidInsts() const {
634     return GFX10_BEncoding;
635   }
636 
637   bool hasAtomicCSub() const {
638     return GFX10_BEncoding;
639   }
640 
641   bool hasMultiDwordFlatScratchAddressing() const {
642     return getGeneration() >= GFX9;
643   }
644 
645   bool hasFlatSegmentOffsetBug() const {
646     return HasFlatSegmentOffsetBug;
647   }
648 
649   bool hasFlatLgkmVMemCountInOrder() const {
650     return getGeneration() > GFX9;
651   }
652 
653   bool hasD16LoadStore() const {
654     return getGeneration() >= GFX9;
655   }
656 
657   bool d16PreservesUnusedBits() const {
658     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
659   }
660 
661   bool hasD16Images() const {
662     return getGeneration() >= VOLCANIC_ISLANDS;
663   }
664 
665   /// Return if most LDS instructions have an m0 use that require m0 to be
666   /// initialized.
667   bool ldsRequiresM0Init() const {
668     return getGeneration() < GFX9;
669   }
670 
671   // True if the hardware rewinds and replays GWS operations if a wave is
672   // preempted.
673   //
674   // If this is false, a GWS operation requires testing if a nack set the
675   // MEM_VIOL bit, and repeating if so.
676   bool hasGWSAutoReplay() const {
677     return getGeneration() >= GFX9;
678   }
679 
680   /// \returns if target has ds_gws_sema_release_all instruction.
681   bool hasGWSSemaReleaseAll() const {
682     return CIInsts;
683   }
684 
685   /// \returns true if the target has integer add/sub instructions that do not
686   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
687   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
688   /// for saturation.
689   bool hasAddNoCarry() const {
690     return AddNoCarryInsts;
691   }
692 
693   bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
694 
695   bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
696 
697   bool hasUnpackedD16VMem() const {
698     return HasUnpackedD16VMem;
699   }
700 
701   // Covers VS/PS/CS graphics shaders
702   bool isMesaGfxShader(const Function &F) const {
703     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
704   }
705 
706   bool hasMad64_32() const {
707     return getGeneration() >= SEA_ISLANDS;
708   }
709 
710   bool hasSDWAOmod() const {
711     return HasSDWAOmod;
712   }
713 
714   bool hasSDWAScalar() const {
715     return HasSDWAScalar;
716   }
717 
718   bool hasSDWASdst() const {
719     return HasSDWASdst;
720   }
721 
722   bool hasSDWAMac() const {
723     return HasSDWAMac;
724   }
725 
726   bool hasSDWAOutModsVOPC() const {
727     return HasSDWAOutModsVOPC;
728   }
729 
730   bool hasDLInsts() const {
731     return HasDLInsts;
732   }
733 
734   bool hasFmacF64Inst() const { return HasFmacF64Inst; }
735 
736   bool hasDot1Insts() const {
737     return HasDot1Insts;
738   }
739 
740   bool hasDot2Insts() const {
741     return HasDot2Insts;
742   }
743 
744   bool hasDot3Insts() const {
745     return HasDot3Insts;
746   }
747 
748   bool hasDot4Insts() const {
749     return HasDot4Insts;
750   }
751 
752   bool hasDot5Insts() const {
753     return HasDot5Insts;
754   }
755 
756   bool hasDot6Insts() const {
757     return HasDot6Insts;
758   }
759 
760   bool hasDot7Insts() const {
761     return HasDot7Insts;
762   }
763 
764   bool hasDot8Insts() const {
765     return HasDot8Insts;
766   }
767 
768   bool hasDot9Insts() const {
769     return HasDot9Insts;
770   }
771 
772   bool hasDot10Insts() const {
773     return HasDot10Insts;
774   }
775 
776   bool hasMAIInsts() const {
777     return HasMAIInsts;
778   }
779 
780   bool hasFP8Insts() const {
781     return HasFP8Insts;
782   }
783 
784   bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
785 
786   bool hasPkFmacF16Inst() const {
787     return HasPkFmacF16Inst;
788   }
789 
790   bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
791 
792   bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
793 
794   bool hasAtomicFaddInsts() const {
795     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
796   }
797 
798   bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
799 
800   bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
801 
802   bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
803     return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
804   }
805 
806   bool hasAtomicBufferGlobalPkAddF16Insts() const {
807     return HasAtomicBufferGlobalPkAddF16Insts;
808   }
809 
810   bool hasAtomicGlobalPkAddBF16Inst() const {
811     return HasAtomicGlobalPkAddBF16Inst;
812   }
813 
814   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
815 
816   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
817 
818   bool hasDefaultComponentBroadcast() const {
819     return HasDefaultComponentBroadcast;
820   }
821 
822   bool hasNoSdstCMPX() const {
823     return HasNoSdstCMPX;
824   }
825 
826   bool hasVscnt() const {
827     return HasVscnt;
828   }
829 
830   bool hasGetWaveIdInst() const {
831     return HasGetWaveIdInst;
832   }
833 
834   bool hasSMemTimeInst() const {
835     return HasSMemTimeInst;
836   }
837 
838   bool hasShaderCyclesRegister() const {
839     return HasShaderCyclesRegister;
840   }
841 
842   bool hasShaderCyclesHiLoRegisters() const {
843     return HasShaderCyclesHiLoRegisters;
844   }
845 
846   bool hasVOP3Literal() const {
847     return HasVOP3Literal;
848   }
849 
850   bool hasNoDataDepHazard() const {
851     return HasNoDataDepHazard;
852   }
853 
854   bool vmemWriteNeedsExpWaitcnt() const {
855     return getGeneration() < SEA_ISLANDS;
856   }
857 
858   bool hasInstPrefetch() const {
859     return getGeneration() == GFX10 || getGeneration() == GFX11;
860   }
861 
862   bool hasPrefetch() const { return GFX12Insts; }
863 
864   // Has s_cmpk_* instructions.
865   bool hasSCmpK() const { return getGeneration() < GFX12; }
866 
867   // Scratch is allocated in 256 dword per wave blocks for the entire
868   // wavefront. When viewed from the perspective of an arbitrary workitem, this
869   // is 4-byte aligned.
870   //
871   // Only 4-byte alignment is really needed to access anything. Transformations
872   // on the pointer value itself may rely on the alignment / known low bits of
873   // the pointer. Set this to something above the minimum to avoid needing
874   // dynamic realignment in common cases.
875   Align getStackAlignment() const { return Align(16); }
876 
877   bool enableMachineScheduler() const override {
878     return true;
879   }
880 
881   bool useAA() const override;
882 
883   bool enableSubRegLiveness() const override {
884     return true;
885   }
886 
887   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
888   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
889 
890   // static wrappers
891   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
892 
893   // XXX - Why is this here if it isn't in the default pass set?
894   bool enableEarlyIfConversion() const override {
895     return true;
896   }
897 
898   void overrideSchedPolicy(MachineSchedPolicy &Policy,
899                            unsigned NumRegionInstrs) const override;
900 
901   unsigned getMaxNumUserSGPRs() const {
902     return AMDGPU::getMaxNumUserSGPRs(*this);
903   }
904 
905   bool hasSMemRealTime() const {
906     return HasSMemRealTime;
907   }
908 
909   bool hasMovrel() const {
910     return HasMovrel;
911   }
912 
913   bool hasVGPRIndexMode() const {
914     return HasVGPRIndexMode;
915   }
916 
917   bool useVGPRIndexMode() const;
918 
919   bool hasScalarCompareEq64() const {
920     return getGeneration() >= VOLCANIC_ISLANDS;
921   }
922 
923   bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
924 
925   bool hasScalarStores() const {
926     return HasScalarStores;
927   }
928 
929   bool hasScalarAtomics() const {
930     return HasScalarAtomics;
931   }
932 
933   bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
934 
935   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
936   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
937 
938   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
939   bool hasPermLane64() const { return getGeneration() >= GFX11; }
940 
941   bool hasDPP() const {
942     return HasDPP;
943   }
944 
945   bool hasDPPBroadcasts() const {
946     return HasDPP && getGeneration() < GFX10;
947   }
948 
949   bool hasDPPWavefrontShifts() const {
950     return HasDPP && getGeneration() < GFX10;
951   }
952 
953   bool hasDPP8() const {
954     return HasDPP8;
955   }
956 
957   bool hasDPALU_DPP() const {
958     return HasDPALU_DPP;
959   }
960 
961   bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
962 
963   bool hasPackedFP32Ops() const {
964     return HasPackedFP32Ops;
965   }
966 
967   // Has V_PK_MOV_B32 opcode
968   bool hasPkMovB32() const {
969     return GFX90AInsts;
970   }
971 
972   bool hasFmaakFmamkF32Insts() const {
973     return getGeneration() >= GFX10 || hasGFX940Insts();
974   }
975 
976   bool hasImageInsts() const {
977     return HasImageInsts;
978   }
979 
980   bool hasExtendedImageInsts() const {
981     return HasExtendedImageInsts;
982   }
983 
984   bool hasR128A16() const {
985     return HasR128A16;
986   }
987 
988   bool hasA16() const { return HasA16; }
989 
990   bool hasG16() const { return HasG16; }
991 
992   bool hasOffset3fBug() const {
993     return HasOffset3fBug;
994   }
995 
996   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
997 
998   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
999 
1000   bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1001 
1002   bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1003 
1004   bool hasNSAEncoding() const { return HasNSAEncoding; }
1005 
1006   bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1007 
1008   bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1009 
1010   unsigned getNSAMaxSize(bool HasSampler = false) const {
1011     return AMDGPU::getNSAMaxSize(*this, HasSampler);
1012   }
1013 
1014   bool hasGFX10_AEncoding() const {
1015     return GFX10_AEncoding;
1016   }
1017 
1018   bool hasGFX10_BEncoding() const {
1019     return GFX10_BEncoding;
1020   }
1021 
1022   bool hasGFX10_3Insts() const {
1023     return GFX10_3Insts;
1024   }
1025 
1026   bool hasMadF16() const;
1027 
1028   bool hasMovB64() const { return GFX940Insts; }
1029 
1030   bool hasLshlAddB64() const { return GFX940Insts; }
1031 
1032   bool enableSIScheduler() const {
1033     return EnableSIScheduler;
1034   }
1035 
1036   bool loadStoreOptEnabled() const {
1037     return EnableLoadStoreOpt;
1038   }
1039 
1040   bool hasSGPRInitBug() const {
1041     return SGPRInitBug;
1042   }
1043 
1044   bool hasUserSGPRInit16Bug() const {
1045     return UserSGPRInit16Bug && isWave32();
1046   }
1047 
1048   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1049 
1050   bool hasNegativeUnalignedScratchOffsetBug() const {
1051     return NegativeUnalignedScratchOffsetBug;
1052   }
1053 
1054   bool hasMFMAInlineLiteralBug() const {
1055     return HasMFMAInlineLiteralBug;
1056   }
1057 
1058   bool has12DWordStoreHazard() const {
1059     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1060   }
1061 
1062   // \returns true if the subtarget supports DWORDX3 load/store instructions.
1063   bool hasDwordx3LoadStores() const {
1064     return CIInsts;
1065   }
1066 
1067   bool hasReadM0MovRelInterpHazard() const {
1068     return getGeneration() == AMDGPUSubtarget::GFX9;
1069   }
1070 
1071   bool hasReadM0SendMsgHazard() const {
1072     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1073            getGeneration() <= AMDGPUSubtarget::GFX9;
1074   }
1075 
1076   bool hasReadM0LdsDmaHazard() const {
1077     return getGeneration() == AMDGPUSubtarget::GFX9;
1078   }
1079 
1080   bool hasReadM0LdsDirectHazard() const {
1081     return getGeneration() == AMDGPUSubtarget::GFX9;
1082   }
1083 
1084   bool hasVcmpxPermlaneHazard() const {
1085     return HasVcmpxPermlaneHazard;
1086   }
1087 
1088   bool hasVMEMtoScalarWriteHazard() const {
1089     return HasVMEMtoScalarWriteHazard;
1090   }
1091 
1092   bool hasSMEMtoVectorWriteHazard() const {
1093     return HasSMEMtoVectorWriteHazard;
1094   }
1095 
1096   bool hasLDSMisalignedBug() const {
1097     return LDSMisalignedBug && !EnableCuMode;
1098   }
1099 
1100   bool hasInstFwdPrefetchBug() const {
1101     return HasInstFwdPrefetchBug;
1102   }
1103 
1104   bool hasVcmpxExecWARHazard() const {
1105     return HasVcmpxExecWARHazard;
1106   }
1107 
1108   bool hasLdsBranchVmemWARHazard() const {
1109     return HasLdsBranchVmemWARHazard;
1110   }
1111 
1112   // Shift amount of a 64 bit shift cannot be a highest allocated register
1113   // if also at the end of the allocation block.
1114   bool hasShift64HighRegBug() const {
1115     return GFX90AInsts && !GFX940Insts;
1116   }
1117 
1118   // Has one cycle hazard on transcendental instruction feeding a
1119   // non transcendental VALU.
1120   bool hasTransForwardingHazard() const { return GFX940Insts; }
1121 
1122   // Has one cycle hazard on a VALU instruction partially writing dst with
1123   // a shift of result bits feeding another VALU instruction.
1124   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1125 
1126   // Cannot use op_sel with v_dot instructions.
1127   bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
1128 
1129   // Does not have HW interlocs for VALU writing and then reading SGPRs.
1130   bool hasVDecCoExecHazard() const {
1131     return GFX940Insts;
1132   }
1133 
1134   bool hasNSAtoVMEMBug() const {
1135     return HasNSAtoVMEMBug;
1136   }
1137 
1138   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1139 
1140   bool hasHardClauses() const { return getGeneration() >= GFX10; }
1141 
1142   bool hasGFX90AInsts() const { return GFX90AInsts; }
1143 
1144   bool hasFPAtomicToDenormModeHazard() const {
1145     return getGeneration() == GFX10;
1146   }
1147 
1148   bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1149 
1150   bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1151 
1152   bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1153 
1154   bool hasVALUPartialForwardingHazard() const {
1155     return getGeneration() == GFX11;
1156   }
1157 
1158   bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1159 
1160   bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1161 
1162   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1163 
1164   /// Return if operations acting on VGPR tuples require even alignment.
1165   bool needsAlignedVGPRs() const { return GFX90AInsts; }
1166 
1167   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1168   bool hasSPackHL() const { return GFX11Insts; }
1169 
1170   /// Return true if the target's EXP instruction has the COMPR flag, which
1171   /// affects the meaning of the EN (enable) bits.
1172   bool hasCompressedExport() const { return !GFX11Insts; }
1173 
1174   /// Return true if the target's EXP instruction supports the NULL export
1175   /// target.
1176   bool hasNullExportTarget() const { return !GFX11Insts; }
1177 
1178   bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; }
1179 
1180   bool hasVOPDInsts() const { return HasVOPDInsts; }
1181 
1182   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1183 
1184   /// Return true if the target has the S_DELAY_ALU instruction.
1185   bool hasDelayAlu() const { return GFX11Insts; }
1186 
1187   bool hasPackedTID() const { return HasPackedTID; }
1188 
1189   // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1190   // hasGFX90AInsts is also true.
1191   bool hasGFX940Insts() const { return GFX940Insts; }
1192 
1193   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1194 
1195   bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
1196 
1197   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1198 
1199   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1200 
1201   /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1202   /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1203   bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1204 
1205   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1206   /// SGPRs
1207   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1208 
1209   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1210   /// VGPRs
1211   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1212 
1213   /// Return occupancy for the given function. Used LDS and a number of
1214   /// registers if provided.
1215   /// Note, occupancy can be affected by the scratch allocation as well, but
1216   /// we do not have enough information to compute it.
1217   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1218                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1219 
1220   /// \returns true if the flat_scratch register should be initialized with the
1221   /// pointer to the wave's scratch memory rather than a size and offset.
1222   bool flatScratchIsPointer() const {
1223     return getGeneration() >= AMDGPUSubtarget::GFX9;
1224   }
1225 
1226   /// \returns true if the flat_scratch register is initialized by the HW.
1227   /// In this case it is readonly.
1228   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1229 
1230   /// \returns true if the architected SGPRs are enabled.
1231   bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1232 
1233   /// \returns true if Global Data Share is supported.
1234   bool hasGDS() const { return HasGDS; }
1235 
1236   /// \returns true if Global Wave Sync is supported.
1237   bool hasGWS() const { return HasGWS; }
1238 
1239   /// \returns true if the machine has merged shaders in which s0-s7 are
1240   /// reserved by the hardware and user SGPRs start at s8
1241   bool hasMergedShaders() const {
1242     return getGeneration() >= GFX9;
1243   }
1244 
1245   // \returns true if the target supports the pre-NGG legacy geometry path.
1246   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1247 
1248   // \returns true if preloading kernel arguments is supported.
1249   bool hasKernargPreload() const { return KernargPreload; }
1250 
1251   // \returns true if we need to generate backwards compatible code when
1252   // preloading kernel arguments.
1253   bool needsKernargPreloadBackwardsCompatibility() const {
1254     return hasKernargPreload() && !hasGFX940Insts();
1255   }
1256 
1257   // \returns true if the target has split barriers feature
1258   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1259 
1260   // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1261   bool hasCvtFP8VOP1Bug() const { return true; }
1262 
1263   // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1264   // no-return form.
1265   bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1266 
1267   // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1268   bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1269 
1270   // \returns true if the target has IEEE kernel descriptor mode bit
1271   bool hasIEEEMode() const { return getGeneration() < GFX12; }
1272 
1273   // \returns true if the target has IEEE fminimum/fmaximum instructions
1274   bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1275 
1276   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1277   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1278 
1279   /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1280   /// values.
1281   bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1282 
1283   // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1284   // of sign-extending.
1285   bool hasGetPCZeroExtension() const { return GFX12Insts; }
1286 
1287   /// \returns SGPR allocation granularity supported by the subtarget.
1288   unsigned getSGPRAllocGranule() const {
1289     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1290   }
1291 
1292   /// \returns SGPR encoding granularity supported by the subtarget.
1293   unsigned getSGPREncodingGranule() const {
1294     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1295   }
1296 
1297   /// \returns Total number of SGPRs supported by the subtarget.
1298   unsigned getTotalNumSGPRs() const {
1299     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1300   }
1301 
1302   /// \returns Addressable number of SGPRs supported by the subtarget.
1303   unsigned getAddressableNumSGPRs() const {
1304     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1305   }
1306 
1307   /// \returns Minimum number of SGPRs that meets the given number of waves per
1308   /// execution unit requirement supported by the subtarget.
1309   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1310     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1311   }
1312 
1313   /// \returns Maximum number of SGPRs that meets the given number of waves per
1314   /// execution unit requirement supported by the subtarget.
1315   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1316     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1317   }
1318 
1319   /// \returns Reserved number of SGPRs. This is common
1320   /// utility function called by MachineFunction and
1321   /// Function variants of getReservedNumSGPRs.
1322   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1323   /// \returns Reserved number of SGPRs for given machine function \p MF.
1324   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1325 
1326   /// \returns Reserved number of SGPRs for given function \p F.
1327   unsigned getReservedNumSGPRs(const Function &F) const;
1328 
1329   /// \returns max num SGPRs. This is the common utility
1330   /// function called by MachineFunction and Function
1331   /// variants of getMaxNumSGPRs.
1332   unsigned getBaseMaxNumSGPRs(const Function &F,
1333                               std::pair<unsigned, unsigned> WavesPerEU,
1334                               unsigned PreloadedSGPRs,
1335                               unsigned ReservedNumSGPRs) const;
1336 
1337   /// \returns Maximum number of SGPRs that meets number of waves per execution
1338   /// unit requirement for function \p MF, or number of SGPRs explicitly
1339   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1340   ///
1341   /// \returns Value that meets number of waves per execution unit requirement
1342   /// if explicitly requested value cannot be converted to integer, violates
1343   /// subtarget's specifications, or does not meet number of waves per execution
1344   /// unit requirement.
1345   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1346 
1347   /// \returns Maximum number of SGPRs that meets number of waves per execution
1348   /// unit requirement for function \p F, or number of SGPRs explicitly
1349   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1350   ///
1351   /// \returns Value that meets number of waves per execution unit requirement
1352   /// if explicitly requested value cannot be converted to integer, violates
1353   /// subtarget's specifications, or does not meet number of waves per execution
1354   /// unit requirement.
1355   unsigned getMaxNumSGPRs(const Function &F) const;
1356 
1357   /// \returns VGPR allocation granularity supported by the subtarget.
1358   unsigned getVGPRAllocGranule() const {
1359     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1360   }
1361 
1362   /// \returns VGPR encoding granularity supported by the subtarget.
1363   unsigned getVGPREncodingGranule() const {
1364     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1365   }
1366 
1367   /// \returns Total number of VGPRs supported by the subtarget.
1368   unsigned getTotalNumVGPRs() const {
1369     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1370   }
1371 
1372   /// \returns Addressable number of VGPRs supported by the subtarget.
1373   unsigned getAddressableNumVGPRs() const {
1374     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1375   }
1376 
1377   /// \returns the minimum number of VGPRs that will prevent achieving more than
1378   /// the specified number of waves \p WavesPerEU.
1379   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1380     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1381   }
1382 
1383   /// \returns the maximum number of VGPRs that can be used and still achieved
1384   /// at least the specified number of waves \p WavesPerEU.
1385   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1386     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1387   }
1388 
1389   /// \returns max num VGPRs. This is the common utility function
1390   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1391   unsigned getBaseMaxNumVGPRs(const Function &F,
1392                               std::pair<unsigned, unsigned> WavesPerEU) const;
1393   /// \returns Maximum number of VGPRs that meets number of waves per execution
1394   /// unit requirement for function \p F, or number of VGPRs explicitly
1395   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1396   ///
1397   /// \returns Value that meets number of waves per execution unit requirement
1398   /// if explicitly requested value cannot be converted to integer, violates
1399   /// subtarget's specifications, or does not meet number of waves per execution
1400   /// unit requirement.
1401   unsigned getMaxNumVGPRs(const Function &F) const;
1402 
1403   unsigned getMaxNumAGPRs(const Function &F) const {
1404     return getMaxNumVGPRs(F);
1405   }
1406 
1407   /// \returns Maximum number of VGPRs that meets number of waves per execution
1408   /// unit requirement for function \p MF, or number of VGPRs explicitly
1409   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1410   ///
1411   /// \returns Value that meets number of waves per execution unit requirement
1412   /// if explicitly requested value cannot be converted to integer, violates
1413   /// subtarget's specifications, or does not meet number of waves per execution
1414   /// unit requirement.
1415   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1416 
1417   void getPostRAMutations(
1418       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1419       const override;
1420 
1421   std::unique_ptr<ScheduleDAGMutation>
1422   createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1423 
1424   bool isWave32() const {
1425     return getWavefrontSize() == 32;
1426   }
1427 
1428   bool isWave64() const {
1429     return getWavefrontSize() == 64;
1430   }
1431 
1432   const TargetRegisterClass *getBoolRC() const {
1433     return getRegisterInfo()->getBoolRC();
1434   }
1435 
1436   /// \returns Maximum number of work groups per compute unit supported by the
1437   /// subtarget and limited by given \p FlatWorkGroupSize.
1438   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1439     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1440   }
1441 
1442   /// \returns Minimum flat work group size supported by the subtarget.
1443   unsigned getMinFlatWorkGroupSize() const override {
1444     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1445   }
1446 
1447   /// \returns Maximum flat work group size supported by the subtarget.
1448   unsigned getMaxFlatWorkGroupSize() const override {
1449     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1450   }
1451 
1452   /// \returns Number of waves per execution unit required to support the given
1453   /// \p FlatWorkGroupSize.
1454   unsigned
1455   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1456     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1457   }
1458 
1459   /// \returns Minimum number of waves per execution unit supported by the
1460   /// subtarget.
1461   unsigned getMinWavesPerEU() const override {
1462     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1463   }
1464 
1465   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1466                              SDep &Dep) const override;
1467 
1468   // \returns true if it's beneficial on this subtarget for the scheduler to
1469   // cluster stores as well as loads.
1470   bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1471 
1472   // \returns the number of address arguments from which to enable MIMG NSA
1473   // on supported architectures.
1474   unsigned getNSAThreshold(const MachineFunction &MF) const;
1475 
1476   // \returns true if the subtarget has a hazard requiring an "s_nop 0"
1477   // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1478   bool requiresNopBeforeDeallocVGPRs() const {
1479     // Currently all targets that support the dealloc VGPRs message also require
1480     // the nop.
1481     return true;
1482   }
1483 };
1484 
1485 class GCNUserSGPRUsageInfo {
1486 public:
1487   bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1488 
1489   bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1490 
1491   bool hasDispatchPtr() const { return DispatchPtr; }
1492 
1493   bool hasQueuePtr() const { return QueuePtr; }
1494 
1495   bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1496 
1497   bool hasDispatchID() const { return DispatchID; }
1498 
1499   bool hasFlatScratchInit() const { return FlatScratchInit; }
1500 
1501   unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1502 
1503   unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1504 
1505   unsigned getNumFreeUserSGPRs();
1506 
1507   void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1508 
1509   enum UserSGPRID : unsigned {
1510     ImplicitBufferPtrID = 0,
1511     PrivateSegmentBufferID = 1,
1512     DispatchPtrID = 2,
1513     QueuePtrID = 3,
1514     KernargSegmentPtrID = 4,
1515     DispatchIdID = 5,
1516     FlatScratchInitID = 6,
1517     PrivateSegmentSizeID = 7
1518   };
1519 
1520   // Returns the size in number of SGPRs for preload user SGPR field.
1521   static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1522     switch (ID) {
1523     case ImplicitBufferPtrID:
1524       return 2;
1525     case PrivateSegmentBufferID:
1526       return 4;
1527     case DispatchPtrID:
1528       return 2;
1529     case QueuePtrID:
1530       return 2;
1531     case KernargSegmentPtrID:
1532       return 2;
1533     case DispatchIdID:
1534       return 2;
1535     case FlatScratchInitID:
1536       return 2;
1537     case PrivateSegmentSizeID:
1538       return 1;
1539     }
1540     llvm_unreachable("Unknown UserSGPRID.");
1541   }
1542 
1543   GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1544 
1545 private:
1546   const GCNSubtarget &ST;
1547 
1548   // Private memory buffer
1549   // Compute directly in sgpr[0:1]
1550   // Other shaders indirect 64-bits at sgpr[0:1]
1551   bool ImplicitBufferPtr = false;
1552 
1553   bool PrivateSegmentBuffer = false;
1554 
1555   bool DispatchPtr = false;
1556 
1557   bool QueuePtr = false;
1558 
1559   bool KernargSegmentPtr = false;
1560 
1561   bool DispatchID = false;
1562 
1563   bool FlatScratchInit = false;
1564 
1565   unsigned NumKernargPreloadSGPRs = 0;
1566 
1567   unsigned NumUsedUserSGPRs = 0;
1568 };
1569 
1570 } // end namespace llvm
1571 
1572 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1573