xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h (revision 7ef62cebc2f965b0f640263e179276928885e33d)
1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIFrameLowering.h"
20 #include "SIISelLowering.h"
21 #include "SIInstrInfo.h"
22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
23 
24 #define GET_SUBTARGETINFO_HEADER
25 #include "AMDGPUGenSubtargetInfo.inc"
26 
27 namespace llvm {
28 
29 class GCNTargetMachine;
30 
31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
32                            public AMDGPUSubtarget {
33 public:
34   using AMDGPUSubtarget::getMaxWavesPerEU;
35 
36   // Following 2 enums are documented at:
37   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
38   enum class TrapHandlerAbi {
39     NONE   = 0x00,
40     AMDHSA = 0x01,
41   };
42 
43   enum class TrapID {
44     LLVMAMDHSATrap      = 0x02,
45     LLVMAMDHSADebugTrap = 0x03,
46   };
47 
48 private:
49   /// GlobalISel related APIs.
50   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
51   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
52   std::unique_ptr<InstructionSelector> InstSelector;
53   std::unique_ptr<LegalizerInfo> Legalizer;
54   std::unique_ptr<RegisterBankInfo> RegBankInfo;
55 
56 protected:
57   // Basic subtarget description.
58   Triple TargetTriple;
59   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
60   unsigned Gen = INVALID;
61   InstrItineraryData InstrItins;
62   int LDSBankCount = 0;
63   unsigned MaxPrivateElementSize = 0;
64 
65   // Possibly statically set by tablegen, but may want to be overridden.
66   bool FastFMAF32 = false;
67   bool FastDenormalF32 = false;
68   bool HalfRate64Ops = false;
69   bool FullRate64Ops = false;
70 
71   // Dynamically set bits that enable features.
72   bool FlatForGlobal = false;
73   bool AutoWaitcntBeforeBarrier = false;
74   bool BackOffBarrier = false;
75   bool UnalignedScratchAccess = false;
76   bool UnalignedAccessMode = false;
77   bool HasApertureRegs = false;
78   bool SupportsXNACK = false;
79 
80   // This should not be used directly. 'TargetID' tracks the dynamic settings
81   // for XNACK.
82   bool EnableXNACK = false;
83 
84   bool EnableTgSplit = false;
85   bool EnableCuMode = false;
86   bool TrapHandler = false;
87 
88   // Used as options.
89   bool EnableLoadStoreOpt = false;
90   bool EnableUnsafeDSOffsetFolding = false;
91   bool EnableSIScheduler = false;
92   bool EnableDS128 = false;
93   bool EnablePRTStrictNull = false;
94   bool DumpCode = false;
95 
96   // Subtarget statically properties set by tablegen
97   bool FP64 = false;
98   bool FMA = false;
99   bool MIMG_R128 = false;
100   bool CIInsts = false;
101   bool GFX8Insts = false;
102   bool GFX9Insts = false;
103   bool GFX90AInsts = false;
104   bool GFX940Insts = false;
105   bool GFX10Insts = false;
106   bool GFX11Insts = false;
107   bool GFX10_3Insts = false;
108   bool GFX7GFX8GFX9Insts = false;
109   bool SGPRInitBug = false;
110   bool UserSGPRInit16Bug = false;
111   bool NegativeScratchOffsetBug = false;
112   bool NegativeUnalignedScratchOffsetBug = false;
113   bool HasSMemRealTime = false;
114   bool HasIntClamp = false;
115   bool HasFmaMixInsts = false;
116   bool HasMovrel = false;
117   bool HasVGPRIndexMode = false;
118   bool HasScalarStores = false;
119   bool HasScalarAtomics = false;
120   bool HasSDWAOmod = false;
121   bool HasSDWAScalar = false;
122   bool HasSDWASdst = false;
123   bool HasSDWAMac = false;
124   bool HasSDWAOutModsVOPC = false;
125   bool HasDPP = false;
126   bool HasDPP8 = false;
127   bool Has64BitDPP = false;
128   bool HasPackedFP32Ops = false;
129   bool HasImageInsts = false;
130   bool HasExtendedImageInsts = false;
131   bool HasR128A16 = false;
132   bool HasA16 = false;
133   bool HasG16 = false;
134   bool HasNSAEncoding = false;
135   unsigned NSAMaxSize = 0;
136   bool GFX10_AEncoding = false;
137   bool GFX10_BEncoding = false;
138   bool HasDLInsts = false;
139   bool HasFmacF64Inst = false;
140   bool HasDot1Insts = false;
141   bool HasDot2Insts = false;
142   bool HasDot3Insts = false;
143   bool HasDot4Insts = false;
144   bool HasDot5Insts = false;
145   bool HasDot6Insts = false;
146   bool HasDot7Insts = false;
147   bool HasDot8Insts = false;
148   bool HasDot9Insts = false;
149   bool HasMAIInsts = false;
150   bool HasFP8Insts = false;
151   bool HasPkFmacF16Inst = false;
152   bool HasAtomicFaddRtnInsts = false;
153   bool HasAtomicFaddNoRtnInsts = false;
154   bool HasAtomicPkFaddNoRtnInsts = false;
155   bool HasFlatAtomicFaddF32Inst = false;
156   bool SupportsSRAMECC = false;
157 
158   // This should not be used directly. 'TargetID' tracks the dynamic settings
159   // for SRAMECC.
160   bool EnableSRAMECC = false;
161 
162   bool HasNoSdstCMPX = false;
163   bool HasVscnt = false;
164   bool HasGetWaveIdInst = false;
165   bool HasSMemTimeInst = false;
166   bool HasShaderCyclesRegister = false;
167   bool HasVOP3Literal = false;
168   bool HasNoDataDepHazard = false;
169   bool FlatAddressSpace = false;
170   bool FlatInstOffsets = false;
171   bool FlatGlobalInsts = false;
172   bool FlatScratchInsts = false;
173   bool ScalarFlatScratchInsts = false;
174   bool HasArchitectedFlatScratch = false;
175   bool EnableFlatScratch = false;
176   bool AddNoCarryInsts = false;
177   bool HasUnpackedD16VMem = false;
178   bool LDSMisalignedBug = false;
179   bool HasMFMAInlineLiteralBug = false;
180   bool UnalignedBufferAccess = false;
181   bool UnalignedDSAccess = false;
182   bool HasPackedTID = false;
183   bool ScalarizeGlobal = false;
184 
185   bool HasVcmpxPermlaneHazard = false;
186   bool HasVMEMtoScalarWriteHazard = false;
187   bool HasSMEMtoVectorWriteHazard = false;
188   bool HasInstFwdPrefetchBug = false;
189   bool HasVcmpxExecWARHazard = false;
190   bool HasLdsBranchVmemWARHazard = false;
191   bool HasNSAtoVMEMBug = false;
192   bool HasNSAClauseBug = false;
193   bool HasOffset3fBug = false;
194   bool HasFlatSegmentOffsetBug = false;
195   bool HasImageStoreD16Bug = false;
196   bool HasImageGather4D16Bug = false;
197   bool HasGFX11FullVGPRs = false;
198   bool HasMADIntraFwdBug = false;
199   bool HasVOPDInsts = false;
200   bool HasVALUTransUseHazard = false;
201 
202   // Dummy feature to use for assembler in tablegen.
203   bool FeatureDisable = false;
204 
205   SelectionDAGTargetInfo TSInfo;
206 private:
207   SIInstrInfo InstrInfo;
208   SITargetLowering TLInfo;
209   SIFrameLowering FrameLowering;
210 
211 public:
212   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
213                const GCNTargetMachine &TM);
214   ~GCNSubtarget() override;
215 
216   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
217                                                    StringRef GPU, StringRef FS);
218 
219   const SIInstrInfo *getInstrInfo() const override {
220     return &InstrInfo;
221   }
222 
223   const SIFrameLowering *getFrameLowering() const override {
224     return &FrameLowering;
225   }
226 
227   const SITargetLowering *getTargetLowering() const override {
228     return &TLInfo;
229   }
230 
231   const SIRegisterInfo *getRegisterInfo() const override {
232     return &InstrInfo.getRegisterInfo();
233   }
234 
235   const CallLowering *getCallLowering() const override {
236     return CallLoweringInfo.get();
237   }
238 
239   const InlineAsmLowering *getInlineAsmLowering() const override {
240     return InlineAsmLoweringInfo.get();
241   }
242 
243   InstructionSelector *getInstructionSelector() const override {
244     return InstSelector.get();
245   }
246 
247   const LegalizerInfo *getLegalizerInfo() const override {
248     return Legalizer.get();
249   }
250 
251   const RegisterBankInfo *getRegBankInfo() const override {
252     return RegBankInfo.get();
253   }
254 
255   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
256     return TargetID;
257   }
258 
259   // Nothing implemented, just prevent crashes on use.
260   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
261     return &TSInfo;
262   }
263 
264   const InstrItineraryData *getInstrItineraryData() const override {
265     return &InstrItins;
266   }
267 
268   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
269 
270   Generation getGeneration() const {
271     return (Generation)Gen;
272   }
273 
274   unsigned getMaxWaveScratchSize() const {
275     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
276     if (getGeneration() < GFX11) {
277       // 13-bit field in units of 256-dword.
278       return (256 * 4) * ((1 << 13) - 1);
279     }
280     // 15-bit field in units of 64-dword.
281     return (64 * 4) * ((1 << 15) - 1);
282   }
283 
284   /// Return the number of high bits known to be zero for a frame index.
285   unsigned getKnownHighZeroBitsForFrameIndex() const {
286     return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
287   }
288 
289   int getLDSBankCount() const {
290     return LDSBankCount;
291   }
292 
293   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
294     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
295   }
296 
297   unsigned getConstantBusLimit(unsigned Opcode) const;
298 
299   /// Returns if the result of this instruction with a 16-bit result returned in
300   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
301   /// the original value.
302   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
303 
304   bool supportsWGP() const { return getGeneration() >= GFX10; }
305 
306   bool hasIntClamp() const {
307     return HasIntClamp;
308   }
309 
310   bool hasFP64() const {
311     return FP64;
312   }
313 
314   bool hasMIMG_R128() const {
315     return MIMG_R128;
316   }
317 
318   bool hasHWFP64() const {
319     return FP64;
320   }
321 
322   bool hasFastFMAF32() const {
323     return FastFMAF32;
324   }
325 
326   bool hasHalfRate64Ops() const {
327     return HalfRate64Ops;
328   }
329 
330   bool hasFullRate64Ops() const {
331     return FullRate64Ops;
332   }
333 
334   bool hasAddr64() const {
335     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
336   }
337 
338   bool hasFlat() const {
339     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
340   }
341 
342   // Return true if the target only has the reverse operand versions of VALU
343   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
344   bool hasOnlyRevVALUShifts() const {
345     return getGeneration() >= VOLCANIC_ISLANDS;
346   }
347 
348   bool hasFractBug() const {
349     return getGeneration() == SOUTHERN_ISLANDS;
350   }
351 
352   bool hasBFE() const {
353     return true;
354   }
355 
356   bool hasBFI() const {
357     return true;
358   }
359 
360   bool hasBFM() const {
361     return hasBFE();
362   }
363 
364   bool hasBCNT(unsigned Size) const {
365     return true;
366   }
367 
368   bool hasFFBL() const {
369     return true;
370   }
371 
372   bool hasFFBH() const {
373     return true;
374   }
375 
376   bool hasMed3_16() const {
377     return getGeneration() >= AMDGPUSubtarget::GFX9;
378   }
379 
380   bool hasMin3Max3_16() const {
381     return getGeneration() >= AMDGPUSubtarget::GFX9;
382   }
383 
384   bool hasFmaMixInsts() const {
385     return HasFmaMixInsts;
386   }
387 
388   bool hasCARRY() const {
389     return true;
390   }
391 
392   bool hasFMA() const {
393     return FMA;
394   }
395 
396   bool hasSwap() const {
397     return GFX9Insts;
398   }
399 
400   bool hasScalarPackInsts() const {
401     return GFX9Insts;
402   }
403 
404   bool hasScalarMulHiInsts() const {
405     return GFX9Insts;
406   }
407 
408   TrapHandlerAbi getTrapHandlerAbi() const {
409     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
410   }
411 
412   bool supportsGetDoorbellID() const {
413     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
414     return getGeneration() >= GFX9;
415   }
416 
417   /// True if the offset field of DS instructions works as expected. On SI, the
418   /// offset uses a 16-bit adder and does not always wrap properly.
419   bool hasUsableDSOffset() const {
420     return getGeneration() >= SEA_ISLANDS;
421   }
422 
423   bool unsafeDSOffsetFoldingEnabled() const {
424     return EnableUnsafeDSOffsetFolding;
425   }
426 
427   /// Condition output from div_scale is usable.
428   bool hasUsableDivScaleConditionOutput() const {
429     return getGeneration() != SOUTHERN_ISLANDS;
430   }
431 
432   /// Extra wait hazard is needed in some cases before
433   /// s_cbranch_vccnz/s_cbranch_vccz.
434   bool hasReadVCCZBug() const {
435     return getGeneration() <= SEA_ISLANDS;
436   }
437 
438   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
439   bool partialVCCWritesUpdateVCCZ() const {
440     return getGeneration() >= GFX10;
441   }
442 
443   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
444   /// was written by a VALU instruction.
445   bool hasSMRDReadVALUDefHazard() const {
446     return getGeneration() == SOUTHERN_ISLANDS;
447   }
448 
449   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
450   /// SGPR was written by a VALU Instruction.
451   bool hasVMEMReadSGPRVALUDefHazard() const {
452     return getGeneration() >= VOLCANIC_ISLANDS;
453   }
454 
455   bool hasRFEHazards() const {
456     return getGeneration() >= VOLCANIC_ISLANDS;
457   }
458 
459   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
460   unsigned getSetRegWaitStates() const {
461     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
462   }
463 
464   bool dumpCode() const {
465     return DumpCode;
466   }
467 
468   /// Return the amount of LDS that can be used that will not restrict the
469   /// occupancy lower than WaveCount.
470   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
471                                            const Function &) const;
472 
473   bool supportsMinMaxDenormModes() const {
474     return getGeneration() >= AMDGPUSubtarget::GFX9;
475   }
476 
477   /// \returns If target supports S_DENORM_MODE.
478   bool hasDenormModeInst() const {
479     return getGeneration() >= AMDGPUSubtarget::GFX10;
480   }
481 
482   bool useFlatForGlobal() const {
483     return FlatForGlobal;
484   }
485 
486   /// \returns If target supports ds_read/write_b128 and user enables generation
487   /// of ds_read/write_b128.
488   bool useDS128() const {
489     return CIInsts && EnableDS128;
490   }
491 
492   /// \return If target supports ds_read/write_b96/128.
493   bool hasDS96AndDS128() const {
494     return CIInsts;
495   }
496 
497   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
498   bool haveRoundOpsF64() const {
499     return CIInsts;
500   }
501 
502   /// \returns If MUBUF instructions always perform range checking, even for
503   /// buffer resources used for private memory access.
504   bool privateMemoryResourceIsRangeChecked() const {
505     return getGeneration() < AMDGPUSubtarget::GFX9;
506   }
507 
508   /// \returns If target requires PRT Struct NULL support (zero result registers
509   /// for sparse texture support).
510   bool usePRTStrictNull() const {
511     return EnablePRTStrictNull;
512   }
513 
514   bool hasAutoWaitcntBeforeBarrier() const {
515     return AutoWaitcntBeforeBarrier;
516   }
517 
518   /// \returns true if the target supports backing off of s_barrier instructions
519   /// when an exception is raised.
520   bool supportsBackOffBarrier() const {
521     return BackOffBarrier;
522   }
523 
524   bool hasUnalignedBufferAccess() const {
525     return UnalignedBufferAccess;
526   }
527 
528   bool hasUnalignedBufferAccessEnabled() const {
529     return UnalignedBufferAccess && UnalignedAccessMode;
530   }
531 
532   bool hasUnalignedDSAccess() const {
533     return UnalignedDSAccess;
534   }
535 
536   bool hasUnalignedDSAccessEnabled() const {
537     return UnalignedDSAccess && UnalignedAccessMode;
538   }
539 
540   bool hasUnalignedScratchAccess() const {
541     return UnalignedScratchAccess;
542   }
543 
544   bool hasUnalignedAccessMode() const {
545     return UnalignedAccessMode;
546   }
547 
548   bool hasApertureRegs() const {
549     return HasApertureRegs;
550   }
551 
552   bool isTrapHandlerEnabled() const {
553     return TrapHandler;
554   }
555 
556   bool isXNACKEnabled() const {
557     return TargetID.isXnackOnOrAny();
558   }
559 
560   bool isTgSplitEnabled() const {
561     return EnableTgSplit;
562   }
563 
564   bool isCuModeEnabled() const {
565     return EnableCuMode;
566   }
567 
568   bool hasFlatAddressSpace() const {
569     return FlatAddressSpace;
570   }
571 
572   bool hasFlatScrRegister() const {
573     return hasFlatAddressSpace();
574   }
575 
576   bool hasFlatInstOffsets() const {
577     return FlatInstOffsets;
578   }
579 
580   bool hasFlatGlobalInsts() const {
581     return FlatGlobalInsts;
582   }
583 
584   bool hasFlatScratchInsts() const {
585     return FlatScratchInsts;
586   }
587 
588   // Check if target supports ST addressing mode with FLAT scratch instructions.
589   // The ST addressing mode means no registers are used, either VGPR or SGPR,
590   // but only immediate offset is swizzled and added to the FLAT scratch base.
591   bool hasFlatScratchSTMode() const {
592     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
593   }
594 
595   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
596 
597   bool hasScalarFlatScratchInsts() const {
598     return ScalarFlatScratchInsts;
599   }
600 
601   bool enableFlatScratch() const {
602     return flatScratchIsArchitected() ||
603            (EnableFlatScratch && hasFlatScratchInsts());
604   }
605 
606   bool hasGlobalAddTidInsts() const {
607     return GFX10_BEncoding;
608   }
609 
610   bool hasAtomicCSub() const {
611     return GFX10_BEncoding;
612   }
613 
614   bool hasMultiDwordFlatScratchAddressing() const {
615     return getGeneration() >= GFX9;
616   }
617 
618   bool hasFlatSegmentOffsetBug() const {
619     return HasFlatSegmentOffsetBug;
620   }
621 
622   bool hasFlatLgkmVMemCountInOrder() const {
623     return getGeneration() > GFX9;
624   }
625 
626   bool hasD16LoadStore() const {
627     return getGeneration() >= GFX9;
628   }
629 
630   bool d16PreservesUnusedBits() const {
631     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
632   }
633 
634   bool hasD16Images() const {
635     return getGeneration() >= VOLCANIC_ISLANDS;
636   }
637 
638   /// Return if most LDS instructions have an m0 use that require m0 to be
639   /// initialized.
640   bool ldsRequiresM0Init() const {
641     return getGeneration() < GFX9;
642   }
643 
644   // True if the hardware rewinds and replays GWS operations if a wave is
645   // preempted.
646   //
647   // If this is false, a GWS operation requires testing if a nack set the
648   // MEM_VIOL bit, and repeating if so.
649   bool hasGWSAutoReplay() const {
650     return getGeneration() >= GFX9;
651   }
652 
653   /// \returns if target has ds_gws_sema_release_all instruction.
654   bool hasGWSSemaReleaseAll() const {
655     return CIInsts;
656   }
657 
658   /// \returns true if the target has integer add/sub instructions that do not
659   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
660   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
661   /// for saturation.
662   bool hasAddNoCarry() const {
663     return AddNoCarryInsts;
664   }
665 
666   bool hasUnpackedD16VMem() const {
667     return HasUnpackedD16VMem;
668   }
669 
670   // Covers VS/PS/CS graphics shaders
671   bool isMesaGfxShader(const Function &F) const {
672     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
673   }
674 
675   bool hasMad64_32() const {
676     return getGeneration() >= SEA_ISLANDS;
677   }
678 
679   bool hasSDWAOmod() const {
680     return HasSDWAOmod;
681   }
682 
683   bool hasSDWAScalar() const {
684     return HasSDWAScalar;
685   }
686 
687   bool hasSDWASdst() const {
688     return HasSDWASdst;
689   }
690 
691   bool hasSDWAMac() const {
692     return HasSDWAMac;
693   }
694 
695   bool hasSDWAOutModsVOPC() const {
696     return HasSDWAOutModsVOPC;
697   }
698 
699   bool hasDLInsts() const {
700     return HasDLInsts;
701   }
702 
703   bool hasFmacF64Inst() const { return HasFmacF64Inst; }
704 
705   bool hasDot1Insts() const {
706     return HasDot1Insts;
707   }
708 
709   bool hasDot2Insts() const {
710     return HasDot2Insts;
711   }
712 
713   bool hasDot3Insts() const {
714     return HasDot3Insts;
715   }
716 
717   bool hasDot4Insts() const {
718     return HasDot4Insts;
719   }
720 
721   bool hasDot5Insts() const {
722     return HasDot5Insts;
723   }
724 
725   bool hasDot6Insts() const {
726     return HasDot6Insts;
727   }
728 
729   bool hasDot7Insts() const {
730     return HasDot7Insts;
731   }
732 
733   bool hasDot8Insts() const {
734     return HasDot8Insts;
735   }
736 
737   bool hasDot9Insts() const {
738     return HasDot9Insts;
739   }
740 
741   bool hasMAIInsts() const {
742     return HasMAIInsts;
743   }
744 
745   bool hasFP8Insts() const {
746     return HasFP8Insts;
747   }
748 
749   bool hasPkFmacF16Inst() const {
750     return HasPkFmacF16Inst;
751   }
752 
753   bool hasAtomicFaddInsts() const {
754     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
755   }
756 
757   bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
758 
759   bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
760 
761   bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; }
762 
763   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
764 
765   bool hasNoSdstCMPX() const {
766     return HasNoSdstCMPX;
767   }
768 
769   bool hasVscnt() const {
770     return HasVscnt;
771   }
772 
773   bool hasGetWaveIdInst() const {
774     return HasGetWaveIdInst;
775   }
776 
777   bool hasSMemTimeInst() const {
778     return HasSMemTimeInst;
779   }
780 
781   bool hasShaderCyclesRegister() const {
782     return HasShaderCyclesRegister;
783   }
784 
785   bool hasVOP3Literal() const {
786     return HasVOP3Literal;
787   }
788 
789   bool hasNoDataDepHazard() const {
790     return HasNoDataDepHazard;
791   }
792 
793   bool vmemWriteNeedsExpWaitcnt() const {
794     return getGeneration() < SEA_ISLANDS;
795   }
796 
797   bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
798 
799   // Scratch is allocated in 256 dword per wave blocks for the entire
800   // wavefront. When viewed from the perspective of an arbitrary workitem, this
801   // is 4-byte aligned.
802   //
803   // Only 4-byte alignment is really needed to access anything. Transformations
804   // on the pointer value itself may rely on the alignment / known low bits of
805   // the pointer. Set this to something above the minimum to avoid needing
806   // dynamic realignment in common cases.
807   Align getStackAlignment() const { return Align(16); }
808 
809   bool enableMachineScheduler() const override {
810     return true;
811   }
812 
813   bool useAA() const override;
814 
815   bool enableSubRegLiveness() const override {
816     return true;
817   }
818 
819   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
820   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
821 
822   // static wrappers
823   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
824 
825   // XXX - Why is this here if it isn't in the default pass set?
826   bool enableEarlyIfConversion() const override {
827     return true;
828   }
829 
830   void overrideSchedPolicy(MachineSchedPolicy &Policy,
831                            unsigned NumRegionInstrs) const override;
832 
833   unsigned getMaxNumUserSGPRs() const {
834     return 16;
835   }
836 
837   bool hasSMemRealTime() const {
838     return HasSMemRealTime;
839   }
840 
841   bool hasMovrel() const {
842     return HasMovrel;
843   }
844 
845   bool hasVGPRIndexMode() const {
846     return HasVGPRIndexMode;
847   }
848 
849   bool useVGPRIndexMode() const;
850 
851   bool hasScalarCompareEq64() const {
852     return getGeneration() >= VOLCANIC_ISLANDS;
853   }
854 
855   bool hasScalarStores() const {
856     return HasScalarStores;
857   }
858 
859   bool hasScalarAtomics() const {
860     return HasScalarAtomics;
861   }
862 
863   bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
864 
865   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
866   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
867 
868   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
869   bool hasPermLane64() const { return getGeneration() >= GFX11; }
870 
871   bool hasDPP() const {
872     return HasDPP;
873   }
874 
875   bool hasDPPBroadcasts() const {
876     return HasDPP && getGeneration() < GFX10;
877   }
878 
879   bool hasDPPWavefrontShifts() const {
880     return HasDPP && getGeneration() < GFX10;
881   }
882 
883   bool hasDPP8() const {
884     return HasDPP8;
885   }
886 
887   bool has64BitDPP() const {
888     return Has64BitDPP;
889   }
890 
891   bool hasPackedFP32Ops() const {
892     return HasPackedFP32Ops;
893   }
894 
895   bool hasFmaakFmamkF32Insts() const {
896     return getGeneration() >= GFX10 || hasGFX940Insts();
897   }
898 
899   bool hasImageInsts() const {
900     return HasImageInsts;
901   }
902 
903   bool hasExtendedImageInsts() const {
904     return HasExtendedImageInsts;
905   }
906 
907   bool hasR128A16() const {
908     return HasR128A16;
909   }
910 
911   bool hasA16() const { return HasA16; }
912 
913   bool hasG16() const { return HasG16; }
914 
915   bool hasOffset3fBug() const {
916     return HasOffset3fBug;
917   }
918 
919   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
920 
921   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
922 
923   bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
924 
925   bool hasNSAEncoding() const { return HasNSAEncoding; }
926 
927   unsigned getNSAMaxSize() const { return NSAMaxSize; }
928 
929   bool hasGFX10_AEncoding() const {
930     return GFX10_AEncoding;
931   }
932 
933   bool hasGFX10_BEncoding() const {
934     return GFX10_BEncoding;
935   }
936 
937   bool hasGFX10_3Insts() const {
938     return GFX10_3Insts;
939   }
940 
941   bool hasMadF16() const;
942 
943   bool hasMovB64() const { return GFX940Insts; }
944 
945   bool hasLshlAddB64() const { return GFX940Insts; }
946 
947   bool enableSIScheduler() const {
948     return EnableSIScheduler;
949   }
950 
951   bool loadStoreOptEnabled() const {
952     return EnableLoadStoreOpt;
953   }
954 
955   bool hasSGPRInitBug() const {
956     return SGPRInitBug;
957   }
958 
959   bool hasUserSGPRInit16Bug() const {
960     return UserSGPRInit16Bug && isWave32();
961   }
962 
963   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
964 
965   bool hasNegativeUnalignedScratchOffsetBug() const {
966     return NegativeUnalignedScratchOffsetBug;
967   }
968 
969   bool hasMFMAInlineLiteralBug() const {
970     return HasMFMAInlineLiteralBug;
971   }
972 
973   bool has12DWordStoreHazard() const {
974     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
975   }
976 
977   // \returns true if the subtarget supports DWORDX3 load/store instructions.
978   bool hasDwordx3LoadStores() const {
979     return CIInsts;
980   }
981 
982   bool hasReadM0MovRelInterpHazard() const {
983     return getGeneration() == AMDGPUSubtarget::GFX9;
984   }
985 
986   bool hasReadM0SendMsgHazard() const {
987     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
988            getGeneration() <= AMDGPUSubtarget::GFX9;
989   }
990 
991   bool hasReadM0LdsDmaHazard() const {
992     return getGeneration() == AMDGPUSubtarget::GFX9;
993   }
994 
995   bool hasReadM0LdsDirectHazard() const {
996     return getGeneration() == AMDGPUSubtarget::GFX9;
997   }
998 
999   bool hasVcmpxPermlaneHazard() const {
1000     return HasVcmpxPermlaneHazard;
1001   }
1002 
1003   bool hasVMEMtoScalarWriteHazard() const {
1004     return HasVMEMtoScalarWriteHazard;
1005   }
1006 
1007   bool hasSMEMtoVectorWriteHazard() const {
1008     return HasSMEMtoVectorWriteHazard;
1009   }
1010 
1011   bool hasLDSMisalignedBug() const {
1012     return LDSMisalignedBug && !EnableCuMode;
1013   }
1014 
1015   bool hasInstFwdPrefetchBug() const {
1016     return HasInstFwdPrefetchBug;
1017   }
1018 
1019   bool hasVcmpxExecWARHazard() const {
1020     return HasVcmpxExecWARHazard;
1021   }
1022 
1023   bool hasLdsBranchVmemWARHazard() const {
1024     return HasLdsBranchVmemWARHazard;
1025   }
1026 
1027   // Shift amount of a 64 bit shift cannot be a highest allocated register
1028   // if also at the end of the allocation block.
1029   bool hasShift64HighRegBug() const {
1030     return GFX90AInsts && !GFX940Insts;
1031   }
1032 
1033   // Has one cycle hazard on transcendental instruction feeding a
1034   // non transcendental VALU.
1035   bool hasTransForwardingHazard() const { return GFX940Insts; }
1036 
1037   // Has one cycle hazard on a VALU instruction partially writing dst with
1038   // a shift of result bits feeding another VALU instruction.
1039   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1040 
1041   // Cannot use op_sel with v_dot instructions.
1042   bool hasDOTOpSelHazard() const { return GFX940Insts; }
1043 
1044   // Does not have HW interlocs for VALU writing and then reading SGPRs.
1045   bool hasVDecCoExecHazard() const {
1046     return GFX940Insts;
1047   }
1048 
1049   bool hasNSAtoVMEMBug() const {
1050     return HasNSAtoVMEMBug;
1051   }
1052 
1053   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1054 
1055   bool hasHardClauses() const { return getGeneration() >= GFX10; }
1056 
1057   bool hasGFX90AInsts() const { return GFX90AInsts; }
1058 
1059   bool hasFPAtomicToDenormModeHazard() const {
1060     return getGeneration() == GFX10;
1061   }
1062 
1063   bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1064 
1065   bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1066 
1067   bool hasVALUPartialForwardingHazard() const {
1068     return getGeneration() >= GFX11;
1069   }
1070 
1071   bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1072 
1073   bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; }
1074 
1075   /// Return if operations acting on VGPR tuples require even alignment.
1076   bool needsAlignedVGPRs() const { return GFX90AInsts; }
1077 
1078   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1079   bool hasSPackHL() const { return GFX11Insts; }
1080 
1081   /// Return true if the target's EXP instruction has the COMPR flag, which
1082   /// affects the meaning of the EN (enable) bits.
1083   bool hasCompressedExport() const { return !GFX11Insts; }
1084 
1085   /// Return true if the target's EXP instruction supports the NULL export
1086   /// target.
1087   bool hasNullExportTarget() const { return !GFX11Insts; }
1088 
1089   bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; }
1090 
1091   bool hasVOPDInsts() const { return HasVOPDInsts; }
1092 
1093   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1094 
1095   /// Return true if the target has the S_DELAY_ALU instruction.
1096   bool hasDelayAlu() const { return GFX11Insts; }
1097 
1098   bool hasPackedTID() const { return HasPackedTID; }
1099 
1100   // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1101   // hasGFX90AInsts is also true.
1102   bool hasGFX940Insts() const { return GFX940Insts; }
1103 
1104   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1105   /// SGPRs
1106   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1107 
1108   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1109   /// VGPRs
1110   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1111 
1112   /// Return occupancy for the given function. Used LDS and a number of
1113   /// registers if provided.
1114   /// Note, occupancy can be affected by the scratch allocation as well, but
1115   /// we do not have enough information to compute it.
1116   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1117                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1118 
1119   /// \returns true if the flat_scratch register should be initialized with the
1120   /// pointer to the wave's scratch memory rather than a size and offset.
1121   bool flatScratchIsPointer() const {
1122     return getGeneration() >= AMDGPUSubtarget::GFX9;
1123   }
1124 
1125   /// \returns true if the flat_scratch register is initialized by the HW.
1126   /// In this case it is readonly.
1127   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1128 
1129   /// \returns true if the machine has merged shaders in which s0-s7 are
1130   /// reserved by the hardware and user SGPRs start at s8
1131   bool hasMergedShaders() const {
1132     return getGeneration() >= GFX9;
1133   }
1134 
1135   // \returns true if the target supports the pre-NGG legacy geometry path.
1136   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1137 
1138   /// \returns SGPR allocation granularity supported by the subtarget.
1139   unsigned getSGPRAllocGranule() const {
1140     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1141   }
1142 
1143   /// \returns SGPR encoding granularity supported by the subtarget.
1144   unsigned getSGPREncodingGranule() const {
1145     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1146   }
1147 
1148   /// \returns Total number of SGPRs supported by the subtarget.
1149   unsigned getTotalNumSGPRs() const {
1150     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1151   }
1152 
1153   /// \returns Addressable number of SGPRs supported by the subtarget.
1154   unsigned getAddressableNumSGPRs() const {
1155     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1156   }
1157 
1158   /// \returns Minimum number of SGPRs that meets the given number of waves per
1159   /// execution unit requirement supported by the subtarget.
1160   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1161     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1162   }
1163 
1164   /// \returns Maximum number of SGPRs that meets the given number of waves per
1165   /// execution unit requirement supported by the subtarget.
1166   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1167     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1168   }
1169 
1170   /// \returns Reserved number of SGPRs. This is common
1171   /// utility function called by MachineFunction and
1172   /// Function variants of getReservedNumSGPRs.
1173   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1174   /// \returns Reserved number of SGPRs for given machine function \p MF.
1175   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1176 
1177   /// \returns Reserved number of SGPRs for given function \p F.
1178   unsigned getReservedNumSGPRs(const Function &F) const;
1179 
1180   /// \returns max num SGPRs. This is the common utility
1181   /// function called by MachineFunction and Function
1182   /// variants of getMaxNumSGPRs.
1183   unsigned getBaseMaxNumSGPRs(const Function &F,
1184                               std::pair<unsigned, unsigned> WavesPerEU,
1185                               unsigned PreloadedSGPRs,
1186                               unsigned ReservedNumSGPRs) const;
1187 
1188   /// \returns Maximum number of SGPRs that meets number of waves per execution
1189   /// unit requirement for function \p MF, or number of SGPRs explicitly
1190   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1191   ///
1192   /// \returns Value that meets number of waves per execution unit requirement
1193   /// if explicitly requested value cannot be converted to integer, violates
1194   /// subtarget's specifications, or does not meet number of waves per execution
1195   /// unit requirement.
1196   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1197 
1198   /// \returns Maximum number of SGPRs that meets number of waves per execution
1199   /// unit requirement for function \p F, or number of SGPRs explicitly
1200   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1201   ///
1202   /// \returns Value that meets number of waves per execution unit requirement
1203   /// if explicitly requested value cannot be converted to integer, violates
1204   /// subtarget's specifications, or does not meet number of waves per execution
1205   /// unit requirement.
1206   unsigned getMaxNumSGPRs(const Function &F) const;
1207 
1208   /// \returns VGPR allocation granularity supported by the subtarget.
1209   unsigned getVGPRAllocGranule() const {
1210     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1211   }
1212 
1213   /// \returns VGPR encoding granularity supported by the subtarget.
1214   unsigned getVGPREncodingGranule() const {
1215     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1216   }
1217 
1218   /// \returns Total number of VGPRs supported by the subtarget.
1219   unsigned getTotalNumVGPRs() const {
1220     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1221   }
1222 
1223   /// \returns Addressable number of VGPRs supported by the subtarget.
1224   unsigned getAddressableNumVGPRs() const {
1225     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1226   }
1227 
1228   /// \returns the minimum number of VGPRs that will prevent achieving more than
1229   /// the specified number of waves \p WavesPerEU.
1230   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1231     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1232   }
1233 
1234   /// \returns the maximum number of VGPRs that can be used and still achieved
1235   /// at least the specified number of waves \p WavesPerEU.
1236   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1237     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1238   }
1239 
1240   /// \returns max num VGPRs. This is the common utility function
1241   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1242   unsigned getBaseMaxNumVGPRs(const Function &F,
1243                               std::pair<unsigned, unsigned> WavesPerEU) const;
1244   /// \returns Maximum number of VGPRs that meets number of waves per execution
1245   /// unit requirement for function \p F, or number of VGPRs explicitly
1246   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1247   ///
1248   /// \returns Value that meets number of waves per execution unit requirement
1249   /// if explicitly requested value cannot be converted to integer, violates
1250   /// subtarget's specifications, or does not meet number of waves per execution
1251   /// unit requirement.
1252   unsigned getMaxNumVGPRs(const Function &F) const;
1253 
1254   unsigned getMaxNumAGPRs(const Function &F) const {
1255     return getMaxNumVGPRs(F);
1256   }
1257 
1258   /// \returns Maximum number of VGPRs that meets number of waves per execution
1259   /// unit requirement for function \p MF, or number of VGPRs explicitly
1260   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1261   ///
1262   /// \returns Value that meets number of waves per execution unit requirement
1263   /// if explicitly requested value cannot be converted to integer, violates
1264   /// subtarget's specifications, or does not meet number of waves per execution
1265   /// unit requirement.
1266   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1267 
1268   void getPostRAMutations(
1269       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1270       const override;
1271 
1272   std::unique_ptr<ScheduleDAGMutation>
1273   createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1274 
1275   bool isWave32() const {
1276     return getWavefrontSize() == 32;
1277   }
1278 
1279   bool isWave64() const {
1280     return getWavefrontSize() == 64;
1281   }
1282 
1283   const TargetRegisterClass *getBoolRC() const {
1284     return getRegisterInfo()->getBoolRC();
1285   }
1286 
1287   /// \returns Maximum number of work groups per compute unit supported by the
1288   /// subtarget and limited by given \p FlatWorkGroupSize.
1289   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1290     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1291   }
1292 
1293   /// \returns Minimum flat work group size supported by the subtarget.
1294   unsigned getMinFlatWorkGroupSize() const override {
1295     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1296   }
1297 
1298   /// \returns Maximum flat work group size supported by the subtarget.
1299   unsigned getMaxFlatWorkGroupSize() const override {
1300     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1301   }
1302 
1303   /// \returns Number of waves per execution unit required to support the given
1304   /// \p FlatWorkGroupSize.
1305   unsigned
1306   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1307     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1308   }
1309 
1310   /// \returns Minimum number of waves per execution unit supported by the
1311   /// subtarget.
1312   unsigned getMinWavesPerEU() const override {
1313     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1314   }
1315 
1316   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1317                              SDep &Dep) const override;
1318 
1319   // \returns true if it's beneficial on this subtarget for the scheduler to
1320   // cluster stores as well as loads.
1321   bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1322 
1323   // \returns the number of address arguments from which to enable MIMG NSA
1324   // on supported architectures.
1325   unsigned getNSAThreshold(const MachineFunction &MF) const;
1326 };
1327 
1328 } // end namespace llvm
1329 
1330 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1331