xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h (revision 51015e6d0f570239b0c2088dc6cf2b018928375d)
1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIFrameLowering.h"
20 #include "SIISelLowering.h"
21 #include "SIInstrInfo.h"
22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
23 
24 #define GET_SUBTARGETINFO_HEADER
25 #include "AMDGPUGenSubtargetInfo.inc"
26 
27 namespace llvm {
28 
29 class GCNTargetMachine;
30 
31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
32                            public AMDGPUSubtarget {
33 
34   using AMDGPUSubtarget::getMaxWavesPerEU;
35 
36 public:
37   // Following 2 enums are documented at:
38   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
39   enum class TrapHandlerAbi {
40     NONE   = 0x00,
41     AMDHSA = 0x01,
42   };
43 
44   enum class TrapID {
45     LLVMAMDHSATrap      = 0x02,
46     LLVMAMDHSADebugTrap = 0x03,
47   };
48 
49 private:
50   /// GlobalISel related APIs.
51   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
52   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
53   std::unique_ptr<InstructionSelector> InstSelector;
54   std::unique_ptr<LegalizerInfo> Legalizer;
55   std::unique_ptr<RegisterBankInfo> RegBankInfo;
56 
57 protected:
58   // Basic subtarget description.
59   Triple TargetTriple;
60   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
61   unsigned Gen = INVALID;
62   InstrItineraryData InstrItins;
63   int LDSBankCount = 0;
64   unsigned MaxPrivateElementSize = 0;
65 
66   // Possibly statically set by tablegen, but may want to be overridden.
67   bool FastFMAF32 = false;
68   bool FastDenormalF32 = false;
69   bool HalfRate64Ops = false;
70   bool FullRate64Ops = false;
71 
72   // Dynamically set bits that enable features.
73   bool FlatForGlobal = false;
74   bool AutoWaitcntBeforeBarrier = false;
75   bool UnalignedScratchAccess = false;
76   bool UnalignedAccessMode = false;
77   bool HasApertureRegs = false;
78   bool SupportsXNACK = false;
79 
80   // This should not be used directly. 'TargetID' tracks the dynamic settings
81   // for XNACK.
82   bool EnableXNACK = false;
83 
84   bool EnableTgSplit = false;
85   bool EnableCuMode = false;
86   bool TrapHandler = false;
87 
88   // Used as options.
89   bool EnableLoadStoreOpt = false;
90   bool EnableUnsafeDSOffsetFolding = false;
91   bool EnableSIScheduler = false;
92   bool EnableDS128 = false;
93   bool EnablePRTStrictNull = false;
94   bool DumpCode = false;
95 
96   // Subtarget statically properties set by tablegen
97   bool FP64 = false;
98   bool FMA = false;
99   bool MIMG_R128 = false;
100   bool CIInsts = false;
101   bool GFX8Insts = false;
102   bool GFX9Insts = false;
103   bool GFX90AInsts = false;
104   bool GFX940Insts = false;
105   bool GFX10Insts = false;
106   bool GFX11Insts = false;
107   bool GFX10_3Insts = false;
108   bool GFX7GFX8GFX9Insts = false;
109   bool SGPRInitBug = false;
110   bool UserSGPRInit16Bug = false;
111   bool NegativeScratchOffsetBug = false;
112   bool NegativeUnalignedScratchOffsetBug = false;
113   bool HasSMemRealTime = false;
114   bool HasIntClamp = false;
115   bool HasFmaMixInsts = false;
116   bool HasMovrel = false;
117   bool HasVGPRIndexMode = false;
118   bool HasScalarStores = false;
119   bool HasScalarAtomics = false;
120   bool HasSDWAOmod = false;
121   bool HasSDWAScalar = false;
122   bool HasSDWASdst = false;
123   bool HasSDWAMac = false;
124   bool HasSDWAOutModsVOPC = false;
125   bool HasDPP = false;
126   bool HasDPP8 = false;
127   bool Has64BitDPP = false;
128   bool HasPackedFP32Ops = false;
129   bool HasImageInsts = false;
130   bool HasExtendedImageInsts = false;
131   bool HasR128A16 = false;
132   bool HasGFX10A16 = false;
133   bool HasG16 = false;
134   bool HasNSAEncoding = false;
135   unsigned NSAMaxSize = 0;
136   bool GFX10_AEncoding = false;
137   bool GFX10_BEncoding = false;
138   bool HasDLInsts = false;
139   bool HasDot1Insts = false;
140   bool HasDot2Insts = false;
141   bool HasDot3Insts = false;
142   bool HasDot4Insts = false;
143   bool HasDot5Insts = false;
144   bool HasDot6Insts = false;
145   bool HasDot7Insts = false;
146   bool HasDot8Insts = false;
147   bool HasMAIInsts = false;
148   bool HasFP8Insts = false;
149   bool HasPkFmacF16Inst = false;
150   bool HasAtomicFaddRtnInsts = false;
151   bool HasAtomicFaddNoRtnInsts = false;
152   bool HasAtomicPkFaddNoRtnInsts = false;
153   bool SupportsSRAMECC = false;
154 
155   // This should not be used directly. 'TargetID' tracks the dynamic settings
156   // for SRAMECC.
157   bool EnableSRAMECC = false;
158 
159   bool HasNoSdstCMPX = false;
160   bool HasVscnt = false;
161   bool HasGetWaveIdInst = false;
162   bool HasSMemTimeInst = false;
163   bool HasShaderCyclesRegister = false;
164   bool HasVOP3Literal = false;
165   bool HasNoDataDepHazard = false;
166   bool FlatAddressSpace = false;
167   bool FlatInstOffsets = false;
168   bool FlatGlobalInsts = false;
169   bool FlatScratchInsts = false;
170   bool ScalarFlatScratchInsts = false;
171   bool HasArchitectedFlatScratch = false;
172   bool EnableFlatScratch = false;
173   bool AddNoCarryInsts = false;
174   bool HasUnpackedD16VMem = false;
175   bool LDSMisalignedBug = false;
176   bool HasMFMAInlineLiteralBug = false;
177   bool UnalignedBufferAccess = false;
178   bool UnalignedDSAccess = false;
179   bool HasPackedTID = false;
180   bool ScalarizeGlobal = false;
181 
182   bool HasVcmpxPermlaneHazard = false;
183   bool HasVMEMtoScalarWriteHazard = false;
184   bool HasSMEMtoVectorWriteHazard = false;
185   bool HasInstFwdPrefetchBug = false;
186   bool HasVcmpxExecWARHazard = false;
187   bool HasLdsBranchVmemWARHazard = false;
188   bool HasNSAtoVMEMBug = false;
189   bool HasNSAClauseBug = false;
190   bool HasOffset3fBug = false;
191   bool HasFlatSegmentOffsetBug = false;
192   bool HasImageStoreD16Bug = false;
193   bool HasImageGather4D16Bug = false;
194   bool HasVOPDInsts = false;
195 
196   // Dummy feature to use for assembler in tablegen.
197   bool FeatureDisable = false;
198 
199   SelectionDAGTargetInfo TSInfo;
200 private:
201   SIInstrInfo InstrInfo;
202   SITargetLowering TLInfo;
203   SIFrameLowering FrameLowering;
204 
205 public:
206   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
207                const GCNTargetMachine &TM);
208   ~GCNSubtarget() override;
209 
210   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
211                                                    StringRef GPU, StringRef FS);
212 
213   const SIInstrInfo *getInstrInfo() const override {
214     return &InstrInfo;
215   }
216 
217   const SIFrameLowering *getFrameLowering() const override {
218     return &FrameLowering;
219   }
220 
221   const SITargetLowering *getTargetLowering() const override {
222     return &TLInfo;
223   }
224 
225   const SIRegisterInfo *getRegisterInfo() const override {
226     return &InstrInfo.getRegisterInfo();
227   }
228 
229   const CallLowering *getCallLowering() const override {
230     return CallLoweringInfo.get();
231   }
232 
233   const InlineAsmLowering *getInlineAsmLowering() const override {
234     return InlineAsmLoweringInfo.get();
235   }
236 
237   InstructionSelector *getInstructionSelector() const override {
238     return InstSelector.get();
239   }
240 
241   const LegalizerInfo *getLegalizerInfo() const override {
242     return Legalizer.get();
243   }
244 
245   const RegisterBankInfo *getRegBankInfo() const override {
246     return RegBankInfo.get();
247   }
248 
249   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
250     return TargetID;
251   }
252 
253   // Nothing implemented, just prevent crashes on use.
254   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
255     return &TSInfo;
256   }
257 
258   const InstrItineraryData *getInstrItineraryData() const override {
259     return &InstrItins;
260   }
261 
262   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
263 
264   Generation getGeneration() const {
265     return (Generation)Gen;
266   }
267 
268   unsigned getMaxWaveScratchSize() const {
269     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
270     if (getGeneration() < GFX11) {
271       // 13-bit field in units of 256-dword.
272       return (256 * 4) * ((1 << 13) - 1);
273     }
274     // 15-bit field in units of 64-dword.
275     return (64 * 4) * ((1 << 15) - 1);
276   }
277 
278   /// Return the number of high bits known to be zero for a frame index.
279   unsigned getKnownHighZeroBitsForFrameIndex() const {
280     return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
281   }
282 
283   int getLDSBankCount() const {
284     return LDSBankCount;
285   }
286 
287   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
288     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
289   }
290 
291   unsigned getConstantBusLimit(unsigned Opcode) const;
292 
293   /// Returns if the result of this instruction with a 16-bit result returned in
294   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
295   /// the original value.
296   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
297 
298   bool hasIntClamp() const {
299     return HasIntClamp;
300   }
301 
302   bool hasFP64() const {
303     return FP64;
304   }
305 
306   bool hasMIMG_R128() const {
307     return MIMG_R128;
308   }
309 
310   bool hasHWFP64() const {
311     return FP64;
312   }
313 
314   bool hasFastFMAF32() const {
315     return FastFMAF32;
316   }
317 
318   bool hasHalfRate64Ops() const {
319     return HalfRate64Ops;
320   }
321 
322   bool hasFullRate64Ops() const {
323     return FullRate64Ops;
324   }
325 
326   bool hasAddr64() const {
327     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
328   }
329 
330   bool hasFlat() const {
331     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
332   }
333 
334   // Return true if the target only has the reverse operand versions of VALU
335   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
336   bool hasOnlyRevVALUShifts() const {
337     return getGeneration() >= VOLCANIC_ISLANDS;
338   }
339 
340   bool hasFractBug() const {
341     return getGeneration() == SOUTHERN_ISLANDS;
342   }
343 
344   bool hasBFE() const {
345     return true;
346   }
347 
348   bool hasBFI() const {
349     return true;
350   }
351 
352   bool hasBFM() const {
353     return hasBFE();
354   }
355 
356   bool hasBCNT(unsigned Size) const {
357     return true;
358   }
359 
360   bool hasFFBL() const {
361     return true;
362   }
363 
364   bool hasFFBH() const {
365     return true;
366   }
367 
368   bool hasMed3_16() const {
369     return getGeneration() >= AMDGPUSubtarget::GFX9;
370   }
371 
372   bool hasMin3Max3_16() const {
373     return getGeneration() >= AMDGPUSubtarget::GFX9;
374   }
375 
376   bool hasFmaMixInsts() const {
377     return HasFmaMixInsts;
378   }
379 
380   bool hasCARRY() const {
381     return true;
382   }
383 
384   bool hasFMA() const {
385     return FMA;
386   }
387 
388   bool hasSwap() const {
389     return GFX9Insts;
390   }
391 
392   bool hasScalarPackInsts() const {
393     return GFX9Insts;
394   }
395 
396   bool hasScalarMulHiInsts() const {
397     return GFX9Insts;
398   }
399 
400   TrapHandlerAbi getTrapHandlerAbi() const {
401     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
402   }
403 
404   bool supportsGetDoorbellID() const {
405     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
406     return getGeneration() >= GFX9;
407   }
408 
409   /// True if the offset field of DS instructions works as expected. On SI, the
410   /// offset uses a 16-bit adder and does not always wrap properly.
411   bool hasUsableDSOffset() const {
412     return getGeneration() >= SEA_ISLANDS;
413   }
414 
415   bool unsafeDSOffsetFoldingEnabled() const {
416     return EnableUnsafeDSOffsetFolding;
417   }
418 
419   /// Condition output from div_scale is usable.
420   bool hasUsableDivScaleConditionOutput() const {
421     return getGeneration() != SOUTHERN_ISLANDS;
422   }
423 
424   /// Extra wait hazard is needed in some cases before
425   /// s_cbranch_vccnz/s_cbranch_vccz.
426   bool hasReadVCCZBug() const {
427     return getGeneration() <= SEA_ISLANDS;
428   }
429 
430   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
431   bool partialVCCWritesUpdateVCCZ() const {
432     return getGeneration() >= GFX10;
433   }
434 
435   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
436   /// was written by a VALU instruction.
437   bool hasSMRDReadVALUDefHazard() const {
438     return getGeneration() == SOUTHERN_ISLANDS;
439   }
440 
441   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
442   /// SGPR was written by a VALU Instruction.
443   bool hasVMEMReadSGPRVALUDefHazard() const {
444     return getGeneration() >= VOLCANIC_ISLANDS;
445   }
446 
447   bool hasRFEHazards() const {
448     return getGeneration() >= VOLCANIC_ISLANDS;
449   }
450 
451   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
452   unsigned getSetRegWaitStates() const {
453     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
454   }
455 
456   bool dumpCode() const {
457     return DumpCode;
458   }
459 
460   /// Return the amount of LDS that can be used that will not restrict the
461   /// occupancy lower than WaveCount.
462   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
463                                            const Function &) const;
464 
465   bool supportsMinMaxDenormModes() const {
466     return getGeneration() >= AMDGPUSubtarget::GFX9;
467   }
468 
469   /// \returns If target supports S_DENORM_MODE.
470   bool hasDenormModeInst() const {
471     return getGeneration() >= AMDGPUSubtarget::GFX10;
472   }
473 
474   bool useFlatForGlobal() const {
475     return FlatForGlobal;
476   }
477 
478   /// \returns If target supports ds_read/write_b128 and user enables generation
479   /// of ds_read/write_b128.
480   bool useDS128() const {
481     return CIInsts && EnableDS128;
482   }
483 
484   /// \return If target supports ds_read/write_b96/128.
485   bool hasDS96AndDS128() const {
486     return CIInsts;
487   }
488 
489   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
490   bool haveRoundOpsF64() const {
491     return CIInsts;
492   }
493 
494   /// \returns If MUBUF instructions always perform range checking, even for
495   /// buffer resources used for private memory access.
496   bool privateMemoryResourceIsRangeChecked() const {
497     return getGeneration() < AMDGPUSubtarget::GFX9;
498   }
499 
500   /// \returns If target requires PRT Struct NULL support (zero result registers
501   /// for sparse texture support).
502   bool usePRTStrictNull() const {
503     return EnablePRTStrictNull;
504   }
505 
506   bool hasAutoWaitcntBeforeBarrier() const {
507     return AutoWaitcntBeforeBarrier;
508   }
509 
510   bool hasUnalignedBufferAccess() const {
511     return UnalignedBufferAccess;
512   }
513 
514   bool hasUnalignedBufferAccessEnabled() const {
515     return UnalignedBufferAccess && UnalignedAccessMode;
516   }
517 
518   bool hasUnalignedDSAccess() const {
519     return UnalignedDSAccess;
520   }
521 
522   bool hasUnalignedDSAccessEnabled() const {
523     return UnalignedDSAccess && UnalignedAccessMode;
524   }
525 
526   bool hasUnalignedScratchAccess() const {
527     return UnalignedScratchAccess;
528   }
529 
530   bool hasUnalignedAccessMode() const {
531     return UnalignedAccessMode;
532   }
533 
534   bool hasApertureRegs() const {
535     return HasApertureRegs;
536   }
537 
538   bool isTrapHandlerEnabled() const {
539     return TrapHandler;
540   }
541 
542   bool isXNACKEnabled() const {
543     return TargetID.isXnackOnOrAny();
544   }
545 
546   bool isTgSplitEnabled() const {
547     return EnableTgSplit;
548   }
549 
550   bool isCuModeEnabled() const {
551     return EnableCuMode;
552   }
553 
554   bool hasFlatAddressSpace() const {
555     return FlatAddressSpace;
556   }
557 
558   bool hasFlatScrRegister() const {
559     return hasFlatAddressSpace();
560   }
561 
562   bool hasFlatInstOffsets() const {
563     return FlatInstOffsets;
564   }
565 
566   bool hasFlatGlobalInsts() const {
567     return FlatGlobalInsts;
568   }
569 
570   bool hasFlatScratchInsts() const {
571     return FlatScratchInsts;
572   }
573 
574   // Check if target supports ST addressing mode with FLAT scratch instructions.
575   // The ST addressing mode means no registers are used, either VGPR or SGPR,
576   // but only immediate offset is swizzled and added to the FLAT scratch base.
577   bool hasFlatScratchSTMode() const {
578     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
579   }
580 
581   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
582 
583   bool hasScalarFlatScratchInsts() const {
584     return ScalarFlatScratchInsts;
585   }
586 
587   bool enableFlatScratch() const {
588     return flatScratchIsArchitected() ||
589            (EnableFlatScratch && hasFlatScratchInsts());
590   }
591 
592   bool hasGlobalAddTidInsts() const {
593     return GFX10_BEncoding;
594   }
595 
596   bool hasAtomicCSub() const {
597     return GFX10_BEncoding;
598   }
599 
600   bool hasMultiDwordFlatScratchAddressing() const {
601     return getGeneration() >= GFX9;
602   }
603 
604   bool hasFlatSegmentOffsetBug() const {
605     return HasFlatSegmentOffsetBug;
606   }
607 
608   bool hasFlatLgkmVMemCountInOrder() const {
609     return getGeneration() > GFX9;
610   }
611 
612   bool hasD16LoadStore() const {
613     return getGeneration() >= GFX9;
614   }
615 
616   bool d16PreservesUnusedBits() const {
617     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
618   }
619 
620   bool hasD16Images() const {
621     return getGeneration() >= VOLCANIC_ISLANDS;
622   }
623 
624   /// Return if most LDS instructions have an m0 use that require m0 to be
625   /// initialized.
626   bool ldsRequiresM0Init() const {
627     return getGeneration() < GFX9;
628   }
629 
630   // True if the hardware rewinds and replays GWS operations if a wave is
631   // preempted.
632   //
633   // If this is false, a GWS operation requires testing if a nack set the
634   // MEM_VIOL bit, and repeating if so.
635   bool hasGWSAutoReplay() const {
636     return getGeneration() >= GFX9;
637   }
638 
639   /// \returns if target has ds_gws_sema_release_all instruction.
640   bool hasGWSSemaReleaseAll() const {
641     return CIInsts;
642   }
643 
644   /// \returns true if the target has integer add/sub instructions that do not
645   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
646   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
647   /// for saturation.
648   bool hasAddNoCarry() const {
649     return AddNoCarryInsts;
650   }
651 
652   bool hasUnpackedD16VMem() const {
653     return HasUnpackedD16VMem;
654   }
655 
656   // Covers VS/PS/CS graphics shaders
657   bool isMesaGfxShader(const Function &F) const {
658     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
659   }
660 
661   bool hasMad64_32() const {
662     return getGeneration() >= SEA_ISLANDS;
663   }
664 
665   bool hasSDWAOmod() const {
666     return HasSDWAOmod;
667   }
668 
669   bool hasSDWAScalar() const {
670     return HasSDWAScalar;
671   }
672 
673   bool hasSDWASdst() const {
674     return HasSDWASdst;
675   }
676 
677   bool hasSDWAMac() const {
678     return HasSDWAMac;
679   }
680 
681   bool hasSDWAOutModsVOPC() const {
682     return HasSDWAOutModsVOPC;
683   }
684 
685   bool hasDLInsts() const {
686     return HasDLInsts;
687   }
688 
689   bool hasDot1Insts() const {
690     return HasDot1Insts;
691   }
692 
693   bool hasDot2Insts() const {
694     return HasDot2Insts;
695   }
696 
697   bool hasDot3Insts() const {
698     return HasDot3Insts;
699   }
700 
701   bool hasDot4Insts() const {
702     return HasDot4Insts;
703   }
704 
705   bool hasDot5Insts() const {
706     return HasDot5Insts;
707   }
708 
709   bool hasDot6Insts() const {
710     return HasDot6Insts;
711   }
712 
713   bool hasDot7Insts() const {
714     return HasDot7Insts;
715   }
716 
717   bool hasDot8Insts() const {
718     return HasDot8Insts;
719   }
720 
721   bool hasMAIInsts() const {
722     return HasMAIInsts;
723   }
724 
725   bool hasFP8Insts() const {
726     return HasFP8Insts;
727   }
728 
729   bool hasPkFmacF16Inst() const {
730     return HasPkFmacF16Inst;
731   }
732 
733   bool hasAtomicFaddInsts() const {
734     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
735   }
736 
737   bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
738 
739   bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
740 
741   bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; }
742 
743   bool hasNoSdstCMPX() const {
744     return HasNoSdstCMPX;
745   }
746 
747   bool hasVscnt() const {
748     return HasVscnt;
749   }
750 
751   bool hasGetWaveIdInst() const {
752     return HasGetWaveIdInst;
753   }
754 
755   bool hasSMemTimeInst() const {
756     return HasSMemTimeInst;
757   }
758 
759   bool hasShaderCyclesRegister() const {
760     return HasShaderCyclesRegister;
761   }
762 
763   bool hasVOP3Literal() const {
764     return HasVOP3Literal;
765   }
766 
767   bool hasNoDataDepHazard() const {
768     return HasNoDataDepHazard;
769   }
770 
771   bool vmemWriteNeedsExpWaitcnt() const {
772     return getGeneration() < SEA_ISLANDS;
773   }
774 
775   // Scratch is allocated in 256 dword per wave blocks for the entire
776   // wavefront. When viewed from the perspective of an arbitrary workitem, this
777   // is 4-byte aligned.
778   //
779   // Only 4-byte alignment is really needed to access anything. Transformations
780   // on the pointer value itself may rely on the alignment / known low bits of
781   // the pointer. Set this to something above the minimum to avoid needing
782   // dynamic realignment in common cases.
783   Align getStackAlignment() const { return Align(16); }
784 
785   bool enableMachineScheduler() const override {
786     return true;
787   }
788 
789   bool useAA() const override;
790 
791   bool enableSubRegLiveness() const override {
792     return true;
793   }
794 
795   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
796   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
797 
798   // static wrappers
799   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
800 
801   // XXX - Why is this here if it isn't in the default pass set?
802   bool enableEarlyIfConversion() const override {
803     return true;
804   }
805 
806   void overrideSchedPolicy(MachineSchedPolicy &Policy,
807                            unsigned NumRegionInstrs) const override;
808 
809   unsigned getMaxNumUserSGPRs() const {
810     return 16;
811   }
812 
813   bool hasSMemRealTime() const {
814     return HasSMemRealTime;
815   }
816 
817   bool hasMovrel() const {
818     return HasMovrel;
819   }
820 
821   bool hasVGPRIndexMode() const {
822     return HasVGPRIndexMode;
823   }
824 
825   bool useVGPRIndexMode() const;
826 
827   bool hasScalarCompareEq64() const {
828     return getGeneration() >= VOLCANIC_ISLANDS;
829   }
830 
831   bool hasScalarStores() const {
832     return HasScalarStores;
833   }
834 
835   bool hasScalarAtomics() const {
836     return HasScalarAtomics;
837   }
838 
839   bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
840 
841   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
842   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
843 
844   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
845   bool hasPermLane64() const { return getGeneration() >= GFX11; }
846 
847   bool hasDPP() const {
848     return HasDPP;
849   }
850 
851   bool hasDPPBroadcasts() const {
852     return HasDPP && getGeneration() < GFX10;
853   }
854 
855   bool hasDPPWavefrontShifts() const {
856     return HasDPP && getGeneration() < GFX10;
857   }
858 
859   bool hasDPP8() const {
860     return HasDPP8;
861   }
862 
863   bool has64BitDPP() const {
864     return Has64BitDPP;
865   }
866 
867   bool hasPackedFP32Ops() const {
868     return HasPackedFP32Ops;
869   }
870 
871   bool hasFmaakFmamkF32Insts() const {
872     return getGeneration() >= GFX10 || hasGFX940Insts();
873   }
874 
875   bool hasImageInsts() const {
876     return HasImageInsts;
877   }
878 
879   bool hasExtendedImageInsts() const {
880     return HasExtendedImageInsts;
881   }
882 
883   bool hasR128A16() const {
884     return HasR128A16;
885   }
886 
887   bool hasGFX10A16() const {
888     return HasGFX10A16;
889   }
890 
891   bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
892 
893   bool hasG16() const { return HasG16; }
894 
895   bool hasOffset3fBug() const {
896     return HasOffset3fBug;
897   }
898 
899   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
900 
901   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
902 
903   bool hasNSAEncoding() const { return HasNSAEncoding; }
904 
905   unsigned getNSAMaxSize() const { return NSAMaxSize; }
906 
907   bool hasGFX10_AEncoding() const {
908     return GFX10_AEncoding;
909   }
910 
911   bool hasGFX10_BEncoding() const {
912     return GFX10_BEncoding;
913   }
914 
915   bool hasGFX10_3Insts() const {
916     return GFX10_3Insts;
917   }
918 
919   bool hasMadF16() const;
920 
921   bool hasMovB64() const { return GFX940Insts; }
922 
923   bool hasLshlAddB64() const { return GFX940Insts; }
924 
925   bool enableSIScheduler() const {
926     return EnableSIScheduler;
927   }
928 
929   bool loadStoreOptEnabled() const {
930     return EnableLoadStoreOpt;
931   }
932 
933   bool hasSGPRInitBug() const {
934     return SGPRInitBug;
935   }
936 
937   bool hasUserSGPRInit16Bug() const {
938     return UserSGPRInit16Bug && isWave32();
939   }
940 
941   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
942 
943   bool hasNegativeUnalignedScratchOffsetBug() const {
944     return NegativeUnalignedScratchOffsetBug;
945   }
946 
947   bool hasMFMAInlineLiteralBug() const {
948     return HasMFMAInlineLiteralBug;
949   }
950 
951   bool has12DWordStoreHazard() const {
952     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
953   }
954 
955   // \returns true if the subtarget supports DWORDX3 load/store instructions.
956   bool hasDwordx3LoadStores() const {
957     return CIInsts;
958   }
959 
960   bool hasReadM0MovRelInterpHazard() const {
961     return getGeneration() == AMDGPUSubtarget::GFX9;
962   }
963 
964   bool hasReadM0SendMsgHazard() const {
965     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
966            getGeneration() <= AMDGPUSubtarget::GFX9;
967   }
968 
969   bool hasReadM0LdsDmaHazard() const {
970     return getGeneration() == AMDGPUSubtarget::GFX9;
971   }
972 
973   bool hasReadM0LdsDirectHazard() const {
974     return getGeneration() == AMDGPUSubtarget::GFX9;
975   }
976 
977   bool hasVcmpxPermlaneHazard() const {
978     return HasVcmpxPermlaneHazard;
979   }
980 
981   bool hasVMEMtoScalarWriteHazard() const {
982     return HasVMEMtoScalarWriteHazard;
983   }
984 
985   bool hasSMEMtoVectorWriteHazard() const {
986     return HasSMEMtoVectorWriteHazard;
987   }
988 
989   bool hasLDSMisalignedBug() const {
990     return LDSMisalignedBug && !EnableCuMode;
991   }
992 
993   bool hasInstFwdPrefetchBug() const {
994     return HasInstFwdPrefetchBug;
995   }
996 
997   bool hasVcmpxExecWARHazard() const {
998     return HasVcmpxExecWARHazard;
999   }
1000 
1001   bool hasLdsBranchVmemWARHazard() const {
1002     return HasLdsBranchVmemWARHazard;
1003   }
1004 
1005   // Has one cycle hazard on transcendental instruction feeding a
1006   // non transcendental VALU.
1007   bool hasTransForwardingHazard() const { return GFX940Insts; }
1008 
1009   // Has one cycle hazard on a VALU instruction partially writing dst with
1010   // a shift of result bits feeding another VALU instruction.
1011   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1012 
1013   // Cannot use op_sel with v_dot instructions.
1014   bool hasDOTOpSelHazard() const { return GFX940Insts; }
1015 
1016   // Does not have HW interlocs for VALU writing and then reading SGPRs.
1017   bool hasVDecCoExecHazard() const {
1018     return GFX940Insts;
1019   }
1020 
1021   bool hasNSAtoVMEMBug() const {
1022     return HasNSAtoVMEMBug;
1023   }
1024 
1025   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1026 
1027   bool hasHardClauses() const { return getGeneration() >= GFX10; }
1028 
1029   bool hasGFX90AInsts() const { return GFX90AInsts; }
1030 
1031   bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1032 
1033   bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1034 
1035   bool hasVALUPartialForwardingHazard() const {
1036     return getGeneration() >= GFX11;
1037   }
1038 
1039   bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; }
1040 
1041   /// Return if operations acting on VGPR tuples require even alignment.
1042   bool needsAlignedVGPRs() const { return GFX90AInsts; }
1043 
1044   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1045   bool hasSPackHL() const { return GFX11Insts; }
1046 
1047   /// Return true if the target's EXP instruction has the COMPR flag, which
1048   /// affects the meaning of the EN (enable) bits.
1049   bool hasCompressedExport() const { return !GFX11Insts; }
1050 
1051   /// Return true if the target's EXP instruction supports the NULL export
1052   /// target.
1053   bool hasNullExportTarget() const { return !GFX11Insts; }
1054 
1055   bool hasVOPDInsts() const { return HasVOPDInsts; }
1056 
1057   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1058 
1059   /// Return true if the target has the S_DELAY_ALU instruction.
1060   bool hasDelayAlu() const { return GFX11Insts; }
1061 
1062   bool hasPackedTID() const { return HasPackedTID; }
1063 
1064   // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1065   // hasGFX90AInsts is also true.
1066   bool hasGFX940Insts() const { return GFX940Insts; }
1067 
1068   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1069   /// SGPRs
1070   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1071 
1072   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1073   /// VGPRs
1074   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1075 
1076   /// Return occupancy for the given function. Used LDS and a number of
1077   /// registers if provided.
1078   /// Note, occupancy can be affected by the scratch allocation as well, but
1079   /// we do not have enough information to compute it.
1080   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1081                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1082 
1083   /// \returns true if the flat_scratch register should be initialized with the
1084   /// pointer to the wave's scratch memory rather than a size and offset.
1085   bool flatScratchIsPointer() const {
1086     return getGeneration() >= AMDGPUSubtarget::GFX9;
1087   }
1088 
1089   /// \returns true if the flat_scratch register is initialized by the HW.
1090   /// In this case it is readonly.
1091   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1092 
1093   /// \returns true if the machine has merged shaders in which s0-s7 are
1094   /// reserved by the hardware and user SGPRs start at s8
1095   bool hasMergedShaders() const {
1096     return getGeneration() >= GFX9;
1097   }
1098 
1099   // \returns true if the target supports the pre-NGG legacy geometry path.
1100   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1101 
1102   /// \returns SGPR allocation granularity supported by the subtarget.
1103   unsigned getSGPRAllocGranule() const {
1104     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1105   }
1106 
1107   /// \returns SGPR encoding granularity supported by the subtarget.
1108   unsigned getSGPREncodingGranule() const {
1109     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1110   }
1111 
1112   /// \returns Total number of SGPRs supported by the subtarget.
1113   unsigned getTotalNumSGPRs() const {
1114     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1115   }
1116 
1117   /// \returns Addressable number of SGPRs supported by the subtarget.
1118   unsigned getAddressableNumSGPRs() const {
1119     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1120   }
1121 
1122   /// \returns Minimum number of SGPRs that meets the given number of waves per
1123   /// execution unit requirement supported by the subtarget.
1124   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1125     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1126   }
1127 
1128   /// \returns Maximum number of SGPRs that meets the given number of waves per
1129   /// execution unit requirement supported by the subtarget.
1130   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1131     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1132   }
1133 
1134   /// \returns Reserved number of SGPRs. This is common
1135   /// utility function called by MachineFunction and
1136   /// Function variants of getReservedNumSGPRs.
1137   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1138   /// \returns Reserved number of SGPRs for given machine function \p MF.
1139   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1140 
1141   /// \returns Reserved number of SGPRs for given function \p F.
1142   unsigned getReservedNumSGPRs(const Function &F) const;
1143 
1144   /// \returns max num SGPRs. This is the common utility
1145   /// function called by MachineFunction and Function
1146   /// variants of getMaxNumSGPRs.
1147   unsigned getBaseMaxNumSGPRs(const Function &F,
1148                               std::pair<unsigned, unsigned> WavesPerEU,
1149                               unsigned PreloadedSGPRs,
1150                               unsigned ReservedNumSGPRs) const;
1151 
1152   /// \returns Maximum number of SGPRs that meets number of waves per execution
1153   /// unit requirement for function \p MF, or number of SGPRs explicitly
1154   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1155   ///
1156   /// \returns Value that meets number of waves per execution unit requirement
1157   /// if explicitly requested value cannot be converted to integer, violates
1158   /// subtarget's specifications, or does not meet number of waves per execution
1159   /// unit requirement.
1160   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1161 
1162   /// \returns Maximum number of SGPRs that meets number of waves per execution
1163   /// unit requirement for function \p F, or number of SGPRs explicitly
1164   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1165   ///
1166   /// \returns Value that meets number of waves per execution unit requirement
1167   /// if explicitly requested value cannot be converted to integer, violates
1168   /// subtarget's specifications, or does not meet number of waves per execution
1169   /// unit requirement.
1170   unsigned getMaxNumSGPRs(const Function &F) const;
1171 
1172   /// \returns VGPR allocation granularity supported by the subtarget.
1173   unsigned getVGPRAllocGranule() const {
1174     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1175   }
1176 
1177   /// \returns VGPR encoding granularity supported by the subtarget.
1178   unsigned getVGPREncodingGranule() const {
1179     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1180   }
1181 
1182   /// \returns Total number of VGPRs supported by the subtarget.
1183   unsigned getTotalNumVGPRs() const {
1184     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1185   }
1186 
1187   /// \returns Addressable number of VGPRs supported by the subtarget.
1188   unsigned getAddressableNumVGPRs() const {
1189     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1190   }
1191 
1192   /// \returns Minimum number of VGPRs that meets given number of waves per
1193   /// execution unit requirement supported by the subtarget.
1194   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1195     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1196   }
1197 
1198   /// \returns Maximum number of VGPRs that meets given number of waves per
1199   /// execution unit requirement supported by the subtarget.
1200   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1201     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1202   }
1203 
1204   /// \returns max num VGPRs. This is the common utility function
1205   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1206   unsigned getBaseMaxNumVGPRs(const Function &F,
1207                               std::pair<unsigned, unsigned> WavesPerEU) const;
1208   /// \returns Maximum number of VGPRs that meets number of waves per execution
1209   /// unit requirement for function \p F, or number of VGPRs explicitly
1210   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1211   ///
1212   /// \returns Value that meets number of waves per execution unit requirement
1213   /// if explicitly requested value cannot be converted to integer, violates
1214   /// subtarget's specifications, or does not meet number of waves per execution
1215   /// unit requirement.
1216   unsigned getMaxNumVGPRs(const Function &F) const;
1217 
1218   unsigned getMaxNumAGPRs(const Function &F) const {
1219     return getMaxNumVGPRs(F);
1220   }
1221 
1222   /// \returns Maximum number of VGPRs that meets number of waves per execution
1223   /// unit requirement for function \p MF, or number of VGPRs explicitly
1224   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1225   ///
1226   /// \returns Value that meets number of waves per execution unit requirement
1227   /// if explicitly requested value cannot be converted to integer, violates
1228   /// subtarget's specifications, or does not meet number of waves per execution
1229   /// unit requirement.
1230   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1231 
1232   void getPostRAMutations(
1233       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1234       const override;
1235 
1236   std::unique_ptr<ScheduleDAGMutation>
1237   createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1238 
1239   bool isWave32() const {
1240     return getWavefrontSize() == 32;
1241   }
1242 
1243   bool isWave64() const {
1244     return getWavefrontSize() == 64;
1245   }
1246 
1247   const TargetRegisterClass *getBoolRC() const {
1248     return getRegisterInfo()->getBoolRC();
1249   }
1250 
1251   /// \returns Maximum number of work groups per compute unit supported by the
1252   /// subtarget and limited by given \p FlatWorkGroupSize.
1253   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1254     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1255   }
1256 
1257   /// \returns Minimum flat work group size supported by the subtarget.
1258   unsigned getMinFlatWorkGroupSize() const override {
1259     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1260   }
1261 
1262   /// \returns Maximum flat work group size supported by the subtarget.
1263   unsigned getMaxFlatWorkGroupSize() const override {
1264     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1265   }
1266 
1267   /// \returns Number of waves per execution unit required to support the given
1268   /// \p FlatWorkGroupSize.
1269   unsigned
1270   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1271     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1272   }
1273 
1274   /// \returns Minimum number of waves per execution unit supported by the
1275   /// subtarget.
1276   unsigned getMinWavesPerEU() const override {
1277     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1278   }
1279 
1280   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1281                              SDep &Dep) const override;
1282 
1283   // \returns true if it's beneficial on this subtarget for the scheduler to
1284   // cluster stores as well as loads.
1285   bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1286 };
1287 
1288 } // end namespace llvm
1289 
1290 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1291