xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIFrameLowering.h"
20 #include "SIISelLowering.h"
21 #include "SIInstrInfo.h"
22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
23 
24 #define GET_SUBTARGETINFO_HEADER
25 #include "AMDGPUGenSubtargetInfo.inc"
26 
27 namespace llvm {
28 
29 class GCNTargetMachine;
30 
31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
32                            public AMDGPUSubtarget {
33 
34   using AMDGPUSubtarget::getMaxWavesPerEU;
35 
36 public:
37   // Following 2 enums are documented at:
38   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
39   enum class TrapHandlerAbi {
40     NONE   = 0x00,
41     AMDHSA = 0x01,
42   };
43 
44   enum class TrapID {
45     LLVMAMDHSATrap      = 0x02,
46     LLVMAMDHSADebugTrap = 0x03,
47   };
48 
49 private:
50   /// GlobalISel related APIs.
51   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
52   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
53   std::unique_ptr<InstructionSelector> InstSelector;
54   std::unique_ptr<LegalizerInfo> Legalizer;
55   std::unique_ptr<RegisterBankInfo> RegBankInfo;
56 
57 protected:
58   // Basic subtarget description.
59   Triple TargetTriple;
60   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
61   unsigned Gen;
62   InstrItineraryData InstrItins;
63   int LDSBankCount;
64   unsigned MaxPrivateElementSize;
65 
66   // Possibly statically set by tablegen, but may want to be overridden.
67   bool FastFMAF32;
68   bool FastDenormalF32;
69   bool HalfRate64Ops;
70   bool FullRate64Ops;
71 
72   // Dynamically set bits that enable features.
73   bool FlatForGlobal;
74   bool AutoWaitcntBeforeBarrier;
75   bool UnalignedScratchAccess;
76   bool UnalignedAccessMode;
77   bool HasApertureRegs;
78   bool SupportsXNACK;
79 
80   // This should not be used directly. 'TargetID' tracks the dynamic settings
81   // for XNACK.
82   bool EnableXNACK;
83 
84   bool EnableTgSplit;
85   bool EnableCuMode;
86   bool TrapHandler;
87 
88   // Used as options.
89   bool EnableLoadStoreOpt;
90   bool EnableUnsafeDSOffsetFolding;
91   bool EnableSIScheduler;
92   bool EnableDS128;
93   bool EnablePRTStrictNull;
94   bool DumpCode;
95 
96   // Subtarget statically properties set by tablegen
97   bool FP64;
98   bool FMA;
99   bool MIMG_R128;
100   bool CIInsts;
101   bool GFX8Insts;
102   bool GFX9Insts;
103   bool GFX90AInsts;
104   bool GFX10Insts;
105   bool GFX10_3Insts;
106   bool GFX7GFX8GFX9Insts;
107   bool SGPRInitBug;
108   bool NegativeScratchOffsetBug;
109   bool NegativeUnalignedScratchOffsetBug;
110   bool HasSMemRealTime;
111   bool HasIntClamp;
112   bool HasFmaMixInsts;
113   bool HasMovrel;
114   bool HasVGPRIndexMode;
115   bool HasScalarStores;
116   bool HasScalarAtomics;
117   bool HasSDWAOmod;
118   bool HasSDWAScalar;
119   bool HasSDWASdst;
120   bool HasSDWAMac;
121   bool HasSDWAOutModsVOPC;
122   bool HasDPP;
123   bool HasDPP8;
124   bool Has64BitDPP;
125   bool HasPackedFP32Ops;
126   bool HasExtendedImageInsts;
127   bool HasR128A16;
128   bool HasGFX10A16;
129   bool HasG16;
130   bool HasNSAEncoding;
131   unsigned NSAMaxSize;
132   bool GFX10_AEncoding;
133   bool GFX10_BEncoding;
134   bool HasDLInsts;
135   bool HasDot1Insts;
136   bool HasDot2Insts;
137   bool HasDot3Insts;
138   bool HasDot4Insts;
139   bool HasDot5Insts;
140   bool HasDot6Insts;
141   bool HasDot7Insts;
142   bool HasMAIInsts;
143   bool HasPkFmacF16Inst;
144   bool HasAtomicFaddInsts;
145   bool SupportsSRAMECC;
146 
147   // This should not be used directly. 'TargetID' tracks the dynamic settings
148   // for SRAMECC.
149   bool EnableSRAMECC;
150 
151   bool HasNoSdstCMPX;
152   bool HasVscnt;
153   bool HasGetWaveIdInst;
154   bool HasSMemTimeInst;
155   bool HasShaderCyclesRegister;
156   bool HasRegisterBanking;
157   bool HasVOP3Literal;
158   bool HasNoDataDepHazard;
159   bool FlatAddressSpace;
160   bool FlatInstOffsets;
161   bool FlatGlobalInsts;
162   bool FlatScratchInsts;
163   bool ScalarFlatScratchInsts;
164   bool HasArchitectedFlatScratch;
165   bool AddNoCarryInsts;
166   bool HasUnpackedD16VMem;
167   bool LDSMisalignedBug;
168   bool HasMFMAInlineLiteralBug;
169   bool UnalignedBufferAccess;
170   bool UnalignedDSAccess;
171   bool HasPackedTID;
172   bool ScalarizeGlobal;
173 
174   bool HasVcmpxPermlaneHazard;
175   bool HasVMEMtoScalarWriteHazard;
176   bool HasSMEMtoVectorWriteHazard;
177   bool HasInstFwdPrefetchBug;
178   bool HasVcmpxExecWARHazard;
179   bool HasLdsBranchVmemWARHazard;
180   bool HasNSAtoVMEMBug;
181   bool HasNSAClauseBug;
182   bool HasOffset3fBug;
183   bool HasFlatSegmentOffsetBug;
184   bool HasImageStoreD16Bug;
185   bool HasImageGather4D16Bug;
186 
187   // Dummy feature to use for assembler in tablegen.
188   bool FeatureDisable;
189 
190   SelectionDAGTargetInfo TSInfo;
191 private:
192   SIInstrInfo InstrInfo;
193   SITargetLowering TLInfo;
194   SIFrameLowering FrameLowering;
195 
196 public:
197   // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
198   static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
199 
200   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
201                const GCNTargetMachine &TM);
202   ~GCNSubtarget() override;
203 
204   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
205                                                    StringRef GPU, StringRef FS);
206 
207   const SIInstrInfo *getInstrInfo() const override {
208     return &InstrInfo;
209   }
210 
211   const SIFrameLowering *getFrameLowering() const override {
212     return &FrameLowering;
213   }
214 
215   const SITargetLowering *getTargetLowering() const override {
216     return &TLInfo;
217   }
218 
219   const SIRegisterInfo *getRegisterInfo() const override {
220     return &InstrInfo.getRegisterInfo();
221   }
222 
223   const CallLowering *getCallLowering() const override {
224     return CallLoweringInfo.get();
225   }
226 
227   const InlineAsmLowering *getInlineAsmLowering() const override {
228     return InlineAsmLoweringInfo.get();
229   }
230 
231   InstructionSelector *getInstructionSelector() const override {
232     return InstSelector.get();
233   }
234 
235   const LegalizerInfo *getLegalizerInfo() const override {
236     return Legalizer.get();
237   }
238 
239   const RegisterBankInfo *getRegBankInfo() const override {
240     return RegBankInfo.get();
241   }
242 
243   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
244     return TargetID;
245   }
246 
247   // Nothing implemented, just prevent crashes on use.
248   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
249     return &TSInfo;
250   }
251 
252   const InstrItineraryData *getInstrItineraryData() const override {
253     return &InstrItins;
254   }
255 
256   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
257 
258   Generation getGeneration() const {
259     return (Generation)Gen;
260   }
261 
262   /// Return the number of high bits known to be zero for a frame index.
263   unsigned getKnownHighZeroBitsForFrameIndex() const {
264     return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
265   }
266 
267   int getLDSBankCount() const {
268     return LDSBankCount;
269   }
270 
271   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
272     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
273   }
274 
275   unsigned getConstantBusLimit(unsigned Opcode) const;
276 
277   /// Returns if the result of this instruction with a 16-bit result returned in
278   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
279   /// the original value.
280   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
281 
282   bool hasIntClamp() const {
283     return HasIntClamp;
284   }
285 
286   bool hasFP64() const {
287     return FP64;
288   }
289 
290   bool hasMIMG_R128() const {
291     return MIMG_R128;
292   }
293 
294   bool hasHWFP64() const {
295     return FP64;
296   }
297 
298   bool hasFastFMAF32() const {
299     return FastFMAF32;
300   }
301 
302   bool hasHalfRate64Ops() const {
303     return HalfRate64Ops;
304   }
305 
306   bool hasFullRate64Ops() const {
307     return FullRate64Ops;
308   }
309 
310   bool hasAddr64() const {
311     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
312   }
313 
314   bool hasFlat() const {
315     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
316   }
317 
318   // Return true if the target only has the reverse operand versions of VALU
319   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
320   bool hasOnlyRevVALUShifts() const {
321     return getGeneration() >= VOLCANIC_ISLANDS;
322   }
323 
324   bool hasFractBug() const {
325     return getGeneration() == SOUTHERN_ISLANDS;
326   }
327 
328   bool hasBFE() const {
329     return true;
330   }
331 
332   bool hasBFI() const {
333     return true;
334   }
335 
336   bool hasBFM() const {
337     return hasBFE();
338   }
339 
340   bool hasBCNT(unsigned Size) const {
341     return true;
342   }
343 
344   bool hasFFBL() const {
345     return true;
346   }
347 
348   bool hasFFBH() const {
349     return true;
350   }
351 
352   bool hasMed3_16() const {
353     return getGeneration() >= AMDGPUSubtarget::GFX9;
354   }
355 
356   bool hasMin3Max3_16() const {
357     return getGeneration() >= AMDGPUSubtarget::GFX9;
358   }
359 
360   bool hasFmaMixInsts() const {
361     return HasFmaMixInsts;
362   }
363 
364   bool hasCARRY() const {
365     return true;
366   }
367 
368   bool hasFMA() const {
369     return FMA;
370   }
371 
372   bool hasSwap() const {
373     return GFX9Insts;
374   }
375 
376   bool hasScalarPackInsts() const {
377     return GFX9Insts;
378   }
379 
380   bool hasScalarMulHiInsts() const {
381     return GFX9Insts;
382   }
383 
384   TrapHandlerAbi getTrapHandlerAbi() const {
385     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
386   }
387 
388   bool supportsGetDoorbellID() const {
389     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
390     return getGeneration() >= GFX9;
391   }
392 
393   /// True if the offset field of DS instructions works as expected. On SI, the
394   /// offset uses a 16-bit adder and does not always wrap properly.
395   bool hasUsableDSOffset() const {
396     return getGeneration() >= SEA_ISLANDS;
397   }
398 
399   bool unsafeDSOffsetFoldingEnabled() const {
400     return EnableUnsafeDSOffsetFolding;
401   }
402 
403   /// Condition output from div_scale is usable.
404   bool hasUsableDivScaleConditionOutput() const {
405     return getGeneration() != SOUTHERN_ISLANDS;
406   }
407 
408   /// Extra wait hazard is needed in some cases before
409   /// s_cbranch_vccnz/s_cbranch_vccz.
410   bool hasReadVCCZBug() const {
411     return getGeneration() <= SEA_ISLANDS;
412   }
413 
414   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
415   bool partialVCCWritesUpdateVCCZ() const {
416     return getGeneration() >= GFX10;
417   }
418 
419   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
420   /// was written by a VALU instruction.
421   bool hasSMRDReadVALUDefHazard() const {
422     return getGeneration() == SOUTHERN_ISLANDS;
423   }
424 
425   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
426   /// SGPR was written by a VALU Instruction.
427   bool hasVMEMReadSGPRVALUDefHazard() const {
428     return getGeneration() >= VOLCANIC_ISLANDS;
429   }
430 
431   bool hasRFEHazards() const {
432     return getGeneration() >= VOLCANIC_ISLANDS;
433   }
434 
435   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
436   unsigned getSetRegWaitStates() const {
437     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
438   }
439 
440   bool dumpCode() const {
441     return DumpCode;
442   }
443 
444   /// Return the amount of LDS that can be used that will not restrict the
445   /// occupancy lower than WaveCount.
446   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
447                                            const Function &) const;
448 
449   bool supportsMinMaxDenormModes() const {
450     return getGeneration() >= AMDGPUSubtarget::GFX9;
451   }
452 
453   /// \returns If target supports S_DENORM_MODE.
454   bool hasDenormModeInst() const {
455     return getGeneration() >= AMDGPUSubtarget::GFX10;
456   }
457 
458   bool useFlatForGlobal() const {
459     return FlatForGlobal;
460   }
461 
462   /// \returns If target supports ds_read/write_b128 and user enables generation
463   /// of ds_read/write_b128.
464   bool useDS128() const {
465     return CIInsts && EnableDS128;
466   }
467 
468   /// \return If target supports ds_read/write_b96/128.
469   bool hasDS96AndDS128() const {
470     return CIInsts;
471   }
472 
473   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
474   bool haveRoundOpsF64() const {
475     return CIInsts;
476   }
477 
478   /// \returns If MUBUF instructions always perform range checking, even for
479   /// buffer resources used for private memory access.
480   bool privateMemoryResourceIsRangeChecked() const {
481     return getGeneration() < AMDGPUSubtarget::GFX9;
482   }
483 
484   /// \returns If target requires PRT Struct NULL support (zero result registers
485   /// for sparse texture support).
486   bool usePRTStrictNull() const {
487     return EnablePRTStrictNull;
488   }
489 
490   bool hasAutoWaitcntBeforeBarrier() const {
491     return AutoWaitcntBeforeBarrier;
492   }
493 
494   bool hasUnalignedBufferAccess() const {
495     return UnalignedBufferAccess;
496   }
497 
498   bool hasUnalignedBufferAccessEnabled() const {
499     return UnalignedBufferAccess && UnalignedAccessMode;
500   }
501 
502   bool hasUnalignedDSAccess() const {
503     return UnalignedDSAccess;
504   }
505 
506   bool hasUnalignedDSAccessEnabled() const {
507     return UnalignedDSAccess && UnalignedAccessMode;
508   }
509 
510   bool hasUnalignedScratchAccess() const {
511     return UnalignedScratchAccess;
512   }
513 
514   bool hasUnalignedAccessMode() const {
515     return UnalignedAccessMode;
516   }
517 
518   bool hasApertureRegs() const {
519     return HasApertureRegs;
520   }
521 
522   bool isTrapHandlerEnabled() const {
523     return TrapHandler;
524   }
525 
526   bool isXNACKEnabled() const {
527     return TargetID.isXnackOnOrAny();
528   }
529 
530   bool isTgSplitEnabled() const {
531     return EnableTgSplit;
532   }
533 
534   bool isCuModeEnabled() const {
535     return EnableCuMode;
536   }
537 
538   bool hasFlatAddressSpace() const {
539     return FlatAddressSpace;
540   }
541 
542   bool hasFlatScrRegister() const {
543     return hasFlatAddressSpace();
544   }
545 
546   bool hasFlatInstOffsets() const {
547     return FlatInstOffsets;
548   }
549 
550   bool hasFlatGlobalInsts() const {
551     return FlatGlobalInsts;
552   }
553 
554   bool hasFlatScratchInsts() const {
555     return FlatScratchInsts;
556   }
557 
558   // Check if target supports ST addressing mode with FLAT scratch instructions.
559   // The ST addressing mode means no registers are used, either VGPR or SGPR,
560   // but only immediate offset is swizzled and added to the FLAT scratch base.
561   bool hasFlatScratchSTMode() const {
562     return hasFlatScratchInsts() && hasGFX10_3Insts();
563   }
564 
565   bool hasScalarFlatScratchInsts() const {
566     return ScalarFlatScratchInsts;
567   }
568 
569   bool hasGlobalAddTidInsts() const {
570     return GFX10_BEncoding;
571   }
572 
573   bool hasAtomicCSub() const {
574     return GFX10_BEncoding;
575   }
576 
577   bool hasMultiDwordFlatScratchAddressing() const {
578     return getGeneration() >= GFX9;
579   }
580 
581   bool hasFlatSegmentOffsetBug() const {
582     return HasFlatSegmentOffsetBug;
583   }
584 
585   bool hasFlatLgkmVMemCountInOrder() const {
586     return getGeneration() > GFX9;
587   }
588 
589   bool hasD16LoadStore() const {
590     return getGeneration() >= GFX9;
591   }
592 
593   bool d16PreservesUnusedBits() const {
594     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
595   }
596 
597   bool hasD16Images() const {
598     return getGeneration() >= VOLCANIC_ISLANDS;
599   }
600 
601   /// Return if most LDS instructions have an m0 use that require m0 to be
602   /// initialized.
603   bool ldsRequiresM0Init() const {
604     return getGeneration() < GFX9;
605   }
606 
607   // True if the hardware rewinds and replays GWS operations if a wave is
608   // preempted.
609   //
610   // If this is false, a GWS operation requires testing if a nack set the
611   // MEM_VIOL bit, and repeating if so.
612   bool hasGWSAutoReplay() const {
613     return getGeneration() >= GFX9;
614   }
615 
616   /// \returns if target has ds_gws_sema_release_all instruction.
617   bool hasGWSSemaReleaseAll() const {
618     return CIInsts;
619   }
620 
621   /// \returns true if the target has integer add/sub instructions that do not
622   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
623   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
624   /// for saturation.
625   bool hasAddNoCarry() const {
626     return AddNoCarryInsts;
627   }
628 
629   bool hasUnpackedD16VMem() const {
630     return HasUnpackedD16VMem;
631   }
632 
633   // Covers VS/PS/CS graphics shaders
634   bool isMesaGfxShader(const Function &F) const {
635     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
636   }
637 
638   bool hasMad64_32() const {
639     return getGeneration() >= SEA_ISLANDS;
640   }
641 
642   bool hasSDWAOmod() const {
643     return HasSDWAOmod;
644   }
645 
646   bool hasSDWAScalar() const {
647     return HasSDWAScalar;
648   }
649 
650   bool hasSDWASdst() const {
651     return HasSDWASdst;
652   }
653 
654   bool hasSDWAMac() const {
655     return HasSDWAMac;
656   }
657 
658   bool hasSDWAOutModsVOPC() const {
659     return HasSDWAOutModsVOPC;
660   }
661 
662   bool hasDLInsts() const {
663     return HasDLInsts;
664   }
665 
666   bool hasDot1Insts() const {
667     return HasDot1Insts;
668   }
669 
670   bool hasDot2Insts() const {
671     return HasDot2Insts;
672   }
673 
674   bool hasDot3Insts() const {
675     return HasDot3Insts;
676   }
677 
678   bool hasDot4Insts() const {
679     return HasDot4Insts;
680   }
681 
682   bool hasDot5Insts() const {
683     return HasDot5Insts;
684   }
685 
686   bool hasDot6Insts() const {
687     return HasDot6Insts;
688   }
689 
690   bool hasDot7Insts() const {
691     return HasDot7Insts;
692   }
693 
694   bool hasMAIInsts() const {
695     return HasMAIInsts;
696   }
697 
698   bool hasPkFmacF16Inst() const {
699     return HasPkFmacF16Inst;
700   }
701 
702   bool hasAtomicFaddInsts() const {
703     return HasAtomicFaddInsts;
704   }
705 
706   bool hasNoSdstCMPX() const {
707     return HasNoSdstCMPX;
708   }
709 
710   bool hasVscnt() const {
711     return HasVscnt;
712   }
713 
714   bool hasGetWaveIdInst() const {
715     return HasGetWaveIdInst;
716   }
717 
718   bool hasSMemTimeInst() const {
719     return HasSMemTimeInst;
720   }
721 
722   bool hasShaderCyclesRegister() const {
723     return HasShaderCyclesRegister;
724   }
725 
726   bool hasRegisterBanking() const {
727     return HasRegisterBanking;
728   }
729 
730   bool hasVOP3Literal() const {
731     return HasVOP3Literal;
732   }
733 
734   bool hasNoDataDepHazard() const {
735     return HasNoDataDepHazard;
736   }
737 
738   bool vmemWriteNeedsExpWaitcnt() const {
739     return getGeneration() < SEA_ISLANDS;
740   }
741 
742   // Scratch is allocated in 256 dword per wave blocks for the entire
743   // wavefront. When viewed from the perspective of an arbitrary workitem, this
744   // is 4-byte aligned.
745   //
746   // Only 4-byte alignment is really needed to access anything. Transformations
747   // on the pointer value itself may rely on the alignment / known low bits of
748   // the pointer. Set this to something above the minimum to avoid needing
749   // dynamic realignment in common cases.
750   Align getStackAlignment() const { return Align(16); }
751 
752   bool enableMachineScheduler() const override {
753     return true;
754   }
755 
756   bool useAA() const override;
757 
758   bool enableSubRegLiveness() const override {
759     return true;
760   }
761 
762   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
763   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
764 
765   // static wrappers
766   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
767 
768   // XXX - Why is this here if it isn't in the default pass set?
769   bool enableEarlyIfConversion() const override {
770     return true;
771   }
772 
773   bool enableFlatScratch() const;
774 
775   void overrideSchedPolicy(MachineSchedPolicy &Policy,
776                            unsigned NumRegionInstrs) const override;
777 
778   unsigned getMaxNumUserSGPRs() const {
779     return 16;
780   }
781 
782   bool hasSMemRealTime() const {
783     return HasSMemRealTime;
784   }
785 
786   bool hasMovrel() const {
787     return HasMovrel;
788   }
789 
790   bool hasVGPRIndexMode() const {
791     return HasVGPRIndexMode;
792   }
793 
794   bool useVGPRIndexMode() const;
795 
796   bool hasScalarCompareEq64() const {
797     return getGeneration() >= VOLCANIC_ISLANDS;
798   }
799 
800   bool hasScalarStores() const {
801     return HasScalarStores;
802   }
803 
804   bool hasScalarAtomics() const {
805     return HasScalarAtomics;
806   }
807 
808   bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
809 
810   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
811   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
812 
813   bool hasDPP() const {
814     return HasDPP;
815   }
816 
817   bool hasDPPBroadcasts() const {
818     return HasDPP && getGeneration() < GFX10;
819   }
820 
821   bool hasDPPWavefrontShifts() const {
822     return HasDPP && getGeneration() < GFX10;
823   }
824 
825   bool hasDPP8() const {
826     return HasDPP8;
827   }
828 
829   bool has64BitDPP() const {
830     return Has64BitDPP;
831   }
832 
833   bool hasPackedFP32Ops() const {
834     return HasPackedFP32Ops;
835   }
836 
837   bool hasFmaakFmamkF32Insts() const {
838     return getGeneration() >= GFX10;
839   }
840 
841   bool hasExtendedImageInsts() const {
842     return HasExtendedImageInsts;
843   }
844 
845   bool hasR128A16() const {
846     return HasR128A16;
847   }
848 
849   bool hasGFX10A16() const {
850     return HasGFX10A16;
851   }
852 
853   bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
854 
855   bool hasG16() const { return HasG16; }
856 
857   bool hasOffset3fBug() const {
858     return HasOffset3fBug;
859   }
860 
861   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
862 
863   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
864 
865   bool hasNSAEncoding() const { return HasNSAEncoding; }
866 
867   unsigned getNSAMaxSize() const { return NSAMaxSize; }
868 
869   bool hasGFX10_AEncoding() const {
870     return GFX10_AEncoding;
871   }
872 
873   bool hasGFX10_BEncoding() const {
874     return GFX10_BEncoding;
875   }
876 
877   bool hasGFX10_3Insts() const {
878     return GFX10_3Insts;
879   }
880 
881   bool hasMadF16() const;
882 
883   bool enableSIScheduler() const {
884     return EnableSIScheduler;
885   }
886 
887   bool loadStoreOptEnabled() const {
888     return EnableLoadStoreOpt;
889   }
890 
891   bool hasSGPRInitBug() const {
892     return SGPRInitBug;
893   }
894 
895   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
896 
897   bool hasNegativeUnalignedScratchOffsetBug() const {
898     return NegativeUnalignedScratchOffsetBug;
899   }
900 
901   bool hasMFMAInlineLiteralBug() const {
902     return HasMFMAInlineLiteralBug;
903   }
904 
905   bool has12DWordStoreHazard() const {
906     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
907   }
908 
909   // \returns true if the subtarget supports DWORDX3 load/store instructions.
910   bool hasDwordx3LoadStores() const {
911     return CIInsts;
912   }
913 
914   bool hasReadM0MovRelInterpHazard() const {
915     return getGeneration() == AMDGPUSubtarget::GFX9;
916   }
917 
918   bool hasReadM0SendMsgHazard() const {
919     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
920            getGeneration() <= AMDGPUSubtarget::GFX9;
921   }
922 
923   bool hasVcmpxPermlaneHazard() const {
924     return HasVcmpxPermlaneHazard;
925   }
926 
927   bool hasVMEMtoScalarWriteHazard() const {
928     return HasVMEMtoScalarWriteHazard;
929   }
930 
931   bool hasSMEMtoVectorWriteHazard() const {
932     return HasSMEMtoVectorWriteHazard;
933   }
934 
935   bool hasLDSMisalignedBug() const {
936     return LDSMisalignedBug && !EnableCuMode;
937   }
938 
939   bool hasInstFwdPrefetchBug() const {
940     return HasInstFwdPrefetchBug;
941   }
942 
943   bool hasVcmpxExecWARHazard() const {
944     return HasVcmpxExecWARHazard;
945   }
946 
947   bool hasLdsBranchVmemWARHazard() const {
948     return HasLdsBranchVmemWARHazard;
949   }
950 
951   bool hasNSAtoVMEMBug() const {
952     return HasNSAtoVMEMBug;
953   }
954 
955   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
956 
957   bool hasHardClauses() const { return getGeneration() >= GFX10; }
958 
959   bool hasGFX90AInsts() const { return GFX90AInsts; }
960 
961   /// Return if operations acting on VGPR tuples require even alignment.
962   bool needsAlignedVGPRs() const { return GFX90AInsts; }
963 
964   bool hasPackedTID() const { return HasPackedTID; }
965 
966   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
967   /// SGPRs
968   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
969 
970   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
971   /// VGPRs
972   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
973 
974   /// Return occupancy for the given function. Used LDS and a number of
975   /// registers if provided.
976   /// Note, occupancy can be affected by the scratch allocation as well, but
977   /// we do not have enough information to compute it.
978   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
979                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
980 
981   /// \returns true if the flat_scratch register should be initialized with the
982   /// pointer to the wave's scratch memory rather than a size and offset.
983   bool flatScratchIsPointer() const {
984     return getGeneration() >= AMDGPUSubtarget::GFX9;
985   }
986 
987   /// \returns true if the flat_scratch register is initialized by the HW.
988   /// In this case it is readonly.
989   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
990 
991   /// \returns true if the machine has merged shaders in which s0-s7 are
992   /// reserved by the hardware and user SGPRs start at s8
993   bool hasMergedShaders() const {
994     return getGeneration() >= GFX9;
995   }
996 
997   /// \returns SGPR allocation granularity supported by the subtarget.
998   unsigned getSGPRAllocGranule() const {
999     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1000   }
1001 
1002   /// \returns SGPR encoding granularity supported by the subtarget.
1003   unsigned getSGPREncodingGranule() const {
1004     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1005   }
1006 
1007   /// \returns Total number of SGPRs supported by the subtarget.
1008   unsigned getTotalNumSGPRs() const {
1009     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1010   }
1011 
1012   /// \returns Addressable number of SGPRs supported by the subtarget.
1013   unsigned getAddressableNumSGPRs() const {
1014     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1015   }
1016 
1017   /// \returns Minimum number of SGPRs that meets the given number of waves per
1018   /// execution unit requirement supported by the subtarget.
1019   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1020     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1021   }
1022 
1023   /// \returns Maximum number of SGPRs that meets the given number of waves per
1024   /// execution unit requirement supported by the subtarget.
1025   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1026     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1027   }
1028 
1029   /// \returns Reserved number of SGPRs. This is common
1030   /// utility function called by MachineFunction and
1031   /// Function variants of getReservedNumSGPRs.
1032   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const;
1033   /// \returns Reserved number of SGPRs for given machine function \p MF.
1034   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1035 
1036   /// \returns Reserved number of SGPRs for given function \p F.
1037   unsigned getReservedNumSGPRs(const Function &F) const;
1038 
1039   /// \returns max num SGPRs. This is the common utility
1040   /// function called by MachineFunction and Function
1041   /// variants of getMaxNumSGPRs.
1042   unsigned getBaseMaxNumSGPRs(const Function &F,
1043                               std::pair<unsigned, unsigned> WavesPerEU,
1044                               unsigned PreloadedSGPRs,
1045                               unsigned ReservedNumSGPRs) const;
1046 
1047   /// \returns Maximum number of SGPRs that meets number of waves per execution
1048   /// unit requirement for function \p MF, or number of SGPRs explicitly
1049   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1050   ///
1051   /// \returns Value that meets number of waves per execution unit requirement
1052   /// if explicitly requested value cannot be converted to integer, violates
1053   /// subtarget's specifications, or does not meet number of waves per execution
1054   /// unit requirement.
1055   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1056 
1057   /// \returns Maximum number of SGPRs that meets number of waves per execution
1058   /// unit requirement for function \p F, or number of SGPRs explicitly
1059   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1060   ///
1061   /// \returns Value that meets number of waves per execution unit requirement
1062   /// if explicitly requested value cannot be converted to integer, violates
1063   /// subtarget's specifications, or does not meet number of waves per execution
1064   /// unit requirement.
1065   unsigned getMaxNumSGPRs(const Function &F) const;
1066 
1067   /// \returns VGPR allocation granularity supported by the subtarget.
1068   unsigned getVGPRAllocGranule() const {
1069     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1070   }
1071 
1072   /// \returns VGPR encoding granularity supported by the subtarget.
1073   unsigned getVGPREncodingGranule() const {
1074     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1075   }
1076 
1077   /// \returns Total number of VGPRs supported by the subtarget.
1078   unsigned getTotalNumVGPRs() const {
1079     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1080   }
1081 
1082   /// \returns Addressable number of VGPRs supported by the subtarget.
1083   unsigned getAddressableNumVGPRs() const {
1084     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1085   }
1086 
1087   /// \returns Minimum number of VGPRs that meets given number of waves per
1088   /// execution unit requirement supported by the subtarget.
1089   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1090     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1091   }
1092 
1093   /// \returns Maximum number of VGPRs that meets given number of waves per
1094   /// execution unit requirement supported by the subtarget.
1095   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1096     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1097   }
1098 
1099   /// \returns max num VGPRs. This is the common utility function
1100   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1101   unsigned getBaseMaxNumVGPRs(const Function &F,
1102                               std::pair<unsigned, unsigned> WavesPerEU) const;
1103   /// \returns Maximum number of VGPRs that meets number of waves per execution
1104   /// unit requirement for function \p F, or number of VGPRs explicitly
1105   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1106   ///
1107   /// \returns Value that meets number of waves per execution unit requirement
1108   /// if explicitly requested value cannot be converted to integer, violates
1109   /// subtarget's specifications, or does not meet number of waves per execution
1110   /// unit requirement.
1111   unsigned getMaxNumVGPRs(const Function &F) const;
1112 
1113   /// \returns Maximum number of VGPRs that meets number of waves per execution
1114   /// unit requirement for function \p MF, or number of VGPRs explicitly
1115   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1116   ///
1117   /// \returns Value that meets number of waves per execution unit requirement
1118   /// if explicitly requested value cannot be converted to integer, violates
1119   /// subtarget's specifications, or does not meet number of waves per execution
1120   /// unit requirement.
1121   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1122 
1123   void getPostRAMutations(
1124       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1125       const override;
1126 
1127   std::unique_ptr<ScheduleDAGMutation>
1128   createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1129 
1130   bool isWave32() const {
1131     return getWavefrontSize() == 32;
1132   }
1133 
1134   bool isWave64() const {
1135     return getWavefrontSize() == 64;
1136   }
1137 
1138   const TargetRegisterClass *getBoolRC() const {
1139     return getRegisterInfo()->getBoolRC();
1140   }
1141 
1142   /// \returns Maximum number of work groups per compute unit supported by the
1143   /// subtarget and limited by given \p FlatWorkGroupSize.
1144   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1145     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1146   }
1147 
1148   /// \returns Minimum flat work group size supported by the subtarget.
1149   unsigned getMinFlatWorkGroupSize() const override {
1150     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1151   }
1152 
1153   /// \returns Maximum flat work group size supported by the subtarget.
1154   unsigned getMaxFlatWorkGroupSize() const override {
1155     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1156   }
1157 
1158   /// \returns Number of waves per execution unit required to support the given
1159   /// \p FlatWorkGroupSize.
1160   unsigned
1161   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1162     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1163   }
1164 
1165   /// \returns Minimum number of waves per execution unit supported by the
1166   /// subtarget.
1167   unsigned getMinWavesPerEU() const override {
1168     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1169   }
1170 
1171   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1172                              SDep &Dep) const override;
1173 };
1174 
1175 } // end namespace llvm
1176 
1177 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1178