xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h (revision f126d349810fdb512c0b01e101342d430b947488)
1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIFrameLowering.h"
20 #include "SIISelLowering.h"
21 #include "SIInstrInfo.h"
22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
23 
24 #define GET_SUBTARGETINFO_HEADER
25 #include "AMDGPUGenSubtargetInfo.inc"
26 
27 namespace llvm {
28 
29 class GCNTargetMachine;
30 
31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
32                            public AMDGPUSubtarget {
33 
34   using AMDGPUSubtarget::getMaxWavesPerEU;
35 
36 public:
37   // Following 2 enums are documented at:
38   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
39   enum class TrapHandlerAbi {
40     NONE   = 0x00,
41     AMDHSA = 0x01,
42   };
43 
44   enum class TrapID {
45     LLVMAMDHSATrap      = 0x02,
46     LLVMAMDHSADebugTrap = 0x03,
47   };
48 
49 private:
50   /// GlobalISel related APIs.
51   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
52   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
53   std::unique_ptr<InstructionSelector> InstSelector;
54   std::unique_ptr<LegalizerInfo> Legalizer;
55   std::unique_ptr<RegisterBankInfo> RegBankInfo;
56 
57 protected:
58   // Basic subtarget description.
59   Triple TargetTriple;
60   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
61   unsigned Gen;
62   InstrItineraryData InstrItins;
63   int LDSBankCount;
64   unsigned MaxPrivateElementSize;
65 
66   // Possibly statically set by tablegen, but may want to be overridden.
67   bool FastFMAF32;
68   bool FastDenormalF32;
69   bool HalfRate64Ops;
70   bool FullRate64Ops;
71 
72   // Dynamically set bits that enable features.
73   bool FlatForGlobal;
74   bool AutoWaitcntBeforeBarrier;
75   bool UnalignedScratchAccess;
76   bool UnalignedAccessMode;
77   bool HasApertureRegs;
78   bool SupportsXNACK;
79 
80   // This should not be used directly. 'TargetID' tracks the dynamic settings
81   // for XNACK.
82   bool EnableXNACK;
83 
84   bool EnableTgSplit;
85   bool EnableCuMode;
86   bool TrapHandler;
87 
88   // Used as options.
89   bool EnableLoadStoreOpt;
90   bool EnableUnsafeDSOffsetFolding;
91   bool EnableSIScheduler;
92   bool EnableDS128;
93   bool EnablePRTStrictNull;
94   bool DumpCode;
95 
96   // Subtarget statically properties set by tablegen
97   bool FP64;
98   bool FMA;
99   bool MIMG_R128;
100   bool CIInsts;
101   bool GFX8Insts;
102   bool GFX9Insts;
103   bool GFX90AInsts;
104   bool GFX10Insts;
105   bool GFX10_3Insts;
106   bool GFX7GFX8GFX9Insts;
107   bool SGPRInitBug;
108   bool NegativeScratchOffsetBug;
109   bool NegativeUnalignedScratchOffsetBug;
110   bool HasSMemRealTime;
111   bool HasIntClamp;
112   bool HasFmaMixInsts;
113   bool HasMovrel;
114   bool HasVGPRIndexMode;
115   bool HasScalarStores;
116   bool HasScalarAtomics;
117   bool HasSDWAOmod;
118   bool HasSDWAScalar;
119   bool HasSDWASdst;
120   bool HasSDWAMac;
121   bool HasSDWAOutModsVOPC;
122   bool HasDPP;
123   bool HasDPP8;
124   bool Has64BitDPP;
125   bool HasPackedFP32Ops;
126   bool HasExtendedImageInsts;
127   bool HasR128A16;
128   bool HasGFX10A16;
129   bool HasG16;
130   bool HasNSAEncoding;
131   unsigned NSAMaxSize;
132   bool GFX10_AEncoding;
133   bool GFX10_BEncoding;
134   bool HasDLInsts;
135   bool HasDot1Insts;
136   bool HasDot2Insts;
137   bool HasDot3Insts;
138   bool HasDot4Insts;
139   bool HasDot5Insts;
140   bool HasDot6Insts;
141   bool HasDot7Insts;
142   bool HasMAIInsts;
143   bool HasPkFmacF16Inst;
144   bool HasAtomicFaddInsts;
145   bool SupportsSRAMECC;
146 
147   // This should not be used directly. 'TargetID' tracks the dynamic settings
148   // for SRAMECC.
149   bool EnableSRAMECC;
150 
151   bool HasNoSdstCMPX;
152   bool HasVscnt;
153   bool HasGetWaveIdInst;
154   bool HasSMemTimeInst;
155   bool HasShaderCyclesRegister;
156   bool HasVOP3Literal;
157   bool HasNoDataDepHazard;
158   bool FlatAddressSpace;
159   bool FlatInstOffsets;
160   bool FlatGlobalInsts;
161   bool FlatScratchInsts;
162   bool ScalarFlatScratchInsts;
163   bool HasArchitectedFlatScratch;
164   bool AddNoCarryInsts;
165   bool HasUnpackedD16VMem;
166   bool LDSMisalignedBug;
167   bool HasMFMAInlineLiteralBug;
168   bool UnalignedBufferAccess;
169   bool UnalignedDSAccess;
170   bool HasPackedTID;
171   bool ScalarizeGlobal;
172 
173   bool HasVcmpxPermlaneHazard;
174   bool HasVMEMtoScalarWriteHazard;
175   bool HasSMEMtoVectorWriteHazard;
176   bool HasInstFwdPrefetchBug;
177   bool HasVcmpxExecWARHazard;
178   bool HasLdsBranchVmemWARHazard;
179   bool HasNSAtoVMEMBug;
180   bool HasNSAClauseBug;
181   bool HasOffset3fBug;
182   bool HasFlatSegmentOffsetBug;
183   bool HasImageStoreD16Bug;
184   bool HasImageGather4D16Bug;
185 
186   // Dummy feature to use for assembler in tablegen.
187   bool FeatureDisable;
188 
189   SelectionDAGTargetInfo TSInfo;
190 private:
191   SIInstrInfo InstrInfo;
192   SITargetLowering TLInfo;
193   SIFrameLowering FrameLowering;
194 
195 public:
196   // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
197   static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
198 
199   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
200                const GCNTargetMachine &TM);
201   ~GCNSubtarget() override;
202 
203   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
204                                                    StringRef GPU, StringRef FS);
205 
206   const SIInstrInfo *getInstrInfo() const override {
207     return &InstrInfo;
208   }
209 
210   const SIFrameLowering *getFrameLowering() const override {
211     return &FrameLowering;
212   }
213 
214   const SITargetLowering *getTargetLowering() const override {
215     return &TLInfo;
216   }
217 
218   const SIRegisterInfo *getRegisterInfo() const override {
219     return &InstrInfo.getRegisterInfo();
220   }
221 
222   const CallLowering *getCallLowering() const override {
223     return CallLoweringInfo.get();
224   }
225 
226   const InlineAsmLowering *getInlineAsmLowering() const override {
227     return InlineAsmLoweringInfo.get();
228   }
229 
230   InstructionSelector *getInstructionSelector() const override {
231     return InstSelector.get();
232   }
233 
234   const LegalizerInfo *getLegalizerInfo() const override {
235     return Legalizer.get();
236   }
237 
238   const RegisterBankInfo *getRegBankInfo() const override {
239     return RegBankInfo.get();
240   }
241 
242   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
243     return TargetID;
244   }
245 
246   // Nothing implemented, just prevent crashes on use.
247   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
248     return &TSInfo;
249   }
250 
251   const InstrItineraryData *getInstrItineraryData() const override {
252     return &InstrItins;
253   }
254 
255   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
256 
257   Generation getGeneration() const {
258     return (Generation)Gen;
259   }
260 
261   /// Return the number of high bits known to be zero for a frame index.
262   unsigned getKnownHighZeroBitsForFrameIndex() const {
263     return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
264   }
265 
266   int getLDSBankCount() const {
267     return LDSBankCount;
268   }
269 
270   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
271     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
272   }
273 
274   unsigned getConstantBusLimit(unsigned Opcode) const;
275 
276   /// Returns if the result of this instruction with a 16-bit result returned in
277   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
278   /// the original value.
279   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
280 
281   bool hasIntClamp() const {
282     return HasIntClamp;
283   }
284 
285   bool hasFP64() const {
286     return FP64;
287   }
288 
289   bool hasMIMG_R128() const {
290     return MIMG_R128;
291   }
292 
293   bool hasHWFP64() const {
294     return FP64;
295   }
296 
297   bool hasFastFMAF32() const {
298     return FastFMAF32;
299   }
300 
301   bool hasHalfRate64Ops() const {
302     return HalfRate64Ops;
303   }
304 
305   bool hasFullRate64Ops() const {
306     return FullRate64Ops;
307   }
308 
309   bool hasAddr64() const {
310     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
311   }
312 
313   bool hasFlat() const {
314     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
315   }
316 
317   // Return true if the target only has the reverse operand versions of VALU
318   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
319   bool hasOnlyRevVALUShifts() const {
320     return getGeneration() >= VOLCANIC_ISLANDS;
321   }
322 
323   bool hasFractBug() const {
324     return getGeneration() == SOUTHERN_ISLANDS;
325   }
326 
327   bool hasBFE() const {
328     return true;
329   }
330 
331   bool hasBFI() const {
332     return true;
333   }
334 
335   bool hasBFM() const {
336     return hasBFE();
337   }
338 
339   bool hasBCNT(unsigned Size) const {
340     return true;
341   }
342 
343   bool hasFFBL() const {
344     return true;
345   }
346 
347   bool hasFFBH() const {
348     return true;
349   }
350 
351   bool hasMed3_16() const {
352     return getGeneration() >= AMDGPUSubtarget::GFX9;
353   }
354 
355   bool hasMin3Max3_16() const {
356     return getGeneration() >= AMDGPUSubtarget::GFX9;
357   }
358 
359   bool hasFmaMixInsts() const {
360     return HasFmaMixInsts;
361   }
362 
363   bool hasCARRY() const {
364     return true;
365   }
366 
367   bool hasFMA() const {
368     return FMA;
369   }
370 
371   bool hasSwap() const {
372     return GFX9Insts;
373   }
374 
375   bool hasScalarPackInsts() const {
376     return GFX9Insts;
377   }
378 
379   bool hasScalarMulHiInsts() const {
380     return GFX9Insts;
381   }
382 
383   TrapHandlerAbi getTrapHandlerAbi() const {
384     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
385   }
386 
387   bool supportsGetDoorbellID() const {
388     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
389     return getGeneration() >= GFX9;
390   }
391 
392   /// True if the offset field of DS instructions works as expected. On SI, the
393   /// offset uses a 16-bit adder and does not always wrap properly.
394   bool hasUsableDSOffset() const {
395     return getGeneration() >= SEA_ISLANDS;
396   }
397 
398   bool unsafeDSOffsetFoldingEnabled() const {
399     return EnableUnsafeDSOffsetFolding;
400   }
401 
402   /// Condition output from div_scale is usable.
403   bool hasUsableDivScaleConditionOutput() const {
404     return getGeneration() != SOUTHERN_ISLANDS;
405   }
406 
407   /// Extra wait hazard is needed in some cases before
408   /// s_cbranch_vccnz/s_cbranch_vccz.
409   bool hasReadVCCZBug() const {
410     return getGeneration() <= SEA_ISLANDS;
411   }
412 
413   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
414   bool partialVCCWritesUpdateVCCZ() const {
415     return getGeneration() >= GFX10;
416   }
417 
418   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
419   /// was written by a VALU instruction.
420   bool hasSMRDReadVALUDefHazard() const {
421     return getGeneration() == SOUTHERN_ISLANDS;
422   }
423 
424   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
425   /// SGPR was written by a VALU Instruction.
426   bool hasVMEMReadSGPRVALUDefHazard() const {
427     return getGeneration() >= VOLCANIC_ISLANDS;
428   }
429 
430   bool hasRFEHazards() const {
431     return getGeneration() >= VOLCANIC_ISLANDS;
432   }
433 
434   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
435   unsigned getSetRegWaitStates() const {
436     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
437   }
438 
439   bool dumpCode() const {
440     return DumpCode;
441   }
442 
443   /// Return the amount of LDS that can be used that will not restrict the
444   /// occupancy lower than WaveCount.
445   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
446                                            const Function &) const;
447 
448   bool supportsMinMaxDenormModes() const {
449     return getGeneration() >= AMDGPUSubtarget::GFX9;
450   }
451 
452   /// \returns If target supports S_DENORM_MODE.
453   bool hasDenormModeInst() const {
454     return getGeneration() >= AMDGPUSubtarget::GFX10;
455   }
456 
457   bool useFlatForGlobal() const {
458     return FlatForGlobal;
459   }
460 
461   /// \returns If target supports ds_read/write_b128 and user enables generation
462   /// of ds_read/write_b128.
463   bool useDS128() const {
464     return CIInsts && EnableDS128;
465   }
466 
467   /// \return If target supports ds_read/write_b96/128.
468   bool hasDS96AndDS128() const {
469     return CIInsts;
470   }
471 
472   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
473   bool haveRoundOpsF64() const {
474     return CIInsts;
475   }
476 
477   /// \returns If MUBUF instructions always perform range checking, even for
478   /// buffer resources used for private memory access.
479   bool privateMemoryResourceIsRangeChecked() const {
480     return getGeneration() < AMDGPUSubtarget::GFX9;
481   }
482 
483   /// \returns If target requires PRT Struct NULL support (zero result registers
484   /// for sparse texture support).
485   bool usePRTStrictNull() const {
486     return EnablePRTStrictNull;
487   }
488 
489   bool hasAutoWaitcntBeforeBarrier() const {
490     return AutoWaitcntBeforeBarrier;
491   }
492 
493   bool hasUnalignedBufferAccess() const {
494     return UnalignedBufferAccess;
495   }
496 
497   bool hasUnalignedBufferAccessEnabled() const {
498     return UnalignedBufferAccess && UnalignedAccessMode;
499   }
500 
501   bool hasUnalignedDSAccess() const {
502     return UnalignedDSAccess;
503   }
504 
505   bool hasUnalignedDSAccessEnabled() const {
506     return UnalignedDSAccess && UnalignedAccessMode;
507   }
508 
509   bool hasUnalignedScratchAccess() const {
510     return UnalignedScratchAccess;
511   }
512 
513   bool hasUnalignedAccessMode() const {
514     return UnalignedAccessMode;
515   }
516 
517   bool hasApertureRegs() const {
518     return HasApertureRegs;
519   }
520 
521   bool isTrapHandlerEnabled() const {
522     return TrapHandler;
523   }
524 
525   bool isXNACKEnabled() const {
526     return TargetID.isXnackOnOrAny();
527   }
528 
529   bool isTgSplitEnabled() const {
530     return EnableTgSplit;
531   }
532 
533   bool isCuModeEnabled() const {
534     return EnableCuMode;
535   }
536 
537   bool hasFlatAddressSpace() const {
538     return FlatAddressSpace;
539   }
540 
541   bool hasFlatScrRegister() const {
542     return hasFlatAddressSpace();
543   }
544 
545   bool hasFlatInstOffsets() const {
546     return FlatInstOffsets;
547   }
548 
549   bool hasFlatGlobalInsts() const {
550     return FlatGlobalInsts;
551   }
552 
553   bool hasFlatScratchInsts() const {
554     return FlatScratchInsts;
555   }
556 
557   // Check if target supports ST addressing mode with FLAT scratch instructions.
558   // The ST addressing mode means no registers are used, either VGPR or SGPR,
559   // but only immediate offset is swizzled and added to the FLAT scratch base.
560   bool hasFlatScratchSTMode() const {
561     return hasFlatScratchInsts() && hasGFX10_3Insts();
562   }
563 
564   bool hasScalarFlatScratchInsts() const {
565     return ScalarFlatScratchInsts;
566   }
567 
568   bool hasGlobalAddTidInsts() const {
569     return GFX10_BEncoding;
570   }
571 
572   bool hasAtomicCSub() const {
573     return GFX10_BEncoding;
574   }
575 
576   bool hasMultiDwordFlatScratchAddressing() const {
577     return getGeneration() >= GFX9;
578   }
579 
580   bool hasFlatSegmentOffsetBug() const {
581     return HasFlatSegmentOffsetBug;
582   }
583 
584   bool hasFlatLgkmVMemCountInOrder() const {
585     return getGeneration() > GFX9;
586   }
587 
588   bool hasD16LoadStore() const {
589     return getGeneration() >= GFX9;
590   }
591 
592   bool d16PreservesUnusedBits() const {
593     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
594   }
595 
596   bool hasD16Images() const {
597     return getGeneration() >= VOLCANIC_ISLANDS;
598   }
599 
600   /// Return if most LDS instructions have an m0 use that require m0 to be
601   /// initialized.
602   bool ldsRequiresM0Init() const {
603     return getGeneration() < GFX9;
604   }
605 
606   // True if the hardware rewinds and replays GWS operations if a wave is
607   // preempted.
608   //
609   // If this is false, a GWS operation requires testing if a nack set the
610   // MEM_VIOL bit, and repeating if so.
611   bool hasGWSAutoReplay() const {
612     return getGeneration() >= GFX9;
613   }
614 
615   /// \returns if target has ds_gws_sema_release_all instruction.
616   bool hasGWSSemaReleaseAll() const {
617     return CIInsts;
618   }
619 
620   /// \returns true if the target has integer add/sub instructions that do not
621   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
622   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
623   /// for saturation.
624   bool hasAddNoCarry() const {
625     return AddNoCarryInsts;
626   }
627 
628   bool hasUnpackedD16VMem() const {
629     return HasUnpackedD16VMem;
630   }
631 
632   // Covers VS/PS/CS graphics shaders
633   bool isMesaGfxShader(const Function &F) const {
634     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
635   }
636 
637   bool hasMad64_32() const {
638     return getGeneration() >= SEA_ISLANDS;
639   }
640 
641   bool hasSDWAOmod() const {
642     return HasSDWAOmod;
643   }
644 
645   bool hasSDWAScalar() const {
646     return HasSDWAScalar;
647   }
648 
649   bool hasSDWASdst() const {
650     return HasSDWASdst;
651   }
652 
653   bool hasSDWAMac() const {
654     return HasSDWAMac;
655   }
656 
657   bool hasSDWAOutModsVOPC() const {
658     return HasSDWAOutModsVOPC;
659   }
660 
661   bool hasDLInsts() const {
662     return HasDLInsts;
663   }
664 
665   bool hasDot1Insts() const {
666     return HasDot1Insts;
667   }
668 
669   bool hasDot2Insts() const {
670     return HasDot2Insts;
671   }
672 
673   bool hasDot3Insts() const {
674     return HasDot3Insts;
675   }
676 
677   bool hasDot4Insts() const {
678     return HasDot4Insts;
679   }
680 
681   bool hasDot5Insts() const {
682     return HasDot5Insts;
683   }
684 
685   bool hasDot6Insts() const {
686     return HasDot6Insts;
687   }
688 
689   bool hasDot7Insts() const {
690     return HasDot7Insts;
691   }
692 
693   bool hasMAIInsts() const {
694     return HasMAIInsts;
695   }
696 
697   bool hasPkFmacF16Inst() const {
698     return HasPkFmacF16Inst;
699   }
700 
701   bool hasAtomicFaddInsts() const {
702     return HasAtomicFaddInsts;
703   }
704 
705   bool hasNoSdstCMPX() const {
706     return HasNoSdstCMPX;
707   }
708 
709   bool hasVscnt() const {
710     return HasVscnt;
711   }
712 
713   bool hasGetWaveIdInst() const {
714     return HasGetWaveIdInst;
715   }
716 
717   bool hasSMemTimeInst() const {
718     return HasSMemTimeInst;
719   }
720 
721   bool hasShaderCyclesRegister() const {
722     return HasShaderCyclesRegister;
723   }
724 
725   bool hasVOP3Literal() const {
726     return HasVOP3Literal;
727   }
728 
729   bool hasNoDataDepHazard() const {
730     return HasNoDataDepHazard;
731   }
732 
733   bool vmemWriteNeedsExpWaitcnt() const {
734     return getGeneration() < SEA_ISLANDS;
735   }
736 
737   // Scratch is allocated in 256 dword per wave blocks for the entire
738   // wavefront. When viewed from the perspective of an arbitrary workitem, this
739   // is 4-byte aligned.
740   //
741   // Only 4-byte alignment is really needed to access anything. Transformations
742   // on the pointer value itself may rely on the alignment / known low bits of
743   // the pointer. Set this to something above the minimum to avoid needing
744   // dynamic realignment in common cases.
745   Align getStackAlignment() const { return Align(16); }
746 
747   bool enableMachineScheduler() const override {
748     return true;
749   }
750 
751   bool useAA() const override;
752 
753   bool enableSubRegLiveness() const override {
754     return true;
755   }
756 
757   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
758   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
759 
760   // static wrappers
761   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
762 
763   // XXX - Why is this here if it isn't in the default pass set?
764   bool enableEarlyIfConversion() const override {
765     return true;
766   }
767 
768   bool enableFlatScratch() const;
769 
770   void overrideSchedPolicy(MachineSchedPolicy &Policy,
771                            unsigned NumRegionInstrs) const override;
772 
773   unsigned getMaxNumUserSGPRs() const {
774     return 16;
775   }
776 
777   bool hasSMemRealTime() const {
778     return HasSMemRealTime;
779   }
780 
781   bool hasMovrel() const {
782     return HasMovrel;
783   }
784 
785   bool hasVGPRIndexMode() const {
786     return HasVGPRIndexMode;
787   }
788 
789   bool useVGPRIndexMode() const;
790 
791   bool hasScalarCompareEq64() const {
792     return getGeneration() >= VOLCANIC_ISLANDS;
793   }
794 
795   bool hasScalarStores() const {
796     return HasScalarStores;
797   }
798 
799   bool hasScalarAtomics() const {
800     return HasScalarAtomics;
801   }
802 
803   bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
804 
805   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
806   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
807 
808   bool hasDPP() const {
809     return HasDPP;
810   }
811 
812   bool hasDPPBroadcasts() const {
813     return HasDPP && getGeneration() < GFX10;
814   }
815 
816   bool hasDPPWavefrontShifts() const {
817     return HasDPP && getGeneration() < GFX10;
818   }
819 
820   bool hasDPP8() const {
821     return HasDPP8;
822   }
823 
824   bool has64BitDPP() const {
825     return Has64BitDPP;
826   }
827 
828   bool hasPackedFP32Ops() const {
829     return HasPackedFP32Ops;
830   }
831 
832   bool hasFmaakFmamkF32Insts() const {
833     return getGeneration() >= GFX10;
834   }
835 
836   bool hasExtendedImageInsts() const {
837     return HasExtendedImageInsts;
838   }
839 
840   bool hasR128A16() const {
841     return HasR128A16;
842   }
843 
844   bool hasGFX10A16() const {
845     return HasGFX10A16;
846   }
847 
848   bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
849 
850   bool hasG16() const { return HasG16; }
851 
852   bool hasOffset3fBug() const {
853     return HasOffset3fBug;
854   }
855 
856   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
857 
858   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
859 
860   bool hasNSAEncoding() const { return HasNSAEncoding; }
861 
862   unsigned getNSAMaxSize() const { return NSAMaxSize; }
863 
864   bool hasGFX10_AEncoding() const {
865     return GFX10_AEncoding;
866   }
867 
868   bool hasGFX10_BEncoding() const {
869     return GFX10_BEncoding;
870   }
871 
872   bool hasGFX10_3Insts() const {
873     return GFX10_3Insts;
874   }
875 
876   bool hasMadF16() const;
877 
878   bool enableSIScheduler() const {
879     return EnableSIScheduler;
880   }
881 
882   bool loadStoreOptEnabled() const {
883     return EnableLoadStoreOpt;
884   }
885 
886   bool hasSGPRInitBug() const {
887     return SGPRInitBug;
888   }
889 
890   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
891 
892   bool hasNegativeUnalignedScratchOffsetBug() const {
893     return NegativeUnalignedScratchOffsetBug;
894   }
895 
896   bool hasMFMAInlineLiteralBug() const {
897     return HasMFMAInlineLiteralBug;
898   }
899 
900   bool has12DWordStoreHazard() const {
901     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
902   }
903 
904   // \returns true if the subtarget supports DWORDX3 load/store instructions.
905   bool hasDwordx3LoadStores() const {
906     return CIInsts;
907   }
908 
909   bool hasReadM0MovRelInterpHazard() const {
910     return getGeneration() == AMDGPUSubtarget::GFX9;
911   }
912 
913   bool hasReadM0SendMsgHazard() const {
914     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
915            getGeneration() <= AMDGPUSubtarget::GFX9;
916   }
917 
918   bool hasVcmpxPermlaneHazard() const {
919     return HasVcmpxPermlaneHazard;
920   }
921 
922   bool hasVMEMtoScalarWriteHazard() const {
923     return HasVMEMtoScalarWriteHazard;
924   }
925 
926   bool hasSMEMtoVectorWriteHazard() const {
927     return HasSMEMtoVectorWriteHazard;
928   }
929 
930   bool hasLDSMisalignedBug() const {
931     return LDSMisalignedBug && !EnableCuMode;
932   }
933 
934   bool hasInstFwdPrefetchBug() const {
935     return HasInstFwdPrefetchBug;
936   }
937 
938   bool hasVcmpxExecWARHazard() const {
939     return HasVcmpxExecWARHazard;
940   }
941 
942   bool hasLdsBranchVmemWARHazard() const {
943     return HasLdsBranchVmemWARHazard;
944   }
945 
946   bool hasNSAtoVMEMBug() const {
947     return HasNSAtoVMEMBug;
948   }
949 
950   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
951 
952   bool hasHardClauses() const { return getGeneration() >= GFX10; }
953 
954   bool hasGFX90AInsts() const { return GFX90AInsts; }
955 
956   /// Return if operations acting on VGPR tuples require even alignment.
957   bool needsAlignedVGPRs() const { return GFX90AInsts; }
958 
959   bool hasPackedTID() const { return HasPackedTID; }
960 
961   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
962   /// SGPRs
963   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
964 
965   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
966   /// VGPRs
967   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
968 
969   /// Return occupancy for the given function. Used LDS and a number of
970   /// registers if provided.
971   /// Note, occupancy can be affected by the scratch allocation as well, but
972   /// we do not have enough information to compute it.
973   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
974                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
975 
976   /// \returns true if the flat_scratch register should be initialized with the
977   /// pointer to the wave's scratch memory rather than a size and offset.
978   bool flatScratchIsPointer() const {
979     return getGeneration() >= AMDGPUSubtarget::GFX9;
980   }
981 
982   /// \returns true if the flat_scratch register is initialized by the HW.
983   /// In this case it is readonly.
984   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
985 
986   /// \returns true if the machine has merged shaders in which s0-s7 are
987   /// reserved by the hardware and user SGPRs start at s8
988   bool hasMergedShaders() const {
989     return getGeneration() >= GFX9;
990   }
991 
992   /// \returns SGPR allocation granularity supported by the subtarget.
993   unsigned getSGPRAllocGranule() const {
994     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
995   }
996 
997   /// \returns SGPR encoding granularity supported by the subtarget.
998   unsigned getSGPREncodingGranule() const {
999     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1000   }
1001 
1002   /// \returns Total number of SGPRs supported by the subtarget.
1003   unsigned getTotalNumSGPRs() const {
1004     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1005   }
1006 
1007   /// \returns Addressable number of SGPRs supported by the subtarget.
1008   unsigned getAddressableNumSGPRs() const {
1009     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1010   }
1011 
1012   /// \returns Minimum number of SGPRs that meets the given number of waves per
1013   /// execution unit requirement supported by the subtarget.
1014   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1015     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1016   }
1017 
1018   /// \returns Maximum number of SGPRs that meets the given number of waves per
1019   /// execution unit requirement supported by the subtarget.
1020   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1021     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1022   }
1023 
1024   /// \returns Reserved number of SGPRs. This is common
1025   /// utility function called by MachineFunction and
1026   /// Function variants of getReservedNumSGPRs.
1027   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1028   /// \returns Reserved number of SGPRs for given machine function \p MF.
1029   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1030 
1031   /// \returns Reserved number of SGPRs for given function \p F.
1032   unsigned getReservedNumSGPRs(const Function &F) const;
1033 
1034   /// \returns max num SGPRs. This is the common utility
1035   /// function called by MachineFunction and Function
1036   /// variants of getMaxNumSGPRs.
1037   unsigned getBaseMaxNumSGPRs(const Function &F,
1038                               std::pair<unsigned, unsigned> WavesPerEU,
1039                               unsigned PreloadedSGPRs,
1040                               unsigned ReservedNumSGPRs) const;
1041 
1042   /// \returns Maximum number of SGPRs that meets number of waves per execution
1043   /// unit requirement for function \p MF, or number of SGPRs explicitly
1044   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1045   ///
1046   /// \returns Value that meets number of waves per execution unit requirement
1047   /// if explicitly requested value cannot be converted to integer, violates
1048   /// subtarget's specifications, or does not meet number of waves per execution
1049   /// unit requirement.
1050   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1051 
1052   /// \returns Maximum number of SGPRs that meets number of waves per execution
1053   /// unit requirement for function \p F, or number of SGPRs explicitly
1054   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1055   ///
1056   /// \returns Value that meets number of waves per execution unit requirement
1057   /// if explicitly requested value cannot be converted to integer, violates
1058   /// subtarget's specifications, or does not meet number of waves per execution
1059   /// unit requirement.
1060   unsigned getMaxNumSGPRs(const Function &F) const;
1061 
1062   /// \returns VGPR allocation granularity supported by the subtarget.
1063   unsigned getVGPRAllocGranule() const {
1064     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1065   }
1066 
1067   /// \returns VGPR encoding granularity supported by the subtarget.
1068   unsigned getVGPREncodingGranule() const {
1069     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1070   }
1071 
1072   /// \returns Total number of VGPRs supported by the subtarget.
1073   unsigned getTotalNumVGPRs() const {
1074     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1075   }
1076 
1077   /// \returns Addressable number of VGPRs supported by the subtarget.
1078   unsigned getAddressableNumVGPRs() const {
1079     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1080   }
1081 
1082   /// \returns Minimum number of VGPRs that meets given number of waves per
1083   /// execution unit requirement supported by the subtarget.
1084   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1085     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1086   }
1087 
1088   /// \returns Maximum number of VGPRs that meets given number of waves per
1089   /// execution unit requirement supported by the subtarget.
1090   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1091     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1092   }
1093 
1094   /// \returns max num VGPRs. This is the common utility function
1095   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1096   unsigned getBaseMaxNumVGPRs(const Function &F,
1097                               std::pair<unsigned, unsigned> WavesPerEU) const;
1098   /// \returns Maximum number of VGPRs that meets number of waves per execution
1099   /// unit requirement for function \p F, or number of VGPRs explicitly
1100   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1101   ///
1102   /// \returns Value that meets number of waves per execution unit requirement
1103   /// if explicitly requested value cannot be converted to integer, violates
1104   /// subtarget's specifications, or does not meet number of waves per execution
1105   /// unit requirement.
1106   unsigned getMaxNumVGPRs(const Function &F) const;
1107 
1108   /// \returns Maximum number of VGPRs that meets number of waves per execution
1109   /// unit requirement for function \p MF, or number of VGPRs explicitly
1110   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1111   ///
1112   /// \returns Value that meets number of waves per execution unit requirement
1113   /// if explicitly requested value cannot be converted to integer, violates
1114   /// subtarget's specifications, or does not meet number of waves per execution
1115   /// unit requirement.
1116   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1117 
1118   void getPostRAMutations(
1119       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1120       const override;
1121 
1122   std::unique_ptr<ScheduleDAGMutation>
1123   createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1124 
1125   bool isWave32() const {
1126     return getWavefrontSize() == 32;
1127   }
1128 
1129   bool isWave64() const {
1130     return getWavefrontSize() == 64;
1131   }
1132 
1133   const TargetRegisterClass *getBoolRC() const {
1134     return getRegisterInfo()->getBoolRC();
1135   }
1136 
1137   /// \returns Maximum number of work groups per compute unit supported by the
1138   /// subtarget and limited by given \p FlatWorkGroupSize.
1139   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1140     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1141   }
1142 
1143   /// \returns Minimum flat work group size supported by the subtarget.
1144   unsigned getMinFlatWorkGroupSize() const override {
1145     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1146   }
1147 
1148   /// \returns Maximum flat work group size supported by the subtarget.
1149   unsigned getMaxFlatWorkGroupSize() const override {
1150     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1151   }
1152 
1153   /// \returns Number of waves per execution unit required to support the given
1154   /// \p FlatWorkGroupSize.
1155   unsigned
1156   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1157     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1158   }
1159 
1160   /// \returns Minimum number of waves per execution unit supported by the
1161   /// subtarget.
1162   unsigned getMinWavesPerEU() const override {
1163     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1164   }
1165 
1166   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1167                              SDep &Dep) const override;
1168 };
1169 
1170 } // end namespace llvm
1171 
1172 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1173