xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Base class for AMDGPU specific classes of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16 
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/IR/CallingConv.h"
19 #include "llvm/Support/Alignment.h"
20 #include "llvm/TargetParser/Triple.h"
21 
22 namespace llvm {
23 
24 enum AMDGPUDwarfFlavour : unsigned;
25 class Function;
26 class Instruction;
27 class MachineFunction;
28 class TargetMachine;
29 
30 class AMDGPUSubtarget {
31 public:
32   enum Generation {
33     INVALID = 0,
34     R600 = 1,
35     R700 = 2,
36     EVERGREEN = 3,
37     NORTHERN_ISLANDS = 4,
38     SOUTHERN_ISLANDS = 5,
39     SEA_ISLANDS = 6,
40     VOLCANIC_ISLANDS = 7,
41     GFX9 = 8,
42     GFX10 = 9,
43     GFX11 = 10,
44     GFX12 = 11,
45   };
46 
47 private:
48   Triple TargetTriple;
49 
50 protected:
51   bool GCN3Encoding = false;
52   bool Has16BitInsts = false;
53   bool HasTrue16BitInsts = false;
54   bool HasFP8ConversionScaleInsts = false;
55   bool HasBF8ConversionScaleInsts = false;
56   bool HasFP4ConversionScaleInsts = false;
57   bool HasFP6BF6ConversionScaleInsts = false;
58   bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
59   bool HasCvtPkF16F32Inst = false;
60   bool HasF32ToF16BF16ConversionSRInsts = false;
61   bool EnableRealTrue16Insts = false;
62   bool HasBF16TransInsts = false;
63   bool HasBF16ConversionInsts = false;
64   bool HasMadMixInsts = false;
65   bool HasMadMacF32Insts = false;
66   bool HasDsSrc2Insts = false;
67   bool HasSDWA = false;
68   bool HasVOP3PInsts = false;
69   bool HasMulI24 = true;
70   bool HasMulU24 = true;
71   bool HasSMulHi = false;
72   bool HasInv2PiInlineImm = false;
73   bool HasFminFmaxLegacy = true;
74   bool EnablePromoteAlloca = false;
75   bool HasTrigReducedRange = false;
76   bool FastFMAF32 = false;
77   unsigned EUsPerCU = 4;
78   unsigned MaxWavesPerEU = 10;
79   unsigned LocalMemorySize = 0;
80   unsigned AddressableLocalMemorySize = 0;
81   char WavefrontSizeLog2 = 0;
82 
83 public:
84   AMDGPUSubtarget(Triple TT);
85 
86   static const AMDGPUSubtarget &get(const MachineFunction &MF);
87   static const AMDGPUSubtarget &get(const TargetMachine &TM,
88                                     const Function &F);
89 
90   /// \returns Default range flat work group size for a calling convention.
91   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
92 
93   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
94   /// for function \p F, or minimum/maximum flat work group sizes explicitly
95   /// requested using "amdgpu-flat-work-group-size" attribute attached to
96   /// function \p F.
97   ///
98   /// \returns Subtarget's default values if explicitly requested values cannot
99   /// be converted to integer, or violate subtarget's specifications.
100   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
101 
102   /// \returns Subtarget's default pair of minimum/maximum number of waves per
103   /// execution unit for function \p F, or minimum/maximum number of waves per
104   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
105   /// attached to function \p F.
106   ///
107   /// \returns Subtarget's default values if explicitly requested values cannot
108   /// be converted to integer, violate subtarget's specifications, or are not
109   /// compatible with minimum/maximum number of waves limited by flat work group
110   /// size, register usage, and/or lds usage.
111   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
112 
113   /// Overload which uses the specified values for the flat work group sizes,
114   /// rather than querying the function itself. \p FlatWorkGroupSizes Should
115   /// correspond to the function's value for getFlatWorkGroupSizes.
116   std::pair<unsigned, unsigned>
117   getWavesPerEU(const Function &F,
118                 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
119 
120   /// Overload which uses the specified values for the flat workgroup sizes and
121   /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
122   /// should correspond to the function's value for getFlatWorkGroupSizes and \p
123   /// LDSBytes to the per-workgroup LDS allocation.
124   std::pair<unsigned, unsigned>
125   getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
126                 unsigned LDSBytes, const Function &F) const;
127 
128   /// Returns the target minimum/maximum number of waves per EU. This is based
129   /// on the minimum/maximum number of \p RequestedWavesPerEU and further
130   /// limited by the maximum achievable occupancy derived from the range of \p
131   /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
132   std::pair<unsigned, unsigned>
133   getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
134                          std::pair<unsigned, unsigned> FlatWorkGroupSizes,
135                          unsigned LDSBytes) const;
136 
137   /// Return the amount of LDS that can be used that will not restrict the
138   /// occupancy lower than WaveCount.
139   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
140                                            const Function &) const;
141 
142   /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
143   /// be achieved when the only function running on a CU is \p F and each
144   /// workgroup running the function requires \p LDSBytes bytes of LDS space.
145   /// This notably depends on the range of allowed flat group sizes for the
146   /// function and hardware characteristics.
147   std::pair<unsigned, unsigned>
getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,const Function & F)148   getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
149     return getOccupancyWithWorkGroupSizes(LDSBytes, getFlatWorkGroupSizes(F));
150   }
151 
152   /// Overload which uses the specified values for the flat work group sizes,
153   /// rather than querying the function itself. \p FlatWorkGroupSizes should
154   /// correspond to the function's value for getFlatWorkGroupSizes.
155   std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
156       uint32_t LDSBytes,
157       std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
158 
159   /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
160   /// be achieved when the only function running on a CU is \p MF. This notably
161   /// depends on the range of allowed flat group sizes for the function, the
162   /// amount of per-workgroup LDS space required by the function, and hardware
163   /// characteristics.
164   std::pair<unsigned, unsigned>
165   getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
166 
isAmdHsaOS()167   bool isAmdHsaOS() const {
168     return TargetTriple.getOS() == Triple::AMDHSA;
169   }
170 
isAmdPalOS()171   bool isAmdPalOS() const {
172     return TargetTriple.getOS() == Triple::AMDPAL;
173   }
174 
isMesa3DOS()175   bool isMesa3DOS() const {
176     return TargetTriple.getOS() == Triple::Mesa3D;
177   }
178 
179   bool isMesaKernel(const Function &F) const;
180 
isAmdHsaOrMesa(const Function & F)181   bool isAmdHsaOrMesa(const Function &F) const {
182     return isAmdHsaOS() || isMesaKernel(F);
183   }
184 
isGCN()185   bool isGCN() const { return TargetTriple.isAMDGCN(); }
186 
isGCN3Encoding()187   bool isGCN3Encoding() const {
188     return GCN3Encoding;
189   }
190 
has16BitInsts()191   bool has16BitInsts() const {
192     return Has16BitInsts;
193   }
194 
195   /// Return true if the subtarget supports True16 instructions.
hasTrue16BitInsts()196   bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
197 
198   /// Return true if real (non-fake) variants of True16 instructions using
199   /// 16-bit registers should be code-generated. Fake True16 instructions are
200   /// identical to non-fake ones except that they take 32-bit registers as
201   /// operands and always use their low halves.
202   // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
203   // supported and the support for fake True16 instructions is removed.
204   bool useRealTrue16Insts() const;
205 
hasBF16TransInsts()206   bool hasBF16TransInsts() const { return HasBF16TransInsts; }
207 
hasBF16ConversionInsts()208   bool hasBF16ConversionInsts() const {
209     return HasBF16ConversionInsts;
210   }
211 
hasMadMixInsts()212   bool hasMadMixInsts() const {
213     return HasMadMixInsts;
214   }
215 
hasFP8ConversionScaleInsts()216   bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; }
217 
hasBF8ConversionScaleInsts()218   bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; }
219 
hasFP4ConversionScaleInsts()220   bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; }
221 
hasFP6BF6ConversionScaleInsts()222   bool hasFP6BF6ConversionScaleInsts() const {
223     return HasFP6BF6ConversionScaleInsts;
224   }
225 
hasF16BF16ToFP6BF6ConversionScaleInsts()226   bool hasF16BF16ToFP6BF6ConversionScaleInsts() const {
227     return HasF16BF16ToFP6BF6ConversionScaleInsts;
228   }
229 
hasCvtPkF16F32Inst()230   bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
231 
hasF32ToF16BF16ConversionSRInsts()232   bool hasF32ToF16BF16ConversionSRInsts() const {
233     return HasF32ToF16BF16ConversionSRInsts;
234   }
235 
hasMadMacF32Insts()236   bool hasMadMacF32Insts() const {
237     return HasMadMacF32Insts || !isGCN();
238   }
239 
hasDsSrc2Insts()240   bool hasDsSrc2Insts() const {
241     return HasDsSrc2Insts;
242   }
243 
hasSDWA()244   bool hasSDWA() const {
245     return HasSDWA;
246   }
247 
hasVOP3PInsts()248   bool hasVOP3PInsts() const {
249     return HasVOP3PInsts;
250   }
251 
hasMulI24()252   bool hasMulI24() const {
253     return HasMulI24;
254   }
255 
hasMulU24()256   bool hasMulU24() const {
257     return HasMulU24;
258   }
259 
hasSMulHi()260   bool hasSMulHi() const {
261     return HasSMulHi;
262   }
263 
hasInv2PiInlineImm()264   bool hasInv2PiInlineImm() const {
265     return HasInv2PiInlineImm;
266   }
267 
hasFminFmaxLegacy()268   bool hasFminFmaxLegacy() const {
269     return HasFminFmaxLegacy;
270   }
271 
hasTrigReducedRange()272   bool hasTrigReducedRange() const {
273     return HasTrigReducedRange;
274   }
275 
hasFastFMAF32()276   bool hasFastFMAF32() const {
277     return FastFMAF32;
278   }
279 
isPromoteAllocaEnabled()280   bool isPromoteAllocaEnabled() const {
281     return EnablePromoteAlloca;
282   }
283 
getWavefrontSize()284   unsigned getWavefrontSize() const {
285     return 1 << WavefrontSizeLog2;
286   }
287 
getWavefrontSizeLog2()288   unsigned getWavefrontSizeLog2() const {
289     return WavefrontSizeLog2;
290   }
291 
292   /// Return the maximum number of bytes of LDS available for all workgroups
293   /// running on the same WGP or CU.
294   /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
295   /// limited to 64k.
getLocalMemorySize()296   unsigned getLocalMemorySize() const {
297     return LocalMemorySize;
298   }
299 
300   /// Return the maximum number of bytes of LDS that can be allocated to a
301   /// single workgroup.
302   /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
303   /// 128k in total.
getAddressableLocalMemorySize()304   unsigned getAddressableLocalMemorySize() const {
305     return AddressableLocalMemorySize;
306   }
307 
308   /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
309   /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
310   /// CU mode into account.
getEUsPerCU()311   unsigned getEUsPerCU() const { return EUsPerCU; }
312 
getAlignmentForImplicitArgPtr()313   Align getAlignmentForImplicitArgPtr() const {
314     return isAmdHsaOS() ? Align(8) : Align(4);
315   }
316 
317   /// Returns the offset in bytes from the start of the input buffer
318   ///        of the first explicit kernel argument.
getExplicitKernelArgOffset()319   unsigned getExplicitKernelArgOffset() const {
320     switch (TargetTriple.getOS()) {
321     case Triple::AMDHSA:
322     case Triple::AMDPAL:
323     case Triple::Mesa3D:
324       return 0;
325     case Triple::UnknownOS:
326     default:
327       // For legacy reasons unknown/other is treated as a different version of
328       // mesa.
329       return 36;
330     }
331 
332     llvm_unreachable("invalid triple OS");
333   }
334 
335   /// \returns Maximum number of work groups per compute unit supported by the
336   /// subtarget and limited by given \p FlatWorkGroupSize.
337   virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
338 
339   /// \returns Minimum flat work group size supported by the subtarget.
340   virtual unsigned getMinFlatWorkGroupSize() const = 0;
341 
342   /// \returns Maximum flat work group size supported by the subtarget.
343   virtual unsigned getMaxFlatWorkGroupSize() const = 0;
344 
345   /// \returns Number of waves per execution unit required to support the given
346   /// \p FlatWorkGroupSize.
347   virtual unsigned
348   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
349 
350   /// \returns Minimum number of waves per execution unit supported by the
351   /// subtarget.
352   virtual unsigned getMinWavesPerEU() const = 0;
353 
354   /// \returns Maximum number of waves per execution unit supported by the
355   /// subtarget without any kind of limitation.
getMaxWavesPerEU()356   unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
357 
358   /// Return the maximum workitem ID value in the function, for the given (0, 1,
359   /// 2) dimension.
360   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
361 
362   /// Return the number of work groups for the function.
363   SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
364 
365   /// Return true if only a single workitem can be active in a wave.
366   bool isSingleLaneExecution(const Function &Kernel) const;
367 
368   /// Creates value range metadata on an workitemid.* intrinsic call or load.
369   bool makeLIDRangeMetadata(Instruction *I) const;
370 
371   /// \returns Number of bytes of arguments that are passed to a shader or
372   /// kernel in addition to the explicit ones declared for the function.
373   unsigned getImplicitArgNumBytes(const Function &F) const;
374   uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
375   unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
376 
377   /// \returns Corresponding DWARF register number mapping flavour for the
378   /// \p WavefrontSize.
379   AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
380 
381   virtual ~AMDGPUSubtarget() = default;
382 };
383 
384 } // end namespace llvm
385 
386 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
387