1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Base class for AMDGPU specific classes of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 16 17 #include "llvm/IR/CallingConv.h" 18 #include "llvm/Support/Alignment.h" 19 #include "llvm/TargetParser/Triple.h" 20 21 namespace llvm { 22 23 enum AMDGPUDwarfFlavour : unsigned; 24 class Function; 25 class Instruction; 26 class MachineFunction; 27 class TargetMachine; 28 29 class AMDGPUSubtarget { 30 public: 31 enum Generation { 32 INVALID = 0, 33 R600 = 1, 34 R700 = 2, 35 EVERGREEN = 3, 36 NORTHERN_ISLANDS = 4, 37 SOUTHERN_ISLANDS = 5, 38 SEA_ISLANDS = 6, 39 VOLCANIC_ISLANDS = 7, 40 GFX9 = 8, 41 GFX10 = 9, 42 GFX11 = 10, 43 GFX12 = 11, 44 }; 45 46 private: 47 Triple TargetTriple; 48 49 protected: 50 bool GCN3Encoding = false; 51 bool Has16BitInsts = false; 52 bool HasTrue16BitInsts = false; 53 bool EnableRealTrue16Insts = false; 54 bool HasMadMixInsts = false; 55 bool HasMadMacF32Insts = false; 56 bool HasDsSrc2Insts = false; 57 bool HasSDWA = false; 58 bool HasVOP3PInsts = false; 59 bool HasMulI24 = true; 60 bool HasMulU24 = true; 61 bool HasSMulHi = false; 62 bool HasInv2PiInlineImm = false; 63 bool HasFminFmaxLegacy = true; 64 bool EnablePromoteAlloca = false; 65 bool HasTrigReducedRange = false; 66 bool FastFMAF32 = false; 67 unsigned EUsPerCU = 4; 68 unsigned MaxWavesPerEU = 10; 69 unsigned LocalMemorySize = 0; 70 unsigned AddressableLocalMemorySize = 0; 71 char WavefrontSizeLog2 = 0; 72 73 public: 74 AMDGPUSubtarget(Triple TT); 75 76 static const AMDGPUSubtarget &get(const MachineFunction &MF); 77 static const AMDGPUSubtarget &get(const TargetMachine &TM, 78 const Function &F); 79 80 /// \returns Default range flat work group size for a calling convention. 81 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; 82 83 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes 84 /// for function \p F, or minimum/maximum flat work group sizes explicitly 85 /// requested using "amdgpu-flat-work-group-size" attribute attached to 86 /// function \p F. 87 /// 88 /// \returns Subtarget's default values if explicitly requested values cannot 89 /// be converted to integer, or violate subtarget's specifications. 90 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; 91 92 /// \returns Subtarget's default pair of minimum/maximum number of waves per 93 /// execution unit for function \p F, or minimum/maximum number of waves per 94 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute 95 /// attached to function \p F. 96 /// 97 /// \returns Subtarget's default values if explicitly requested values cannot 98 /// be converted to integer, violate subtarget's specifications, or are not 99 /// compatible with minimum/maximum number of waves limited by flat work group 100 /// size, register usage, and/or lds usage. getWavesPerEU(const Function & F)101 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const { 102 // Default/requested minimum/maximum flat work group sizes. 103 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 104 return getWavesPerEU(F, FlatWorkGroupSizes); 105 } 106 107 /// Overload which uses the specified values for the flat work group sizes, 108 /// rather than querying the function itself. \p FlatWorkGroupSizes Should 109 /// correspond to the function's value for getFlatWorkGroupSizes. 110 std::pair<unsigned, unsigned> 111 getWavesPerEU(const Function &F, 112 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; 113 std::pair<unsigned, unsigned> getEffectiveWavesPerEU( 114 std::pair<unsigned, unsigned> WavesPerEU, 115 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; 116 117 /// Return the amount of LDS that can be used that will not restrict the 118 /// occupancy lower than WaveCount. 119 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 120 const Function &) const; 121 122 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if 123 /// the given LDS memory size is the only constraint. 124 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; 125 126 unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; 127 isAmdHsaOS()128 bool isAmdHsaOS() const { 129 return TargetTriple.getOS() == Triple::AMDHSA; 130 } 131 isAmdPalOS()132 bool isAmdPalOS() const { 133 return TargetTriple.getOS() == Triple::AMDPAL; 134 } 135 isMesa3DOS()136 bool isMesa3DOS() const { 137 return TargetTriple.getOS() == Triple::Mesa3D; 138 } 139 140 bool isMesaKernel(const Function &F) const; 141 isAmdHsaOrMesa(const Function & F)142 bool isAmdHsaOrMesa(const Function &F) const { 143 return isAmdHsaOS() || isMesaKernel(F); 144 } 145 isGCN()146 bool isGCN() const { 147 return TargetTriple.getArch() == Triple::amdgcn; 148 } 149 isGCN3Encoding()150 bool isGCN3Encoding() const { 151 return GCN3Encoding; 152 } 153 has16BitInsts()154 bool has16BitInsts() const { 155 return Has16BitInsts; 156 } 157 158 /// Return true if the subtarget supports True16 instructions. hasTrue16BitInsts()159 bool hasTrue16BitInsts() const { return HasTrue16BitInsts; } 160 161 /// Return true if real (non-fake) variants of True16 instructions using 162 /// 16-bit registers should be code-generated. Fake True16 instructions are 163 /// identical to non-fake ones except that they take 32-bit registers as 164 /// operands and always use their low halves. 165 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully 166 // supported and the support for fake True16 instructions is removed. 167 bool useRealTrue16Insts() const; 168 hasMadMixInsts()169 bool hasMadMixInsts() const { 170 return HasMadMixInsts; 171 } 172 hasMadMacF32Insts()173 bool hasMadMacF32Insts() const { 174 return HasMadMacF32Insts || !isGCN(); 175 } 176 hasDsSrc2Insts()177 bool hasDsSrc2Insts() const { 178 return HasDsSrc2Insts; 179 } 180 hasSDWA()181 bool hasSDWA() const { 182 return HasSDWA; 183 } 184 hasVOP3PInsts()185 bool hasVOP3PInsts() const { 186 return HasVOP3PInsts; 187 } 188 hasMulI24()189 bool hasMulI24() const { 190 return HasMulI24; 191 } 192 hasMulU24()193 bool hasMulU24() const { 194 return HasMulU24; 195 } 196 hasSMulHi()197 bool hasSMulHi() const { 198 return HasSMulHi; 199 } 200 hasInv2PiInlineImm()201 bool hasInv2PiInlineImm() const { 202 return HasInv2PiInlineImm; 203 } 204 hasFminFmaxLegacy()205 bool hasFminFmaxLegacy() const { 206 return HasFminFmaxLegacy; 207 } 208 hasTrigReducedRange()209 bool hasTrigReducedRange() const { 210 return HasTrigReducedRange; 211 } 212 hasFastFMAF32()213 bool hasFastFMAF32() const { 214 return FastFMAF32; 215 } 216 isPromoteAllocaEnabled()217 bool isPromoteAllocaEnabled() const { 218 return EnablePromoteAlloca; 219 } 220 getWavefrontSize()221 unsigned getWavefrontSize() const { 222 return 1 << WavefrontSizeLog2; 223 } 224 getWavefrontSizeLog2()225 unsigned getWavefrontSizeLog2() const { 226 return WavefrontSizeLog2; 227 } 228 getLocalMemorySize()229 unsigned getLocalMemorySize() const { 230 return LocalMemorySize; 231 } 232 getAddressableLocalMemorySize()233 unsigned getAddressableLocalMemorySize() const { 234 return AddressableLocalMemorySize; 235 } 236 237 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the 238 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs. 239 /// CU mode into account. getEUsPerCU()240 unsigned getEUsPerCU() const { return EUsPerCU; } 241 getAlignmentForImplicitArgPtr()242 Align getAlignmentForImplicitArgPtr() const { 243 return isAmdHsaOS() ? Align(8) : Align(4); 244 } 245 246 /// Returns the offset in bytes from the start of the input buffer 247 /// of the first explicit kernel argument. getExplicitKernelArgOffset()248 unsigned getExplicitKernelArgOffset() const { 249 switch (TargetTriple.getOS()) { 250 case Triple::AMDHSA: 251 case Triple::AMDPAL: 252 case Triple::Mesa3D: 253 return 0; 254 case Triple::UnknownOS: 255 default: 256 // For legacy reasons unknown/other is treated as a different version of 257 // mesa. 258 return 36; 259 } 260 261 llvm_unreachable("invalid triple OS"); 262 } 263 264 /// \returns Maximum number of work groups per compute unit supported by the 265 /// subtarget and limited by given \p FlatWorkGroupSize. 266 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; 267 268 /// \returns Minimum flat work group size supported by the subtarget. 269 virtual unsigned getMinFlatWorkGroupSize() const = 0; 270 271 /// \returns Maximum flat work group size supported by the subtarget. 272 virtual unsigned getMaxFlatWorkGroupSize() const = 0; 273 274 /// \returns Number of waves per execution unit required to support the given 275 /// \p FlatWorkGroupSize. 276 virtual unsigned 277 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0; 278 279 /// \returns Minimum number of waves per execution unit supported by the 280 /// subtarget. 281 virtual unsigned getMinWavesPerEU() const = 0; 282 283 /// \returns Maximum number of waves per execution unit supported by the 284 /// subtarget without any kind of limitation. getMaxWavesPerEU()285 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } 286 287 /// Return the maximum workitem ID value in the function, for the given (0, 1, 288 /// 2) dimension. 289 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const; 290 291 /// Return the number of work groups for the function. 292 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const; 293 294 /// Return true if only a single workitem can be active in a wave. 295 bool isSingleLaneExecution(const Function &Kernel) const; 296 297 /// Creates value range metadata on an workitemid.* intrinsic call or load. 298 bool makeLIDRangeMetadata(Instruction *I) const; 299 300 /// \returns Number of bytes of arguments that are passed to a shader or 301 /// kernel in addition to the explicit ones declared for the function. 302 unsigned getImplicitArgNumBytes(const Function &F) const; 303 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; 304 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; 305 306 /// \returns Corresponding DWARF register number mapping flavour for the 307 /// \p WavefrontSize. 308 AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const; 309 310 virtual ~AMDGPUSubtarget() = default; 311 }; 312 313 } // end namespace llvm 314 315 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 316