1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Base class for AMDGPU specific classes of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 16 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/IR/CallingConv.h" 19 #include "llvm/Support/Alignment.h" 20 #include "llvm/TargetParser/Triple.h" 21 22 namespace llvm { 23 24 enum AMDGPUDwarfFlavour : unsigned; 25 class Function; 26 class Instruction; 27 class MachineFunction; 28 class TargetMachine; 29 30 class AMDGPUSubtarget { 31 public: 32 enum Generation { 33 INVALID = 0, 34 R600 = 1, 35 R700 = 2, 36 EVERGREEN = 3, 37 NORTHERN_ISLANDS = 4, 38 SOUTHERN_ISLANDS = 5, 39 SEA_ISLANDS = 6, 40 VOLCANIC_ISLANDS = 7, 41 GFX9 = 8, 42 GFX10 = 9, 43 GFX11 = 10, 44 GFX12 = 11, 45 }; 46 47 private: 48 Triple TargetTriple; 49 50 protected: 51 bool GCN3Encoding = false; 52 bool Has16BitInsts = false; 53 bool HasTrue16BitInsts = false; 54 bool HasFP8ConversionScaleInsts = false; 55 bool HasBF8ConversionScaleInsts = false; 56 bool HasFP4ConversionScaleInsts = false; 57 bool HasFP6BF6ConversionScaleInsts = false; 58 bool HasF16BF16ToFP6BF6ConversionScaleInsts = false; 59 bool HasCvtPkF16F32Inst = false; 60 bool HasF32ToF16BF16ConversionSRInsts = false; 61 bool EnableRealTrue16Insts = false; 62 bool HasBF16TransInsts = false; 63 bool HasBF16ConversionInsts = false; 64 bool HasMadMixInsts = false; 65 bool HasMadMacF32Insts = false; 66 bool HasDsSrc2Insts = false; 67 bool HasSDWA = false; 68 bool HasVOP3PInsts = false; 69 bool HasMulI24 = true; 70 bool HasMulU24 = true; 71 bool HasSMulHi = false; 72 bool HasInv2PiInlineImm = false; 73 bool HasFminFmaxLegacy = true; 74 bool EnablePromoteAlloca = false; 75 bool HasTrigReducedRange = false; 76 bool FastFMAF32 = false; 77 unsigned EUsPerCU = 4; 78 unsigned MaxWavesPerEU = 10; 79 unsigned LocalMemorySize = 0; 80 unsigned AddressableLocalMemorySize = 0; 81 char WavefrontSizeLog2 = 0; 82 83 public: 84 AMDGPUSubtarget(Triple TT); 85 86 static const AMDGPUSubtarget &get(const MachineFunction &MF); 87 static const AMDGPUSubtarget &get(const TargetMachine &TM, 88 const Function &F); 89 90 /// \returns Default range flat work group size for a calling convention. 91 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; 92 93 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes 94 /// for function \p F, or minimum/maximum flat work group sizes explicitly 95 /// requested using "amdgpu-flat-work-group-size" attribute attached to 96 /// function \p F. 97 /// 98 /// \returns Subtarget's default values if explicitly requested values cannot 99 /// be converted to integer, or violate subtarget's specifications. 100 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; 101 102 /// \returns Subtarget's default pair of minimum/maximum number of waves per 103 /// execution unit for function \p F, or minimum/maximum number of waves per 104 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute 105 /// attached to function \p F. 106 /// 107 /// \returns Subtarget's default values if explicitly requested values cannot 108 /// be converted to integer, violate subtarget's specifications, or are not 109 /// compatible with minimum/maximum number of waves limited by flat work group 110 /// size, register usage, and/or lds usage. 111 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; 112 113 /// Overload which uses the specified values for the flat work group sizes, 114 /// rather than querying the function itself. \p FlatWorkGroupSizes Should 115 /// correspond to the function's value for getFlatWorkGroupSizes. 116 std::pair<unsigned, unsigned> 117 getWavesPerEU(const Function &F, 118 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; 119 120 /// Overload which uses the specified values for the flat workgroup sizes and 121 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes 122 /// should correspond to the function's value for getFlatWorkGroupSizes and \p 123 /// LDSBytes to the per-workgroup LDS allocation. 124 std::pair<unsigned, unsigned> 125 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes, 126 unsigned LDSBytes, const Function &F) const; 127 128 /// Returns the target minimum/maximum number of waves per EU. This is based 129 /// on the minimum/maximum number of \p RequestedWavesPerEU and further 130 /// limited by the maximum achievable occupancy derived from the range of \p 131 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. 132 std::pair<unsigned, unsigned> 133 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU, 134 std::pair<unsigned, unsigned> FlatWorkGroupSizes, 135 unsigned LDSBytes) const; 136 137 /// Return the amount of LDS that can be used that will not restrict the 138 /// occupancy lower than WaveCount. 139 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 140 const Function &) const; 141 142 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can 143 /// be achieved when the only function running on a CU is \p F and each 144 /// workgroup running the function requires \p LDSBytes bytes of LDS space. 145 /// This notably depends on the range of allowed flat group sizes for the 146 /// function and hardware characteristics. 147 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,const Function & F)148 getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const { 149 return getOccupancyWithWorkGroupSizes(LDSBytes, getFlatWorkGroupSizes(F)); 150 } 151 152 /// Overload which uses the specified values for the flat work group sizes, 153 /// rather than querying the function itself. \p FlatWorkGroupSizes should 154 /// correspond to the function's value for getFlatWorkGroupSizes. 155 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes( 156 uint32_t LDSBytes, 157 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; 158 159 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can 160 /// be achieved when the only function running on a CU is \p MF. This notably 161 /// depends on the range of allowed flat group sizes for the function, the 162 /// amount of per-workgroup LDS space required by the function, and hardware 163 /// characteristics. 164 std::pair<unsigned, unsigned> 165 getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const; 166 isAmdHsaOS()167 bool isAmdHsaOS() const { 168 return TargetTriple.getOS() == Triple::AMDHSA; 169 } 170 isAmdPalOS()171 bool isAmdPalOS() const { 172 return TargetTriple.getOS() == Triple::AMDPAL; 173 } 174 isMesa3DOS()175 bool isMesa3DOS() const { 176 return TargetTriple.getOS() == Triple::Mesa3D; 177 } 178 179 bool isMesaKernel(const Function &F) const; 180 isAmdHsaOrMesa(const Function & F)181 bool isAmdHsaOrMesa(const Function &F) const { 182 return isAmdHsaOS() || isMesaKernel(F); 183 } 184 isGCN()185 bool isGCN() const { return TargetTriple.isAMDGCN(); } 186 isGCN3Encoding()187 bool isGCN3Encoding() const { 188 return GCN3Encoding; 189 } 190 has16BitInsts()191 bool has16BitInsts() const { 192 return Has16BitInsts; 193 } 194 195 /// Return true if the subtarget supports True16 instructions. hasTrue16BitInsts()196 bool hasTrue16BitInsts() const { return HasTrue16BitInsts; } 197 198 /// Return true if real (non-fake) variants of True16 instructions using 199 /// 16-bit registers should be code-generated. Fake True16 instructions are 200 /// identical to non-fake ones except that they take 32-bit registers as 201 /// operands and always use their low halves. 202 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully 203 // supported and the support for fake True16 instructions is removed. 204 bool useRealTrue16Insts() const; 205 hasBF16TransInsts()206 bool hasBF16TransInsts() const { return HasBF16TransInsts; } 207 hasBF16ConversionInsts()208 bool hasBF16ConversionInsts() const { 209 return HasBF16ConversionInsts; 210 } 211 hasMadMixInsts()212 bool hasMadMixInsts() const { 213 return HasMadMixInsts; 214 } 215 hasFP8ConversionScaleInsts()216 bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; } 217 hasBF8ConversionScaleInsts()218 bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; } 219 hasFP4ConversionScaleInsts()220 bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; } 221 hasFP6BF6ConversionScaleInsts()222 bool hasFP6BF6ConversionScaleInsts() const { 223 return HasFP6BF6ConversionScaleInsts; 224 } 225 hasF16BF16ToFP6BF6ConversionScaleInsts()226 bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { 227 return HasF16BF16ToFP6BF6ConversionScaleInsts; 228 } 229 hasCvtPkF16F32Inst()230 bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; } 231 hasF32ToF16BF16ConversionSRInsts()232 bool hasF32ToF16BF16ConversionSRInsts() const { 233 return HasF32ToF16BF16ConversionSRInsts; 234 } 235 hasMadMacF32Insts()236 bool hasMadMacF32Insts() const { 237 return HasMadMacF32Insts || !isGCN(); 238 } 239 hasDsSrc2Insts()240 bool hasDsSrc2Insts() const { 241 return HasDsSrc2Insts; 242 } 243 hasSDWA()244 bool hasSDWA() const { 245 return HasSDWA; 246 } 247 hasVOP3PInsts()248 bool hasVOP3PInsts() const { 249 return HasVOP3PInsts; 250 } 251 hasMulI24()252 bool hasMulI24() const { 253 return HasMulI24; 254 } 255 hasMulU24()256 bool hasMulU24() const { 257 return HasMulU24; 258 } 259 hasSMulHi()260 bool hasSMulHi() const { 261 return HasSMulHi; 262 } 263 hasInv2PiInlineImm()264 bool hasInv2PiInlineImm() const { 265 return HasInv2PiInlineImm; 266 } 267 hasFminFmaxLegacy()268 bool hasFminFmaxLegacy() const { 269 return HasFminFmaxLegacy; 270 } 271 hasTrigReducedRange()272 bool hasTrigReducedRange() const { 273 return HasTrigReducedRange; 274 } 275 hasFastFMAF32()276 bool hasFastFMAF32() const { 277 return FastFMAF32; 278 } 279 isPromoteAllocaEnabled()280 bool isPromoteAllocaEnabled() const { 281 return EnablePromoteAlloca; 282 } 283 getWavefrontSize()284 unsigned getWavefrontSize() const { 285 return 1 << WavefrontSizeLog2; 286 } 287 getWavefrontSizeLog2()288 unsigned getWavefrontSizeLog2() const { 289 return WavefrontSizeLog2; 290 } 291 292 /// Return the maximum number of bytes of LDS available for all workgroups 293 /// running on the same WGP or CU. 294 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is 295 /// limited to 64k. getLocalMemorySize()296 unsigned getLocalMemorySize() const { 297 return LocalMemorySize; 298 } 299 300 /// Return the maximum number of bytes of LDS that can be allocated to a 301 /// single workgroup. 302 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has 303 /// 128k in total. getAddressableLocalMemorySize()304 unsigned getAddressableLocalMemorySize() const { 305 return AddressableLocalMemorySize; 306 } 307 308 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the 309 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs. 310 /// CU mode into account. getEUsPerCU()311 unsigned getEUsPerCU() const { return EUsPerCU; } 312 getAlignmentForImplicitArgPtr()313 Align getAlignmentForImplicitArgPtr() const { 314 return isAmdHsaOS() ? Align(8) : Align(4); 315 } 316 317 /// Returns the offset in bytes from the start of the input buffer 318 /// of the first explicit kernel argument. getExplicitKernelArgOffset()319 unsigned getExplicitKernelArgOffset() const { 320 switch (TargetTriple.getOS()) { 321 case Triple::AMDHSA: 322 case Triple::AMDPAL: 323 case Triple::Mesa3D: 324 return 0; 325 case Triple::UnknownOS: 326 default: 327 // For legacy reasons unknown/other is treated as a different version of 328 // mesa. 329 return 36; 330 } 331 332 llvm_unreachable("invalid triple OS"); 333 } 334 335 /// \returns Maximum number of work groups per compute unit supported by the 336 /// subtarget and limited by given \p FlatWorkGroupSize. 337 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; 338 339 /// \returns Minimum flat work group size supported by the subtarget. 340 virtual unsigned getMinFlatWorkGroupSize() const = 0; 341 342 /// \returns Maximum flat work group size supported by the subtarget. 343 virtual unsigned getMaxFlatWorkGroupSize() const = 0; 344 345 /// \returns Number of waves per execution unit required to support the given 346 /// \p FlatWorkGroupSize. 347 virtual unsigned 348 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0; 349 350 /// \returns Minimum number of waves per execution unit supported by the 351 /// subtarget. 352 virtual unsigned getMinWavesPerEU() const = 0; 353 354 /// \returns Maximum number of waves per execution unit supported by the 355 /// subtarget without any kind of limitation. getMaxWavesPerEU()356 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } 357 358 /// Return the maximum workitem ID value in the function, for the given (0, 1, 359 /// 2) dimension. 360 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const; 361 362 /// Return the number of work groups for the function. 363 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const; 364 365 /// Return true if only a single workitem can be active in a wave. 366 bool isSingleLaneExecution(const Function &Kernel) const; 367 368 /// Creates value range metadata on an workitemid.* intrinsic call or load. 369 bool makeLIDRangeMetadata(Instruction *I) const; 370 371 /// \returns Number of bytes of arguments that are passed to a shader or 372 /// kernel in addition to the explicit ones declared for the function. 373 unsigned getImplicitArgNumBytes(const Function &F) const; 374 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; 375 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; 376 377 /// \returns Corresponding DWARF register number mapping flavour for the 378 /// \p WavefrontSize. 379 AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const; 380 381 virtual ~AMDGPUSubtarget() = default; 382 }; 383 384 } // end namespace llvm 385 386 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 387