1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "R600Subtarget.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 23 #include "llvm/CodeGen/MachineScheduler.h" 24 #include "llvm/CodeGen/TargetFrameLowering.h" 25 #include "llvm/IR/DiagnosticInfo.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 #include "llvm/IR/IntrinsicsR600.h" 28 #include "llvm/IR/MDBuilder.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-subtarget" 34 35 AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} 36 37 bool AMDGPUSubtarget::useRealTrue16Insts() const { 38 return hasTrue16BitInsts() && EnableRealTrue16Insts; 39 } 40 41 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still 42 // allows the given function to achieve an occupancy of NWaves waves per 43 // SIMD / EU, taking into account only the function's *maximum* workgroup size. 44 unsigned 45 AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 46 const Function &F) const { 47 const unsigned WaveSize = getWavefrontSize(); 48 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 49 const unsigned WavesPerWorkgroup = 50 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize); 51 52 const unsigned WorkGroupsPerCU = 53 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup); 54 55 return getLocalMemorySize() / WorkGroupsPerCU; 56 } 57 58 std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( 59 uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 60 61 // FIXME: We should take into account the LDS allocation granularity. 62 const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u); 63 64 // Queried LDS size may be larger than available on a CU, in which case we 65 // consider the only achievable occupancy to be 1, in line with what we 66 // consider the occupancy to be when the number of requested registers in a 67 // particular bank is higher than the number of available ones in that bank. 68 if (!MaxWGsLDS) 69 return {1, 1}; 70 71 const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU(); 72 73 auto PropsFromWGSize = [=](unsigned WGSize) 74 -> std::tuple<const unsigned, const unsigned, unsigned> { 75 unsigned WavesPerWG = divideCeil(WGSize, WaveSize); 76 unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS); 77 return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU}; 78 }; 79 80 // The maximum group size will generally yield the minimum number of 81 // workgroups, maximum number of waves, and minimum occupancy. The opposite is 82 // generally true for the minimum group size. LDS or barrier ressource 83 // limitations can flip those minimums/maximums. 84 const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes; 85 auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize); 86 auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize); 87 88 // It is possible that we end up with flipped minimum and maximum number of 89 // waves per CU when the number of minimum/maximum concurrent groups on the CU 90 // is limited by LDS usage or barrier resources. 91 if (MinWavesPerCU >= MaxWavesPerCU) { 92 std::swap(MinWavesPerCU, MaxWavesPerCU); 93 } else { 94 const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU(); 95 96 // Look for a potential smaller group size than the maximum which decreases 97 // the concurrent number of waves on the CU for the same number of 98 // concurrent workgroups on the CU. 99 unsigned MinWavesPerCUForWGSize = 100 divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU; 101 if (MinWavesPerCU > MinWavesPerCUForWGSize) { 102 unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize; 103 if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) { 104 // There may exist a smaller group size than the maximum that achieves 105 // the minimum number of waves per CU. This group size is the largest 106 // possible size that requires MaxWavesPerWG - E waves where E is 107 // maximized under the following constraints. 108 // 1. 0 <= E <= ExcessSlotsPerWG 109 // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize 110 MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG, 111 MaxWavesPerWG - MinWavesPerWG); 112 } 113 } 114 115 // Look for a potential larger group size than the minimum which increases 116 // the concurrent number of waves on the CU for the same number of 117 // concurrent workgroups on the CU. 118 unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG; 119 if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) { 120 // There may exist a larger group size than the minimum that achieves the 121 // maximum number of waves per CU. This group size is the smallest 122 // possible size that requires MinWavesPerWG + L waves where L is 123 // maximized under the following constraints. 124 // 1. 0 <= L <= LeftoverSlotsPerWG 125 // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize 126 MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG, 127 ((MaxWGSize - 1) / WaveSize) + 1 - 128 MinWavesPerWG); 129 } 130 } 131 132 // Return the minimum/maximum number of waves on any EU, assuming that all 133 // wavefronts are spread across all EUs as evenly as possible. 134 return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU), 135 std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)}; 136 } 137 138 std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( 139 const MachineFunction &MF) const { 140 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 141 return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction()); 142 } 143 144 std::pair<unsigned, unsigned> 145 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 146 switch (CC) { 147 case CallingConv::AMDGPU_VS: 148 case CallingConv::AMDGPU_LS: 149 case CallingConv::AMDGPU_HS: 150 case CallingConv::AMDGPU_ES: 151 case CallingConv::AMDGPU_GS: 152 case CallingConv::AMDGPU_PS: 153 return std::pair(1, getWavefrontSize()); 154 default: 155 return std::pair(1u, getMaxFlatWorkGroupSize()); 156 } 157 } 158 159 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 160 const Function &F) const { 161 // Default minimum/maximum flat work group sizes. 162 std::pair<unsigned, unsigned> Default = 163 getDefaultFlatWorkGroupSize(F.getCallingConv()); 164 165 // Requested minimum/maximum flat work group sizes. 166 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 167 F, "amdgpu-flat-work-group-size", Default); 168 169 // Make sure requested minimum is less than requested maximum. 170 if (Requested.first > Requested.second) 171 return Default; 172 173 // Make sure requested values do not violate subtarget's specifications. 174 if (Requested.first < getMinFlatWorkGroupSize()) 175 return Default; 176 if (Requested.second > getMaxFlatWorkGroupSize()) 177 return Default; 178 179 return Requested; 180 } 181 182 std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU( 183 std::pair<unsigned, unsigned> RequestedWavesPerEU, 184 std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const { 185 // Default minimum/maximum number of waves per EU. The range of flat workgroup 186 // sizes limits the achievable maximum, and we aim to support enough waves per 187 // EU so that we can concurrently execute all waves of a single workgroup of 188 // maximum size on a CU. 189 std::pair<unsigned, unsigned> Default = { 190 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second), 191 getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second}; 192 Default.first = std::min(Default.first, Default.second); 193 194 // Make sure requested minimum is within the default range and lower than the 195 // requested maximum. The latter must not violate target specification. 196 if (RequestedWavesPerEU.first < Default.first || 197 RequestedWavesPerEU.first > Default.second || 198 RequestedWavesPerEU.first > RequestedWavesPerEU.second || 199 RequestedWavesPerEU.second > getMaxWavesPerEU()) 200 return Default; 201 202 // We cannot exceed maximum occupancy implied by flat workgroup size and LDS. 203 RequestedWavesPerEU.second = 204 std::min(RequestedWavesPerEU.second, Default.second); 205 return RequestedWavesPerEU; 206 } 207 208 std::pair<unsigned, unsigned> 209 AMDGPUSubtarget::getWavesPerEU(const Function &F) const { 210 // Default/requested minimum/maximum flat work group sizes. 211 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 212 // Minimum number of bytes allocated in the LDS. 213 unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", 214 {0, UINT32_MAX}, true) 215 .first; 216 return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F); 217 } 218 219 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 220 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 221 // Minimum number of bytes allocated in the LDS. 222 unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", 223 {0, UINT32_MAX}, true) 224 .first; 225 return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F); 226 } 227 228 std::pair<unsigned, unsigned> 229 AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes, 230 unsigned LDSBytes, const Function &F) const { 231 // Default minimum/maximum number of waves per execution unit. 232 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 233 234 // Requested minimum/maximum number of waves per execution unit. 235 std::pair<unsigned, unsigned> Requested = 236 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true); 237 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes); 238 } 239 240 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 241 auto *Node = Kernel.getMetadata("reqd_work_group_size"); 242 if (Node && Node->getNumOperands() == 3) 243 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 244 return std::numeric_limits<unsigned>::max(); 245 } 246 247 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 248 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 249 } 250 251 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 252 unsigned Dimension) const { 253 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 254 if (ReqdSize != std::numeric_limits<unsigned>::max()) 255 return ReqdSize - 1; 256 return getFlatWorkGroupSizes(Kernel).second - 1; 257 } 258 259 bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { 260 for (int I = 0; I < 3; ++I) { 261 if (getMaxWorkitemID(Func, I) > 0) 262 return false; 263 } 264 265 return true; 266 } 267 268 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 269 Function *Kernel = I->getParent()->getParent(); 270 unsigned MinSize = 0; 271 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 272 bool IdQuery = false; 273 274 // If reqd_work_group_size is present it narrows value down. 275 if (auto *CI = dyn_cast<CallInst>(I)) { 276 const Function *F = CI->getCalledFunction(); 277 if (F) { 278 unsigned Dim = UINT_MAX; 279 switch (F->getIntrinsicID()) { 280 case Intrinsic::amdgcn_workitem_id_x: 281 case Intrinsic::r600_read_tidig_x: 282 IdQuery = true; 283 [[fallthrough]]; 284 case Intrinsic::r600_read_local_size_x: 285 Dim = 0; 286 break; 287 case Intrinsic::amdgcn_workitem_id_y: 288 case Intrinsic::r600_read_tidig_y: 289 IdQuery = true; 290 [[fallthrough]]; 291 case Intrinsic::r600_read_local_size_y: 292 Dim = 1; 293 break; 294 case Intrinsic::amdgcn_workitem_id_z: 295 case Intrinsic::r600_read_tidig_z: 296 IdQuery = true; 297 [[fallthrough]]; 298 case Intrinsic::r600_read_local_size_z: 299 Dim = 2; 300 break; 301 default: 302 break; 303 } 304 305 if (Dim <= 3) { 306 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 307 if (ReqdSize != std::numeric_limits<unsigned>::max()) 308 MinSize = MaxSize = ReqdSize; 309 } 310 } 311 } 312 313 if (!MaxSize) 314 return false; 315 316 // Range metadata is [Lo, Hi). For ID query we need to pass max size 317 // as Hi. For size query we need to pass Hi + 1. 318 if (IdQuery) 319 MinSize = 0; 320 else 321 ++MaxSize; 322 323 APInt Lower{32, MinSize}; 324 APInt Upper{32, MaxSize}; 325 if (auto *CI = dyn_cast<CallBase>(I)) { 326 ConstantRange Range(Lower, Upper); 327 CI->addRangeRetAttr(Range); 328 } else { 329 MDBuilder MDB(I->getContext()); 330 MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper); 331 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 332 } 333 return true; 334 } 335 336 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 337 assert(AMDGPU::isKernel(F.getCallingConv())); 338 339 // We don't allocate the segment if we know the implicit arguments weren't 340 // used, even if the ABI implies we need them. 341 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 342 return 0; 343 344 if (isMesaKernel(F)) 345 return 16; 346 347 // Assume all implicit inputs are used by default 348 const Module *M = F.getParent(); 349 unsigned NBytes = 350 AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; 351 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes", 352 NBytes); 353 } 354 355 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 356 Align &MaxAlign) const { 357 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 358 F.getCallingConv() == CallingConv::SPIR_KERNEL); 359 360 const DataLayout &DL = F.getDataLayout(); 361 uint64_t ExplicitArgBytes = 0; 362 MaxAlign = Align(1); 363 364 for (const Argument &Arg : F.args()) { 365 if (Arg.hasAttribute("amdgpu-hidden-argument")) 366 continue; 367 368 const bool IsByRef = Arg.hasByRefAttr(); 369 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 370 Align Alignment = DL.getValueOrABITypeAlignment( 371 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy); 372 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 373 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 374 MaxAlign = std::max(MaxAlign, Alignment); 375 } 376 377 return ExplicitArgBytes; 378 } 379 380 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 381 Align &MaxAlign) const { 382 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && 383 F.getCallingConv() != CallingConv::SPIR_KERNEL) 384 return 0; 385 386 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 387 388 unsigned ExplicitOffset = getExplicitKernelArgOffset(); 389 390 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 391 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 392 if (ImplicitBytes != 0) { 393 const Align Alignment = getAlignmentForImplicitArgPtr(); 394 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 395 MaxAlign = std::max(MaxAlign, Alignment); 396 } 397 398 // Being able to dereference past the end is useful for emitting scalar loads. 399 return alignTo(TotalSize, 4); 400 } 401 402 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 403 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 404 : AMDGPUDwarfFlavour::Wave64; 405 } 406 407 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 408 if (MF.getTarget().getTargetTriple().isAMDGCN()) 409 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 410 return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>()); 411 } 412 413 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 414 if (TM.getTargetTriple().isAMDGCN()) 415 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 416 return static_cast<const AMDGPUSubtarget &>( 417 TM.getSubtarget<R600Subtarget>(F)); 418 } 419 420 // FIXME: This has no reason to be in subtarget 421 SmallVector<unsigned> 422 AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { 423 return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3, 424 std::numeric_limits<uint32_t>::max()); 425 } 426