1 //===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass preloads kernel arguments into user_data SGPRs before kernel 10 /// execution begins. The number of registers available for preloading depends 11 /// on the number of free user SGPRs, up to the hardware's maximum limit. 12 /// Implicit arguments enabled in the kernel descriptor are allocated first, 13 /// followed by SGPRs used for preloaded kernel arguments. (Reference: 14 /// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state) 15 /// Additionally, hidden kernel arguments may be preloaded, in which case they 16 /// are appended to the kernel signature after explicit arguments. Preloaded 17 /// arguments will be marked with `inreg`. 18 // 19 //===----------------------------------------------------------------------===// 20 21 #include "AMDGPU.h" 22 #include "AMDGPUTargetMachine.h" 23 #include "llvm/Analysis/ValueTracking.h" 24 #include "llvm/IR/Function.h" 25 #include "llvm/IR/Instructions.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 #include "llvm/IR/Module.h" 28 #include "llvm/IR/PassManager.h" 29 #include "llvm/IR/Verifier.h" 30 #include "llvm/Pass.h" 31 32 #define DEBUG_TYPE "amdgpu-preload-kernel-arguments" 33 34 using namespace llvm; 35 36 static cl::opt<unsigned> KernargPreloadCount( 37 "amdgpu-kernarg-preload-count", 38 cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0)); 39 40 namespace { 41 42 class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass { 43 const GCNTargetMachine *TM; 44 45 public: 46 static char ID; 47 explicit AMDGPUPreloadKernelArgumentsLegacy( 48 const GCNTargetMachine *TM = nullptr); 49 50 StringRef getPassName() const override { 51 return "AMDGPU Preload Kernel Arguments"; 52 } 53 54 bool runOnModule(Module &M) override; 55 }; 56 57 class PreloadKernelArgInfo { 58 private: 59 Function &F; 60 const GCNSubtarget &ST; 61 unsigned NumFreeUserSGPRs; 62 63 enum HiddenArg : unsigned { 64 HIDDEN_BLOCK_COUNT_X, 65 HIDDEN_BLOCK_COUNT_Y, 66 HIDDEN_BLOCK_COUNT_Z, 67 HIDDEN_GROUP_SIZE_X, 68 HIDDEN_GROUP_SIZE_Y, 69 HIDDEN_GROUP_SIZE_Z, 70 HIDDEN_REMAINDER_X, 71 HIDDEN_REMAINDER_Y, 72 HIDDEN_REMAINDER_Z, 73 END_HIDDEN_ARGS 74 }; 75 76 // Stores information about a specific hidden argument. 77 struct HiddenArgInfo { 78 // Offset in bytes from the location in the kernearg segment pointed to by 79 // the implicitarg pointer. 80 uint8_t Offset; 81 // The size of the hidden argument in bytes. 82 uint8_t Size; 83 // The name of the hidden argument in the kernel signature. 84 const char *Name; 85 }; 86 87 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { 88 {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, 89 {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, 90 {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, 91 {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, 92 {22, 2, "_hidden_remainder_z"}}; 93 94 static HiddenArg getHiddenArgFromOffset(unsigned Offset) { 95 for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) 96 if (HiddenArgs[I].Offset == Offset) 97 return static_cast<HiddenArg>(I); 98 99 return END_HIDDEN_ARGS; 100 } 101 102 static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { 103 if (HA < END_HIDDEN_ARGS) 104 return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); 105 106 llvm_unreachable("Unexpected hidden argument."); 107 } 108 109 static const char *getHiddenArgName(HiddenArg HA) { 110 if (HA < END_HIDDEN_ARGS) 111 return HiddenArgs[HA].Name; 112 113 llvm_unreachable("Unexpected hidden argument."); 114 } 115 116 // Clones the function after adding implicit arguments to the argument list 117 // and returns the new updated function. Preloaded implicit arguments are 118 // added up to and including the last one that will be preloaded, indicated by 119 // LastPreloadIndex. Currently preloading is only performed on the totality of 120 // sequential data from the kernarg segment including implicit (hidden) 121 // arguments. This means that all arguments up to the last preloaded argument 122 // will also be preloaded even if that data is unused. 123 Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { 124 FunctionType *FT = F.getFunctionType(); 125 LLVMContext &Ctx = F.getParent()->getContext(); 126 SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end()); 127 for (unsigned I = 0; I <= LastPreloadIndex; ++I) 128 FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); 129 130 FunctionType *NFT = 131 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); 132 Function *NF = 133 Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName()); 134 135 NF->copyAttributesFrom(&F); 136 NF->copyMetadata(&F, 0); 137 138 F.getParent()->getFunctionList().insert(F.getIterator(), NF); 139 NF->takeName(&F); 140 NF->splice(NF->begin(), &F); 141 142 Function::arg_iterator NFArg = NF->arg_begin(); 143 for (Argument &Arg : F.args()) { 144 Arg.replaceAllUsesWith(&*NFArg); 145 NFArg->takeName(&Arg); 146 ++NFArg; 147 } 148 149 AttrBuilder AB(Ctx); 150 AB.addAttribute(Attribute::InReg); 151 AB.addAttribute("amdgpu-hidden-argument"); 152 AttributeList AL = NF->getAttributes(); 153 for (unsigned I = 0; I <= LastPreloadIndex; ++I) { 154 AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); 155 NFArg++->setName(getHiddenArgName(HiddenArg(I))); 156 } 157 158 NF->setAttributes(AL); 159 F.replaceAllUsesWith(NF); 160 161 return NF; 162 } 163 164 public: 165 PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { 166 setInitialFreeUserSGPRsCount(); 167 } 168 169 // Returns the maximum number of user SGPRs that we have available to preload 170 // arguments. 171 void setInitialFreeUserSGPRsCount() { 172 GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); 173 NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs(); 174 } 175 176 bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) { 177 return ExplicitArgOffset <= NumFreeUserSGPRs * 4; 178 } 179 180 // Try to allocate SGPRs to preload hidden kernel arguments. 181 void 182 tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, 183 SmallVectorImpl<Function *> &FunctionsToErase) { 184 Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( 185 F.getParent(), Intrinsic::amdgcn_implicitarg_ptr); 186 if (!ImplicitArgPtr) 187 return; 188 189 const DataLayout &DL = F.getParent()->getDataLayout(); 190 // Pair is the load and the load offset. 191 SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads; 192 for (auto *U : ImplicitArgPtr->users()) { 193 Instruction *CI = dyn_cast<Instruction>(U); 194 if (!CI || CI->getParent()->getParent() != &F) 195 continue; 196 197 for (auto *U : CI->users()) { 198 int64_t Offset = 0; 199 auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr? 200 if (!Load) { 201 if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) 202 continue; 203 204 Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP? 205 } 206 207 if (!Load || !Load->isSimple()) 208 continue; 209 210 // FIXME: Expand handle merged loads. 211 LLVMContext &Ctx = F.getParent()->getContext(); 212 Type *LoadTy = Load->getType(); 213 HiddenArg HA = getHiddenArgFromOffset(Offset); 214 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) 215 continue; 216 217 ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); 218 } 219 } 220 221 if (ImplicitArgLoads.empty()) 222 return; 223 224 // Allocate loads in order of offset. We need to be sure that the implicit 225 // argument can actually be preloaded. 226 std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second()); 227 228 // If we fail to preload any implicit argument we know we don't have SGPRs 229 // to preload any subsequent ones with larger offsets. Find the first 230 // argument that we cannot preload. 231 auto *PreloadEnd = llvm::find_if( 232 ImplicitArgLoads, [&](const std::pair<LoadInst *, unsigned> &Load) { 233 unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType()); 234 unsigned LoadOffset = Load.second; 235 if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize + 236 ImplicitArgsBaseOffset)) 237 return true; 238 239 return false; 240 }); 241 242 if (PreloadEnd == ImplicitArgLoads.begin()) 243 return; 244 245 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); 246 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); 247 assert(NF); 248 FunctionsToErase.push_back(&F); 249 for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { 250 LoadInst *LoadInst = I->first; 251 unsigned LoadOffset = I->second; 252 unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); 253 unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; 254 Argument *Arg = NF->getArg(Index); 255 LoadInst->replaceAllUsesWith(Arg); 256 } 257 } 258 }; 259 260 } // end anonymous namespace 261 262 char AMDGPUPreloadKernelArgumentsLegacy::ID = 0; 263 264 INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE, 265 "AMDGPU Preload Kernel Arguments", false, false) 266 267 ModulePass * 268 llvm::createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *TM) { 269 return new AMDGPUPreloadKernelArgumentsLegacy( 270 static_cast<const GCNTargetMachine *>(TM)); 271 } 272 273 AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy( 274 const GCNTargetMachine *TM) 275 : ModulePass(ID), TM(TM) {} 276 277 static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) { 278 SmallVector<Function *, 4> FunctionsToErase; 279 bool Changed = false; 280 for (auto &F : M) { 281 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 282 if (!ST.hasKernargPreload() || 283 F.getCallingConv() != CallingConv::AMDGPU_KERNEL) 284 continue; 285 286 PreloadKernelArgInfo PreloadInfo(F, ST); 287 uint64_t ExplicitArgOffset = 0; 288 const DataLayout &DL = F.getDataLayout(); 289 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); 290 unsigned NumPreloadsRequested = KernargPreloadCount; 291 unsigned NumPreloadedExplicitArgs = 0; 292 for (Argument &Arg : F.args()) { 293 // Avoid incompatible attributes and guard against running this pass 294 // twice. 295 // 296 // TODO: Preload byref kernel arguments 297 if (Arg.hasByRefAttr() || Arg.hasNestAttr() || 298 Arg.hasAttribute("amdgpu-hidden-argument")) 299 break; 300 301 // Inreg may be pre-existing on some arguments, try to preload these. 302 if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr()) 303 break; 304 305 // FIXME: Preload aggregates. 306 if (Arg.getType()->isAggregateType()) 307 break; 308 309 Type *ArgTy = Arg.getType(); 310 Align ABITypeAlign = DL.getABITypeAlign(ArgTy); 311 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 312 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; 313 314 if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset)) 315 break; 316 317 Arg.addAttr(Attribute::InReg); 318 NumPreloadedExplicitArgs++; 319 if (NumPreloadsRequested > 0) 320 NumPreloadsRequested--; 321 } 322 323 // Only try preloading hidden arguments if we can successfully preload the 324 // last explicit argument. 325 if (NumPreloadedExplicitArgs == F.arg_size()) { 326 uint64_t ImplicitArgsBaseOffset = 327 alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + 328 BaseOffset; 329 PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset, 330 FunctionsToErase); 331 } 332 333 Changed |= NumPreloadedExplicitArgs > 0; 334 } 335 336 // Erase cloned functions if we needed to update the kernel signature to 337 // support preloading hidden kernel arguments. 338 for (auto *F : FunctionsToErase) 339 F->eraseFromParent(); 340 341 return Changed; 342 } 343 344 bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) { 345 if (skipModule(M) || !TM) 346 return false; 347 348 return markKernelArgsAsInreg(M, *TM); 349 } 350 351 PreservedAnalyses 352 AMDGPUPreloadKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) { 353 bool Changed = markKernelArgsAsInreg(M, TM); 354 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 355 } 356