1 //===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass preloads kernel arguments into user_data SGPRs before kernel
10 /// execution begins. The number of registers available for preloading depends
11 /// on the number of free user SGPRs, up to the hardware's maximum limit.
12 /// Implicit arguments enabled in the kernel descriptor are allocated first,
13 /// followed by SGPRs used for preloaded kernel arguments. (Reference:
14 /// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state)
15 /// Additionally, hidden kernel arguments may be preloaded, in which case they
16 /// are appended to the kernel signature after explicit arguments. Preloaded
17 /// arguments will be marked with `inreg`.
18 //
19 //===----------------------------------------------------------------------===//
20
21 #include "AMDGPU.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/Module.h"
28 #include "llvm/IR/PassManager.h"
29 #include "llvm/IR/Verifier.h"
30 #include "llvm/Pass.h"
31
32 #define DEBUG_TYPE "amdgpu-preload-kernel-arguments"
33
34 using namespace llvm;
35
36 static cl::opt<unsigned> KernargPreloadCount(
37 "amdgpu-kernarg-preload-count",
38 cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
39
40 namespace {
41
42 class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {
43 const GCNTargetMachine *TM;
44
45 public:
46 static char ID;
47 explicit AMDGPUPreloadKernelArgumentsLegacy(
48 const GCNTargetMachine *TM = nullptr);
49
getPassName() const50 StringRef getPassName() const override {
51 return "AMDGPU Preload Kernel Arguments";
52 }
53
54 bool runOnModule(Module &M) override;
55 };
56
57 class PreloadKernelArgInfo {
58 private:
59 Function &F;
60 const GCNSubtarget &ST;
61 unsigned NumFreeUserSGPRs;
62
63 enum HiddenArg : unsigned {
64 HIDDEN_BLOCK_COUNT_X,
65 HIDDEN_BLOCK_COUNT_Y,
66 HIDDEN_BLOCK_COUNT_Z,
67 HIDDEN_GROUP_SIZE_X,
68 HIDDEN_GROUP_SIZE_Y,
69 HIDDEN_GROUP_SIZE_Z,
70 HIDDEN_REMAINDER_X,
71 HIDDEN_REMAINDER_Y,
72 HIDDEN_REMAINDER_Z,
73 END_HIDDEN_ARGS
74 };
75
76 // Stores information about a specific hidden argument.
77 struct HiddenArgInfo {
78 // Offset in bytes from the location in the kernearg segment pointed to by
79 // the implicitarg pointer.
80 uint8_t Offset;
81 // The size of the hidden argument in bytes.
82 uint8_t Size;
83 // The name of the hidden argument in the kernel signature.
84 const char *Name;
85 };
86
87 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
88 {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
89 {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
90 {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
91 {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
92 {22, 2, "_hidden_remainder_z"}};
93
getHiddenArgFromOffset(unsigned Offset)94 static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
95 for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
96 if (HiddenArgs[I].Offset == Offset)
97 return static_cast<HiddenArg>(I);
98
99 return END_HIDDEN_ARGS;
100 }
101
getHiddenArgType(LLVMContext & Ctx,HiddenArg HA)102 static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
103 if (HA < END_HIDDEN_ARGS)
104 return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
105
106 llvm_unreachable("Unexpected hidden argument.");
107 }
108
getHiddenArgName(HiddenArg HA)109 static const char *getHiddenArgName(HiddenArg HA) {
110 if (HA < END_HIDDEN_ARGS)
111 return HiddenArgs[HA].Name;
112
113 llvm_unreachable("Unexpected hidden argument.");
114 }
115
116 // Clones the function after adding implicit arguments to the argument list
117 // and returns the new updated function. Preloaded implicit arguments are
118 // added up to and including the last one that will be preloaded, indicated by
119 // LastPreloadIndex. Currently preloading is only performed on the totality of
120 // sequential data from the kernarg segment including implicit (hidden)
121 // arguments. This means that all arguments up to the last preloaded argument
122 // will also be preloaded even if that data is unused.
cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex)123 Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
124 FunctionType *FT = F.getFunctionType();
125 LLVMContext &Ctx = F.getParent()->getContext();
126 SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
127 for (unsigned I = 0; I <= LastPreloadIndex; ++I)
128 FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
129
130 FunctionType *NFT =
131 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
132 Function *NF =
133 Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
134
135 NF->copyAttributesFrom(&F);
136 NF->copyMetadata(&F, 0);
137
138 F.getParent()->getFunctionList().insert(F.getIterator(), NF);
139 NF->takeName(&F);
140 NF->splice(NF->begin(), &F);
141
142 Function::arg_iterator NFArg = NF->arg_begin();
143 for (Argument &Arg : F.args()) {
144 Arg.replaceAllUsesWith(&*NFArg);
145 NFArg->takeName(&Arg);
146 ++NFArg;
147 }
148
149 AttrBuilder AB(Ctx);
150 AB.addAttribute(Attribute::InReg);
151 AB.addAttribute("amdgpu-hidden-argument");
152 AttributeList AL = NF->getAttributes();
153 for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
154 AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
155 NFArg++->setName(getHiddenArgName(HiddenArg(I)));
156 }
157
158 NF->setAttributes(AL);
159 F.replaceAllUsesWith(NF);
160
161 return NF;
162 }
163
164 public:
PreloadKernelArgInfo(Function & F,const GCNSubtarget & ST)165 PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
166 setInitialFreeUserSGPRsCount();
167 }
168
169 // Returns the maximum number of user SGPRs that we have available to preload
170 // arguments.
setInitialFreeUserSGPRsCount()171 void setInitialFreeUserSGPRsCount() {
172 GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
173 NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
174 }
175
canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset)176 bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {
177 return ExplicitArgOffset <= NumFreeUserSGPRs * 4;
178 }
179
180 // Try to allocate SGPRs to preload hidden kernel arguments.
181 void
tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,SmallVectorImpl<Function * > & FunctionsToErase)182 tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
183 SmallVectorImpl<Function *> &FunctionsToErase) {
184 Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
185 F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
186 if (!ImplicitArgPtr)
187 return;
188
189 const DataLayout &DL = F.getParent()->getDataLayout();
190 // Pair is the load and the load offset.
191 SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
192 for (auto *U : ImplicitArgPtr->users()) {
193 Instruction *CI = dyn_cast<Instruction>(U);
194 if (!CI || CI->getParent()->getParent() != &F)
195 continue;
196
197 for (auto *U : CI->users()) {
198 int64_t Offset = 0;
199 auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
200 if (!Load) {
201 if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
202 continue;
203
204 Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
205 }
206
207 if (!Load || !Load->isSimple())
208 continue;
209
210 // FIXME: Expand handle merged loads.
211 LLVMContext &Ctx = F.getParent()->getContext();
212 Type *LoadTy = Load->getType();
213 HiddenArg HA = getHiddenArgFromOffset(Offset);
214 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
215 continue;
216
217 ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
218 }
219 }
220
221 if (ImplicitArgLoads.empty())
222 return;
223
224 // Allocate loads in order of offset. We need to be sure that the implicit
225 // argument can actually be preloaded.
226 std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
227
228 // If we fail to preload any implicit argument we know we don't have SGPRs
229 // to preload any subsequent ones with larger offsets. Find the first
230 // argument that we cannot preload.
231 auto *PreloadEnd = llvm::find_if(
232 ImplicitArgLoads, [&](const std::pair<LoadInst *, unsigned> &Load) {
233 unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
234 unsigned LoadOffset = Load.second;
235 if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +
236 ImplicitArgsBaseOffset))
237 return true;
238
239 return false;
240 });
241
242 if (PreloadEnd == ImplicitArgLoads.begin())
243 return;
244
245 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
246 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
247 assert(NF);
248 FunctionsToErase.push_back(&F);
249 for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
250 LoadInst *LoadInst = I->first;
251 unsigned LoadOffset = I->second;
252 unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
253 unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
254 Argument *Arg = NF->getArg(Index);
255 LoadInst->replaceAllUsesWith(Arg);
256 }
257 }
258 };
259
260 } // end anonymous namespace
261
262 char AMDGPUPreloadKernelArgumentsLegacy::ID = 0;
263
264 INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE,
265 "AMDGPU Preload Kernel Arguments", false, false)
266
267 ModulePass *
createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine * TM)268 llvm::createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *TM) {
269 return new AMDGPUPreloadKernelArgumentsLegacy(
270 static_cast<const GCNTargetMachine *>(TM));
271 }
272
AMDGPUPreloadKernelArgumentsLegacy(const GCNTargetMachine * TM)273 AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(
274 const GCNTargetMachine *TM)
275 : ModulePass(ID), TM(TM) {}
276
markKernelArgsAsInreg(Module & M,const TargetMachine & TM)277 static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) {
278 SmallVector<Function *, 4> FunctionsToErase;
279 bool Changed = false;
280 for (auto &F : M) {
281 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
282 if (!ST.hasKernargPreload() ||
283 F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
284 continue;
285
286 PreloadKernelArgInfo PreloadInfo(F, ST);
287 uint64_t ExplicitArgOffset = 0;
288 const DataLayout &DL = F.getDataLayout();
289 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
290 unsigned NumPreloadsRequested = KernargPreloadCount;
291 unsigned NumPreloadedExplicitArgs = 0;
292 for (Argument &Arg : F.args()) {
293 // Avoid incompatible attributes and guard against running this pass
294 // twice.
295 //
296 // TODO: Preload byref kernel arguments
297 if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
298 Arg.hasAttribute("amdgpu-hidden-argument"))
299 break;
300
301 // Inreg may be pre-existing on some arguments, try to preload these.
302 if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
303 break;
304
305 // FIXME: Preload aggregates.
306 if (Arg.getType()->isAggregateType())
307 break;
308
309 Type *ArgTy = Arg.getType();
310 Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
311 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
312 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
313
314 if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))
315 break;
316
317 Arg.addAttr(Attribute::InReg);
318 NumPreloadedExplicitArgs++;
319 if (NumPreloadsRequested > 0)
320 NumPreloadsRequested--;
321 }
322
323 // Only try preloading hidden arguments if we can successfully preload the
324 // last explicit argument.
325 if (NumPreloadedExplicitArgs == F.arg_size()) {
326 uint64_t ImplicitArgsBaseOffset =
327 alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
328 BaseOffset;
329 PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,
330 FunctionsToErase);
331 }
332
333 Changed |= NumPreloadedExplicitArgs > 0;
334 }
335
336 // Erase cloned functions if we needed to update the kernel signature to
337 // support preloading hidden kernel arguments.
338 for (auto *F : FunctionsToErase)
339 F->eraseFromParent();
340
341 return Changed;
342 }
343
runOnModule(Module & M)344 bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) {
345 if (skipModule(M) || !TM)
346 return false;
347
348 return markKernelArgsAsInreg(M, *TM);
349 }
350
351 PreservedAnalyses
run(Module & M,ModuleAnalysisManager & AM)352 AMDGPUPreloadKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) {
353 bool Changed = markKernelArgsAsInreg(M, TM);
354 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
355 }
356