xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass preloads kernel arguments into user_data SGPRs before kernel
10 /// execution begins. The number of registers available for preloading depends
11 /// on the number of free user SGPRs, up to the hardware's maximum limit.
12 /// Implicit arguments enabled in the kernel descriptor are allocated first,
13 /// followed by SGPRs used for preloaded kernel arguments. (Reference:
14 /// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state)
15 /// Additionally, hidden kernel arguments may be preloaded, in which case they
16 /// are appended to the kernel signature after explicit arguments. Preloaded
17 /// arguments will be marked with `inreg`.
18 //
19 //===----------------------------------------------------------------------===//
20 
21 #include "AMDGPU.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/Module.h"
28 #include "llvm/IR/PassManager.h"
29 #include "llvm/IR/Verifier.h"
30 #include "llvm/Pass.h"
31 
32 #define DEBUG_TYPE "amdgpu-preload-kernel-arguments"
33 
34 using namespace llvm;
35 
36 static cl::opt<unsigned> KernargPreloadCount(
37     "amdgpu-kernarg-preload-count",
38     cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
39 
40 namespace {
41 
42 class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {
43   const GCNTargetMachine *TM;
44 
45 public:
46   static char ID;
47   explicit AMDGPUPreloadKernelArgumentsLegacy(
48       const GCNTargetMachine *TM = nullptr);
49 
getPassName() const50   StringRef getPassName() const override {
51     return "AMDGPU Preload Kernel Arguments";
52   }
53 
54   bool runOnModule(Module &M) override;
55 };
56 
57 class PreloadKernelArgInfo {
58 private:
59   Function &F;
60   const GCNSubtarget &ST;
61   unsigned NumFreeUserSGPRs;
62 
63   enum HiddenArg : unsigned {
64     HIDDEN_BLOCK_COUNT_X,
65     HIDDEN_BLOCK_COUNT_Y,
66     HIDDEN_BLOCK_COUNT_Z,
67     HIDDEN_GROUP_SIZE_X,
68     HIDDEN_GROUP_SIZE_Y,
69     HIDDEN_GROUP_SIZE_Z,
70     HIDDEN_REMAINDER_X,
71     HIDDEN_REMAINDER_Y,
72     HIDDEN_REMAINDER_Z,
73     END_HIDDEN_ARGS
74   };
75 
76   // Stores information about a specific hidden argument.
77   struct HiddenArgInfo {
78     // Offset in bytes from the location in the kernearg segment pointed to by
79     // the implicitarg pointer.
80     uint8_t Offset;
81     // The size of the hidden argument in bytes.
82     uint8_t Size;
83     // The name of the hidden argument in the kernel signature.
84     const char *Name;
85   };
86 
87   static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
88       {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
89       {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
90       {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
91       {18, 2, "_hidden_remainder_x"},  {20, 2, "_hidden_remainder_y"},
92       {22, 2, "_hidden_remainder_z"}};
93 
getHiddenArgFromOffset(unsigned Offset)94   static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
95     for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
96       if (HiddenArgs[I].Offset == Offset)
97         return static_cast<HiddenArg>(I);
98 
99     return END_HIDDEN_ARGS;
100   }
101 
getHiddenArgType(LLVMContext & Ctx,HiddenArg HA)102   static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
103     if (HA < END_HIDDEN_ARGS)
104       return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
105 
106     llvm_unreachable("Unexpected hidden argument.");
107   }
108 
getHiddenArgName(HiddenArg HA)109   static const char *getHiddenArgName(HiddenArg HA) {
110     if (HA < END_HIDDEN_ARGS)
111       return HiddenArgs[HA].Name;
112 
113     llvm_unreachable("Unexpected hidden argument.");
114   }
115 
116   // Clones the function after adding implicit arguments to the argument list
117   // and returns the new updated function. Preloaded implicit arguments are
118   // added up to and including the last one that will be preloaded, indicated by
119   // LastPreloadIndex. Currently preloading is only performed on the totality of
120   // sequential data from the kernarg segment including implicit (hidden)
121   // arguments. This means that all arguments up to the last preloaded argument
122   // will also be preloaded even if that data is unused.
cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex)123   Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
124     FunctionType *FT = F.getFunctionType();
125     LLVMContext &Ctx = F.getParent()->getContext();
126     SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
127     for (unsigned I = 0; I <= LastPreloadIndex; ++I)
128       FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
129 
130     FunctionType *NFT =
131         FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
132     Function *NF =
133         Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
134 
135     NF->copyAttributesFrom(&F);
136     NF->copyMetadata(&F, 0);
137 
138     F.getParent()->getFunctionList().insert(F.getIterator(), NF);
139     NF->takeName(&F);
140     NF->splice(NF->begin(), &F);
141 
142     Function::arg_iterator NFArg = NF->arg_begin();
143     for (Argument &Arg : F.args()) {
144       Arg.replaceAllUsesWith(&*NFArg);
145       NFArg->takeName(&Arg);
146       ++NFArg;
147     }
148 
149     AttrBuilder AB(Ctx);
150     AB.addAttribute(Attribute::InReg);
151     AB.addAttribute("amdgpu-hidden-argument");
152     AttributeList AL = NF->getAttributes();
153     for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
154       AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
155       NFArg++->setName(getHiddenArgName(HiddenArg(I)));
156     }
157 
158     NF->setAttributes(AL);
159     F.replaceAllUsesWith(NF);
160 
161     return NF;
162   }
163 
164 public:
PreloadKernelArgInfo(Function & F,const GCNSubtarget & ST)165   PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
166     setInitialFreeUserSGPRsCount();
167   }
168 
169   // Returns the maximum number of user SGPRs that we have available to preload
170   // arguments.
setInitialFreeUserSGPRsCount()171   void setInitialFreeUserSGPRsCount() {
172     GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
173     NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
174   }
175 
canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset)176   bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {
177     return ExplicitArgOffset <= NumFreeUserSGPRs * 4;
178   }
179 
180   // Try to allocate SGPRs to preload hidden kernel arguments.
181   void
tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,SmallVectorImpl<Function * > & FunctionsToErase)182   tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
183                                 SmallVectorImpl<Function *> &FunctionsToErase) {
184     Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
185         F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
186     if (!ImplicitArgPtr)
187       return;
188 
189     const DataLayout &DL = F.getParent()->getDataLayout();
190     // Pair is the load and the load offset.
191     SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
192     for (auto *U : ImplicitArgPtr->users()) {
193       Instruction *CI = dyn_cast<Instruction>(U);
194       if (!CI || CI->getParent()->getParent() != &F)
195         continue;
196 
197       for (auto *U : CI->users()) {
198         int64_t Offset = 0;
199         auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
200         if (!Load) {
201           if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
202             continue;
203 
204           Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
205         }
206 
207         if (!Load || !Load->isSimple())
208           continue;
209 
210         // FIXME: Expand handle merged loads.
211         LLVMContext &Ctx = F.getParent()->getContext();
212         Type *LoadTy = Load->getType();
213         HiddenArg HA = getHiddenArgFromOffset(Offset);
214         if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
215           continue;
216 
217         ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
218       }
219     }
220 
221     if (ImplicitArgLoads.empty())
222       return;
223 
224     // Allocate loads in order of offset. We need to be sure that the implicit
225     // argument can actually be preloaded.
226     std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
227 
228     // If we fail to preload any implicit argument we know we don't have SGPRs
229     // to preload any subsequent ones with larger offsets. Find the first
230     // argument that we cannot preload.
231     auto *PreloadEnd = llvm::find_if(
232         ImplicitArgLoads, [&](const std::pair<LoadInst *, unsigned> &Load) {
233           unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
234           unsigned LoadOffset = Load.second;
235           if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +
236                                          ImplicitArgsBaseOffset))
237             return true;
238 
239           return false;
240         });
241 
242     if (PreloadEnd == ImplicitArgLoads.begin())
243       return;
244 
245     unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
246     Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
247     assert(NF);
248     FunctionsToErase.push_back(&F);
249     for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
250       LoadInst *LoadInst = I->first;
251       unsigned LoadOffset = I->second;
252       unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
253       unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
254       Argument *Arg = NF->getArg(Index);
255       LoadInst->replaceAllUsesWith(Arg);
256     }
257   }
258 };
259 
260 } // end anonymous namespace
261 
262 char AMDGPUPreloadKernelArgumentsLegacy::ID = 0;
263 
264 INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE,
265                 "AMDGPU Preload Kernel Arguments", false, false)
266 
267 ModulePass *
createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine * TM)268 llvm::createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *TM) {
269   return new AMDGPUPreloadKernelArgumentsLegacy(
270       static_cast<const GCNTargetMachine *>(TM));
271 }
272 
AMDGPUPreloadKernelArgumentsLegacy(const GCNTargetMachine * TM)273 AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(
274     const GCNTargetMachine *TM)
275     : ModulePass(ID), TM(TM) {}
276 
markKernelArgsAsInreg(Module & M,const TargetMachine & TM)277 static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) {
278   SmallVector<Function *, 4> FunctionsToErase;
279   bool Changed = false;
280   for (auto &F : M) {
281     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
282     if (!ST.hasKernargPreload() ||
283         F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
284       continue;
285 
286     PreloadKernelArgInfo PreloadInfo(F, ST);
287     uint64_t ExplicitArgOffset = 0;
288     const DataLayout &DL = F.getDataLayout();
289     const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
290     unsigned NumPreloadsRequested = KernargPreloadCount;
291     unsigned NumPreloadedExplicitArgs = 0;
292     for (Argument &Arg : F.args()) {
293       // Avoid incompatible attributes and guard against running this pass
294       // twice.
295       //
296       // TODO: Preload byref kernel arguments
297       if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
298           Arg.hasAttribute("amdgpu-hidden-argument"))
299         break;
300 
301       // Inreg may be pre-existing on some arguments, try to preload these.
302       if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
303         break;
304 
305       // FIXME: Preload aggregates.
306       if (Arg.getType()->isAggregateType())
307         break;
308 
309       Type *ArgTy = Arg.getType();
310       Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
311       uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
312       ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
313 
314       if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))
315         break;
316 
317       Arg.addAttr(Attribute::InReg);
318       NumPreloadedExplicitArgs++;
319       if (NumPreloadsRequested > 0)
320         NumPreloadsRequested--;
321     }
322 
323     // Only try preloading hidden arguments if we can successfully preload the
324     // last explicit argument.
325     if (NumPreloadedExplicitArgs == F.arg_size()) {
326       uint64_t ImplicitArgsBaseOffset =
327           alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
328           BaseOffset;
329       PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,
330                                                 FunctionsToErase);
331     }
332 
333     Changed |= NumPreloadedExplicitArgs > 0;
334   }
335 
336   // Erase cloned functions if we needed to update the kernel signature to
337   // support preloading hidden kernel arguments.
338   for (auto *F : FunctionsToErase)
339     F->eraseFromParent();
340 
341   return Changed;
342 }
343 
runOnModule(Module & M)344 bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) {
345   if (skipModule(M) || !TM)
346     return false;
347 
348   return markKernelArgsAsInreg(M, *TM);
349 }
350 
351 PreservedAnalyses
run(Module & M,ModuleAnalysisManager & AM)352 AMDGPUPreloadKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) {
353   bool Changed = markKernelArgsAsInreg(M, TM);
354   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
355 }
356