xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp (revision 725a9f47324d42037db93c27ceb40d4956872f3e)
1 //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass does attempts to make use of reqd_work_group_size metadata
10 /// to eliminate loads from the dispatch packet and to constant fold OpenCL
11 /// get_local_size-like functions.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/Analysis/ConstantFolding.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/CodeGen/TargetPassConfig.h"
21 #include "llvm/IR/Constants.h"
22 #include "llvm/IR/Function.h"
23 #include "llvm/IR/InstIterator.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/IR/IntrinsicsAMDGPU.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/Pass.h"
28 
29 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
30 
31 using namespace llvm;
32 
33 namespace {
34 
35 // Field offsets in hsa_kernel_dispatch_packet_t.
36 enum DispatchPackedOffsets {
37   WORKGROUP_SIZE_X = 4,
38   WORKGROUP_SIZE_Y = 6,
39   WORKGROUP_SIZE_Z = 8,
40 
41   GRID_SIZE_X = 12,
42   GRID_SIZE_Y = 16,
43   GRID_SIZE_Z = 20
44 };
45 
46 // Field offsets to implicit kernel argument pointer.
47 enum ImplicitArgOffsets {
48   HIDDEN_BLOCK_COUNT_X = 0,
49   HIDDEN_BLOCK_COUNT_Y = 4,
50   HIDDEN_BLOCK_COUNT_Z = 8,
51 
52   HIDDEN_GROUP_SIZE_X = 12,
53   HIDDEN_GROUP_SIZE_Y = 14,
54   HIDDEN_GROUP_SIZE_Z = 16,
55 
56   HIDDEN_REMAINDER_X = 18,
57   HIDDEN_REMAINDER_Y = 20,
58   HIDDEN_REMAINDER_Z = 22,
59 };
60 
61 class AMDGPULowerKernelAttributes : public ModulePass {
62 public:
63   static char ID;
64 
65   AMDGPULowerKernelAttributes() : ModulePass(ID) {}
66 
67   bool runOnModule(Module &M) override;
68 
69   StringRef getPassName() const override {
70     return "AMDGPU Kernel Attributes";
71   }
72 
73   void getAnalysisUsage(AnalysisUsage &AU) const override {
74     AU.setPreservesAll();
75  }
76 };
77 
78 Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
79   auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
80                                  : Intrinsic::amdgcn_dispatch_ptr;
81   StringRef Name = Intrinsic::getName(IntrinsicId);
82   return M.getFunction(Name);
83 }
84 
85 } // end anonymous namespace
86 
87 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
88   Function *F = CI->getParent()->getParent();
89 
90   auto MD = F->getMetadata("reqd_work_group_size");
91   const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
92 
93   const bool HasUniformWorkGroupSize =
94     F->getFnAttribute("uniform-work-group-size").getValueAsBool();
95 
96   if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
97     return false;
98 
99   Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
100   Value *GroupSizes[3]  = {nullptr, nullptr, nullptr};
101   Value *Remainders[3]  = {nullptr, nullptr, nullptr};
102   Value *GridSizes[3]   = {nullptr, nullptr, nullptr};
103 
104   const DataLayout &DL = F->getParent()->getDataLayout();
105 
106   // We expect to see several GEP users, casted to the appropriate type and
107   // loaded.
108   for (User *U : CI->users()) {
109     if (!U->hasOneUse())
110       continue;
111 
112     int64_t Offset = 0;
113     auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
114     auto *BCI = dyn_cast<BitCastInst>(U);
115     if (!Load && !BCI) {
116       if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
117         continue;
118       Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
119       BCI = dyn_cast<BitCastInst>(*U->user_begin());
120     }
121 
122     if (BCI) {
123       if (!BCI->hasOneUse())
124         continue;
125       Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
126     }
127 
128     if (!Load || !Load->isSimple())
129       continue;
130 
131     unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
132 
133     // TODO: Handle merged loads.
134     if (IsV5OrAbove) { // Base is ImplicitArgPtr.
135       switch (Offset) {
136       case HIDDEN_BLOCK_COUNT_X:
137         if (LoadSize == 4)
138           BlockCounts[0] = Load;
139         break;
140       case HIDDEN_BLOCK_COUNT_Y:
141         if (LoadSize == 4)
142           BlockCounts[1] = Load;
143         break;
144       case HIDDEN_BLOCK_COUNT_Z:
145         if (LoadSize == 4)
146           BlockCounts[2] = Load;
147         break;
148       case HIDDEN_GROUP_SIZE_X:
149         if (LoadSize == 2)
150           GroupSizes[0] = Load;
151         break;
152       case HIDDEN_GROUP_SIZE_Y:
153         if (LoadSize == 2)
154           GroupSizes[1] = Load;
155         break;
156       case HIDDEN_GROUP_SIZE_Z:
157         if (LoadSize == 2)
158           GroupSizes[2] = Load;
159         break;
160       case HIDDEN_REMAINDER_X:
161         if (LoadSize == 2)
162           Remainders[0] = Load;
163         break;
164       case HIDDEN_REMAINDER_Y:
165         if (LoadSize == 2)
166           Remainders[1] = Load;
167         break;
168       case HIDDEN_REMAINDER_Z:
169         if (LoadSize == 2)
170           Remainders[2] = Load;
171         break;
172       default:
173         break;
174       }
175     } else { // Base is DispatchPtr.
176       switch (Offset) {
177       case WORKGROUP_SIZE_X:
178         if (LoadSize == 2)
179           GroupSizes[0] = Load;
180         break;
181       case WORKGROUP_SIZE_Y:
182         if (LoadSize == 2)
183           GroupSizes[1] = Load;
184         break;
185       case WORKGROUP_SIZE_Z:
186         if (LoadSize == 2)
187           GroupSizes[2] = Load;
188         break;
189       case GRID_SIZE_X:
190         if (LoadSize == 4)
191           GridSizes[0] = Load;
192         break;
193       case GRID_SIZE_Y:
194         if (LoadSize == 4)
195           GridSizes[1] = Load;
196         break;
197       case GRID_SIZE_Z:
198         if (LoadSize == 4)
199           GridSizes[2] = Load;
200         break;
201       default:
202         break;
203       }
204     }
205   }
206 
207   bool MadeChange = false;
208   if (IsV5OrAbove && HasUniformWorkGroupSize) {
209     // Under v5  __ockl_get_local_size returns the value computed by the expression:
210     //
211     //   workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
212     //
213     // For functions with the attribute uniform-work-group-size=true. we can evaluate
214     // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
215     // for __ockl_get_local_size.
216     for (int I = 0; I < 3; ++I) {
217       Value *BlockCount = BlockCounts[I];
218       if (!BlockCount)
219         continue;
220 
221       using namespace llvm::PatternMatch;
222       auto GroupIDIntrin =
223           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
224                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
225                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
226 
227       for (User *ICmp : BlockCount->users()) {
228         ICmpInst::Predicate Pred;
229         if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
230           if (Pred != ICmpInst::ICMP_ULT)
231             continue;
232           ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
233           MadeChange = true;
234         }
235       }
236     }
237 
238     // All remainders should be 0 with uniform work group size.
239     for (Value *Remainder : Remainders) {
240       if (!Remainder)
241         continue;
242       Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
243       MadeChange = true;
244     }
245   } else if (HasUniformWorkGroupSize) { // Pre-V5.
246     // Pattern match the code used to handle partial workgroup dispatches in the
247     // library implementation of get_local_size, so the entire function can be
248     // constant folded with a known group size.
249     //
250     // uint r = grid_size - group_id * group_size;
251     // get_local_size = (r < group_size) ? r : group_size;
252     //
253     // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
254     // the grid_size is required to be a multiple of group_size). In this case:
255     //
256     // grid_size - (group_id * group_size) < group_size
257     // ->
258     // grid_size < group_size + (group_id * group_size)
259     //
260     // (grid_size / group_size) < 1 + group_id
261     //
262     // grid_size / group_size is at least 1, so we can conclude the select
263     // condition is false (except for group_id == 0, where the select result is
264     // the same).
265     for (int I = 0; I < 3; ++I) {
266       Value *GroupSize = GroupSizes[I];
267       Value *GridSize = GridSizes[I];
268       if (!GroupSize || !GridSize)
269         continue;
270 
271       using namespace llvm::PatternMatch;
272       auto GroupIDIntrin =
273           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
274                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
275                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
276 
277       for (User *U : GroupSize->users()) {
278         auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
279         if (!ZextGroupSize)
280           continue;
281 
282         for (User *UMin : ZextGroupSize->users()) {
283           if (match(UMin,
284                     m_UMin(m_Sub(m_Specific(GridSize),
285                                  m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
286                            m_Specific(ZextGroupSize)))) {
287             if (HasReqdWorkGroupSize) {
288               ConstantInt *KnownSize
289                 = mdconst::extract<ConstantInt>(MD->getOperand(I));
290               UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
291                   KnownSize, UMin->getType(), false, DL));
292             } else {
293               UMin->replaceAllUsesWith(ZextGroupSize);
294             }
295 
296             MadeChange = true;
297           }
298         }
299       }
300     }
301   }
302 
303   // If reqd_work_group_size is set, we can replace work group size with it.
304   if (!HasReqdWorkGroupSize)
305     return MadeChange;
306 
307   for (int I = 0; I < 3; I++) {
308     Value *GroupSize = GroupSizes[I];
309     if (!GroupSize)
310       continue;
311 
312     ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
313     GroupSize->replaceAllUsesWith(
314         ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
315     MadeChange = true;
316   }
317 
318   return MadeChange;
319 }
320 
321 
322 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
323 // TargetPassConfig for subtarget.
324 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
325   bool MadeChange = false;
326   bool IsV5OrAbove =
327       AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
328   Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
329 
330   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
331     return false;
332 
333   SmallPtrSet<Instruction *, 4> HandledUses;
334   for (auto *U : BasePtr->users()) {
335     CallInst *CI = cast<CallInst>(U);
336     if (HandledUses.insert(CI).second) {
337       if (processUse(CI, IsV5OrAbove))
338         MadeChange = true;
339     }
340   }
341 
342   return MadeChange;
343 }
344 
345 
346 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
347                       "AMDGPU Kernel Attributes", false, false)
348 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
349                     "AMDGPU Kernel Attributes", false, false)
350 
351 char AMDGPULowerKernelAttributes::ID = 0;
352 
353 ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
354   return new AMDGPULowerKernelAttributes();
355 }
356 
357 PreservedAnalyses
358 AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
359   bool IsV5OrAbove =
360       AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
361   Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
362 
363   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
364     return PreservedAnalyses::all();
365 
366   for (Instruction &I : instructions(F)) {
367     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
368       if (CI->getCalledFunction() == BasePtr)
369         processUse(CI, IsV5OrAbove);
370     }
371   }
372 
373   return PreservedAnalyses::all();
374 }
375