xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass does attempts to make use of reqd_work_group_size metadata
10 /// to eliminate loads from the dispatch packet and to constant fold OpenCL
11 /// get_local_size-like functions.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/Analysis/ConstantFolding.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/InstIterator.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/Pass.h"
28 
29 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
30 
31 using namespace llvm;
32 
33 namespace {
34 
35 // Field offsets in hsa_kernel_dispatch_packet_t.
36 enum DispatchPackedOffsets {
37   WORKGROUP_SIZE_X = 4,
38   WORKGROUP_SIZE_Y = 6,
39   WORKGROUP_SIZE_Z = 8,
40 
41   GRID_SIZE_X = 12,
42   GRID_SIZE_Y = 16,
43   GRID_SIZE_Z = 20
44 };
45 
46 // Field offsets to implicit kernel argument pointer.
47 enum ImplicitArgOffsets {
48   HIDDEN_BLOCK_COUNT_X = 0,
49   HIDDEN_BLOCK_COUNT_Y = 4,
50   HIDDEN_BLOCK_COUNT_Z = 8,
51 
52   HIDDEN_GROUP_SIZE_X = 12,
53   HIDDEN_GROUP_SIZE_Y = 14,
54   HIDDEN_GROUP_SIZE_Z = 16,
55 
56   HIDDEN_REMAINDER_X = 18,
57   HIDDEN_REMAINDER_Y = 20,
58   HIDDEN_REMAINDER_Z = 22,
59 };
60 
61 class AMDGPULowerKernelAttributes : public ModulePass {
62 public:
63   static char ID;
64 
65   AMDGPULowerKernelAttributes() : ModulePass(ID) {}
66 
67   bool runOnModule(Module &M) override;
68 
69   StringRef getPassName() const override {
70     return "AMDGPU Kernel Attributes";
71   }
72 
73   void getAnalysisUsage(AnalysisUsage &AU) const override {
74     AU.setPreservesAll();
75  }
76 };
77 
78 Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
79   auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
80                                  : Intrinsic::amdgcn_dispatch_ptr;
81   return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
82 }
83 
84 } // end anonymous namespace
85 
86 static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
87                                             uint32_t MaxNumGroups) {
88   if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
89     return;
90 
91   if (!Load->getType()->isIntegerTy(32))
92     return;
93 
94   // TODO: If there is existing range metadata, preserve it if it is stricter.
95   MDBuilder MDB(Load->getContext());
96   MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
97   Load->setMetadata(LLVMContext::MD_range, Range);
98 }
99 
100 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
101   Function *F = CI->getParent()->getParent();
102 
103   auto *MD = F->getMetadata("reqd_work_group_size");
104   const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
105 
106   const bool HasUniformWorkGroupSize =
107     F->getFnAttribute("uniform-work-group-size").getValueAsBool();
108 
109   SmallVector<unsigned> MaxNumWorkgroups =
110       AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
111                                      /*Size=*/3, /*DefaultVal=*/0);
112 
113   if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
114       none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
115     return false;
116 
117   Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
118   Value *GroupSizes[3]  = {nullptr, nullptr, nullptr};
119   Value *Remainders[3]  = {nullptr, nullptr, nullptr};
120   Value *GridSizes[3]   = {nullptr, nullptr, nullptr};
121 
122   const DataLayout &DL = F->getDataLayout();
123 
124   // We expect to see several GEP users, casted to the appropriate type and
125   // loaded.
126   for (User *U : CI->users()) {
127     if (!U->hasOneUse())
128       continue;
129 
130     int64_t Offset = 0;
131     auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
132     auto *BCI = dyn_cast<BitCastInst>(U);
133     if (!Load && !BCI) {
134       if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
135         continue;
136       Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
137       BCI = dyn_cast<BitCastInst>(*U->user_begin());
138     }
139 
140     if (BCI) {
141       if (!BCI->hasOneUse())
142         continue;
143       Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
144     }
145 
146     if (!Load || !Load->isSimple())
147       continue;
148 
149     unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
150 
151     // TODO: Handle merged loads.
152     if (IsV5OrAbove) { // Base is ImplicitArgPtr.
153       switch (Offset) {
154       case HIDDEN_BLOCK_COUNT_X:
155         if (LoadSize == 4) {
156           BlockCounts[0] = Load;
157           annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
158         }
159         break;
160       case HIDDEN_BLOCK_COUNT_Y:
161         if (LoadSize == 4) {
162           BlockCounts[1] = Load;
163           annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
164         }
165         break;
166       case HIDDEN_BLOCK_COUNT_Z:
167         if (LoadSize == 4) {
168           BlockCounts[2] = Load;
169           annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
170         }
171         break;
172       case HIDDEN_GROUP_SIZE_X:
173         if (LoadSize == 2)
174           GroupSizes[0] = Load;
175         break;
176       case HIDDEN_GROUP_SIZE_Y:
177         if (LoadSize == 2)
178           GroupSizes[1] = Load;
179         break;
180       case HIDDEN_GROUP_SIZE_Z:
181         if (LoadSize == 2)
182           GroupSizes[2] = Load;
183         break;
184       case HIDDEN_REMAINDER_X:
185         if (LoadSize == 2)
186           Remainders[0] = Load;
187         break;
188       case HIDDEN_REMAINDER_Y:
189         if (LoadSize == 2)
190           Remainders[1] = Load;
191         break;
192       case HIDDEN_REMAINDER_Z:
193         if (LoadSize == 2)
194           Remainders[2] = Load;
195         break;
196       default:
197         break;
198       }
199     } else { // Base is DispatchPtr.
200       switch (Offset) {
201       case WORKGROUP_SIZE_X:
202         if (LoadSize == 2)
203           GroupSizes[0] = Load;
204         break;
205       case WORKGROUP_SIZE_Y:
206         if (LoadSize == 2)
207           GroupSizes[1] = Load;
208         break;
209       case WORKGROUP_SIZE_Z:
210         if (LoadSize == 2)
211           GroupSizes[2] = Load;
212         break;
213       case GRID_SIZE_X:
214         if (LoadSize == 4)
215           GridSizes[0] = Load;
216         break;
217       case GRID_SIZE_Y:
218         if (LoadSize == 4)
219           GridSizes[1] = Load;
220         break;
221       case GRID_SIZE_Z:
222         if (LoadSize == 4)
223           GridSizes[2] = Load;
224         break;
225       default:
226         break;
227       }
228     }
229   }
230 
231   bool MadeChange = false;
232   if (IsV5OrAbove && HasUniformWorkGroupSize) {
233     // Under v5  __ockl_get_local_size returns the value computed by the expression:
234     //
235     //   workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
236     //
237     // For functions with the attribute uniform-work-group-size=true. we can evaluate
238     // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
239     // for __ockl_get_local_size.
240     for (int I = 0; I < 3; ++I) {
241       Value *BlockCount = BlockCounts[I];
242       if (!BlockCount)
243         continue;
244 
245       using namespace llvm::PatternMatch;
246       auto GroupIDIntrin =
247           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
248                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
249                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
250 
251       for (User *ICmp : BlockCount->users()) {
252         if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
253                                        m_Specific(BlockCount)))) {
254           ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
255           MadeChange = true;
256         }
257       }
258     }
259 
260     // All remainders should be 0 with uniform work group size.
261     for (Value *Remainder : Remainders) {
262       if (!Remainder)
263         continue;
264       Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
265       MadeChange = true;
266     }
267   } else if (HasUniformWorkGroupSize) { // Pre-V5.
268     // Pattern match the code used to handle partial workgroup dispatches in the
269     // library implementation of get_local_size, so the entire function can be
270     // constant folded with a known group size.
271     //
272     // uint r = grid_size - group_id * group_size;
273     // get_local_size = (r < group_size) ? r : group_size;
274     //
275     // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
276     // the grid_size is required to be a multiple of group_size). In this case:
277     //
278     // grid_size - (group_id * group_size) < group_size
279     // ->
280     // grid_size < group_size + (group_id * group_size)
281     //
282     // (grid_size / group_size) < 1 + group_id
283     //
284     // grid_size / group_size is at least 1, so we can conclude the select
285     // condition is false (except for group_id == 0, where the select result is
286     // the same).
287     for (int I = 0; I < 3; ++I) {
288       Value *GroupSize = GroupSizes[I];
289       Value *GridSize = GridSizes[I];
290       if (!GroupSize || !GridSize)
291         continue;
292 
293       using namespace llvm::PatternMatch;
294       auto GroupIDIntrin =
295           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
296                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
297                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
298 
299       for (User *U : GroupSize->users()) {
300         auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
301         if (!ZextGroupSize)
302           continue;
303 
304         for (User *UMin : ZextGroupSize->users()) {
305           if (match(UMin,
306                     m_UMin(m_Sub(m_Specific(GridSize),
307                                  m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
308                            m_Specific(ZextGroupSize)))) {
309             if (HasReqdWorkGroupSize) {
310               ConstantInt *KnownSize
311                 = mdconst::extract<ConstantInt>(MD->getOperand(I));
312               UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
313                   KnownSize, UMin->getType(), false, DL));
314             } else {
315               UMin->replaceAllUsesWith(ZextGroupSize);
316             }
317 
318             MadeChange = true;
319           }
320         }
321       }
322     }
323   }
324 
325   // If reqd_work_group_size is set, we can replace work group size with it.
326   if (!HasReqdWorkGroupSize)
327     return MadeChange;
328 
329   for (int I = 0; I < 3; I++) {
330     Value *GroupSize = GroupSizes[I];
331     if (!GroupSize)
332       continue;
333 
334     ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
335     GroupSize->replaceAllUsesWith(
336         ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
337     MadeChange = true;
338   }
339 
340   return MadeChange;
341 }
342 
343 
344 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
345 // TargetPassConfig for subtarget.
346 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
347   bool MadeChange = false;
348   bool IsV5OrAbove =
349       AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
350   Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
351 
352   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
353     return false;
354 
355   SmallPtrSet<Instruction *, 4> HandledUses;
356   for (auto *U : BasePtr->users()) {
357     CallInst *CI = cast<CallInst>(U);
358     if (HandledUses.insert(CI).second) {
359       if (processUse(CI, IsV5OrAbove))
360         MadeChange = true;
361     }
362   }
363 
364   return MadeChange;
365 }
366 
367 
368 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
369                       "AMDGPU Kernel Attributes", false, false)
370 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
371                     "AMDGPU Kernel Attributes", false, false)
372 
373 char AMDGPULowerKernelAttributes::ID = 0;
374 
375 ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
376   return new AMDGPULowerKernelAttributes();
377 }
378 
379 PreservedAnalyses
380 AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
381   bool IsV5OrAbove =
382       AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
383   Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
384 
385   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
386     return PreservedAnalyses::all();
387 
388   for (Instruction &I : instructions(F)) {
389     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
390       if (CI->getCalledFunction() == BasePtr)
391         processUse(CI, IsV5OrAbove);
392     }
393   }
394 
395   return PreservedAnalyses::all();
396 }
397