xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp (revision 2f513db72b034fd5ef7f080b11be5c711c15186a)
1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass adds target attributes to functions which use intrinsics
10 /// which will impact calling convention lowering.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Triple.h"
21 #include "llvm/Analysis/CallGraph.h"
22 #include "llvm/Analysis/CallGraphSCCPass.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/CallSite.h"
25 #include "llvm/IR/Constant.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Intrinsics.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/IR/Type.h"
33 #include "llvm/IR/Use.h"
34 #include "llvm/Pass.h"
35 #include "llvm/Support/Casting.h"
36 #include "llvm/Support/ErrorHandling.h"
37 #include "llvm/Target/TargetMachine.h"
38 
39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
40 
41 using namespace llvm;
42 
43 namespace {
44 
45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
46 private:
47   const TargetMachine *TM = nullptr;
48   SmallVector<CallGraphNode*, 8> NodeList;
49 
50   bool addFeatureAttributes(Function &F);
51   bool processUniformWorkGroupAttribute();
52   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
53 
54 public:
55   static char ID;
56 
57   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
58 
59   bool doInitialization(CallGraph &CG) override;
60   bool runOnSCC(CallGraphSCC &SCC) override;
61 
62   StringRef getPassName() const override {
63     return "AMDGPU Annotate Kernel Features";
64   }
65 
66   void getAnalysisUsage(AnalysisUsage &AU) const override {
67     AU.setPreservesAll();
68     CallGraphSCCPass::getAnalysisUsage(AU);
69   }
70 
71   static bool visitConstantExpr(const ConstantExpr *CE);
72   static bool visitConstantExprsRecursively(
73     const Constant *EntryC,
74     SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
75 };
76 
77 } // end anonymous namespace
78 
79 char AMDGPUAnnotateKernelFeatures::ID = 0;
80 
81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
82 
83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
84                 "Add AMDGPU function attributes", false, false)
85 
86 
87 // The queue ptr is only needed when casting to flat, not from it.
88 static bool castRequiresQueuePtr(unsigned SrcAS) {
89   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
90 }
91 
92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
93   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
94 }
95 
96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
97   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
98     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
99     return castRequiresQueuePtr(SrcAS);
100   }
101 
102   return false;
103 }
104 
105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
106   const Constant *EntryC,
107   SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
108 
109   if (!ConstantExprVisited.insert(EntryC).second)
110     return false;
111 
112   SmallVector<const Constant *, 16> Stack;
113   Stack.push_back(EntryC);
114 
115   while (!Stack.empty()) {
116     const Constant *C = Stack.pop_back_val();
117 
118     // Check this constant expression.
119     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
120       if (visitConstantExpr(CE))
121         return true;
122     }
123 
124     // Visit all sub-expressions.
125     for (const Use &U : C->operands()) {
126       const auto *OpC = dyn_cast<Constant>(U);
127       if (!OpC)
128         continue;
129 
130       if (!ConstantExprVisited.insert(OpC).second)
131         continue;
132 
133       Stack.push_back(OpC);
134     }
135   }
136 
137   return false;
138 }
139 
140 // We do not need to note the x workitem or workgroup id because they are always
141 // initialized.
142 //
143 // TODO: We should not add the attributes if the known compile time workgroup
144 // size is 1 for y/z.
145 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
146                                      bool &NonKernelOnly,
147                                      bool &IsQueuePtr) {
148   switch (ID) {
149   case Intrinsic::amdgcn_workitem_id_x:
150     NonKernelOnly = true;
151     return "amdgpu-work-item-id-x";
152   case Intrinsic::amdgcn_workgroup_id_x:
153     NonKernelOnly = true;
154     return "amdgpu-work-group-id-x";
155   case Intrinsic::amdgcn_workitem_id_y:
156   case Intrinsic::r600_read_tidig_y:
157     return "amdgpu-work-item-id-y";
158   case Intrinsic::amdgcn_workitem_id_z:
159   case Intrinsic::r600_read_tidig_z:
160     return "amdgpu-work-item-id-z";
161   case Intrinsic::amdgcn_workgroup_id_y:
162   case Intrinsic::r600_read_tgid_y:
163     return "amdgpu-work-group-id-y";
164   case Intrinsic::amdgcn_workgroup_id_z:
165   case Intrinsic::r600_read_tgid_z:
166     return "amdgpu-work-group-id-z";
167   case Intrinsic::amdgcn_dispatch_ptr:
168     return "amdgpu-dispatch-ptr";
169   case Intrinsic::amdgcn_dispatch_id:
170     return "amdgpu-dispatch-id";
171   case Intrinsic::amdgcn_kernarg_segment_ptr:
172     return "amdgpu-kernarg-segment-ptr";
173   case Intrinsic::amdgcn_implicitarg_ptr:
174     return "amdgpu-implicitarg-ptr";
175   case Intrinsic::amdgcn_queue_ptr:
176   case Intrinsic::trap:
177   case Intrinsic::debugtrap:
178     IsQueuePtr = true;
179     return "amdgpu-queue-ptr";
180   default:
181     return "";
182   }
183 }
184 
185 static bool handleAttr(Function &Parent, const Function &Callee,
186                        StringRef Name) {
187   if (Callee.hasFnAttribute(Name)) {
188     Parent.addFnAttr(Name);
189     return true;
190   }
191   return false;
192 }
193 
194 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
195                                    bool &NeedQueuePtr) {
196   // X ids unnecessarily propagated to kernels.
197   static const StringRef AttrNames[] = {
198     { "amdgpu-work-item-id-x" },
199     { "amdgpu-work-item-id-y" },
200     { "amdgpu-work-item-id-z" },
201     { "amdgpu-work-group-id-x" },
202     { "amdgpu-work-group-id-y" },
203     { "amdgpu-work-group-id-z" },
204     { "amdgpu-dispatch-ptr" },
205     { "amdgpu-dispatch-id" },
206     { "amdgpu-kernarg-segment-ptr" },
207     { "amdgpu-implicitarg-ptr" }
208   };
209 
210   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
211     NeedQueuePtr = true;
212 
213   for (StringRef AttrName : AttrNames)
214     handleAttr(Parent, Callee, AttrName);
215 }
216 
217 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
218   bool Changed = false;
219 
220   for (auto *Node : reverse(NodeList)) {
221     Function *Caller = Node->getFunction();
222 
223     for (auto I : *Node) {
224       Function *Callee = std::get<1>(I)->getFunction();
225       if (Callee)
226         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
227     }
228   }
229 
230   return Changed;
231 }
232 
233 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
234        Function &Caller, Function &Callee) {
235 
236   // Check for externally defined function
237   if (!Callee.hasExactDefinition()) {
238     Callee.addFnAttr("uniform-work-group-size", "false");
239     if (!Caller.hasFnAttribute("uniform-work-group-size"))
240       Caller.addFnAttr("uniform-work-group-size", "false");
241 
242     return true;
243   }
244   // Check if the Caller has the attribute
245   if (Caller.hasFnAttribute("uniform-work-group-size")) {
246     // Check if the value of the attribute is true
247     if (Caller.getFnAttribute("uniform-work-group-size")
248         .getValueAsString().equals("true")) {
249       // Propagate the attribute to the Callee, if it does not have it
250       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
251         Callee.addFnAttr("uniform-work-group-size", "true");
252         return true;
253       }
254     } else {
255       Callee.addFnAttr("uniform-work-group-size", "false");
256       return true;
257     }
258   } else {
259     // If the attribute is absent, set it as false
260     Caller.addFnAttr("uniform-work-group-size", "false");
261     Callee.addFnAttr("uniform-work-group-size", "false");
262     return true;
263   }
264   return false;
265 }
266 
267 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
268   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
269   bool HasFlat = ST.hasFlatAddressSpace();
270   bool HasApertureRegs = ST.hasApertureRegs();
271   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
272 
273   bool Changed = false;
274   bool NeedQueuePtr = false;
275   bool HaveCall = false;
276   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
277 
278   for (BasicBlock &BB : F) {
279     for (Instruction &I : BB) {
280       CallSite CS(&I);
281       if (CS) {
282         Function *Callee = CS.getCalledFunction();
283 
284         // TODO: Do something with indirect calls.
285         if (!Callee) {
286           if (!CS.isInlineAsm())
287             HaveCall = true;
288           continue;
289         }
290 
291         Intrinsic::ID IID = Callee->getIntrinsicID();
292         if (IID == Intrinsic::not_intrinsic) {
293           HaveCall = true;
294           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
295           Changed = true;
296         } else {
297           bool NonKernelOnly = false;
298           StringRef AttrName = intrinsicToAttrName(IID,
299                                                    NonKernelOnly, NeedQueuePtr);
300           if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
301             F.addFnAttr(AttrName);
302             Changed = true;
303           }
304         }
305       }
306 
307       if (NeedQueuePtr || HasApertureRegs)
308         continue;
309 
310       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
311         if (castRequiresQueuePtr(ASC)) {
312           NeedQueuePtr = true;
313           continue;
314         }
315       }
316 
317       for (const Use &U : I.operands()) {
318         const auto *OpC = dyn_cast<Constant>(U);
319         if (!OpC)
320           continue;
321 
322         if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
323           NeedQueuePtr = true;
324           break;
325         }
326       }
327     }
328   }
329 
330   if (NeedQueuePtr) {
331     F.addFnAttr("amdgpu-queue-ptr");
332     Changed = true;
333   }
334 
335   // TODO: We could refine this to captured pointers that could possibly be
336   // accessed by flat instructions. For now this is mostly a poor way of
337   // estimating whether there are calls before argument lowering.
338   if (HasFlat && !IsFunc && HaveCall) {
339     F.addFnAttr("amdgpu-flat-scratch");
340     Changed = true;
341   }
342 
343   return Changed;
344 }
345 
346 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
347   bool Changed = false;
348 
349   for (CallGraphNode *I : SCC) {
350     // Build a list of CallGraphNodes from most number of uses to least
351     if (I->getNumReferences())
352       NodeList.push_back(I);
353     else {
354       processUniformWorkGroupAttribute();
355       NodeList.clear();
356     }
357 
358     Function *F = I->getFunction();
359     // Add feature attributes
360     if (!F || F->isDeclaration())
361       continue;
362     Changed |= addFeatureAttributes(*F);
363   }
364 
365   return Changed;
366 }
367 
368 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
369   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
370   if (!TPC)
371     report_fatal_error("TargetMachine is required");
372 
373   TM = &TPC->getTM<TargetMachine>();
374   return false;
375 }
376 
377 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
378   return new AMDGPUAnnotateKernelFeatures();
379 }
380