xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp (revision 517e52b6c21ccff22c46df0dcd15c19baee3d86c)
1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass adds target attributes to functions which use intrinsics
10 /// which will impact calling convention lowering.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "llvm/Analysis/CallGraph.h"
17 #include "llvm/Analysis/CallGraphSCCPass.h"
18 #include "llvm/CodeGen/TargetPassConfig.h"
19 #include "llvm/IR/IntrinsicsAMDGPU.h"
20 #include "llvm/IR/IntrinsicsR600.h"
21 #include "llvm/Target/TargetMachine.h"
22 
23 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
24 
25 using namespace llvm;
26 
27 namespace {
28 
29 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
30 private:
31   const TargetMachine *TM = nullptr;
32   SmallVector<CallGraphNode*, 8> NodeList;
33 
34   bool addFeatureAttributes(Function &F);
35   bool processUniformWorkGroupAttribute();
36   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
37 
38 public:
39   static char ID;
40 
41   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
42 
43   bool doInitialization(CallGraph &CG) override;
44   bool runOnSCC(CallGraphSCC &SCC) override;
45 
46   StringRef getPassName() const override {
47     return "AMDGPU Annotate Kernel Features";
48   }
49 
50   void getAnalysisUsage(AnalysisUsage &AU) const override {
51     AU.setPreservesAll();
52     CallGraphSCCPass::getAnalysisUsage(AU);
53   }
54 
55   static bool visitConstantExpr(const ConstantExpr *CE);
56   static bool visitConstantExprsRecursively(
57     const Constant *EntryC,
58     SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
59     bool HasApertureRegs);
60 };
61 
62 } // end anonymous namespace
63 
64 char AMDGPUAnnotateKernelFeatures::ID = 0;
65 
66 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
67 
68 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
69                 "Add AMDGPU function attributes", false, false)
70 
71 
72 // The queue ptr is only needed when casting to flat, not from it.
73 static bool castRequiresQueuePtr(unsigned SrcAS) {
74   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
75 }
76 
77 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
78   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
79 }
80 
81 static bool isDSAddress(const Constant *C) {
82   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
83   if (!GV)
84     return false;
85   unsigned AS = GV->getAddressSpace();
86   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
87 }
88 
89 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
90   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
91     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
92     return castRequiresQueuePtr(SrcAS);
93   }
94 
95   return false;
96 }
97 
98 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
99   const Constant *EntryC,
100   SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
101   bool IsFunc, bool HasApertureRegs) {
102 
103   if (!ConstantExprVisited.insert(EntryC).second)
104     return false;
105 
106   SmallVector<const Constant *, 16> Stack;
107   Stack.push_back(EntryC);
108 
109   while (!Stack.empty()) {
110     const Constant *C = Stack.pop_back_val();
111 
112     // We need to trap on DS globals in non-entry functions.
113     if (IsFunc && isDSAddress(C))
114       return true;
115 
116     // Check this constant expression.
117     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
118       if (!HasApertureRegs && visitConstantExpr(CE))
119         return true;
120     }
121 
122     // Visit all sub-expressions.
123     for (const Use &U : C->operands()) {
124       const auto *OpC = dyn_cast<Constant>(U);
125       if (!OpC)
126         continue;
127 
128       if (!ConstantExprVisited.insert(OpC).second)
129         continue;
130 
131       Stack.push_back(OpC);
132     }
133   }
134 
135   return false;
136 }
137 
138 // We do not need to note the x workitem or workgroup id because they are always
139 // initialized.
140 //
141 // TODO: We should not add the attributes if the known compile time workgroup
142 // size is 1 for y/z.
143 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
144                                      bool &NonKernelOnly,
145                                      bool &IsQueuePtr) {
146   switch (ID) {
147   case Intrinsic::amdgcn_workitem_id_x:
148     NonKernelOnly = true;
149     return "amdgpu-work-item-id-x";
150   case Intrinsic::amdgcn_workgroup_id_x:
151     NonKernelOnly = true;
152     return "amdgpu-work-group-id-x";
153   case Intrinsic::amdgcn_workitem_id_y:
154   case Intrinsic::r600_read_tidig_y:
155     return "amdgpu-work-item-id-y";
156   case Intrinsic::amdgcn_workitem_id_z:
157   case Intrinsic::r600_read_tidig_z:
158     return "amdgpu-work-item-id-z";
159   case Intrinsic::amdgcn_workgroup_id_y:
160   case Intrinsic::r600_read_tgid_y:
161     return "amdgpu-work-group-id-y";
162   case Intrinsic::amdgcn_workgroup_id_z:
163   case Intrinsic::r600_read_tgid_z:
164     return "amdgpu-work-group-id-z";
165   case Intrinsic::amdgcn_dispatch_ptr:
166     return "amdgpu-dispatch-ptr";
167   case Intrinsic::amdgcn_dispatch_id:
168     return "amdgpu-dispatch-id";
169   case Intrinsic::amdgcn_kernarg_segment_ptr:
170     return "amdgpu-kernarg-segment-ptr";
171   case Intrinsic::amdgcn_implicitarg_ptr:
172     return "amdgpu-implicitarg-ptr";
173   case Intrinsic::amdgcn_queue_ptr:
174   case Intrinsic::amdgcn_is_shared:
175   case Intrinsic::amdgcn_is_private:
176     // TODO: Does not require queue ptr on gfx9+
177   case Intrinsic::trap:
178   case Intrinsic::debugtrap:
179     IsQueuePtr = true;
180     return "amdgpu-queue-ptr";
181   default:
182     return "";
183   }
184 }
185 
186 static bool handleAttr(Function &Parent, const Function &Callee,
187                        StringRef Name) {
188   if (Callee.hasFnAttribute(Name)) {
189     Parent.addFnAttr(Name);
190     return true;
191   }
192   return false;
193 }
194 
195 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
196                                    bool &NeedQueuePtr) {
197   // X ids unnecessarily propagated to kernels.
198   static constexpr StringLiteral AttrNames[] = {
199       "amdgpu-work-item-id-x",      "amdgpu-work-item-id-y",
200       "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
201       "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
202       "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
203       "amdgpu-implicitarg-ptr"};
204 
205   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
206     NeedQueuePtr = true;
207 
208   for (StringRef AttrName : AttrNames)
209     handleAttr(Parent, Callee, AttrName);
210 }
211 
212 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
213   bool Changed = false;
214 
215   for (auto *Node : reverse(NodeList)) {
216     Function *Caller = Node->getFunction();
217 
218     for (auto I : *Node) {
219       Function *Callee = std::get<1>(I)->getFunction();
220       if (Callee)
221         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
222     }
223   }
224 
225   return Changed;
226 }
227 
228 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
229        Function &Caller, Function &Callee) {
230 
231   // Check for externally defined function
232   if (!Callee.hasExactDefinition()) {
233     Callee.addFnAttr("uniform-work-group-size", "false");
234     if (!Caller.hasFnAttribute("uniform-work-group-size"))
235       Caller.addFnAttr("uniform-work-group-size", "false");
236 
237     return true;
238   }
239   // Check if the Caller has the attribute
240   if (Caller.hasFnAttribute("uniform-work-group-size")) {
241     // Check if the value of the attribute is true
242     if (Caller.getFnAttribute("uniform-work-group-size")
243         .getValueAsString().equals("true")) {
244       // Propagate the attribute to the Callee, if it does not have it
245       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
246         Callee.addFnAttr("uniform-work-group-size", "true");
247         return true;
248       }
249     } else {
250       Callee.addFnAttr("uniform-work-group-size", "false");
251       return true;
252     }
253   } else {
254     // If the attribute is absent, set it as false
255     Caller.addFnAttr("uniform-work-group-size", "false");
256     Callee.addFnAttr("uniform-work-group-size", "false");
257     return true;
258   }
259   return false;
260 }
261 
262 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
263   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
264   bool HasApertureRegs = ST.hasApertureRegs();
265   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
266 
267   bool HaveStackObjects = false;
268   bool Changed = false;
269   bool NeedQueuePtr = false;
270   bool HaveCall = false;
271   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
272 
273   for (BasicBlock &BB : F) {
274     for (Instruction &I : BB) {
275       if (isa<AllocaInst>(I)) {
276         HaveStackObjects = true;
277         continue;
278       }
279 
280       if (auto *CB = dyn_cast<CallBase>(&I)) {
281         const Function *Callee =
282             dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
283 
284         // TODO: Do something with indirect calls.
285         if (!Callee) {
286           if (!CB->isInlineAsm())
287             HaveCall = true;
288           continue;
289         }
290 
291         Intrinsic::ID IID = Callee->getIntrinsicID();
292         if (IID == Intrinsic::not_intrinsic) {
293           HaveCall = true;
294           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
295           Changed = true;
296         } else {
297           bool NonKernelOnly = false;
298 
299           if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
300             F.addFnAttr("amdgpu-kernarg-segment-ptr");
301           } else {
302             StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
303                                                      NeedQueuePtr);
304             if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
305               F.addFnAttr(AttrName);
306               Changed = true;
307             }
308           }
309         }
310       }
311 
312       if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
313         continue;
314 
315       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
316         if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
317           NeedQueuePtr = true;
318           continue;
319         }
320       }
321 
322       for (const Use &U : I.operands()) {
323         const auto *OpC = dyn_cast<Constant>(U);
324         if (!OpC)
325           continue;
326 
327         if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
328                                           HasApertureRegs)) {
329           NeedQueuePtr = true;
330           break;
331         }
332       }
333     }
334   }
335 
336   if (NeedQueuePtr) {
337     F.addFnAttr("amdgpu-queue-ptr");
338     Changed = true;
339   }
340 
341   // TODO: We could refine this to captured pointers that could possibly be
342   // accessed by flat instructions. For now this is mostly a poor way of
343   // estimating whether there are calls before argument lowering.
344   if (!IsFunc && HaveCall) {
345     F.addFnAttr("amdgpu-calls");
346     Changed = true;
347   }
348 
349   if (HaveStackObjects) {
350     F.addFnAttr("amdgpu-stack-objects");
351     Changed = true;
352   }
353 
354   return Changed;
355 }
356 
357 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
358   bool Changed = false;
359 
360   for (CallGraphNode *I : SCC) {
361     // Build a list of CallGraphNodes from most number of uses to least
362     if (I->getNumReferences())
363       NodeList.push_back(I);
364     else {
365       processUniformWorkGroupAttribute();
366       NodeList.clear();
367     }
368 
369     Function *F = I->getFunction();
370     // Add feature attributes
371     if (!F || F->isDeclaration())
372       continue;
373     Changed |= addFeatureAttributes(*F);
374   }
375 
376   return Changed;
377 }
378 
379 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
380   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
381   if (!TPC)
382     report_fatal_error("TargetMachine is required");
383 
384   TM = &TPC->getTM<TargetMachine>();
385   return false;
386 }
387 
388 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
389   return new AMDGPUAnnotateKernelFeatures();
390 }
391