xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp (revision 6132212808e8dccedc9e5d85fea4390c2f38059a)
1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass adds target attributes to functions which use intrinsics
10 /// which will impact calling convention lowering.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Triple.h"
21 #include "llvm/Analysis/CallGraph.h"
22 #include "llvm/Analysis/CallGraphSCCPass.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/Constant.h"
25 #include "llvm/IR/Constants.h"
26 #include "llvm/IR/Function.h"
27 #include "llvm/IR/Instruction.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/IR/Intrinsics.h"
30 #include "llvm/IR/Module.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/IR/Use.h"
33 #include "llvm/Pass.h"
34 #include "llvm/Support/Casting.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Target/TargetMachine.h"
37 
38 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
39 
40 using namespace llvm;
41 
42 namespace {
43 
44 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
45 private:
46   const TargetMachine *TM = nullptr;
47   SmallVector<CallGraphNode*, 8> NodeList;
48 
49   bool addFeatureAttributes(Function &F);
50   bool processUniformWorkGroupAttribute();
51   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
52 
53 public:
54   static char ID;
55 
56   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
57 
58   bool doInitialization(CallGraph &CG) override;
59   bool runOnSCC(CallGraphSCC &SCC) override;
60 
61   StringRef getPassName() const override {
62     return "AMDGPU Annotate Kernel Features";
63   }
64 
65   void getAnalysisUsage(AnalysisUsage &AU) const override {
66     AU.setPreservesAll();
67     CallGraphSCCPass::getAnalysisUsage(AU);
68   }
69 
70   static bool visitConstantExpr(const ConstantExpr *CE);
71   static bool visitConstantExprsRecursively(
72     const Constant *EntryC,
73     SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
74     bool HasApertureRegs);
75 };
76 
77 } // end anonymous namespace
78 
79 char AMDGPUAnnotateKernelFeatures::ID = 0;
80 
81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
82 
83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
84                 "Add AMDGPU function attributes", false, false)
85 
86 
87 // The queue ptr is only needed when casting to flat, not from it.
88 static bool castRequiresQueuePtr(unsigned SrcAS) {
89   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
90 }
91 
92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
93   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
94 }
95 
96 static bool isDSAddress(const Constant *C) {
97   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
98   if (!GV)
99     return false;
100   unsigned AS = GV->getAddressSpace();
101   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
102 }
103 
104 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
105   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
106     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
107     return castRequiresQueuePtr(SrcAS);
108   }
109 
110   return false;
111 }
112 
113 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
114   const Constant *EntryC,
115   SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
116   bool IsFunc, bool HasApertureRegs) {
117 
118   if (!ConstantExprVisited.insert(EntryC).second)
119     return false;
120 
121   SmallVector<const Constant *, 16> Stack;
122   Stack.push_back(EntryC);
123 
124   while (!Stack.empty()) {
125     const Constant *C = Stack.pop_back_val();
126 
127     // We need to trap on DS globals in non-entry functions.
128     if (IsFunc && isDSAddress(C))
129       return true;
130 
131     // Check this constant expression.
132     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
133       if (!HasApertureRegs && visitConstantExpr(CE))
134         return true;
135     }
136 
137     // Visit all sub-expressions.
138     for (const Use &U : C->operands()) {
139       const auto *OpC = dyn_cast<Constant>(U);
140       if (!OpC)
141         continue;
142 
143       if (!ConstantExprVisited.insert(OpC).second)
144         continue;
145 
146       Stack.push_back(OpC);
147     }
148   }
149 
150   return false;
151 }
152 
153 // We do not need to note the x workitem or workgroup id because they are always
154 // initialized.
155 //
156 // TODO: We should not add the attributes if the known compile time workgroup
157 // size is 1 for y/z.
158 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
159                                      bool &NonKernelOnly,
160                                      bool &IsQueuePtr) {
161   switch (ID) {
162   case Intrinsic::amdgcn_workitem_id_x:
163     NonKernelOnly = true;
164     return "amdgpu-work-item-id-x";
165   case Intrinsic::amdgcn_workgroup_id_x:
166     NonKernelOnly = true;
167     return "amdgpu-work-group-id-x";
168   case Intrinsic::amdgcn_workitem_id_y:
169   case Intrinsic::r600_read_tidig_y:
170     return "amdgpu-work-item-id-y";
171   case Intrinsic::amdgcn_workitem_id_z:
172   case Intrinsic::r600_read_tidig_z:
173     return "amdgpu-work-item-id-z";
174   case Intrinsic::amdgcn_workgroup_id_y:
175   case Intrinsic::r600_read_tgid_y:
176     return "amdgpu-work-group-id-y";
177   case Intrinsic::amdgcn_workgroup_id_z:
178   case Intrinsic::r600_read_tgid_z:
179     return "amdgpu-work-group-id-z";
180   case Intrinsic::amdgcn_dispatch_ptr:
181     return "amdgpu-dispatch-ptr";
182   case Intrinsic::amdgcn_dispatch_id:
183     return "amdgpu-dispatch-id";
184   case Intrinsic::amdgcn_kernarg_segment_ptr:
185     return "amdgpu-kernarg-segment-ptr";
186   case Intrinsic::amdgcn_implicitarg_ptr:
187     return "amdgpu-implicitarg-ptr";
188   case Intrinsic::amdgcn_queue_ptr:
189   case Intrinsic::amdgcn_is_shared:
190   case Intrinsic::amdgcn_is_private:
191     // TODO: Does not require queue ptr on gfx9+
192   case Intrinsic::trap:
193   case Intrinsic::debugtrap:
194     IsQueuePtr = true;
195     return "amdgpu-queue-ptr";
196   default:
197     return "";
198   }
199 }
200 
201 static bool handleAttr(Function &Parent, const Function &Callee,
202                        StringRef Name) {
203   if (Callee.hasFnAttribute(Name)) {
204     Parent.addFnAttr(Name);
205     return true;
206   }
207   return false;
208 }
209 
210 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
211                                    bool &NeedQueuePtr) {
212   // X ids unnecessarily propagated to kernels.
213   static constexpr StringLiteral AttrNames[] = {
214       "amdgpu-work-item-id-x",      "amdgpu-work-item-id-y",
215       "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
216       "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
217       "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
218       "amdgpu-implicitarg-ptr"};
219 
220   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
221     NeedQueuePtr = true;
222 
223   for (StringRef AttrName : AttrNames)
224     handleAttr(Parent, Callee, AttrName);
225 }
226 
227 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
228   bool Changed = false;
229 
230   for (auto *Node : reverse(NodeList)) {
231     Function *Caller = Node->getFunction();
232 
233     for (auto I : *Node) {
234       Function *Callee = std::get<1>(I)->getFunction();
235       if (Callee)
236         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
237     }
238   }
239 
240   return Changed;
241 }
242 
243 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
244        Function &Caller, Function &Callee) {
245 
246   // Check for externally defined function
247   if (!Callee.hasExactDefinition()) {
248     Callee.addFnAttr("uniform-work-group-size", "false");
249     if (!Caller.hasFnAttribute("uniform-work-group-size"))
250       Caller.addFnAttr("uniform-work-group-size", "false");
251 
252     return true;
253   }
254   // Check if the Caller has the attribute
255   if (Caller.hasFnAttribute("uniform-work-group-size")) {
256     // Check if the value of the attribute is true
257     if (Caller.getFnAttribute("uniform-work-group-size")
258         .getValueAsString().equals("true")) {
259       // Propagate the attribute to the Callee, if it does not have it
260       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
261         Callee.addFnAttr("uniform-work-group-size", "true");
262         return true;
263       }
264     } else {
265       Callee.addFnAttr("uniform-work-group-size", "false");
266       return true;
267     }
268   } else {
269     // If the attribute is absent, set it as false
270     Caller.addFnAttr("uniform-work-group-size", "false");
271     Callee.addFnAttr("uniform-work-group-size", "false");
272     return true;
273   }
274   return false;
275 }
276 
277 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
278   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
279   bool HasApertureRegs = ST.hasApertureRegs();
280   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
281 
282   bool HaveStackObjects = false;
283   bool Changed = false;
284   bool NeedQueuePtr = false;
285   bool HaveCall = false;
286   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
287 
288   for (BasicBlock &BB : F) {
289     for (Instruction &I : BB) {
290       if (isa<AllocaInst>(I)) {
291         HaveStackObjects = true;
292         continue;
293       }
294 
295       if (auto *CB = dyn_cast<CallBase>(&I)) {
296         const Function *Callee =
297             dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
298 
299         // TODO: Do something with indirect calls.
300         if (!Callee) {
301           if (!CB->isInlineAsm())
302             HaveCall = true;
303           continue;
304         }
305 
306         Intrinsic::ID IID = Callee->getIntrinsicID();
307         if (IID == Intrinsic::not_intrinsic) {
308           HaveCall = true;
309           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
310           Changed = true;
311         } else {
312           bool NonKernelOnly = false;
313 
314           if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
315             F.addFnAttr("amdgpu-kernarg-segment-ptr");
316           } else {
317             StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
318                                                      NeedQueuePtr);
319             if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
320               F.addFnAttr(AttrName);
321               Changed = true;
322             }
323           }
324         }
325       }
326 
327       if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
328         continue;
329 
330       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
331         if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
332           NeedQueuePtr = true;
333           continue;
334         }
335       }
336 
337       for (const Use &U : I.operands()) {
338         const auto *OpC = dyn_cast<Constant>(U);
339         if (!OpC)
340           continue;
341 
342         if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
343                                           HasApertureRegs)) {
344           NeedQueuePtr = true;
345           break;
346         }
347       }
348     }
349   }
350 
351   if (NeedQueuePtr) {
352     F.addFnAttr("amdgpu-queue-ptr");
353     Changed = true;
354   }
355 
356   // TODO: We could refine this to captured pointers that could possibly be
357   // accessed by flat instructions. For now this is mostly a poor way of
358   // estimating whether there are calls before argument lowering.
359   if (!IsFunc && HaveCall) {
360     F.addFnAttr("amdgpu-calls");
361     Changed = true;
362   }
363 
364   if (HaveStackObjects) {
365     F.addFnAttr("amdgpu-stack-objects");
366     Changed = true;
367   }
368 
369   return Changed;
370 }
371 
372 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
373   bool Changed = false;
374 
375   for (CallGraphNode *I : SCC) {
376     // Build a list of CallGraphNodes from most number of uses to least
377     if (I->getNumReferences())
378       NodeList.push_back(I);
379     else {
380       processUniformWorkGroupAttribute();
381       NodeList.clear();
382     }
383 
384     Function *F = I->getFunction();
385     // Add feature attributes
386     if (!F || F->isDeclaration())
387       continue;
388     Changed |= addFeatureAttributes(*F);
389   }
390 
391   return Changed;
392 }
393 
394 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
395   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
396   if (!TPC)
397     report_fatal_error("TargetMachine is required");
398 
399   TM = &TPC->getTM<TargetMachine>();
400   return false;
401 }
402 
403 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
404   return new AMDGPUAnnotateKernelFeatures();
405 }
406