xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp (revision 3a56015a2f5d630910177fa79a522bb95511ccf7)
1 //===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AMDGPUMemoryUtils.h"
10 #include "AMDGPU.h"
11 #include "AMDGPUBaseInfo.h"
12 #include "llvm/ADT/SetOperations.h"
13 #include "llvm/ADT/SmallSet.h"
14 #include "llvm/Analysis/AliasAnalysis.h"
15 #include "llvm/Analysis/CallGraph.h"
16 #include "llvm/Analysis/MemorySSA.h"
17 #include "llvm/IR/DataLayout.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/IR/Operator.h"
22 #include "llvm/IR/ReplaceConstant.h"
23 
24 #define DEBUG_TYPE "amdgpu-memory-utils"
25 
26 using namespace llvm;
27 
28 namespace llvm::AMDGPU {
29 
30 Align getAlign(const DataLayout &DL, const GlobalVariable *GV) {
31   return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
32                                        GV->getValueType());
33 }
34 
35 bool isDynamicLDS(const GlobalVariable &GV) {
36   // external zero size addrspace(3) without initializer is dynlds.
37   const Module *M = GV.getParent();
38   const DataLayout &DL = M->getDataLayout();
39   if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
40     return false;
41   return DL.getTypeAllocSize(GV.getValueType()) == 0;
42 }
43 
44 bool isLDSVariableToLower(const GlobalVariable &GV) {
45   if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
46     return false;
47   }
48   if (isDynamicLDS(GV)) {
49     return true;
50   }
51   if (GV.isConstant()) {
52     // A constant undef variable can't be written to, and any load is
53     // undef, so it should be eliminated by the optimizer. It could be
54     // dropped by the back end if not. This pass skips over it.
55     return false;
56   }
57   if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) {
58     // Initializers are unimplemented for LDS address space.
59     // Leave such variables in place for consistent error reporting.
60     return false;
61   }
62   return true;
63 }
64 
65 bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
66   // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
67   // global may have uses from multiple different functions as a result.
68   // This pass specialises LDS variables with respect to the kernel that
69   // allocates them.
70 
71   // This is semantically equivalent to (the unimplemented as slow):
72   // for (auto &F : M.functions())
73   //   for (auto &BB : F)
74   //     for (auto &I : BB)
75   //       for (Use &Op : I.operands())
76   //         if (constantExprUsesLDS(Op))
77   //           replaceConstantExprInFunction(I, Op);
78 
79   SmallVector<Constant *> LDSGlobals;
80   for (auto &GV : M.globals())
81     if (AMDGPU::isLDSVariableToLower(GV))
82       LDSGlobals.push_back(&GV);
83   return convertUsersOfConstantsToInstructions(LDSGlobals);
84 }
85 
86 void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
87                             FunctionVariableMap &kernels,
88                             FunctionVariableMap &Functions) {
89   // Get uses from the current function, excluding uses by called Functions
90   // Two output variables to avoid walking the globals list twice
91   for (auto &GV : M.globals()) {
92     if (!AMDGPU::isLDSVariableToLower(GV))
93       continue;
94     for (User *V : GV.users()) {
95       if (auto *I = dyn_cast<Instruction>(V)) {
96         Function *F = I->getFunction();
97         if (isKernelLDS(F))
98           kernels[F].insert(&GV);
99         else
100           Functions[F].insert(&GV);
101       }
102     }
103   }
104 }
105 
106 bool isKernelLDS(const Function *F) {
107   // Some weirdness here. AMDGPU::isKernelCC does not call into
108   // AMDGPU::isKernel with the calling conv, it instead calls into
109   // isModuleEntryFunction which returns true for more calling conventions
110   // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
111   // There's also a test that checks that the LDS lowering does not hit on
112   // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
113   // Putting LDS in the name of the function to draw attention to this.
114   return AMDGPU::isKernel(F->getCallingConv());
115 }
116 
117 LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
118 
119   FunctionVariableMap DirectMapKernel;
120   FunctionVariableMap DirectMapFunction;
121   getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction);
122 
123   // Collect variables that are used by functions whose address has escaped
124   DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
125   for (Function &F : M.functions()) {
126     if (!isKernelLDS(&F))
127       if (F.hasAddressTaken(nullptr,
128                             /* IgnoreCallbackUses */ false,
129                             /* IgnoreAssumeLikeCalls */ false,
130                             /* IgnoreLLVMUsed */ true,
131                             /* IgnoreArcAttachedCall */ false)) {
132         set_union(VariablesReachableThroughFunctionPointer,
133                   DirectMapFunction[&F]);
134       }
135   }
136 
137   auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
138     assert(!F->isDeclaration());
139     for (const CallGraphNode::CallRecord &R : *CG[F]) {
140       if (!R.second->getFunction())
141         return true;
142     }
143     return false;
144   };
145 
146   // Work out which variables are reachable through function calls
147   FunctionVariableMap TransitiveMapFunction = DirectMapFunction;
148 
149   // If the function makes any unknown call, assume the worst case that it can
150   // access all variables accessed by functions whose address escaped
151   for (Function &F : M.functions()) {
152     if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
153       if (!isKernelLDS(&F)) {
154         set_union(TransitiveMapFunction[&F],
155                   VariablesReachableThroughFunctionPointer);
156       }
157     }
158   }
159 
160   // Direct implementation of collecting all variables reachable from each
161   // function
162   for (Function &Func : M.functions()) {
163     if (Func.isDeclaration() || isKernelLDS(&Func))
164       continue;
165 
166     DenseSet<Function *> seen; // catches cycles
167     SmallVector<Function *, 4> wip = {&Func};
168 
169     while (!wip.empty()) {
170       Function *F = wip.pop_back_val();
171 
172       // Can accelerate this by referring to transitive map for functions that
173       // have already been computed, with more care than this
174       set_union(TransitiveMapFunction[&Func], DirectMapFunction[F]);
175 
176       for (const CallGraphNode::CallRecord &R : *CG[F]) {
177         Function *Ith = R.second->getFunction();
178         if (Ith) {
179           if (!seen.contains(Ith)) {
180             seen.insert(Ith);
181             wip.push_back(Ith);
182           }
183         }
184       }
185     }
186   }
187 
188   // DirectMapKernel lists which variables are used by the kernel
189   // find the variables which are used through a function call
190   FunctionVariableMap IndirectMapKernel;
191 
192   for (Function &Func : M.functions()) {
193     if (Func.isDeclaration() || !isKernelLDS(&Func))
194       continue;
195 
196     for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
197       Function *Ith = R.second->getFunction();
198       if (Ith) {
199         set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]);
200       } else {
201         set_union(IndirectMapKernel[&Func],
202                   VariablesReachableThroughFunctionPointer);
203       }
204     }
205   }
206 
207   // Verify that we fall into one of 2 cases:
208   //    - All variables are either absolute
209   //      or direct mapped dynamic LDS that is not lowered.
210   //      this is a re-run of the pass
211   //      so we don't have anything to do.
212   //    - No variables are absolute.
213   std::optional<bool> HasAbsoluteGVs;
214   for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
215     for (auto &[Fn, GVs] : Map) {
216       for (auto *GV : GVs) {
217         bool IsAbsolute = GV->isAbsoluteSymbolRef();
218         bool IsDirectMapDynLDSGV = AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn);
219         if (IsDirectMapDynLDSGV)
220           continue;
221         if (HasAbsoluteGVs.has_value()) {
222           if (*HasAbsoluteGVs != IsAbsolute) {
223             report_fatal_error(
224                 "Module cannot mix absolute and non-absolute LDS GVs");
225           }
226         } else
227           HasAbsoluteGVs = IsAbsolute;
228       }
229     }
230   }
231 
232   // If we only had absolute GVs, we have nothing to do, return an empty
233   // result.
234   if (HasAbsoluteGVs && *HasAbsoluteGVs)
235     return {FunctionVariableMap(), FunctionVariableMap()};
236 
237   return {std::move(DirectMapKernel), std::move(IndirectMapKernel)};
238 }
239 
240 void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
241                                ArrayRef<StringRef> FnAttrs) {
242   for (StringRef Attr : FnAttrs)
243     KernelRoot->removeFnAttr(Attr);
244 
245   SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()};
246   SmallPtrSet<Function *, 8> Visited;
247   bool SeenUnknownCall = false;
248 
249   while (!WorkList.empty()) {
250     Function *F = WorkList.pop_back_val();
251 
252     for (auto &CallRecord : *CG[F]) {
253       if (!CallRecord.second)
254         continue;
255 
256       Function *Callee = CallRecord.second->getFunction();
257       if (!Callee) {
258         if (!SeenUnknownCall) {
259           SeenUnknownCall = true;
260 
261           // If we see any indirect calls, assume nothing about potential
262           // targets.
263           // TODO: This could be refined to possible LDS global users.
264           for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
265             Function *PotentialCallee =
266                 ExternalCallRecord.second->getFunction();
267             assert(PotentialCallee);
268             if (!isKernelLDS(PotentialCallee)) {
269               for (StringRef Attr : FnAttrs)
270                 PotentialCallee->removeFnAttr(Attr);
271             }
272           }
273         }
274       } else {
275         for (StringRef Attr : FnAttrs)
276           Callee->removeFnAttr(Attr);
277         if (Visited.insert(Callee).second)
278           WorkList.push_back(Callee);
279       }
280     }
281   }
282 }
283 
284 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
285   Instruction *DefInst = Def->getMemoryInst();
286 
287   if (isa<FenceInst>(DefInst))
288     return false;
289 
290   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
291     switch (II->getIntrinsicID()) {
292     case Intrinsic::amdgcn_s_barrier:
293     case Intrinsic::amdgcn_s_barrier_signal:
294     case Intrinsic::amdgcn_s_barrier_signal_var:
295     case Intrinsic::amdgcn_s_barrier_signal_isfirst:
296     case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
297     case Intrinsic::amdgcn_s_barrier_init:
298     case Intrinsic::amdgcn_s_barrier_join:
299     case Intrinsic::amdgcn_s_barrier_wait:
300     case Intrinsic::amdgcn_s_barrier_leave:
301     case Intrinsic::amdgcn_s_get_barrier_state:
302     case Intrinsic::amdgcn_s_wakeup_barrier:
303     case Intrinsic::amdgcn_wave_barrier:
304     case Intrinsic::amdgcn_sched_barrier:
305     case Intrinsic::amdgcn_sched_group_barrier:
306       return false;
307     default:
308       break;
309     }
310   }
311 
312   // Ignore atomics not aliasing with the original load, any atomic is a
313   // universal MemoryDef from MSSA's point of view too, just like a fence.
314   const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
315     return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
316   };
317 
318   if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
319       checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
320     return false;
321 
322   return true;
323 }
324 
325 bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
326                            AAResults *AA) {
327   MemorySSAWalker *Walker = MSSA->getWalker();
328   SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
329   SmallSet<MemoryAccess *, 8> Visited;
330   MemoryLocation Loc(MemoryLocation::get(Load));
331 
332   LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
333 
334   // Start with a nearest dominating clobbering access, it will be either
335   // live on entry (nothing to do, load is not clobbered), MemoryDef, or
336   // MemoryPhi if several MemoryDefs can define this memory state. In that
337   // case add all Defs to WorkList and continue going up and checking all
338   // the definitions of this memory location until the root. When all the
339   // defs are exhausted and came to the entry state we have no clobber.
340   // Along the scan ignore barriers and fences which are considered clobbers
341   // by the MemorySSA, but not really writing anything into the memory.
342   while (!WorkList.empty()) {
343     MemoryAccess *MA = WorkList.pop_back_val();
344     if (!Visited.insert(MA).second)
345       continue;
346 
347     if (MSSA->isLiveOnEntryDef(MA))
348       continue;
349 
350     if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
351       LLVM_DEBUG(dbgs() << "  Def: " << *Def->getMemoryInst() << '\n');
352 
353       if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) {
354         LLVM_DEBUG(dbgs() << "      -> load is clobbered\n");
355         return true;
356       }
357 
358       WorkList.push_back(
359           Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
360       continue;
361     }
362 
363     const MemoryPhi *Phi = cast<MemoryPhi>(MA);
364     for (const auto &Use : Phi->incoming_values())
365       WorkList.push_back(cast<MemoryAccess>(&Use));
366   }
367 
368   LLVM_DEBUG(dbgs() << "      -> no clobber\n");
369   return false;
370 }
371 
372 } // end namespace llvm::AMDGPU
373