1 //===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "AMDGPUMemoryUtils.h"
10 #include "AMDGPU.h"
11 #include "AMDGPUBaseInfo.h"
12 #include "llvm/ADT/SetOperations.h"
13 #include "llvm/ADT/SmallSet.h"
14 #include "llvm/Analysis/AliasAnalysis.h"
15 #include "llvm/Analysis/CallGraph.h"
16 #include "llvm/Analysis/MemorySSA.h"
17 #include "llvm/IR/DataLayout.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/IR/Operator.h"
22 #include "llvm/IR/ReplaceConstant.h"
23
24 #define DEBUG_TYPE "amdgpu-memory-utils"
25
26 using namespace llvm;
27
28 namespace llvm::AMDGPU {
29
getAlign(const DataLayout & DL,const GlobalVariable * GV)30 Align getAlign(const DataLayout &DL, const GlobalVariable *GV) {
31 return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
32 GV->getValueType());
33 }
34
isDynamicLDS(const GlobalVariable & GV)35 bool isDynamicLDS(const GlobalVariable &GV) {
36 // external zero size addrspace(3) without initializer is dynlds.
37 const Module *M = GV.getParent();
38 const DataLayout &DL = M->getDataLayout();
39 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
40 return false;
41 return DL.getTypeAllocSize(GV.getValueType()) == 0;
42 }
43
isLDSVariableToLower(const GlobalVariable & GV)44 bool isLDSVariableToLower(const GlobalVariable &GV) {
45 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
46 return false;
47 }
48 if (isDynamicLDS(GV)) {
49 return true;
50 }
51 if (GV.isConstant()) {
52 // A constant undef variable can't be written to, and any load is
53 // undef, so it should be eliminated by the optimizer. It could be
54 // dropped by the back end if not. This pass skips over it.
55 return false;
56 }
57 if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) {
58 // Initializers are unimplemented for LDS address space.
59 // Leave such variables in place for consistent error reporting.
60 return false;
61 }
62 return true;
63 }
64
eliminateConstantExprUsesOfLDSFromAllInstructions(Module & M)65 bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
66 // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
67 // global may have uses from multiple different functions as a result.
68 // This pass specialises LDS variables with respect to the kernel that
69 // allocates them.
70
71 // This is semantically equivalent to (the unimplemented as slow):
72 // for (auto &F : M.functions())
73 // for (auto &BB : F)
74 // for (auto &I : BB)
75 // for (Use &Op : I.operands())
76 // if (constantExprUsesLDS(Op))
77 // replaceConstantExprInFunction(I, Op);
78
79 SmallVector<Constant *> LDSGlobals;
80 for (auto &GV : M.globals())
81 if (AMDGPU::isLDSVariableToLower(GV))
82 LDSGlobals.push_back(&GV);
83 return convertUsersOfConstantsToInstructions(LDSGlobals);
84 }
85
getUsesOfLDSByFunction(const CallGraph & CG,Module & M,FunctionVariableMap & kernels,FunctionVariableMap & Functions)86 void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
87 FunctionVariableMap &kernels,
88 FunctionVariableMap &Functions) {
89 // Get uses from the current function, excluding uses by called Functions
90 // Two output variables to avoid walking the globals list twice
91 for (auto &GV : M.globals()) {
92 if (!AMDGPU::isLDSVariableToLower(GV))
93 continue;
94 for (User *V : GV.users()) {
95 if (auto *I = dyn_cast<Instruction>(V)) {
96 Function *F = I->getFunction();
97 if (isKernelLDS(F))
98 kernels[F].insert(&GV);
99 else
100 Functions[F].insert(&GV);
101 }
102 }
103 }
104 }
105
isKernelLDS(const Function * F)106 bool isKernelLDS(const Function *F) {
107 // Some weirdness here. AMDGPU::isKernelCC does not call into
108 // AMDGPU::isKernel with the calling conv, it instead calls into
109 // isModuleEntryFunction which returns true for more calling conventions
110 // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
111 // There's also a test that checks that the LDS lowering does not hit on
112 // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
113 // Putting LDS in the name of the function to draw attention to this.
114 return AMDGPU::isKernel(F->getCallingConv());
115 }
116
getTransitiveUsesOfLDS(const CallGraph & CG,Module & M)117 LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
118
119 FunctionVariableMap DirectMapKernel;
120 FunctionVariableMap DirectMapFunction;
121 getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction);
122
123 // Collect variables that are used by functions whose address has escaped
124 DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
125 for (Function &F : M.functions()) {
126 if (!isKernelLDS(&F))
127 if (F.hasAddressTaken(nullptr,
128 /* IgnoreCallbackUses */ false,
129 /* IgnoreAssumeLikeCalls */ false,
130 /* IgnoreLLVMUsed */ true,
131 /* IgnoreArcAttachedCall */ false)) {
132 set_union(VariablesReachableThroughFunctionPointer,
133 DirectMapFunction[&F]);
134 }
135 }
136
137 auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
138 assert(!F->isDeclaration());
139 for (const CallGraphNode::CallRecord &R : *CG[F]) {
140 if (!R.second->getFunction())
141 return true;
142 }
143 return false;
144 };
145
146 // Work out which variables are reachable through function calls
147 FunctionVariableMap TransitiveMapFunction = DirectMapFunction;
148
149 // If the function makes any unknown call, assume the worst case that it can
150 // access all variables accessed by functions whose address escaped
151 for (Function &F : M.functions()) {
152 if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
153 if (!isKernelLDS(&F)) {
154 set_union(TransitiveMapFunction[&F],
155 VariablesReachableThroughFunctionPointer);
156 }
157 }
158 }
159
160 // Direct implementation of collecting all variables reachable from each
161 // function
162 for (Function &Func : M.functions()) {
163 if (Func.isDeclaration() || isKernelLDS(&Func))
164 continue;
165
166 DenseSet<Function *> seen; // catches cycles
167 SmallVector<Function *, 4> wip = {&Func};
168
169 while (!wip.empty()) {
170 Function *F = wip.pop_back_val();
171
172 // Can accelerate this by referring to transitive map for functions that
173 // have already been computed, with more care than this
174 set_union(TransitiveMapFunction[&Func], DirectMapFunction[F]);
175
176 for (const CallGraphNode::CallRecord &R : *CG[F]) {
177 Function *Ith = R.second->getFunction();
178 if (Ith) {
179 if (!seen.contains(Ith)) {
180 seen.insert(Ith);
181 wip.push_back(Ith);
182 }
183 }
184 }
185 }
186 }
187
188 // DirectMapKernel lists which variables are used by the kernel
189 // find the variables which are used through a function call
190 FunctionVariableMap IndirectMapKernel;
191
192 for (Function &Func : M.functions()) {
193 if (Func.isDeclaration() || !isKernelLDS(&Func))
194 continue;
195
196 for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
197 Function *Ith = R.second->getFunction();
198 if (Ith) {
199 set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]);
200 } else {
201 set_union(IndirectMapKernel[&Func],
202 VariablesReachableThroughFunctionPointer);
203 }
204 }
205 }
206
207 // Verify that we fall into one of 2 cases:
208 // - All variables are either absolute
209 // or direct mapped dynamic LDS that is not lowered.
210 // this is a re-run of the pass
211 // so we don't have anything to do.
212 // - No variables are absolute.
213 std::optional<bool> HasAbsoluteGVs;
214 for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
215 for (auto &[Fn, GVs] : Map) {
216 for (auto *GV : GVs) {
217 bool IsAbsolute = GV->isAbsoluteSymbolRef();
218 bool IsDirectMapDynLDSGV = AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn);
219 if (IsDirectMapDynLDSGV)
220 continue;
221 if (HasAbsoluteGVs.has_value()) {
222 if (*HasAbsoluteGVs != IsAbsolute) {
223 report_fatal_error(
224 "Module cannot mix absolute and non-absolute LDS GVs");
225 }
226 } else
227 HasAbsoluteGVs = IsAbsolute;
228 }
229 }
230 }
231
232 // If we only had absolute GVs, we have nothing to do, return an empty
233 // result.
234 if (HasAbsoluteGVs && *HasAbsoluteGVs)
235 return {FunctionVariableMap(), FunctionVariableMap()};
236
237 return {std::move(DirectMapKernel), std::move(IndirectMapKernel)};
238 }
239
removeFnAttrFromReachable(CallGraph & CG,Function * KernelRoot,ArrayRef<StringRef> FnAttrs)240 void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
241 ArrayRef<StringRef> FnAttrs) {
242 for (StringRef Attr : FnAttrs)
243 KernelRoot->removeFnAttr(Attr);
244
245 SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()};
246 SmallPtrSet<Function *, 8> Visited;
247 bool SeenUnknownCall = false;
248
249 while (!WorkList.empty()) {
250 Function *F = WorkList.pop_back_val();
251
252 for (auto &CallRecord : *CG[F]) {
253 if (!CallRecord.second)
254 continue;
255
256 Function *Callee = CallRecord.second->getFunction();
257 if (!Callee) {
258 if (!SeenUnknownCall) {
259 SeenUnknownCall = true;
260
261 // If we see any indirect calls, assume nothing about potential
262 // targets.
263 // TODO: This could be refined to possible LDS global users.
264 for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
265 Function *PotentialCallee =
266 ExternalCallRecord.second->getFunction();
267 assert(PotentialCallee);
268 if (!isKernelLDS(PotentialCallee)) {
269 for (StringRef Attr : FnAttrs)
270 PotentialCallee->removeFnAttr(Attr);
271 }
272 }
273 }
274 } else {
275 for (StringRef Attr : FnAttrs)
276 Callee->removeFnAttr(Attr);
277 if (Visited.insert(Callee).second)
278 WorkList.push_back(Callee);
279 }
280 }
281 }
282 }
283
isReallyAClobber(const Value * Ptr,MemoryDef * Def,AAResults * AA)284 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
285 Instruction *DefInst = Def->getMemoryInst();
286
287 if (isa<FenceInst>(DefInst))
288 return false;
289
290 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
291 switch (II->getIntrinsicID()) {
292 case Intrinsic::amdgcn_s_barrier:
293 case Intrinsic::amdgcn_s_barrier_signal:
294 case Intrinsic::amdgcn_s_barrier_signal_var:
295 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
296 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
297 case Intrinsic::amdgcn_s_barrier_init:
298 case Intrinsic::amdgcn_s_barrier_join:
299 case Intrinsic::amdgcn_s_barrier_wait:
300 case Intrinsic::amdgcn_s_barrier_leave:
301 case Intrinsic::amdgcn_s_get_barrier_state:
302 case Intrinsic::amdgcn_s_wakeup_barrier:
303 case Intrinsic::amdgcn_wave_barrier:
304 case Intrinsic::amdgcn_sched_barrier:
305 case Intrinsic::amdgcn_sched_group_barrier:
306 return false;
307 default:
308 break;
309 }
310 }
311
312 // Ignore atomics not aliasing with the original load, any atomic is a
313 // universal MemoryDef from MSSA's point of view too, just like a fence.
314 const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
315 return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
316 };
317
318 if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
319 checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
320 return false;
321
322 return true;
323 }
324
isClobberedInFunction(const LoadInst * Load,MemorySSA * MSSA,AAResults * AA)325 bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
326 AAResults *AA) {
327 MemorySSAWalker *Walker = MSSA->getWalker();
328 SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
329 SmallSet<MemoryAccess *, 8> Visited;
330 MemoryLocation Loc(MemoryLocation::get(Load));
331
332 LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
333
334 // Start with a nearest dominating clobbering access, it will be either
335 // live on entry (nothing to do, load is not clobbered), MemoryDef, or
336 // MemoryPhi if several MemoryDefs can define this memory state. In that
337 // case add all Defs to WorkList and continue going up and checking all
338 // the definitions of this memory location until the root. When all the
339 // defs are exhausted and came to the entry state we have no clobber.
340 // Along the scan ignore barriers and fences which are considered clobbers
341 // by the MemorySSA, but not really writing anything into the memory.
342 while (!WorkList.empty()) {
343 MemoryAccess *MA = WorkList.pop_back_val();
344 if (!Visited.insert(MA).second)
345 continue;
346
347 if (MSSA->isLiveOnEntryDef(MA))
348 continue;
349
350 if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
351 LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n');
352
353 if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) {
354 LLVM_DEBUG(dbgs() << " -> load is clobbered\n");
355 return true;
356 }
357
358 WorkList.push_back(
359 Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
360 continue;
361 }
362
363 const MemoryPhi *Phi = cast<MemoryPhi>(MA);
364 for (const auto &Use : Phi->incoming_values())
365 WorkList.push_back(cast<MemoryAccess>(&Use));
366 }
367
368 LLVM_DEBUG(dbgs() << " -> no clobber\n");
369 return false;
370 }
371
372 } // end namespace llvm::AMDGPU
373