1 //===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPUMemoryUtils.h" 10 #include "AMDGPU.h" 11 #include "AMDGPUBaseInfo.h" 12 #include "llvm/ADT/SetOperations.h" 13 #include "llvm/ADT/SmallSet.h" 14 #include "llvm/Analysis/AliasAnalysis.h" 15 #include "llvm/Analysis/CallGraph.h" 16 #include "llvm/Analysis/MemorySSA.h" 17 #include "llvm/IR/DataLayout.h" 18 #include "llvm/IR/Instructions.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/IntrinsicsAMDGPU.h" 21 #include "llvm/IR/Operator.h" 22 #include "llvm/IR/ReplaceConstant.h" 23 24 #define DEBUG_TYPE "amdgpu-memory-utils" 25 26 using namespace llvm; 27 28 namespace llvm::AMDGPU { 29 30 Align getAlign(const DataLayout &DL, const GlobalVariable *GV) { 31 return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), 32 GV->getValueType()); 33 } 34 35 bool isDynamicLDS(const GlobalVariable &GV) { 36 // external zero size addrspace(3) without initializer is dynlds. 37 const Module *M = GV.getParent(); 38 const DataLayout &DL = M->getDataLayout(); 39 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) 40 return false; 41 return DL.getTypeAllocSize(GV.getValueType()) == 0; 42 } 43 44 bool isLDSVariableToLower(const GlobalVariable &GV) { 45 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { 46 return false; 47 } 48 if (isDynamicLDS(GV)) { 49 return true; 50 } 51 if (GV.isConstant()) { 52 // A constant undef variable can't be written to, and any load is 53 // undef, so it should be eliminated by the optimizer. It could be 54 // dropped by the back end if not. This pass skips over it. 55 return false; 56 } 57 if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) { 58 // Initializers are unimplemented for LDS address space. 59 // Leave such variables in place for consistent error reporting. 60 return false; 61 } 62 return true; 63 } 64 65 bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) { 66 // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS 67 // global may have uses from multiple different functions as a result. 68 // This pass specialises LDS variables with respect to the kernel that 69 // allocates them. 70 71 // This is semantically equivalent to (the unimplemented as slow): 72 // for (auto &F : M.functions()) 73 // for (auto &BB : F) 74 // for (auto &I : BB) 75 // for (Use &Op : I.operands()) 76 // if (constantExprUsesLDS(Op)) 77 // replaceConstantExprInFunction(I, Op); 78 79 SmallVector<Constant *> LDSGlobals; 80 for (auto &GV : M.globals()) 81 if (AMDGPU::isLDSVariableToLower(GV)) 82 LDSGlobals.push_back(&GV); 83 return convertUsersOfConstantsToInstructions(LDSGlobals); 84 } 85 86 void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, 87 FunctionVariableMap &kernels, 88 FunctionVariableMap &Functions) { 89 // Get uses from the current function, excluding uses by called Functions 90 // Two output variables to avoid walking the globals list twice 91 for (auto &GV : M.globals()) { 92 if (!AMDGPU::isLDSVariableToLower(GV)) 93 continue; 94 for (User *V : GV.users()) { 95 if (auto *I = dyn_cast<Instruction>(V)) { 96 Function *F = I->getFunction(); 97 if (isKernelLDS(F)) 98 kernels[F].insert(&GV); 99 else 100 Functions[F].insert(&GV); 101 } 102 } 103 } 104 } 105 106 bool isKernelLDS(const Function *F) { 107 // Some weirdness here. AMDGPU::isKernelCC does not call into 108 // AMDGPU::isKernel with the calling conv, it instead calls into 109 // isModuleEntryFunction which returns true for more calling conventions 110 // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel. 111 // There's also a test that checks that the LDS lowering does not hit on 112 // a graphics shader, denoted amdgpu_ps, so stay with the limited case. 113 // Putting LDS in the name of the function to draw attention to this. 114 return AMDGPU::isKernel(F->getCallingConv()); 115 } 116 117 LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { 118 119 FunctionVariableMap DirectMapKernel; 120 FunctionVariableMap DirectMapFunction; 121 getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction); 122 123 // Collect variables that are used by functions whose address has escaped 124 DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer; 125 for (Function &F : M.functions()) { 126 if (!isKernelLDS(&F)) 127 if (F.hasAddressTaken(nullptr, 128 /* IgnoreCallbackUses */ false, 129 /* IgnoreAssumeLikeCalls */ false, 130 /* IgnoreLLVMUsed */ true, 131 /* IgnoreArcAttachedCall */ false)) { 132 set_union(VariablesReachableThroughFunctionPointer, 133 DirectMapFunction[&F]); 134 } 135 } 136 137 auto FunctionMakesUnknownCall = [&](const Function *F) -> bool { 138 assert(!F->isDeclaration()); 139 for (const CallGraphNode::CallRecord &R : *CG[F]) { 140 if (!R.second->getFunction()) 141 return true; 142 } 143 return false; 144 }; 145 146 // Work out which variables are reachable through function calls 147 FunctionVariableMap TransitiveMapFunction = DirectMapFunction; 148 149 // If the function makes any unknown call, assume the worst case that it can 150 // access all variables accessed by functions whose address escaped 151 for (Function &F : M.functions()) { 152 if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) { 153 if (!isKernelLDS(&F)) { 154 set_union(TransitiveMapFunction[&F], 155 VariablesReachableThroughFunctionPointer); 156 } 157 } 158 } 159 160 // Direct implementation of collecting all variables reachable from each 161 // function 162 for (Function &Func : M.functions()) { 163 if (Func.isDeclaration() || isKernelLDS(&Func)) 164 continue; 165 166 DenseSet<Function *> seen; // catches cycles 167 SmallVector<Function *, 4> wip = {&Func}; 168 169 while (!wip.empty()) { 170 Function *F = wip.pop_back_val(); 171 172 // Can accelerate this by referring to transitive map for functions that 173 // have already been computed, with more care than this 174 set_union(TransitiveMapFunction[&Func], DirectMapFunction[F]); 175 176 for (const CallGraphNode::CallRecord &R : *CG[F]) { 177 Function *Ith = R.second->getFunction(); 178 if (Ith) { 179 if (!seen.contains(Ith)) { 180 seen.insert(Ith); 181 wip.push_back(Ith); 182 } 183 } 184 } 185 } 186 } 187 188 // DirectMapKernel lists which variables are used by the kernel 189 // find the variables which are used through a function call 190 FunctionVariableMap IndirectMapKernel; 191 192 for (Function &Func : M.functions()) { 193 if (Func.isDeclaration() || !isKernelLDS(&Func)) 194 continue; 195 196 for (const CallGraphNode::CallRecord &R : *CG[&Func]) { 197 Function *Ith = R.second->getFunction(); 198 if (Ith) { 199 set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]); 200 } else { 201 set_union(IndirectMapKernel[&Func], 202 VariablesReachableThroughFunctionPointer); 203 } 204 } 205 } 206 207 // Verify that we fall into one of 2 cases: 208 // - All variables are either absolute 209 // or direct mapped dynamic LDS that is not lowered. 210 // this is a re-run of the pass 211 // so we don't have anything to do. 212 // - No variables are absolute. 213 std::optional<bool> HasAbsoluteGVs; 214 for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { 215 for (auto &[Fn, GVs] : Map) { 216 for (auto *GV : GVs) { 217 bool IsAbsolute = GV->isAbsoluteSymbolRef(); 218 bool IsDirectMapDynLDSGV = AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn); 219 if (IsDirectMapDynLDSGV) 220 continue; 221 if (HasAbsoluteGVs.has_value()) { 222 if (*HasAbsoluteGVs != IsAbsolute) { 223 report_fatal_error( 224 "Module cannot mix absolute and non-absolute LDS GVs"); 225 } 226 } else 227 HasAbsoluteGVs = IsAbsolute; 228 } 229 } 230 } 231 232 // If we only had absolute GVs, we have nothing to do, return an empty 233 // result. 234 if (HasAbsoluteGVs && *HasAbsoluteGVs) 235 return {FunctionVariableMap(), FunctionVariableMap()}; 236 237 return {std::move(DirectMapKernel), std::move(IndirectMapKernel)}; 238 } 239 240 void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, 241 ArrayRef<StringRef> FnAttrs) { 242 for (StringRef Attr : FnAttrs) 243 KernelRoot->removeFnAttr(Attr); 244 245 SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()}; 246 SmallPtrSet<Function *, 8> Visited; 247 bool SeenUnknownCall = false; 248 249 while (!WorkList.empty()) { 250 Function *F = WorkList.pop_back_val(); 251 252 for (auto &CallRecord : *CG[F]) { 253 if (!CallRecord.second) 254 continue; 255 256 Function *Callee = CallRecord.second->getFunction(); 257 if (!Callee) { 258 if (!SeenUnknownCall) { 259 SeenUnknownCall = true; 260 261 // If we see any indirect calls, assume nothing about potential 262 // targets. 263 // TODO: This could be refined to possible LDS global users. 264 for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) { 265 Function *PotentialCallee = 266 ExternalCallRecord.second->getFunction(); 267 assert(PotentialCallee); 268 if (!isKernelLDS(PotentialCallee)) { 269 for (StringRef Attr : FnAttrs) 270 PotentialCallee->removeFnAttr(Attr); 271 } 272 } 273 } 274 } else { 275 for (StringRef Attr : FnAttrs) 276 Callee->removeFnAttr(Attr); 277 if (Visited.insert(Callee).second) 278 WorkList.push_back(Callee); 279 } 280 } 281 } 282 } 283 284 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { 285 Instruction *DefInst = Def->getMemoryInst(); 286 287 if (isa<FenceInst>(DefInst)) 288 return false; 289 290 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) { 291 switch (II->getIntrinsicID()) { 292 case Intrinsic::amdgcn_s_barrier: 293 case Intrinsic::amdgcn_s_barrier_signal: 294 case Intrinsic::amdgcn_s_barrier_signal_var: 295 case Intrinsic::amdgcn_s_barrier_signal_isfirst: 296 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: 297 case Intrinsic::amdgcn_s_barrier_init: 298 case Intrinsic::amdgcn_s_barrier_join: 299 case Intrinsic::amdgcn_s_barrier_wait: 300 case Intrinsic::amdgcn_s_barrier_leave: 301 case Intrinsic::amdgcn_s_get_barrier_state: 302 case Intrinsic::amdgcn_s_wakeup_barrier: 303 case Intrinsic::amdgcn_wave_barrier: 304 case Intrinsic::amdgcn_sched_barrier: 305 case Intrinsic::amdgcn_sched_group_barrier: 306 return false; 307 default: 308 break; 309 } 310 } 311 312 // Ignore atomics not aliasing with the original load, any atomic is a 313 // universal MemoryDef from MSSA's point of view too, just like a fence. 314 const auto checkNoAlias = [AA, Ptr](auto I) -> bool { 315 return I && AA->isNoAlias(I->getPointerOperand(), Ptr); 316 }; 317 318 if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) || 319 checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst))) 320 return false; 321 322 return true; 323 } 324 325 bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, 326 AAResults *AA) { 327 MemorySSAWalker *Walker = MSSA->getWalker(); 328 SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)}; 329 SmallSet<MemoryAccess *, 8> Visited; 330 MemoryLocation Loc(MemoryLocation::get(Load)); 331 332 LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); 333 334 // Start with a nearest dominating clobbering access, it will be either 335 // live on entry (nothing to do, load is not clobbered), MemoryDef, or 336 // MemoryPhi if several MemoryDefs can define this memory state. In that 337 // case add all Defs to WorkList and continue going up and checking all 338 // the definitions of this memory location until the root. When all the 339 // defs are exhausted and came to the entry state we have no clobber. 340 // Along the scan ignore barriers and fences which are considered clobbers 341 // by the MemorySSA, but not really writing anything into the memory. 342 while (!WorkList.empty()) { 343 MemoryAccess *MA = WorkList.pop_back_val(); 344 if (!Visited.insert(MA).second) 345 continue; 346 347 if (MSSA->isLiveOnEntryDef(MA)) 348 continue; 349 350 if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) { 351 LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n'); 352 353 if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) { 354 LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); 355 return true; 356 } 357 358 WorkList.push_back( 359 Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); 360 continue; 361 } 362 363 const MemoryPhi *Phi = cast<MemoryPhi>(MA); 364 for (const auto &Use : Phi->incoming_values()) 365 WorkList.push_back(cast<MemoryAccess>(&Use)); 366 } 367 368 LLVM_DEBUG(dbgs() << " -> no clobber\n"); 369 return false; 370 } 371 372 } // end namespace llvm::AMDGPU 373