1 //===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPUMemoryUtils.h" 10 #include "AMDGPU.h" 11 #include "Utils/AMDGPUBaseInfo.h" 12 #include "llvm/ADT/SetOperations.h" 13 #include "llvm/ADT/SmallSet.h" 14 #include "llvm/Analysis/AliasAnalysis.h" 15 #include "llvm/Analysis/CallGraph.h" 16 #include "llvm/Analysis/MemorySSA.h" 17 #include "llvm/IR/DataLayout.h" 18 #include "llvm/IR/Instructions.h" 19 #include "llvm/IR/IntrinsicInst.h" 20 #include "llvm/IR/IntrinsicsAMDGPU.h" 21 #include "llvm/IR/ReplaceConstant.h" 22 23 #define DEBUG_TYPE "amdgpu-memory-utils" 24 25 using namespace llvm; 26 27 namespace llvm::AMDGPU { 28 29 Align getAlign(const DataLayout &DL, const GlobalVariable *GV) { 30 return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), 31 GV->getValueType()); 32 } 33 34 TargetExtType *isNamedBarrier(const GlobalVariable &GV) { 35 // TODO: Allow arrays and structs, if all members are barriers 36 // in the same scope. 37 // TODO: Disallow other uses of target("amdgcn.named.barrier") including: 38 // - Structs containing barriers in different scope. 39 // - Structs containing a mixture of barriers and other data. 40 // - Globals in other address spaces. 41 // - Allocas. 42 Type *Ty = GV.getValueType(); 43 while (true) { 44 if (auto *TTy = dyn_cast<TargetExtType>(Ty)) 45 return TTy->getName() == "amdgcn.named.barrier" ? TTy : nullptr; 46 if (auto *STy = dyn_cast<StructType>(Ty)) { 47 if (STy->getNumElements() == 0) 48 return nullptr; 49 Ty = STy->getElementType(0); 50 continue; 51 } 52 return nullptr; 53 } 54 } 55 56 bool isDynamicLDS(const GlobalVariable &GV) { 57 // external zero size addrspace(3) without initializer is dynlds. 58 const Module *M = GV.getParent(); 59 const DataLayout &DL = M->getDataLayout(); 60 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) 61 return false; 62 return DL.getTypeAllocSize(GV.getValueType()) == 0; 63 } 64 65 bool isLDSVariableToLower(const GlobalVariable &GV) { 66 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { 67 return false; 68 } 69 if (isDynamicLDS(GV)) { 70 return true; 71 } 72 if (GV.isConstant()) { 73 // A constant undef variable can't be written to, and any load is 74 // undef, so it should be eliminated by the optimizer. It could be 75 // dropped by the back end if not. This pass skips over it. 76 return false; 77 } 78 if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) { 79 // Initializers are unimplemented for LDS address space. 80 // Leave such variables in place for consistent error reporting. 81 return false; 82 } 83 return true; 84 } 85 86 bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) { 87 // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS 88 // global may have uses from multiple different functions as a result. 89 // This pass specialises LDS variables with respect to the kernel that 90 // allocates them. 91 92 // This is semantically equivalent to (the unimplemented as slow): 93 // for (auto &F : M.functions()) 94 // for (auto &BB : F) 95 // for (auto &I : BB) 96 // for (Use &Op : I.operands()) 97 // if (constantExprUsesLDS(Op)) 98 // replaceConstantExprInFunction(I, Op); 99 100 SmallVector<Constant *> LDSGlobals; 101 for (auto &GV : M.globals()) 102 if (AMDGPU::isLDSVariableToLower(GV)) 103 LDSGlobals.push_back(&GV); 104 return convertUsersOfConstantsToInstructions(LDSGlobals); 105 } 106 107 void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, 108 FunctionVariableMap &kernels, 109 FunctionVariableMap &Functions) { 110 // Get uses from the current function, excluding uses by called Functions 111 // Two output variables to avoid walking the globals list twice 112 for (auto &GV : M.globals()) { 113 if (!AMDGPU::isLDSVariableToLower(GV)) 114 continue; 115 for (User *V : GV.users()) { 116 if (auto *I = dyn_cast<Instruction>(V)) { 117 Function *F = I->getFunction(); 118 if (isKernelLDS(F)) 119 kernels[F].insert(&GV); 120 else 121 Functions[F].insert(&GV); 122 } 123 } 124 } 125 } 126 127 bool isKernelLDS(const Function *F) { 128 return AMDGPU::isKernel(F->getCallingConv()); 129 } 130 131 LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { 132 133 FunctionVariableMap DirectMapKernel; 134 FunctionVariableMap DirectMapFunction; 135 getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction); 136 137 // Collect functions whose address has escaped 138 DenseSet<Function *> AddressTakenFuncs; 139 for (Function &F : M.functions()) { 140 if (!isKernelLDS(&F)) 141 if (F.hasAddressTaken(nullptr, 142 /* IgnoreCallbackUses */ false, 143 /* IgnoreAssumeLikeCalls */ false, 144 /* IgnoreLLVMUsed */ true, 145 /* IgnoreArcAttachedCall */ false)) { 146 AddressTakenFuncs.insert(&F); 147 } 148 } 149 150 // Collect variables that are used by functions whose address has escaped 151 DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer; 152 for (Function *F : AddressTakenFuncs) { 153 set_union(VariablesReachableThroughFunctionPointer, DirectMapFunction[F]); 154 } 155 156 auto FunctionMakesUnknownCall = [&](const Function *F) -> bool { 157 assert(!F->isDeclaration()); 158 for (const CallGraphNode::CallRecord &R : *CG[F]) { 159 if (!R.second->getFunction()) 160 return true; 161 } 162 return false; 163 }; 164 165 // Work out which variables are reachable through function calls 166 FunctionVariableMap TransitiveMapFunction = DirectMapFunction; 167 168 // If the function makes any unknown call, assume the worst case that it can 169 // access all variables accessed by functions whose address escaped 170 for (Function &F : M.functions()) { 171 if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) { 172 if (!isKernelLDS(&F)) { 173 set_union(TransitiveMapFunction[&F], 174 VariablesReachableThroughFunctionPointer); 175 } 176 } 177 } 178 179 // Direct implementation of collecting all variables reachable from each 180 // function 181 for (Function &Func : M.functions()) { 182 if (Func.isDeclaration() || isKernelLDS(&Func)) 183 continue; 184 185 DenseSet<Function *> seen; // catches cycles 186 SmallVector<Function *, 4> wip = {&Func}; 187 188 while (!wip.empty()) { 189 Function *F = wip.pop_back_val(); 190 191 // Can accelerate this by referring to transitive map for functions that 192 // have already been computed, with more care than this 193 set_union(TransitiveMapFunction[&Func], DirectMapFunction[F]); 194 195 for (const CallGraphNode::CallRecord &R : *CG[F]) { 196 Function *Ith = R.second->getFunction(); 197 if (Ith) { 198 if (!seen.contains(Ith)) { 199 seen.insert(Ith); 200 wip.push_back(Ith); 201 } 202 } 203 } 204 } 205 } 206 207 // Collect variables that are transitively used by functions whose address has 208 // escaped 209 for (Function *F : AddressTakenFuncs) { 210 set_union(VariablesReachableThroughFunctionPointer, 211 TransitiveMapFunction[F]); 212 } 213 214 // DirectMapKernel lists which variables are used by the kernel 215 // find the variables which are used through a function call 216 FunctionVariableMap IndirectMapKernel; 217 218 for (Function &Func : M.functions()) { 219 if (Func.isDeclaration() || !isKernelLDS(&Func)) 220 continue; 221 222 for (const CallGraphNode::CallRecord &R : *CG[&Func]) { 223 Function *Ith = R.second->getFunction(); 224 if (Ith) { 225 set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]); 226 } 227 } 228 229 // Check if the kernel encounters unknows calls, wheher directly or 230 // indirectly. 231 bool SeesUnknownCalls = [&]() { 232 SmallVector<Function *> WorkList = {CG[&Func]->getFunction()}; 233 SmallPtrSet<Function *, 8> Visited; 234 235 while (!WorkList.empty()) { 236 Function *F = WorkList.pop_back_val(); 237 238 for (const CallGraphNode::CallRecord &CallRecord : *CG[F]) { 239 if (!CallRecord.second) 240 continue; 241 242 Function *Callee = CallRecord.second->getFunction(); 243 if (!Callee) 244 return true; 245 246 if (Visited.insert(Callee).second) 247 WorkList.push_back(Callee); 248 } 249 } 250 return false; 251 }(); 252 253 if (SeesUnknownCalls) { 254 set_union(IndirectMapKernel[&Func], 255 VariablesReachableThroughFunctionPointer); 256 } 257 } 258 259 // Verify that we fall into one of 2 cases: 260 // - All variables are either absolute 261 // or direct mapped dynamic LDS that is not lowered. 262 // this is a re-run of the pass 263 // so we don't have anything to do. 264 // - No variables are absolute. 265 std::optional<bool> HasAbsoluteGVs; 266 bool HasSpecialGVs = false; 267 for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { 268 for (auto &[Fn, GVs] : Map) { 269 for (auto *GV : GVs) { 270 bool IsAbsolute = GV->isAbsoluteSymbolRef(); 271 bool IsDirectMapDynLDSGV = 272 AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn); 273 if (IsDirectMapDynLDSGV) 274 continue; 275 if (isNamedBarrier(*GV)) { 276 HasSpecialGVs = true; 277 continue; 278 } 279 if (HasAbsoluteGVs.has_value()) { 280 if (*HasAbsoluteGVs != IsAbsolute) { 281 reportFatalUsageError( 282 "module cannot mix absolute and non-absolute LDS GVs"); 283 } 284 } else 285 HasAbsoluteGVs = IsAbsolute; 286 } 287 } 288 } 289 290 // If we only had absolute GVs, we have nothing to do, return an empty 291 // result. 292 if (HasAbsoluteGVs && *HasAbsoluteGVs) 293 return {FunctionVariableMap(), FunctionVariableMap(), false}; 294 295 return {std::move(DirectMapKernel), std::move(IndirectMapKernel), 296 HasSpecialGVs}; 297 } 298 299 void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, 300 ArrayRef<StringRef> FnAttrs) { 301 for (StringRef Attr : FnAttrs) 302 KernelRoot->removeFnAttr(Attr); 303 304 SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()}; 305 SmallPtrSet<Function *, 8> Visited; 306 bool SeenUnknownCall = false; 307 308 while (!WorkList.empty()) { 309 Function *F = WorkList.pop_back_val(); 310 311 for (auto &CallRecord : *CG[F]) { 312 if (!CallRecord.second) 313 continue; 314 315 Function *Callee = CallRecord.second->getFunction(); 316 if (!Callee) { 317 if (!SeenUnknownCall) { 318 SeenUnknownCall = true; 319 320 // If we see any indirect calls, assume nothing about potential 321 // targets. 322 // TODO: This could be refined to possible LDS global users. 323 for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) { 324 Function *PotentialCallee = 325 ExternalCallRecord.second->getFunction(); 326 assert(PotentialCallee); 327 if (!isKernelLDS(PotentialCallee)) { 328 for (StringRef Attr : FnAttrs) 329 PotentialCallee->removeFnAttr(Attr); 330 } 331 } 332 } 333 } else { 334 for (StringRef Attr : FnAttrs) 335 Callee->removeFnAttr(Attr); 336 if (Visited.insert(Callee).second) 337 WorkList.push_back(Callee); 338 } 339 } 340 } 341 } 342 343 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { 344 Instruction *DefInst = Def->getMemoryInst(); 345 346 if (isa<FenceInst>(DefInst)) 347 return false; 348 349 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) { 350 switch (II->getIntrinsicID()) { 351 case Intrinsic::amdgcn_s_barrier: 352 case Intrinsic::amdgcn_s_barrier_signal: 353 case Intrinsic::amdgcn_s_barrier_signal_var: 354 case Intrinsic::amdgcn_s_barrier_signal_isfirst: 355 case Intrinsic::amdgcn_s_barrier_wait: 356 case Intrinsic::amdgcn_s_get_barrier_state: 357 case Intrinsic::amdgcn_wave_barrier: 358 case Intrinsic::amdgcn_sched_barrier: 359 case Intrinsic::amdgcn_sched_group_barrier: 360 case Intrinsic::amdgcn_iglp_opt: 361 return false; 362 default: 363 break; 364 } 365 } 366 367 // Ignore atomics not aliasing with the original load, any atomic is a 368 // universal MemoryDef from MSSA's point of view too, just like a fence. 369 const auto checkNoAlias = [AA, Ptr](auto I) -> bool { 370 return I && AA->isNoAlias(I->getPointerOperand(), Ptr); 371 }; 372 373 if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) || 374 checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst))) 375 return false; 376 377 return true; 378 } 379 380 bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, 381 AAResults *AA) { 382 MemorySSAWalker *Walker = MSSA->getWalker(); 383 SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)}; 384 SmallSet<MemoryAccess *, 8> Visited; 385 MemoryLocation Loc(MemoryLocation::get(Load)); 386 387 LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); 388 389 // Start with a nearest dominating clobbering access, it will be either 390 // live on entry (nothing to do, load is not clobbered), MemoryDef, or 391 // MemoryPhi if several MemoryDefs can define this memory state. In that 392 // case add all Defs to WorkList and continue going up and checking all 393 // the definitions of this memory location until the root. When all the 394 // defs are exhausted and came to the entry state we have no clobber. 395 // Along the scan ignore barriers and fences which are considered clobbers 396 // by the MemorySSA, but not really writing anything into the memory. 397 while (!WorkList.empty()) { 398 MemoryAccess *MA = WorkList.pop_back_val(); 399 if (!Visited.insert(MA).second) 400 continue; 401 402 if (MSSA->isLiveOnEntryDef(MA)) 403 continue; 404 405 if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) { 406 LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n'); 407 408 if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) { 409 LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); 410 return true; 411 } 412 413 WorkList.push_back( 414 Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); 415 continue; 416 } 417 418 const MemoryPhi *Phi = cast<MemoryPhi>(MA); 419 for (const auto &Use : Phi->incoming_values()) 420 WorkList.push_back(cast<MemoryAccess>(&Use)); 421 } 422 423 LLVM_DEBUG(dbgs() << " -> no clobber\n"); 424 return false; 425 } 426 427 } // end namespace llvm::AMDGPU 428