1 //===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPUMemoryUtils.h" 10 #include "AMDGPU.h" 11 #include "AMDGPUBaseInfo.h" 12 #include "llvm/ADT/SmallSet.h" 13 #include "llvm/Analysis/AliasAnalysis.h" 14 #include "llvm/Analysis/MemorySSA.h" 15 #include "llvm/IR/DataLayout.h" 16 #include "llvm/IR/Instructions.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsAMDGPU.h" 19 #include "llvm/IR/ReplaceConstant.h" 20 21 #define DEBUG_TYPE "amdgpu-memory-utils" 22 23 using namespace llvm; 24 25 namespace llvm { 26 27 namespace AMDGPU { 28 29 Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { 30 return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), 31 GV->getValueType()); 32 } 33 34 static bool shouldLowerLDSToStruct(const GlobalVariable &GV, 35 const Function *F) { 36 // We are not interested in kernel LDS lowering for module LDS itself. 37 if (F && GV.getName() == "llvm.amdgcn.module.lds") 38 return false; 39 40 bool Ret = false; 41 SmallPtrSet<const User *, 8> Visited; 42 SmallVector<const User *, 16> Stack(GV.users()); 43 44 assert(!F || isKernelCC(F)); 45 46 while (!Stack.empty()) { 47 const User *V = Stack.pop_back_val(); 48 Visited.insert(V); 49 50 if (isa<GlobalValue>(V)) { 51 // This use of the LDS variable is the initializer of a global variable. 52 // This is ill formed. The address of an LDS variable is kernel dependent 53 // and unknown until runtime. It can't be written to a global variable. 54 continue; 55 } 56 57 if (auto *I = dyn_cast<Instruction>(V)) { 58 const Function *UF = I->getFunction(); 59 if (UF == F) { 60 // Used from this kernel, we want to put it into the structure. 61 Ret = true; 62 } else if (!F) { 63 // For module LDS lowering, lowering is required if the user instruction 64 // is from non-kernel function. 65 Ret |= !isKernelCC(UF); 66 } 67 continue; 68 } 69 70 // User V should be a constant, recursively visit users of V. 71 assert(isa<Constant>(V) && "Expected a constant."); 72 append_range(Stack, V->users()); 73 } 74 75 return Ret; 76 } 77 78 bool isLDSVariableToLower(const GlobalVariable &GV) { 79 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { 80 return false; 81 } 82 if (!GV.hasInitializer()) { 83 // addrspace(3) without initializer implies cuda/hip extern __shared__ 84 // the semantics for such a variable appears to be that all extern 85 // __shared__ variables alias one another, in which case this transform 86 // is not required 87 return false; 88 } 89 if (!isa<UndefValue>(GV.getInitializer())) { 90 // Initializers are unimplemented for LDS address space. 91 // Leave such variables in place for consistent error reporting. 92 return false; 93 } 94 if (GV.isConstant()) { 95 // A constant undef variable can't be written to, and any load is 96 // undef, so it should be eliminated by the optimizer. It could be 97 // dropped by the back end if not. This pass skips over it. 98 return false; 99 } 100 return true; 101 } 102 103 std::vector<GlobalVariable *> findLDSVariablesToLower(Module &M, 104 const Function *F) { 105 std::vector<llvm::GlobalVariable *> LocalVars; 106 for (auto &GV : M.globals()) { 107 if (!isLDSVariableToLower(GV)) { 108 continue; 109 } 110 if (!shouldLowerLDSToStruct(GV, F)) { 111 continue; 112 } 113 LocalVars.push_back(&GV); 114 } 115 return LocalVars; 116 } 117 118 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { 119 Instruction *DefInst = Def->getMemoryInst(); 120 121 if (isa<FenceInst>(DefInst)) 122 return false; 123 124 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) { 125 switch (II->getIntrinsicID()) { 126 case Intrinsic::amdgcn_s_barrier: 127 case Intrinsic::amdgcn_wave_barrier: 128 case Intrinsic::amdgcn_sched_barrier: 129 case Intrinsic::amdgcn_sched_group_barrier: 130 return false; 131 default: 132 break; 133 } 134 } 135 136 // Ignore atomics not aliasing with the original load, any atomic is a 137 // universal MemoryDef from MSSA's point of view too, just like a fence. 138 const auto checkNoAlias = [AA, Ptr](auto I) -> bool { 139 return I && AA->isNoAlias(I->getPointerOperand(), Ptr); 140 }; 141 142 if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) || 143 checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst))) 144 return false; 145 146 return true; 147 } 148 149 bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, 150 AAResults *AA) { 151 MemorySSAWalker *Walker = MSSA->getWalker(); 152 SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)}; 153 SmallSet<MemoryAccess *, 8> Visited; 154 MemoryLocation Loc(MemoryLocation::get(Load)); 155 156 LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); 157 158 // Start with a nearest dominating clobbering access, it will be either 159 // live on entry (nothing to do, load is not clobbered), MemoryDef, or 160 // MemoryPhi if several MemoryDefs can define this memory state. In that 161 // case add all Defs to WorkList and continue going up and checking all 162 // the definitions of this memory location until the root. When all the 163 // defs are exhausted and came to the entry state we have no clobber. 164 // Along the scan ignore barriers and fences which are considered clobbers 165 // by the MemorySSA, but not really writing anything into the memory. 166 while (!WorkList.empty()) { 167 MemoryAccess *MA = WorkList.pop_back_val(); 168 if (!Visited.insert(MA).second) 169 continue; 170 171 if (MSSA->isLiveOnEntryDef(MA)) 172 continue; 173 174 if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) { 175 LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n'); 176 177 if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) { 178 LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); 179 return true; 180 } 181 182 WorkList.push_back( 183 Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); 184 continue; 185 } 186 187 const MemoryPhi *Phi = cast<MemoryPhi>(MA); 188 for (const auto &Use : Phi->incoming_values()) 189 WorkList.push_back(cast<MemoryAccess>(&Use)); 190 } 191 192 LLVM_DEBUG(dbgs() << " -> no clobber\n"); 193 return false; 194 } 195 196 } // end namespace AMDGPU 197 198 } // end namespace llvm 199