1fe6060f1SDimitry Andric //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// 2fe6060f1SDimitry Andric // 3fe6060f1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4fe6060f1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5fe6060f1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6fe6060f1SDimitry Andric // 7fe6060f1SDimitry Andric //===----------------------------------------------------------------------===// 8fe6060f1SDimitry Andric // 9fe6060f1SDimitry Andric // This pass eliminates LDS uses from non-kernel functions. 10fe6060f1SDimitry Andric // 11fe6060f1SDimitry Andric // The strategy is to create a new struct with a field for each LDS variable 12fe6060f1SDimitry Andric // and allocate that struct at the same address for every kernel. Uses of the 13fe6060f1SDimitry Andric // original LDS variables are then replaced with compile time offsets from that 14fe6060f1SDimitry Andric // known address. AMDGPUMachineFunction allocates the LDS global. 15fe6060f1SDimitry Andric // 16fe6060f1SDimitry Andric // Local variables with constant annotation or non-undef initializer are passed 1781ad6265SDimitry Andric // through unchanged for simplification or error diagnostics in later passes. 18fe6060f1SDimitry Andric // 19fe6060f1SDimitry Andric // To reduce the memory overhead variables that are only used by kernels are 20fe6060f1SDimitry Andric // excluded from this transform. The analysis to determine whether a variable 21fe6060f1SDimitry Andric // is only used by a kernel is cheap and conservative so this may allocate 22fe6060f1SDimitry Andric // a variable in every kernel when it was not strictly necessary to do so. 23fe6060f1SDimitry Andric // 24fe6060f1SDimitry Andric // A possible future refinement is to specialise the structure per-kernel, so 25fe6060f1SDimitry Andric // that fields can be elided based on more expensive analysis. 26fe6060f1SDimitry Andric // 27fe6060f1SDimitry Andric //===----------------------------------------------------------------------===// 28fe6060f1SDimitry Andric 29fe6060f1SDimitry Andric #include "AMDGPU.h" 30fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h" 3181ad6265SDimitry Andric #include "Utils/AMDGPUMemoryUtils.h" 32*972a253aSDimitry Andric #include "llvm/ADT/BitVector.h" 33*972a253aSDimitry Andric #include "llvm/ADT/DenseMap.h" 34fe6060f1SDimitry Andric #include "llvm/ADT/STLExtras.h" 3581ad6265SDimitry Andric #include "llvm/Analysis/CallGraph.h" 36fe6060f1SDimitry Andric #include "llvm/IR/Constants.h" 37fe6060f1SDimitry Andric #include "llvm/IR/DerivedTypes.h" 38fe6060f1SDimitry Andric #include "llvm/IR/IRBuilder.h" 39fe6060f1SDimitry Andric #include "llvm/IR/InlineAsm.h" 40fe6060f1SDimitry Andric #include "llvm/IR/Instructions.h" 41349cc55cSDimitry Andric #include "llvm/IR/MDBuilder.h" 42fe6060f1SDimitry Andric #include "llvm/InitializePasses.h" 43fe6060f1SDimitry Andric #include "llvm/Pass.h" 44fe6060f1SDimitry Andric #include "llvm/Support/CommandLine.h" 45fe6060f1SDimitry Andric #include "llvm/Support/Debug.h" 46fe6060f1SDimitry Andric #include "llvm/Support/OptimizedStructLayout.h" 47fe6060f1SDimitry Andric #include "llvm/Transforms/Utils/ModuleUtils.h" 48*972a253aSDimitry Andric #include <tuple> 49fe6060f1SDimitry Andric #include <vector> 50fe6060f1SDimitry Andric 51fe6060f1SDimitry Andric #define DEBUG_TYPE "amdgpu-lower-module-lds" 52fe6060f1SDimitry Andric 53fe6060f1SDimitry Andric using namespace llvm; 54fe6060f1SDimitry Andric 55fe6060f1SDimitry Andric static cl::opt<bool> SuperAlignLDSGlobals( 56fe6060f1SDimitry Andric "amdgpu-super-align-lds-globals", 57fe6060f1SDimitry Andric cl::desc("Increase alignment of LDS if it is not on align boundary"), 58fe6060f1SDimitry Andric cl::init(true), cl::Hidden); 59fe6060f1SDimitry Andric 60fe6060f1SDimitry Andric namespace { 61fe6060f1SDimitry Andric class AMDGPULowerModuleLDS : public ModulePass { 62fe6060f1SDimitry Andric 63fe6060f1SDimitry Andric static void removeFromUsedList(Module &M, StringRef Name, 64fe6060f1SDimitry Andric SmallPtrSetImpl<Constant *> &ToRemove) { 65fe6060f1SDimitry Andric GlobalVariable *GV = M.getNamedGlobal(Name); 66fe6060f1SDimitry Andric if (!GV || ToRemove.empty()) { 67fe6060f1SDimitry Andric return; 68fe6060f1SDimitry Andric } 69fe6060f1SDimitry Andric 70fe6060f1SDimitry Andric SmallVector<Constant *, 16> Init; 71fe6060f1SDimitry Andric auto *CA = cast<ConstantArray>(GV->getInitializer()); 72fe6060f1SDimitry Andric for (auto &Op : CA->operands()) { 73fe6060f1SDimitry Andric // ModuleUtils::appendToUsed only inserts Constants 74fe6060f1SDimitry Andric Constant *C = cast<Constant>(Op); 75fe6060f1SDimitry Andric if (!ToRemove.contains(C->stripPointerCasts())) { 76fe6060f1SDimitry Andric Init.push_back(C); 77fe6060f1SDimitry Andric } 78fe6060f1SDimitry Andric } 79fe6060f1SDimitry Andric 80fe6060f1SDimitry Andric if (Init.size() == CA->getNumOperands()) { 81fe6060f1SDimitry Andric return; // none to remove 82fe6060f1SDimitry Andric } 83fe6060f1SDimitry Andric 84fe6060f1SDimitry Andric GV->eraseFromParent(); 85fe6060f1SDimitry Andric 86fe6060f1SDimitry Andric for (Constant *C : ToRemove) { 87fe6060f1SDimitry Andric C->removeDeadConstantUsers(); 88fe6060f1SDimitry Andric } 89fe6060f1SDimitry Andric 90fe6060f1SDimitry Andric if (!Init.empty()) { 91fe6060f1SDimitry Andric ArrayType *ATy = 92fe6060f1SDimitry Andric ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size()); 93fe6060f1SDimitry Andric GV = 94fe6060f1SDimitry Andric new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, 95fe6060f1SDimitry Andric ConstantArray::get(ATy, Init), Name); 96fe6060f1SDimitry Andric GV->setSection("llvm.metadata"); 97fe6060f1SDimitry Andric } 98fe6060f1SDimitry Andric } 99fe6060f1SDimitry Andric 100fe6060f1SDimitry Andric static void 101fe6060f1SDimitry Andric removeFromUsedLists(Module &M, 102fe6060f1SDimitry Andric const std::vector<GlobalVariable *> &LocalVars) { 103*972a253aSDimitry Andric // The verifier rejects used lists containing an inttoptr of a constant 104*972a253aSDimitry Andric // so remove the variables from these lists before replaceAllUsesWith 105*972a253aSDimitry Andric 106fe6060f1SDimitry Andric SmallPtrSet<Constant *, 32> LocalVarsSet; 1070eae32dcSDimitry Andric for (GlobalVariable *LocalVar : LocalVars) 1080eae32dcSDimitry Andric if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts())) 109fe6060f1SDimitry Andric LocalVarsSet.insert(C); 110fe6060f1SDimitry Andric removeFromUsedList(M, "llvm.used", LocalVarsSet); 111fe6060f1SDimitry Andric removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet); 112fe6060f1SDimitry Andric } 113fe6060f1SDimitry Andric 114fe6060f1SDimitry Andric static void markUsedByKernel(IRBuilder<> &Builder, Function *Func, 115fe6060f1SDimitry Andric GlobalVariable *SGV) { 116fe6060f1SDimitry Andric // The llvm.amdgcn.module.lds instance is implicitly used by all kernels 117fe6060f1SDimitry Andric // that might call a function which accesses a field within it. This is 118fe6060f1SDimitry Andric // presently approximated to 'all kernels' if there are any such functions 119349cc55cSDimitry Andric // in the module. This implicit use is redefined as an explicit use here so 120fe6060f1SDimitry Andric // that later passes, specifically PromoteAlloca, account for the required 121fe6060f1SDimitry Andric // memory without any knowledge of this transform. 122fe6060f1SDimitry Andric 123fe6060f1SDimitry Andric // An operand bundle on llvm.donothing works because the call instruction 124fe6060f1SDimitry Andric // survives until after the last pass that needs to account for LDS. It is 125fe6060f1SDimitry Andric // better than inline asm as the latter survives until the end of codegen. A 126fe6060f1SDimitry Andric // totally robust solution would be a function with the same semantics as 127fe6060f1SDimitry Andric // llvm.donothing that takes a pointer to the instance and is lowered to a 128fe6060f1SDimitry Andric // no-op after LDS is allocated, but that is not presently necessary. 129fe6060f1SDimitry Andric 130fe6060f1SDimitry Andric LLVMContext &Ctx = Func->getContext(); 131fe6060f1SDimitry Andric 132fe6060f1SDimitry Andric Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI()); 133fe6060f1SDimitry Andric 134fe6060f1SDimitry Andric FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {}); 135fe6060f1SDimitry Andric 136fe6060f1SDimitry Andric Function *Decl = 137fe6060f1SDimitry Andric Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); 138fe6060f1SDimitry Andric 139fe6060f1SDimitry Andric Value *UseInstance[1] = {Builder.CreateInBoundsGEP( 140fe6060f1SDimitry Andric SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))}; 141fe6060f1SDimitry Andric 142fe6060f1SDimitry Andric Builder.CreateCall(FTy, Decl, {}, 143fe6060f1SDimitry Andric {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}, 144fe6060f1SDimitry Andric ""); 145fe6060f1SDimitry Andric } 146fe6060f1SDimitry Andric 147fe6060f1SDimitry Andric public: 148fe6060f1SDimitry Andric static char ID; 149fe6060f1SDimitry Andric 150fe6060f1SDimitry Andric AMDGPULowerModuleLDS() : ModulePass(ID) { 151fe6060f1SDimitry Andric initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry()); 152fe6060f1SDimitry Andric } 153fe6060f1SDimitry Andric 154fe6060f1SDimitry Andric bool runOnModule(Module &M) override { 155*972a253aSDimitry Andric LLVMContext &Ctx = M.getContext(); 15681ad6265SDimitry Andric CallGraph CG = CallGraph(M); 1570eae32dcSDimitry Andric bool Changed = superAlignLDSGlobals(M); 158*972a253aSDimitry Andric 159*972a253aSDimitry Andric // Move variables used by functions into amdgcn.module.lds 160fcaf7f86SDimitry Andric std::vector<GlobalVariable *> ModuleScopeVariables = 161fcaf7f86SDimitry Andric AMDGPU::findVariablesToLower(M, nullptr); 162*972a253aSDimitry Andric if (!ModuleScopeVariables.empty()) { 163*972a253aSDimitry Andric std::string VarName = "llvm.amdgcn.module.lds"; 164fe6060f1SDimitry Andric 165*972a253aSDimitry Andric GlobalVariable *SGV; 166*972a253aSDimitry Andric DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; 167*972a253aSDimitry Andric std::tie(SGV, LDSVarsToConstantGEP) = 168*972a253aSDimitry Andric createLDSVariableReplacement(M, VarName, ModuleScopeVariables); 169*972a253aSDimitry Andric 170*972a253aSDimitry Andric appendToCompilerUsed( 171*972a253aSDimitry Andric M, {static_cast<GlobalValue *>( 172*972a253aSDimitry Andric ConstantExpr::getPointerBitCastOrAddrSpaceCast( 173*972a253aSDimitry Andric cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))}); 174*972a253aSDimitry Andric 175*972a253aSDimitry Andric removeFromUsedLists(M, ModuleScopeVariables); 176*972a253aSDimitry Andric replaceLDSVariablesWithStruct(M, ModuleScopeVariables, SGV, 177*972a253aSDimitry Andric LDSVarsToConstantGEP, 178*972a253aSDimitry Andric [](Use &) { return true; }); 179*972a253aSDimitry Andric 180*972a253aSDimitry Andric // This ensures the variable is allocated when called functions access it. 181*972a253aSDimitry Andric // It also lets other passes, specifically PromoteAlloca, accurately 182*972a253aSDimitry Andric // calculate how much LDS will be used by the kernel after lowering. 183*972a253aSDimitry Andric 184*972a253aSDimitry Andric IRBuilder<> Builder(Ctx); 185*972a253aSDimitry Andric for (Function &Func : M.functions()) { 186*972a253aSDimitry Andric if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { 187*972a253aSDimitry Andric const CallGraphNode *N = CG[&Func]; 188*972a253aSDimitry Andric const bool CalleesRequireModuleLDS = N->size() > 0; 189*972a253aSDimitry Andric 190*972a253aSDimitry Andric if (CalleesRequireModuleLDS) { 191*972a253aSDimitry Andric // If a function this kernel might call requires module LDS, 192*972a253aSDimitry Andric // annotate the kernel to let later passes know it will allocate 193*972a253aSDimitry Andric // this structure, even if not apparent from the IR. 194*972a253aSDimitry Andric markUsedByKernel(Builder, &Func, SGV); 195*972a253aSDimitry Andric } else { 196*972a253aSDimitry Andric // However if we are certain this kernel cannot call a function that 197*972a253aSDimitry Andric // requires module LDS, annotate the kernel so the backend can elide 198*972a253aSDimitry Andric // the allocation without repeating callgraph walks. 199*972a253aSDimitry Andric Func.addFnAttr("amdgpu-elide-module-lds"); 200*972a253aSDimitry Andric } 201*972a253aSDimitry Andric } 202*972a253aSDimitry Andric } 203*972a253aSDimitry Andric 204*972a253aSDimitry Andric Changed = true; 205*972a253aSDimitry Andric } 206*972a253aSDimitry Andric 207*972a253aSDimitry Andric // Move variables used by kernels into per-kernel instances 208fe6060f1SDimitry Andric for (Function &F : M.functions()) { 209349cc55cSDimitry Andric if (F.isDeclaration()) 210349cc55cSDimitry Andric continue; 211349cc55cSDimitry Andric 212fe6060f1SDimitry Andric // Only lower compute kernels' LDS. 213fe6060f1SDimitry Andric if (!AMDGPU::isKernel(F.getCallingConv())) 214fe6060f1SDimitry Andric continue; 215*972a253aSDimitry Andric 216fcaf7f86SDimitry Andric std::vector<GlobalVariable *> KernelUsedVariables = 217fcaf7f86SDimitry Andric AMDGPU::findVariablesToLower(M, &F); 218*972a253aSDimitry Andric 219*972a253aSDimitry Andric // Replace all constant uses with instructions if they belong to the 220*972a253aSDimitry Andric // current kernel. Unnecessary, removing will cause test churn. 221*972a253aSDimitry Andric for (size_t I = 0; I < KernelUsedVariables.size(); I++) { 222*972a253aSDimitry Andric GlobalVariable *GV = KernelUsedVariables[I]; 223*972a253aSDimitry Andric for (User *U : make_early_inc_range(GV->users())) { 224*972a253aSDimitry Andric if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) 225*972a253aSDimitry Andric AMDGPU::replaceConstantUsesInFunction(C, &F); 226*972a253aSDimitry Andric } 227*972a253aSDimitry Andric GV->removeDeadConstantUsers(); 228*972a253aSDimitry Andric } 229*972a253aSDimitry Andric 230*972a253aSDimitry Andric if (!KernelUsedVariables.empty()) { 231*972a253aSDimitry Andric std::string VarName = 232*972a253aSDimitry Andric (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str(); 233*972a253aSDimitry Andric GlobalVariable *SGV; 234*972a253aSDimitry Andric DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; 235*972a253aSDimitry Andric std::tie(SGV, LDSVarsToConstantGEP) = 236*972a253aSDimitry Andric createLDSVariableReplacement(M, VarName, KernelUsedVariables); 237*972a253aSDimitry Andric 238*972a253aSDimitry Andric removeFromUsedLists(M, KernelUsedVariables); 239*972a253aSDimitry Andric replaceLDSVariablesWithStruct( 240*972a253aSDimitry Andric M, KernelUsedVariables, SGV, LDSVarsToConstantGEP, [&F](Use &U) { 241*972a253aSDimitry Andric Instruction *I = dyn_cast<Instruction>(U.getUser()); 242*972a253aSDimitry Andric return I && I->getFunction() == &F; 243*972a253aSDimitry Andric }); 244*972a253aSDimitry Andric Changed = true; 245*972a253aSDimitry Andric } 246fe6060f1SDimitry Andric } 247fe6060f1SDimitry Andric 248fe6060f1SDimitry Andric return Changed; 249fe6060f1SDimitry Andric } 250fe6060f1SDimitry Andric 251fe6060f1SDimitry Andric private: 252fe6060f1SDimitry Andric // Increase the alignment of LDS globals if necessary to maximise the chance 253fe6060f1SDimitry Andric // that we can use aligned LDS instructions to access them. 2540eae32dcSDimitry Andric static bool superAlignLDSGlobals(Module &M) { 2550eae32dcSDimitry Andric const DataLayout &DL = M.getDataLayout(); 2560eae32dcSDimitry Andric bool Changed = false; 2570eae32dcSDimitry Andric if (!SuperAlignLDSGlobals) { 2580eae32dcSDimitry Andric return Changed; 2590eae32dcSDimitry Andric } 2600eae32dcSDimitry Andric 2610eae32dcSDimitry Andric for (auto &GV : M.globals()) { 2620eae32dcSDimitry Andric if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { 2630eae32dcSDimitry Andric // Only changing alignment of LDS variables 2640eae32dcSDimitry Andric continue; 2650eae32dcSDimitry Andric } 2660eae32dcSDimitry Andric if (!GV.hasInitializer()) { 2670eae32dcSDimitry Andric // cuda/hip extern __shared__ variable, leave alignment alone 2680eae32dcSDimitry Andric continue; 2690eae32dcSDimitry Andric } 2700eae32dcSDimitry Andric 2710eae32dcSDimitry Andric Align Alignment = AMDGPU::getAlign(DL, &GV); 2720eae32dcSDimitry Andric TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType()); 273fe6060f1SDimitry Andric 274fe6060f1SDimitry Andric if (GVSize > 8) { 275fe6060f1SDimitry Andric // We might want to use a b96 or b128 load/store 276fe6060f1SDimitry Andric Alignment = std::max(Alignment, Align(16)); 277fe6060f1SDimitry Andric } else if (GVSize > 4) { 278fe6060f1SDimitry Andric // We might want to use a b64 load/store 279fe6060f1SDimitry Andric Alignment = std::max(Alignment, Align(8)); 280fe6060f1SDimitry Andric } else if (GVSize > 2) { 281fe6060f1SDimitry Andric // We might want to use a b32 load/store 282fe6060f1SDimitry Andric Alignment = std::max(Alignment, Align(4)); 283fe6060f1SDimitry Andric } else if (GVSize > 1) { 284fe6060f1SDimitry Andric // We might want to use a b16 load/store 285fe6060f1SDimitry Andric Alignment = std::max(Alignment, Align(2)); 286fe6060f1SDimitry Andric } 287fe6060f1SDimitry Andric 2880eae32dcSDimitry Andric if (Alignment != AMDGPU::getAlign(DL, &GV)) { 2890eae32dcSDimitry Andric Changed = true; 2900eae32dcSDimitry Andric GV.setAlignment(Alignment); 291fe6060f1SDimitry Andric } 292fe6060f1SDimitry Andric } 2930eae32dcSDimitry Andric return Changed; 2940eae32dcSDimitry Andric } 2950eae32dcSDimitry Andric 296*972a253aSDimitry Andric std::tuple<GlobalVariable *, DenseMap<GlobalVariable *, Constant *>> 297*972a253aSDimitry Andric createLDSVariableReplacement( 298*972a253aSDimitry Andric Module &M, std::string VarName, 299*972a253aSDimitry Andric std::vector<GlobalVariable *> const &LDSVarsToTransform) { 300*972a253aSDimitry Andric // Create a struct instance containing LDSVarsToTransform and map from those 301*972a253aSDimitry Andric // variables to ConstantExprGEP 302*972a253aSDimitry Andric // Variables may be introduced to meet alignment requirements. No aliasing 303*972a253aSDimitry Andric // metadata is useful for these as they have no uses. Erased before return. 304*972a253aSDimitry Andric 3050eae32dcSDimitry Andric LLVMContext &Ctx = M.getContext(); 3060eae32dcSDimitry Andric const DataLayout &DL = M.getDataLayout(); 307*972a253aSDimitry Andric assert(!LDSVarsToTransform.empty()); 308fe6060f1SDimitry Andric 309fe6060f1SDimitry Andric SmallVector<OptimizedStructLayoutField, 8> LayoutFields; 310fcaf7f86SDimitry Andric LayoutFields.reserve(LDSVarsToTransform.size()); 311fcaf7f86SDimitry Andric for (GlobalVariable *GV : LDSVarsToTransform) { 312fe6060f1SDimitry Andric OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()), 313fe6060f1SDimitry Andric AMDGPU::getAlign(DL, GV)); 314fe6060f1SDimitry Andric LayoutFields.emplace_back(F); 315fe6060f1SDimitry Andric } 316fe6060f1SDimitry Andric 317fe6060f1SDimitry Andric performOptimizedStructLayout(LayoutFields); 318fe6060f1SDimitry Andric 319fe6060f1SDimitry Andric std::vector<GlobalVariable *> LocalVars; 320*972a253aSDimitry Andric BitVector IsPaddingField; 321fcaf7f86SDimitry Andric LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large 322*972a253aSDimitry Andric IsPaddingField.reserve(LDSVarsToTransform.size()); 323fe6060f1SDimitry Andric { 324fe6060f1SDimitry Andric uint64_t CurrentOffset = 0; 325fe6060f1SDimitry Andric for (size_t I = 0; I < LayoutFields.size(); I++) { 326fe6060f1SDimitry Andric GlobalVariable *FGV = static_cast<GlobalVariable *>( 327fe6060f1SDimitry Andric const_cast<void *>(LayoutFields[I].Id)); 328fe6060f1SDimitry Andric Align DataAlign = LayoutFields[I].Alignment; 329fe6060f1SDimitry Andric 330fe6060f1SDimitry Andric uint64_t DataAlignV = DataAlign.value(); 331fe6060f1SDimitry Andric if (uint64_t Rem = CurrentOffset % DataAlignV) { 332fe6060f1SDimitry Andric uint64_t Padding = DataAlignV - Rem; 333fe6060f1SDimitry Andric 334fe6060f1SDimitry Andric // Append an array of padding bytes to meet alignment requested 335fe6060f1SDimitry Andric // Note (o + (a - (o % a)) ) % a == 0 336fe6060f1SDimitry Andric // (offset + Padding ) % align == 0 337fe6060f1SDimitry Andric 338fe6060f1SDimitry Andric Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); 339fe6060f1SDimitry Andric LocalVars.push_back(new GlobalVariable( 340fe6060f1SDimitry Andric M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), 341fe6060f1SDimitry Andric "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, 342fe6060f1SDimitry Andric false)); 343*972a253aSDimitry Andric IsPaddingField.push_back(true); 344fe6060f1SDimitry Andric CurrentOffset += Padding; 345fe6060f1SDimitry Andric } 346fe6060f1SDimitry Andric 347fe6060f1SDimitry Andric LocalVars.push_back(FGV); 348*972a253aSDimitry Andric IsPaddingField.push_back(false); 349fe6060f1SDimitry Andric CurrentOffset += LayoutFields[I].Size; 350fe6060f1SDimitry Andric } 351fe6060f1SDimitry Andric } 352fe6060f1SDimitry Andric 353fe6060f1SDimitry Andric std::vector<Type *> LocalVarTypes; 354fe6060f1SDimitry Andric LocalVarTypes.reserve(LocalVars.size()); 355fe6060f1SDimitry Andric std::transform( 356fe6060f1SDimitry Andric LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), 357fe6060f1SDimitry Andric [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); 358fe6060f1SDimitry Andric 359fe6060f1SDimitry Andric StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t"); 360fe6060f1SDimitry Andric 361fe6060f1SDimitry Andric Align StructAlign = 362fe6060f1SDimitry Andric AMDGPU::getAlign(DL, LocalVars[0]); 363fe6060f1SDimitry Andric 364fe6060f1SDimitry Andric GlobalVariable *SGV = new GlobalVariable( 365fe6060f1SDimitry Andric M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), 366fe6060f1SDimitry Andric VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, 367fe6060f1SDimitry Andric false); 368fe6060f1SDimitry Andric SGV->setAlignment(StructAlign); 369*972a253aSDimitry Andric 370*972a253aSDimitry Andric DenseMap<GlobalVariable *, Constant *> Map; 371*972a253aSDimitry Andric Type *I32 = Type::getInt32Ty(Ctx); 372*972a253aSDimitry Andric for (size_t I = 0; I < LocalVars.size(); I++) { 373*972a253aSDimitry Andric GlobalVariable *GV = LocalVars[I]; 374*972a253aSDimitry Andric Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; 375*972a253aSDimitry Andric Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true); 376*972a253aSDimitry Andric if (IsPaddingField[I]) { 377*972a253aSDimitry Andric assert(GV->use_empty()); 378*972a253aSDimitry Andric GV->eraseFromParent(); 379*972a253aSDimitry Andric } else { 380*972a253aSDimitry Andric Map[GV] = GEP; 381*972a253aSDimitry Andric } 382*972a253aSDimitry Andric } 383*972a253aSDimitry Andric assert(Map.size() == LDSVarsToTransform.size()); 384*972a253aSDimitry Andric return {SGV, std::move(Map)}; 385fe6060f1SDimitry Andric } 386fe6060f1SDimitry Andric 387*972a253aSDimitry Andric template <typename PredicateTy> 388*972a253aSDimitry Andric void replaceLDSVariablesWithStruct( 389*972a253aSDimitry Andric Module &M, std::vector<GlobalVariable *> const &LDSVarsToTransform, 390*972a253aSDimitry Andric GlobalVariable *SGV, 391*972a253aSDimitry Andric DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP, 392*972a253aSDimitry Andric PredicateTy Predicate) { 393*972a253aSDimitry Andric LLVMContext &Ctx = M.getContext(); 394*972a253aSDimitry Andric const DataLayout &DL = M.getDataLayout(); 395fe6060f1SDimitry Andric 396349cc55cSDimitry Andric // Create alias.scope and their lists. Each field in the new structure 397349cc55cSDimitry Andric // does not alias with all other fields. 398349cc55cSDimitry Andric SmallVector<MDNode *> AliasScopes; 399349cc55cSDimitry Andric SmallVector<Metadata *> NoAliasList; 400*972a253aSDimitry Andric const size_t NumberVars = LDSVarsToTransform.size(); 401*972a253aSDimitry Andric if (NumberVars > 1) { 402349cc55cSDimitry Andric MDBuilder MDB(Ctx); 403*972a253aSDimitry Andric AliasScopes.reserve(NumberVars); 404349cc55cSDimitry Andric MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); 405*972a253aSDimitry Andric for (size_t I = 0; I < NumberVars; I++) { 406349cc55cSDimitry Andric MDNode *Scope = MDB.createAnonymousAliasScope(Domain); 407349cc55cSDimitry Andric AliasScopes.push_back(Scope); 408349cc55cSDimitry Andric } 409349cc55cSDimitry Andric NoAliasList.append(&AliasScopes[1], AliasScopes.end()); 410349cc55cSDimitry Andric } 411349cc55cSDimitry Andric 412*972a253aSDimitry Andric // Replace uses of ith variable with a constantexpr to the corresponding 413*972a253aSDimitry Andric // field of the instance that will be allocated by AMDGPUMachineFunction 414*972a253aSDimitry Andric for (size_t I = 0; I < NumberVars; I++) { 415*972a253aSDimitry Andric GlobalVariable *GV = LDSVarsToTransform[I]; 416*972a253aSDimitry Andric Constant *GEP = LDSVarsToConstantGEP[GV]; 417fe6060f1SDimitry Andric 418*972a253aSDimitry Andric GV->replaceUsesWithIf(GEP, Predicate); 419fe6060f1SDimitry Andric if (GV->use_empty()) { 420fe6060f1SDimitry Andric GV->eraseFromParent(); 421fe6060f1SDimitry Andric } 422fe6060f1SDimitry Andric 423*972a253aSDimitry Andric APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0); 424*972a253aSDimitry Andric GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff); 425*972a253aSDimitry Andric uint64_t Offset = APOff.getZExtValue(); 426*972a253aSDimitry Andric 427*972a253aSDimitry Andric Align A = commonAlignment(SGV->getAlign().valueOrOne(), Offset); 428349cc55cSDimitry Andric 429349cc55cSDimitry Andric if (I) 430349cc55cSDimitry Andric NoAliasList[I - 1] = AliasScopes[I - 1]; 431349cc55cSDimitry Andric MDNode *NoAlias = 432349cc55cSDimitry Andric NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList); 433349cc55cSDimitry Andric MDNode *AliasScope = 434349cc55cSDimitry Andric AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]}); 435349cc55cSDimitry Andric 436349cc55cSDimitry Andric refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias); 437fe6060f1SDimitry Andric } 438fe6060f1SDimitry Andric } 439fe6060f1SDimitry Andric 440349cc55cSDimitry Andric void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL, 441349cc55cSDimitry Andric MDNode *AliasScope, MDNode *NoAlias, 442fe6060f1SDimitry Andric unsigned MaxDepth = 5) { 443349cc55cSDimitry Andric if (!MaxDepth || (A == 1 && !AliasScope)) 444fe6060f1SDimitry Andric return; 445fe6060f1SDimitry Andric 446fe6060f1SDimitry Andric for (User *U : Ptr->users()) { 447349cc55cSDimitry Andric if (auto *I = dyn_cast<Instruction>(U)) { 448349cc55cSDimitry Andric if (AliasScope && I->mayReadOrWriteMemory()) { 449349cc55cSDimitry Andric MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope); 450349cc55cSDimitry Andric AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope) 451349cc55cSDimitry Andric : AliasScope); 452349cc55cSDimitry Andric I->setMetadata(LLVMContext::MD_alias_scope, AS); 453349cc55cSDimitry Andric 454349cc55cSDimitry Andric MDNode *NA = I->getMetadata(LLVMContext::MD_noalias); 455349cc55cSDimitry Andric NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias); 456349cc55cSDimitry Andric I->setMetadata(LLVMContext::MD_noalias, NA); 457349cc55cSDimitry Andric } 458349cc55cSDimitry Andric } 459349cc55cSDimitry Andric 460fe6060f1SDimitry Andric if (auto *LI = dyn_cast<LoadInst>(U)) { 461fe6060f1SDimitry Andric LI->setAlignment(std::max(A, LI->getAlign())); 462fe6060f1SDimitry Andric continue; 463fe6060f1SDimitry Andric } 464fe6060f1SDimitry Andric if (auto *SI = dyn_cast<StoreInst>(U)) { 465fe6060f1SDimitry Andric if (SI->getPointerOperand() == Ptr) 466fe6060f1SDimitry Andric SI->setAlignment(std::max(A, SI->getAlign())); 467fe6060f1SDimitry Andric continue; 468fe6060f1SDimitry Andric } 469fe6060f1SDimitry Andric if (auto *AI = dyn_cast<AtomicRMWInst>(U)) { 470fe6060f1SDimitry Andric // None of atomicrmw operations can work on pointers, but let's 471fe6060f1SDimitry Andric // check it anyway in case it will or we will process ConstantExpr. 472fe6060f1SDimitry Andric if (AI->getPointerOperand() == Ptr) 473fe6060f1SDimitry Andric AI->setAlignment(std::max(A, AI->getAlign())); 474fe6060f1SDimitry Andric continue; 475fe6060f1SDimitry Andric } 476fe6060f1SDimitry Andric if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) { 477fe6060f1SDimitry Andric if (AI->getPointerOperand() == Ptr) 478fe6060f1SDimitry Andric AI->setAlignment(std::max(A, AI->getAlign())); 479fe6060f1SDimitry Andric continue; 480fe6060f1SDimitry Andric } 481fe6060f1SDimitry Andric if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) { 482fe6060f1SDimitry Andric unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); 483fe6060f1SDimitry Andric APInt Off(BitWidth, 0); 484349cc55cSDimitry Andric if (GEP->getPointerOperand() == Ptr) { 485349cc55cSDimitry Andric Align GA; 486349cc55cSDimitry Andric if (GEP->accumulateConstantOffset(DL, Off)) 487349cc55cSDimitry Andric GA = commonAlignment(A, Off.getLimitedValue()); 488349cc55cSDimitry Andric refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias, 489349cc55cSDimitry Andric MaxDepth - 1); 490fe6060f1SDimitry Andric } 491fe6060f1SDimitry Andric continue; 492fe6060f1SDimitry Andric } 493fe6060f1SDimitry Andric if (auto *I = dyn_cast<Instruction>(U)) { 494fe6060f1SDimitry Andric if (I->getOpcode() == Instruction::BitCast || 495fe6060f1SDimitry Andric I->getOpcode() == Instruction::AddrSpaceCast) 496349cc55cSDimitry Andric refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1); 497fe6060f1SDimitry Andric } 498fe6060f1SDimitry Andric } 499fe6060f1SDimitry Andric } 500fe6060f1SDimitry Andric }; 501fe6060f1SDimitry Andric 502fe6060f1SDimitry Andric } // namespace 503fe6060f1SDimitry Andric char AMDGPULowerModuleLDS::ID = 0; 504fe6060f1SDimitry Andric 505fe6060f1SDimitry Andric char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID; 506fe6060f1SDimitry Andric 507fe6060f1SDimitry Andric INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE, 508fe6060f1SDimitry Andric "Lower uses of LDS variables from non-kernel functions", false, 509fe6060f1SDimitry Andric false) 510fe6060f1SDimitry Andric 511fe6060f1SDimitry Andric ModulePass *llvm::createAMDGPULowerModuleLDSPass() { 512fe6060f1SDimitry Andric return new AMDGPULowerModuleLDS(); 513fe6060f1SDimitry Andric } 514fe6060f1SDimitry Andric 515fe6060f1SDimitry Andric PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, 516fe6060f1SDimitry Andric ModuleAnalysisManager &) { 517fe6060f1SDimitry Andric return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none() 518fe6060f1SDimitry Andric : PreservedAnalyses::all(); 519fe6060f1SDimitry Andric } 520