xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (revision 972a253a57b6f144b0e4a3e2080a2a0076ec55a0)
1fe6060f1SDimitry Andric //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2fe6060f1SDimitry Andric //
3fe6060f1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4fe6060f1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5fe6060f1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6fe6060f1SDimitry Andric //
7fe6060f1SDimitry Andric //===----------------------------------------------------------------------===//
8fe6060f1SDimitry Andric //
9fe6060f1SDimitry Andric // This pass eliminates LDS uses from non-kernel functions.
10fe6060f1SDimitry Andric //
11fe6060f1SDimitry Andric // The strategy is to create a new struct with a field for each LDS variable
12fe6060f1SDimitry Andric // and allocate that struct at the same address for every kernel. Uses of the
13fe6060f1SDimitry Andric // original LDS variables are then replaced with compile time offsets from that
14fe6060f1SDimitry Andric // known address. AMDGPUMachineFunction allocates the LDS global.
15fe6060f1SDimitry Andric //
16fe6060f1SDimitry Andric // Local variables with constant annotation or non-undef initializer are passed
1781ad6265SDimitry Andric // through unchanged for simplification or error diagnostics in later passes.
18fe6060f1SDimitry Andric //
19fe6060f1SDimitry Andric // To reduce the memory overhead variables that are only used by kernels are
20fe6060f1SDimitry Andric // excluded from this transform. The analysis to determine whether a variable
21fe6060f1SDimitry Andric // is only used by a kernel is cheap and conservative so this may allocate
22fe6060f1SDimitry Andric // a variable in every kernel when it was not strictly necessary to do so.
23fe6060f1SDimitry Andric //
24fe6060f1SDimitry Andric // A possible future refinement is to specialise the structure per-kernel, so
25fe6060f1SDimitry Andric // that fields can be elided based on more expensive analysis.
26fe6060f1SDimitry Andric //
27fe6060f1SDimitry Andric //===----------------------------------------------------------------------===//
28fe6060f1SDimitry Andric 
29fe6060f1SDimitry Andric #include "AMDGPU.h"
30fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
3181ad6265SDimitry Andric #include "Utils/AMDGPUMemoryUtils.h"
32*972a253aSDimitry Andric #include "llvm/ADT/BitVector.h"
33*972a253aSDimitry Andric #include "llvm/ADT/DenseMap.h"
34fe6060f1SDimitry Andric #include "llvm/ADT/STLExtras.h"
3581ad6265SDimitry Andric #include "llvm/Analysis/CallGraph.h"
36fe6060f1SDimitry Andric #include "llvm/IR/Constants.h"
37fe6060f1SDimitry Andric #include "llvm/IR/DerivedTypes.h"
38fe6060f1SDimitry Andric #include "llvm/IR/IRBuilder.h"
39fe6060f1SDimitry Andric #include "llvm/IR/InlineAsm.h"
40fe6060f1SDimitry Andric #include "llvm/IR/Instructions.h"
41349cc55cSDimitry Andric #include "llvm/IR/MDBuilder.h"
42fe6060f1SDimitry Andric #include "llvm/InitializePasses.h"
43fe6060f1SDimitry Andric #include "llvm/Pass.h"
44fe6060f1SDimitry Andric #include "llvm/Support/CommandLine.h"
45fe6060f1SDimitry Andric #include "llvm/Support/Debug.h"
46fe6060f1SDimitry Andric #include "llvm/Support/OptimizedStructLayout.h"
47fe6060f1SDimitry Andric #include "llvm/Transforms/Utils/ModuleUtils.h"
48*972a253aSDimitry Andric #include <tuple>
49fe6060f1SDimitry Andric #include <vector>
50fe6060f1SDimitry Andric 
51fe6060f1SDimitry Andric #define DEBUG_TYPE "amdgpu-lower-module-lds"
52fe6060f1SDimitry Andric 
53fe6060f1SDimitry Andric using namespace llvm;
54fe6060f1SDimitry Andric 
55fe6060f1SDimitry Andric static cl::opt<bool> SuperAlignLDSGlobals(
56fe6060f1SDimitry Andric     "amdgpu-super-align-lds-globals",
57fe6060f1SDimitry Andric     cl::desc("Increase alignment of LDS if it is not on align boundary"),
58fe6060f1SDimitry Andric     cl::init(true), cl::Hidden);
59fe6060f1SDimitry Andric 
60fe6060f1SDimitry Andric namespace {
61fe6060f1SDimitry Andric class AMDGPULowerModuleLDS : public ModulePass {
62fe6060f1SDimitry Andric 
63fe6060f1SDimitry Andric   static void removeFromUsedList(Module &M, StringRef Name,
64fe6060f1SDimitry Andric                                  SmallPtrSetImpl<Constant *> &ToRemove) {
65fe6060f1SDimitry Andric     GlobalVariable *GV = M.getNamedGlobal(Name);
66fe6060f1SDimitry Andric     if (!GV || ToRemove.empty()) {
67fe6060f1SDimitry Andric       return;
68fe6060f1SDimitry Andric     }
69fe6060f1SDimitry Andric 
70fe6060f1SDimitry Andric     SmallVector<Constant *, 16> Init;
71fe6060f1SDimitry Andric     auto *CA = cast<ConstantArray>(GV->getInitializer());
72fe6060f1SDimitry Andric     for (auto &Op : CA->operands()) {
73fe6060f1SDimitry Andric       // ModuleUtils::appendToUsed only inserts Constants
74fe6060f1SDimitry Andric       Constant *C = cast<Constant>(Op);
75fe6060f1SDimitry Andric       if (!ToRemove.contains(C->stripPointerCasts())) {
76fe6060f1SDimitry Andric         Init.push_back(C);
77fe6060f1SDimitry Andric       }
78fe6060f1SDimitry Andric     }
79fe6060f1SDimitry Andric 
80fe6060f1SDimitry Andric     if (Init.size() == CA->getNumOperands()) {
81fe6060f1SDimitry Andric       return; // none to remove
82fe6060f1SDimitry Andric     }
83fe6060f1SDimitry Andric 
84fe6060f1SDimitry Andric     GV->eraseFromParent();
85fe6060f1SDimitry Andric 
86fe6060f1SDimitry Andric     for (Constant *C : ToRemove) {
87fe6060f1SDimitry Andric       C->removeDeadConstantUsers();
88fe6060f1SDimitry Andric     }
89fe6060f1SDimitry Andric 
90fe6060f1SDimitry Andric     if (!Init.empty()) {
91fe6060f1SDimitry Andric       ArrayType *ATy =
92fe6060f1SDimitry Andric           ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
93fe6060f1SDimitry Andric       GV =
94fe6060f1SDimitry Andric           new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
95fe6060f1SDimitry Andric                                    ConstantArray::get(ATy, Init), Name);
96fe6060f1SDimitry Andric       GV->setSection("llvm.metadata");
97fe6060f1SDimitry Andric     }
98fe6060f1SDimitry Andric   }
99fe6060f1SDimitry Andric 
100fe6060f1SDimitry Andric   static void
101fe6060f1SDimitry Andric   removeFromUsedLists(Module &M,
102fe6060f1SDimitry Andric                       const std::vector<GlobalVariable *> &LocalVars) {
103*972a253aSDimitry Andric     // The verifier rejects used lists containing an inttoptr of a constant
104*972a253aSDimitry Andric     // so remove the variables from these lists before replaceAllUsesWith
105*972a253aSDimitry Andric 
106fe6060f1SDimitry Andric     SmallPtrSet<Constant *, 32> LocalVarsSet;
1070eae32dcSDimitry Andric     for (GlobalVariable *LocalVar : LocalVars)
1080eae32dcSDimitry Andric       if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
109fe6060f1SDimitry Andric         LocalVarsSet.insert(C);
110fe6060f1SDimitry Andric     removeFromUsedList(M, "llvm.used", LocalVarsSet);
111fe6060f1SDimitry Andric     removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
112fe6060f1SDimitry Andric   }
113fe6060f1SDimitry Andric 
114fe6060f1SDimitry Andric   static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
115fe6060f1SDimitry Andric                                GlobalVariable *SGV) {
116fe6060f1SDimitry Andric     // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
117fe6060f1SDimitry Andric     // that might call a function which accesses a field within it. This is
118fe6060f1SDimitry Andric     // presently approximated to 'all kernels' if there are any such functions
119349cc55cSDimitry Andric     // in the module. This implicit use is redefined as an explicit use here so
120fe6060f1SDimitry Andric     // that later passes, specifically PromoteAlloca, account for the required
121fe6060f1SDimitry Andric     // memory without any knowledge of this transform.
122fe6060f1SDimitry Andric 
123fe6060f1SDimitry Andric     // An operand bundle on llvm.donothing works because the call instruction
124fe6060f1SDimitry Andric     // survives until after the last pass that needs to account for LDS. It is
125fe6060f1SDimitry Andric     // better than inline asm as the latter survives until the end of codegen. A
126fe6060f1SDimitry Andric     // totally robust solution would be a function with the same semantics as
127fe6060f1SDimitry Andric     // llvm.donothing that takes a pointer to the instance and is lowered to a
128fe6060f1SDimitry Andric     // no-op after LDS is allocated, but that is not presently necessary.
129fe6060f1SDimitry Andric 
130fe6060f1SDimitry Andric     LLVMContext &Ctx = Func->getContext();
131fe6060f1SDimitry Andric 
132fe6060f1SDimitry Andric     Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
133fe6060f1SDimitry Andric 
134fe6060f1SDimitry Andric     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
135fe6060f1SDimitry Andric 
136fe6060f1SDimitry Andric     Function *Decl =
137fe6060f1SDimitry Andric         Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
138fe6060f1SDimitry Andric 
139fe6060f1SDimitry Andric     Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
140fe6060f1SDimitry Andric         SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
141fe6060f1SDimitry Andric 
142fe6060f1SDimitry Andric     Builder.CreateCall(FTy, Decl, {},
143fe6060f1SDimitry Andric                        {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
144fe6060f1SDimitry Andric                        "");
145fe6060f1SDimitry Andric   }
146fe6060f1SDimitry Andric 
147fe6060f1SDimitry Andric public:
148fe6060f1SDimitry Andric   static char ID;
149fe6060f1SDimitry Andric 
150fe6060f1SDimitry Andric   AMDGPULowerModuleLDS() : ModulePass(ID) {
151fe6060f1SDimitry Andric     initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
152fe6060f1SDimitry Andric   }
153fe6060f1SDimitry Andric 
154fe6060f1SDimitry Andric   bool runOnModule(Module &M) override {
155*972a253aSDimitry Andric     LLVMContext &Ctx = M.getContext();
15681ad6265SDimitry Andric     CallGraph CG = CallGraph(M);
1570eae32dcSDimitry Andric     bool Changed = superAlignLDSGlobals(M);
158*972a253aSDimitry Andric 
159*972a253aSDimitry Andric     // Move variables used by functions into amdgcn.module.lds
160fcaf7f86SDimitry Andric     std::vector<GlobalVariable *> ModuleScopeVariables =
161fcaf7f86SDimitry Andric         AMDGPU::findVariablesToLower(M, nullptr);
162*972a253aSDimitry Andric     if (!ModuleScopeVariables.empty()) {
163*972a253aSDimitry Andric       std::string VarName = "llvm.amdgcn.module.lds";
164fe6060f1SDimitry Andric 
165*972a253aSDimitry Andric       GlobalVariable *SGV;
166*972a253aSDimitry Andric       DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
167*972a253aSDimitry Andric       std::tie(SGV, LDSVarsToConstantGEP) =
168*972a253aSDimitry Andric           createLDSVariableReplacement(M, VarName, ModuleScopeVariables);
169*972a253aSDimitry Andric 
170*972a253aSDimitry Andric       appendToCompilerUsed(
171*972a253aSDimitry Andric           M, {static_cast<GlobalValue *>(
172*972a253aSDimitry Andric                  ConstantExpr::getPointerBitCastOrAddrSpaceCast(
173*972a253aSDimitry Andric                      cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
174*972a253aSDimitry Andric 
175*972a253aSDimitry Andric       removeFromUsedLists(M, ModuleScopeVariables);
176*972a253aSDimitry Andric       replaceLDSVariablesWithStruct(M, ModuleScopeVariables, SGV,
177*972a253aSDimitry Andric                                     LDSVarsToConstantGEP,
178*972a253aSDimitry Andric                                     [](Use &) { return true; });
179*972a253aSDimitry Andric 
180*972a253aSDimitry Andric       // This ensures the variable is allocated when called functions access it.
181*972a253aSDimitry Andric       // It also lets other passes, specifically PromoteAlloca, accurately
182*972a253aSDimitry Andric       // calculate how much LDS will be used by the kernel after lowering.
183*972a253aSDimitry Andric 
184*972a253aSDimitry Andric       IRBuilder<> Builder(Ctx);
185*972a253aSDimitry Andric       for (Function &Func : M.functions()) {
186*972a253aSDimitry Andric         if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
187*972a253aSDimitry Andric           const CallGraphNode *N = CG[&Func];
188*972a253aSDimitry Andric           const bool CalleesRequireModuleLDS = N->size() > 0;
189*972a253aSDimitry Andric 
190*972a253aSDimitry Andric           if (CalleesRequireModuleLDS) {
191*972a253aSDimitry Andric             // If a function this kernel might call requires module LDS,
192*972a253aSDimitry Andric             // annotate the kernel to let later passes know it will allocate
193*972a253aSDimitry Andric             // this structure, even if not apparent from the IR.
194*972a253aSDimitry Andric             markUsedByKernel(Builder, &Func, SGV);
195*972a253aSDimitry Andric           } else {
196*972a253aSDimitry Andric             // However if we are certain this kernel cannot call a function that
197*972a253aSDimitry Andric             // requires module LDS, annotate the kernel so the backend can elide
198*972a253aSDimitry Andric             // the allocation without repeating callgraph walks.
199*972a253aSDimitry Andric             Func.addFnAttr("amdgpu-elide-module-lds");
200*972a253aSDimitry Andric           }
201*972a253aSDimitry Andric         }
202*972a253aSDimitry Andric       }
203*972a253aSDimitry Andric 
204*972a253aSDimitry Andric       Changed = true;
205*972a253aSDimitry Andric     }
206*972a253aSDimitry Andric 
207*972a253aSDimitry Andric     // Move variables used by kernels into per-kernel instances
208fe6060f1SDimitry Andric     for (Function &F : M.functions()) {
209349cc55cSDimitry Andric       if (F.isDeclaration())
210349cc55cSDimitry Andric         continue;
211349cc55cSDimitry Andric 
212fe6060f1SDimitry Andric       // Only lower compute kernels' LDS.
213fe6060f1SDimitry Andric       if (!AMDGPU::isKernel(F.getCallingConv()))
214fe6060f1SDimitry Andric         continue;
215*972a253aSDimitry Andric 
216fcaf7f86SDimitry Andric       std::vector<GlobalVariable *> KernelUsedVariables =
217fcaf7f86SDimitry Andric           AMDGPU::findVariablesToLower(M, &F);
218*972a253aSDimitry Andric 
219*972a253aSDimitry Andric       // Replace all constant uses with instructions if they belong to the
220*972a253aSDimitry Andric       // current kernel. Unnecessary, removing will cause test churn.
221*972a253aSDimitry Andric       for (size_t I = 0; I < KernelUsedVariables.size(); I++) {
222*972a253aSDimitry Andric         GlobalVariable *GV = KernelUsedVariables[I];
223*972a253aSDimitry Andric         for (User *U : make_early_inc_range(GV->users())) {
224*972a253aSDimitry Andric           if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
225*972a253aSDimitry Andric             AMDGPU::replaceConstantUsesInFunction(C, &F);
226*972a253aSDimitry Andric         }
227*972a253aSDimitry Andric         GV->removeDeadConstantUsers();
228*972a253aSDimitry Andric       }
229*972a253aSDimitry Andric 
230*972a253aSDimitry Andric       if (!KernelUsedVariables.empty()) {
231*972a253aSDimitry Andric         std::string VarName =
232*972a253aSDimitry Andric             (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str();
233*972a253aSDimitry Andric         GlobalVariable *SGV;
234*972a253aSDimitry Andric         DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
235*972a253aSDimitry Andric         std::tie(SGV, LDSVarsToConstantGEP) =
236*972a253aSDimitry Andric             createLDSVariableReplacement(M, VarName, KernelUsedVariables);
237*972a253aSDimitry Andric 
238*972a253aSDimitry Andric         removeFromUsedLists(M, KernelUsedVariables);
239*972a253aSDimitry Andric         replaceLDSVariablesWithStruct(
240*972a253aSDimitry Andric             M, KernelUsedVariables, SGV, LDSVarsToConstantGEP, [&F](Use &U) {
241*972a253aSDimitry Andric               Instruction *I = dyn_cast<Instruction>(U.getUser());
242*972a253aSDimitry Andric               return I && I->getFunction() == &F;
243*972a253aSDimitry Andric             });
244*972a253aSDimitry Andric         Changed = true;
245*972a253aSDimitry Andric       }
246fe6060f1SDimitry Andric     }
247fe6060f1SDimitry Andric 
248fe6060f1SDimitry Andric     return Changed;
249fe6060f1SDimitry Andric   }
250fe6060f1SDimitry Andric 
251fe6060f1SDimitry Andric private:
252fe6060f1SDimitry Andric   // Increase the alignment of LDS globals if necessary to maximise the chance
253fe6060f1SDimitry Andric   // that we can use aligned LDS instructions to access them.
2540eae32dcSDimitry Andric   static bool superAlignLDSGlobals(Module &M) {
2550eae32dcSDimitry Andric     const DataLayout &DL = M.getDataLayout();
2560eae32dcSDimitry Andric     bool Changed = false;
2570eae32dcSDimitry Andric     if (!SuperAlignLDSGlobals) {
2580eae32dcSDimitry Andric       return Changed;
2590eae32dcSDimitry Andric     }
2600eae32dcSDimitry Andric 
2610eae32dcSDimitry Andric     for (auto &GV : M.globals()) {
2620eae32dcSDimitry Andric       if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
2630eae32dcSDimitry Andric         // Only changing alignment of LDS variables
2640eae32dcSDimitry Andric         continue;
2650eae32dcSDimitry Andric       }
2660eae32dcSDimitry Andric       if (!GV.hasInitializer()) {
2670eae32dcSDimitry Andric         // cuda/hip extern __shared__ variable, leave alignment alone
2680eae32dcSDimitry Andric         continue;
2690eae32dcSDimitry Andric       }
2700eae32dcSDimitry Andric 
2710eae32dcSDimitry Andric       Align Alignment = AMDGPU::getAlign(DL, &GV);
2720eae32dcSDimitry Andric       TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
273fe6060f1SDimitry Andric 
274fe6060f1SDimitry Andric       if (GVSize > 8) {
275fe6060f1SDimitry Andric         // We might want to use a b96 or b128 load/store
276fe6060f1SDimitry Andric         Alignment = std::max(Alignment, Align(16));
277fe6060f1SDimitry Andric       } else if (GVSize > 4) {
278fe6060f1SDimitry Andric         // We might want to use a b64 load/store
279fe6060f1SDimitry Andric         Alignment = std::max(Alignment, Align(8));
280fe6060f1SDimitry Andric       } else if (GVSize > 2) {
281fe6060f1SDimitry Andric         // We might want to use a b32 load/store
282fe6060f1SDimitry Andric         Alignment = std::max(Alignment, Align(4));
283fe6060f1SDimitry Andric       } else if (GVSize > 1) {
284fe6060f1SDimitry Andric         // We might want to use a b16 load/store
285fe6060f1SDimitry Andric         Alignment = std::max(Alignment, Align(2));
286fe6060f1SDimitry Andric       }
287fe6060f1SDimitry Andric 
2880eae32dcSDimitry Andric       if (Alignment != AMDGPU::getAlign(DL, &GV)) {
2890eae32dcSDimitry Andric         Changed = true;
2900eae32dcSDimitry Andric         GV.setAlignment(Alignment);
291fe6060f1SDimitry Andric       }
292fe6060f1SDimitry Andric     }
2930eae32dcSDimitry Andric     return Changed;
2940eae32dcSDimitry Andric   }
2950eae32dcSDimitry Andric 
296*972a253aSDimitry Andric   std::tuple<GlobalVariable *, DenseMap<GlobalVariable *, Constant *>>
297*972a253aSDimitry Andric   createLDSVariableReplacement(
298*972a253aSDimitry Andric       Module &M, std::string VarName,
299*972a253aSDimitry Andric       std::vector<GlobalVariable *> const &LDSVarsToTransform) {
300*972a253aSDimitry Andric     // Create a struct instance containing LDSVarsToTransform and map from those
301*972a253aSDimitry Andric     // variables to ConstantExprGEP
302*972a253aSDimitry Andric     // Variables may be introduced to meet alignment requirements. No aliasing
303*972a253aSDimitry Andric     // metadata is useful for these as they have no uses. Erased before return.
304*972a253aSDimitry Andric 
3050eae32dcSDimitry Andric     LLVMContext &Ctx = M.getContext();
3060eae32dcSDimitry Andric     const DataLayout &DL = M.getDataLayout();
307*972a253aSDimitry Andric     assert(!LDSVarsToTransform.empty());
308fe6060f1SDimitry Andric 
309fe6060f1SDimitry Andric     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
310fcaf7f86SDimitry Andric     LayoutFields.reserve(LDSVarsToTransform.size());
311fcaf7f86SDimitry Andric     for (GlobalVariable *GV : LDSVarsToTransform) {
312fe6060f1SDimitry Andric       OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
313fe6060f1SDimitry Andric                                    AMDGPU::getAlign(DL, GV));
314fe6060f1SDimitry Andric       LayoutFields.emplace_back(F);
315fe6060f1SDimitry Andric     }
316fe6060f1SDimitry Andric 
317fe6060f1SDimitry Andric     performOptimizedStructLayout(LayoutFields);
318fe6060f1SDimitry Andric 
319fe6060f1SDimitry Andric     std::vector<GlobalVariable *> LocalVars;
320*972a253aSDimitry Andric     BitVector IsPaddingField;
321fcaf7f86SDimitry Andric     LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
322*972a253aSDimitry Andric     IsPaddingField.reserve(LDSVarsToTransform.size());
323fe6060f1SDimitry Andric     {
324fe6060f1SDimitry Andric       uint64_t CurrentOffset = 0;
325fe6060f1SDimitry Andric       for (size_t I = 0; I < LayoutFields.size(); I++) {
326fe6060f1SDimitry Andric         GlobalVariable *FGV = static_cast<GlobalVariable *>(
327fe6060f1SDimitry Andric             const_cast<void *>(LayoutFields[I].Id));
328fe6060f1SDimitry Andric         Align DataAlign = LayoutFields[I].Alignment;
329fe6060f1SDimitry Andric 
330fe6060f1SDimitry Andric         uint64_t DataAlignV = DataAlign.value();
331fe6060f1SDimitry Andric         if (uint64_t Rem = CurrentOffset % DataAlignV) {
332fe6060f1SDimitry Andric           uint64_t Padding = DataAlignV - Rem;
333fe6060f1SDimitry Andric 
334fe6060f1SDimitry Andric           // Append an array of padding bytes to meet alignment requested
335fe6060f1SDimitry Andric           // Note (o +      (a - (o % a)) ) % a == 0
336fe6060f1SDimitry Andric           //      (offset + Padding       ) % align == 0
337fe6060f1SDimitry Andric 
338fe6060f1SDimitry Andric           Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
339fe6060f1SDimitry Andric           LocalVars.push_back(new GlobalVariable(
340fe6060f1SDimitry Andric               M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
341fe6060f1SDimitry Andric               "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
342fe6060f1SDimitry Andric               false));
343*972a253aSDimitry Andric           IsPaddingField.push_back(true);
344fe6060f1SDimitry Andric           CurrentOffset += Padding;
345fe6060f1SDimitry Andric         }
346fe6060f1SDimitry Andric 
347fe6060f1SDimitry Andric         LocalVars.push_back(FGV);
348*972a253aSDimitry Andric         IsPaddingField.push_back(false);
349fe6060f1SDimitry Andric         CurrentOffset += LayoutFields[I].Size;
350fe6060f1SDimitry Andric       }
351fe6060f1SDimitry Andric     }
352fe6060f1SDimitry Andric 
353fe6060f1SDimitry Andric     std::vector<Type *> LocalVarTypes;
354fe6060f1SDimitry Andric     LocalVarTypes.reserve(LocalVars.size());
355fe6060f1SDimitry Andric     std::transform(
356fe6060f1SDimitry Andric         LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
357fe6060f1SDimitry Andric         [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
358fe6060f1SDimitry Andric 
359fe6060f1SDimitry Andric     StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
360fe6060f1SDimitry Andric 
361fe6060f1SDimitry Andric     Align StructAlign =
362fe6060f1SDimitry Andric         AMDGPU::getAlign(DL, LocalVars[0]);
363fe6060f1SDimitry Andric 
364fe6060f1SDimitry Andric     GlobalVariable *SGV = new GlobalVariable(
365fe6060f1SDimitry Andric         M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
366fe6060f1SDimitry Andric         VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
367fe6060f1SDimitry Andric         false);
368fe6060f1SDimitry Andric     SGV->setAlignment(StructAlign);
369*972a253aSDimitry Andric 
370*972a253aSDimitry Andric     DenseMap<GlobalVariable *, Constant *> Map;
371*972a253aSDimitry Andric     Type *I32 = Type::getInt32Ty(Ctx);
372*972a253aSDimitry Andric     for (size_t I = 0; I < LocalVars.size(); I++) {
373*972a253aSDimitry Andric       GlobalVariable *GV = LocalVars[I];
374*972a253aSDimitry Andric       Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
375*972a253aSDimitry Andric       Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true);
376*972a253aSDimitry Andric       if (IsPaddingField[I]) {
377*972a253aSDimitry Andric         assert(GV->use_empty());
378*972a253aSDimitry Andric         GV->eraseFromParent();
379*972a253aSDimitry Andric       } else {
380*972a253aSDimitry Andric         Map[GV] = GEP;
381*972a253aSDimitry Andric       }
382*972a253aSDimitry Andric     }
383*972a253aSDimitry Andric     assert(Map.size() == LDSVarsToTransform.size());
384*972a253aSDimitry Andric     return {SGV, std::move(Map)};
385fe6060f1SDimitry Andric   }
386fe6060f1SDimitry Andric 
387*972a253aSDimitry Andric   template <typename PredicateTy>
388*972a253aSDimitry Andric   void replaceLDSVariablesWithStruct(
389*972a253aSDimitry Andric       Module &M, std::vector<GlobalVariable *> const &LDSVarsToTransform,
390*972a253aSDimitry Andric       GlobalVariable *SGV,
391*972a253aSDimitry Andric       DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP,
392*972a253aSDimitry Andric       PredicateTy Predicate) {
393*972a253aSDimitry Andric     LLVMContext &Ctx = M.getContext();
394*972a253aSDimitry Andric     const DataLayout &DL = M.getDataLayout();
395fe6060f1SDimitry Andric 
396349cc55cSDimitry Andric     // Create alias.scope and their lists. Each field in the new structure
397349cc55cSDimitry Andric     // does not alias with all other fields.
398349cc55cSDimitry Andric     SmallVector<MDNode *> AliasScopes;
399349cc55cSDimitry Andric     SmallVector<Metadata *> NoAliasList;
400*972a253aSDimitry Andric     const size_t NumberVars = LDSVarsToTransform.size();
401*972a253aSDimitry Andric     if (NumberVars > 1) {
402349cc55cSDimitry Andric       MDBuilder MDB(Ctx);
403*972a253aSDimitry Andric       AliasScopes.reserve(NumberVars);
404349cc55cSDimitry Andric       MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
405*972a253aSDimitry Andric       for (size_t I = 0; I < NumberVars; I++) {
406349cc55cSDimitry Andric         MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
407349cc55cSDimitry Andric         AliasScopes.push_back(Scope);
408349cc55cSDimitry Andric       }
409349cc55cSDimitry Andric       NoAliasList.append(&AliasScopes[1], AliasScopes.end());
410349cc55cSDimitry Andric     }
411349cc55cSDimitry Andric 
412*972a253aSDimitry Andric     // Replace uses of ith variable with a constantexpr to the corresponding
413*972a253aSDimitry Andric     // field of the instance that will be allocated by AMDGPUMachineFunction
414*972a253aSDimitry Andric     for (size_t I = 0; I < NumberVars; I++) {
415*972a253aSDimitry Andric       GlobalVariable *GV = LDSVarsToTransform[I];
416*972a253aSDimitry Andric       Constant *GEP = LDSVarsToConstantGEP[GV];
417fe6060f1SDimitry Andric 
418*972a253aSDimitry Andric       GV->replaceUsesWithIf(GEP, Predicate);
419fe6060f1SDimitry Andric       if (GV->use_empty()) {
420fe6060f1SDimitry Andric         GV->eraseFromParent();
421fe6060f1SDimitry Andric       }
422fe6060f1SDimitry Andric 
423*972a253aSDimitry Andric       APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
424*972a253aSDimitry Andric       GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff);
425*972a253aSDimitry Andric       uint64_t Offset = APOff.getZExtValue();
426*972a253aSDimitry Andric 
427*972a253aSDimitry Andric       Align A = commonAlignment(SGV->getAlign().valueOrOne(), Offset);
428349cc55cSDimitry Andric 
429349cc55cSDimitry Andric       if (I)
430349cc55cSDimitry Andric         NoAliasList[I - 1] = AliasScopes[I - 1];
431349cc55cSDimitry Andric       MDNode *NoAlias =
432349cc55cSDimitry Andric           NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
433349cc55cSDimitry Andric       MDNode *AliasScope =
434349cc55cSDimitry Andric           AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
435349cc55cSDimitry Andric 
436349cc55cSDimitry Andric       refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
437fe6060f1SDimitry Andric     }
438fe6060f1SDimitry Andric   }
439fe6060f1SDimitry Andric 
440349cc55cSDimitry Andric   void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
441349cc55cSDimitry Andric                                 MDNode *AliasScope, MDNode *NoAlias,
442fe6060f1SDimitry Andric                                 unsigned MaxDepth = 5) {
443349cc55cSDimitry Andric     if (!MaxDepth || (A == 1 && !AliasScope))
444fe6060f1SDimitry Andric       return;
445fe6060f1SDimitry Andric 
446fe6060f1SDimitry Andric     for (User *U : Ptr->users()) {
447349cc55cSDimitry Andric       if (auto *I = dyn_cast<Instruction>(U)) {
448349cc55cSDimitry Andric         if (AliasScope && I->mayReadOrWriteMemory()) {
449349cc55cSDimitry Andric           MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
450349cc55cSDimitry Andric           AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope)
451349cc55cSDimitry Andric                    : AliasScope);
452349cc55cSDimitry Andric           I->setMetadata(LLVMContext::MD_alias_scope, AS);
453349cc55cSDimitry Andric 
454349cc55cSDimitry Andric           MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
455349cc55cSDimitry Andric           NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias);
456349cc55cSDimitry Andric           I->setMetadata(LLVMContext::MD_noalias, NA);
457349cc55cSDimitry Andric         }
458349cc55cSDimitry Andric       }
459349cc55cSDimitry Andric 
460fe6060f1SDimitry Andric       if (auto *LI = dyn_cast<LoadInst>(U)) {
461fe6060f1SDimitry Andric         LI->setAlignment(std::max(A, LI->getAlign()));
462fe6060f1SDimitry Andric         continue;
463fe6060f1SDimitry Andric       }
464fe6060f1SDimitry Andric       if (auto *SI = dyn_cast<StoreInst>(U)) {
465fe6060f1SDimitry Andric         if (SI->getPointerOperand() == Ptr)
466fe6060f1SDimitry Andric           SI->setAlignment(std::max(A, SI->getAlign()));
467fe6060f1SDimitry Andric         continue;
468fe6060f1SDimitry Andric       }
469fe6060f1SDimitry Andric       if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
470fe6060f1SDimitry Andric         // None of atomicrmw operations can work on pointers, but let's
471fe6060f1SDimitry Andric         // check it anyway in case it will or we will process ConstantExpr.
472fe6060f1SDimitry Andric         if (AI->getPointerOperand() == Ptr)
473fe6060f1SDimitry Andric           AI->setAlignment(std::max(A, AI->getAlign()));
474fe6060f1SDimitry Andric         continue;
475fe6060f1SDimitry Andric       }
476fe6060f1SDimitry Andric       if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
477fe6060f1SDimitry Andric         if (AI->getPointerOperand() == Ptr)
478fe6060f1SDimitry Andric           AI->setAlignment(std::max(A, AI->getAlign()));
479fe6060f1SDimitry Andric         continue;
480fe6060f1SDimitry Andric       }
481fe6060f1SDimitry Andric       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
482fe6060f1SDimitry Andric         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
483fe6060f1SDimitry Andric         APInt Off(BitWidth, 0);
484349cc55cSDimitry Andric         if (GEP->getPointerOperand() == Ptr) {
485349cc55cSDimitry Andric           Align GA;
486349cc55cSDimitry Andric           if (GEP->accumulateConstantOffset(DL, Off))
487349cc55cSDimitry Andric             GA = commonAlignment(A, Off.getLimitedValue());
488349cc55cSDimitry Andric           refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
489349cc55cSDimitry Andric                                    MaxDepth - 1);
490fe6060f1SDimitry Andric         }
491fe6060f1SDimitry Andric         continue;
492fe6060f1SDimitry Andric       }
493fe6060f1SDimitry Andric       if (auto *I = dyn_cast<Instruction>(U)) {
494fe6060f1SDimitry Andric         if (I->getOpcode() == Instruction::BitCast ||
495fe6060f1SDimitry Andric             I->getOpcode() == Instruction::AddrSpaceCast)
496349cc55cSDimitry Andric           refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
497fe6060f1SDimitry Andric       }
498fe6060f1SDimitry Andric     }
499fe6060f1SDimitry Andric   }
500fe6060f1SDimitry Andric };
501fe6060f1SDimitry Andric 
502fe6060f1SDimitry Andric } // namespace
503fe6060f1SDimitry Andric char AMDGPULowerModuleLDS::ID = 0;
504fe6060f1SDimitry Andric 
505fe6060f1SDimitry Andric char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
506fe6060f1SDimitry Andric 
507fe6060f1SDimitry Andric INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
508fe6060f1SDimitry Andric                 "Lower uses of LDS variables from non-kernel functions", false,
509fe6060f1SDimitry Andric                 false)
510fe6060f1SDimitry Andric 
511fe6060f1SDimitry Andric ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
512fe6060f1SDimitry Andric   return new AMDGPULowerModuleLDS();
513fe6060f1SDimitry Andric }
514fe6060f1SDimitry Andric 
515fe6060f1SDimitry Andric PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
516fe6060f1SDimitry Andric                                                 ModuleAnalysisManager &) {
517fe6060f1SDimitry Andric   return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
518fe6060f1SDimitry Andric                                                : PreservedAnalyses::all();
519fe6060f1SDimitry Andric }
520