xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (revision 5f757f3ff9144b609b3c433dfd370cc6bdc191ad)
10b57cec5SDimitry Andric //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
906c3fb27SDimitry Andric // Eliminates allocas by either converting them into vectors or by migrating
1006c3fb27SDimitry Andric // them to local address space.
1106c3fb27SDimitry Andric //
1206c3fb27SDimitry Andric // Two passes are exposed by this file:
1306c3fb27SDimitry Andric //    - "promote-alloca-to-vector", which runs early in the pipeline and only
1406c3fb27SDimitry Andric //      promotes to vector. Promotion to vector is almost always profitable
1506c3fb27SDimitry Andric //      except when the alloca is too big and the promotion would result in
1606c3fb27SDimitry Andric //      very high register pressure.
1706c3fb27SDimitry Andric //    - "promote-alloca", which does both promotion to vector and LDS and runs
1806c3fb27SDimitry Andric //      much later in the pipeline. This runs after SROA because promoting to
1906c3fb27SDimitry Andric //      LDS is of course less profitable than getting rid of the alloca or
2006c3fb27SDimitry Andric //      vectorizing it, thus we only want to do it when the only alternative is
2106c3fb27SDimitry Andric //      lowering the alloca to stack.
2206c3fb27SDimitry Andric //
2306c3fb27SDimitry Andric // Note that both of them exist for the old and new PMs. The new PM passes are
2406c3fb27SDimitry Andric // declared in AMDGPU.h and the legacy PM ones are declared here.s
250b57cec5SDimitry Andric //
260b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
270b57cec5SDimitry Andric 
280b57cec5SDimitry Andric #include "AMDGPU.h"
29e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
301fd87a68SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
3106c3fb27SDimitry Andric #include "llvm/ADT/STLExtras.h"
320b57cec5SDimitry Andric #include "llvm/Analysis/CaptureTracking.h"
3306c3fb27SDimitry Andric #include "llvm/Analysis/InstSimplifyFolder.h"
3406c3fb27SDimitry Andric #include "llvm/Analysis/InstructionSimplify.h"
350b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
360b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
370b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h"
381fd87a68SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
39480093f4SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
40480093f4SDimitry Andric #include "llvm/IR/IntrinsicsR600.h"
4106c3fb27SDimitry Andric #include "llvm/IR/PatternMatch.h"
420b57cec5SDimitry Andric #include "llvm/Pass.h"
430b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h"
4406c3fb27SDimitry Andric #include "llvm/Transforms/Utils/SSAUpdater.h"
450b57cec5SDimitry Andric 
460b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-promote-alloca"
470b57cec5SDimitry Andric 
480b57cec5SDimitry Andric using namespace llvm;
490b57cec5SDimitry Andric 
500b57cec5SDimitry Andric namespace {
510b57cec5SDimitry Andric 
5206c3fb27SDimitry Andric static cl::opt<bool>
5306c3fb27SDimitry Andric     DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
540b57cec5SDimitry Andric                                  cl::desc("Disable promote alloca to vector"),
550b57cec5SDimitry Andric                                  cl::init(false));
560b57cec5SDimitry Andric 
5706c3fb27SDimitry Andric static cl::opt<bool>
5806c3fb27SDimitry Andric     DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
590b57cec5SDimitry Andric                               cl::desc("Disable promote alloca to LDS"),
600b57cec5SDimitry Andric                               cl::init(false));
610b57cec5SDimitry Andric 
625ffd83dbSDimitry Andric static cl::opt<unsigned> PromoteAllocaToVectorLimit(
635ffd83dbSDimitry Andric     "amdgpu-promote-alloca-to-vector-limit",
645ffd83dbSDimitry Andric     cl::desc("Maximum byte size to consider promote alloca to vector"),
655ffd83dbSDimitry Andric     cl::init(0));
665ffd83dbSDimitry Andric 
6706c3fb27SDimitry Andric // Shared implementation which can do both promotion to vector and to LDS.
68e8d8bef9SDimitry Andric class AMDGPUPromoteAllocaImpl {
690b57cec5SDimitry Andric private:
70e8d8bef9SDimitry Andric   const TargetMachine &TM;
710b57cec5SDimitry Andric   Module *Mod = nullptr;
720b57cec5SDimitry Andric   const DataLayout *DL = nullptr;
730b57cec5SDimitry Andric 
740b57cec5SDimitry Andric   // FIXME: This should be per-kernel.
750b57cec5SDimitry Andric   uint32_t LocalMemLimit = 0;
760b57cec5SDimitry Andric   uint32_t CurrentLocalMemUsage = 0;
775ffd83dbSDimitry Andric   unsigned MaxVGPRs;
780b57cec5SDimitry Andric 
790b57cec5SDimitry Andric   bool IsAMDGCN = false;
800b57cec5SDimitry Andric   bool IsAMDHSA = false;
810b57cec5SDimitry Andric 
820b57cec5SDimitry Andric   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
830b57cec5SDimitry Andric   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
840b57cec5SDimitry Andric 
850b57cec5SDimitry Andric   /// BaseAlloca is the alloca root the search started from.
860b57cec5SDimitry Andric   /// Val may be that alloca or a recursive user of it.
8706c3fb27SDimitry Andric   bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
880b57cec5SDimitry Andric                                std::vector<Value *> &WorkList) const;
890b57cec5SDimitry Andric 
900b57cec5SDimitry Andric   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
910b57cec5SDimitry Andric   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
920b57cec5SDimitry Andric   /// Returns true if both operands are derived from the same alloca. Val should
930b57cec5SDimitry Andric   /// be the same value as one of the input operands of UseInst.
940b57cec5SDimitry Andric   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
9506c3fb27SDimitry Andric                                        Instruction *UseInst, int OpIdx0,
9606c3fb27SDimitry Andric                                        int OpIdx1) const;
970b57cec5SDimitry Andric 
980b57cec5SDimitry Andric   /// Check whether we have enough local memory for promotion.
990b57cec5SDimitry Andric   bool hasSufficientLocalMem(const Function &F);
1000b57cec5SDimitry Andric 
10106c3fb27SDimitry Andric   bool tryPromoteAllocaToVector(AllocaInst &I);
10206c3fb27SDimitry Andric   bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
1030b57cec5SDimitry Andric 
104e8d8bef9SDimitry Andric public:
10506c3fb27SDimitry Andric   AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
10606c3fb27SDimitry Andric     const Triple &TT = TM.getTargetTriple();
10706c3fb27SDimitry Andric     IsAMDGCN = TT.getArch() == Triple::amdgcn;
10806c3fb27SDimitry Andric     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
10906c3fb27SDimitry Andric   }
11006c3fb27SDimitry Andric 
11106c3fb27SDimitry Andric   bool run(Function &F, bool PromoteToLDS);
11206c3fb27SDimitry Andric };
11306c3fb27SDimitry Andric 
11406c3fb27SDimitry Andric // FIXME: This can create globals so should be a module pass.
11506c3fb27SDimitry Andric class AMDGPUPromoteAlloca : public FunctionPass {
11606c3fb27SDimitry Andric public:
11706c3fb27SDimitry Andric   static char ID;
11806c3fb27SDimitry Andric 
11906c3fb27SDimitry Andric   AMDGPUPromoteAlloca() : FunctionPass(ID) {}
12006c3fb27SDimitry Andric 
12106c3fb27SDimitry Andric   bool runOnFunction(Function &F) override {
12206c3fb27SDimitry Andric     if (skipFunction(F))
12306c3fb27SDimitry Andric       return false;
12406c3fb27SDimitry Andric     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
12506c3fb27SDimitry Andric       return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
12606c3fb27SDimitry Andric           .run(F, /*PromoteToLDS*/ true);
12706c3fb27SDimitry Andric     return false;
12806c3fb27SDimitry Andric   }
12906c3fb27SDimitry Andric 
13006c3fb27SDimitry Andric   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
13106c3fb27SDimitry Andric 
13206c3fb27SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
13306c3fb27SDimitry Andric     AU.setPreservesCFG();
13406c3fb27SDimitry Andric     FunctionPass::getAnalysisUsage(AU);
13506c3fb27SDimitry Andric   }
1360b57cec5SDimitry Andric };
1370b57cec5SDimitry Andric 
1385ffd83dbSDimitry Andric class AMDGPUPromoteAllocaToVector : public FunctionPass {
1395ffd83dbSDimitry Andric public:
1405ffd83dbSDimitry Andric   static char ID;
1415ffd83dbSDimitry Andric 
1425ffd83dbSDimitry Andric   AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
1435ffd83dbSDimitry Andric 
14406c3fb27SDimitry Andric   bool runOnFunction(Function &F) override {
14506c3fb27SDimitry Andric     if (skipFunction(F))
14606c3fb27SDimitry Andric       return false;
14706c3fb27SDimitry Andric     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
14806c3fb27SDimitry Andric       return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
14906c3fb27SDimitry Andric           .run(F, /*PromoteToLDS*/ false);
15006c3fb27SDimitry Andric     return false;
15106c3fb27SDimitry Andric   }
1525ffd83dbSDimitry Andric 
1535ffd83dbSDimitry Andric   StringRef getPassName() const override {
1545ffd83dbSDimitry Andric     return "AMDGPU Promote Alloca to vector";
1555ffd83dbSDimitry Andric   }
1565ffd83dbSDimitry Andric 
1575ffd83dbSDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
1585ffd83dbSDimitry Andric     AU.setPreservesCFG();
1595ffd83dbSDimitry Andric     FunctionPass::getAnalysisUsage(AU);
1605ffd83dbSDimitry Andric   }
1615ffd83dbSDimitry Andric };
1625ffd83dbSDimitry Andric 
16306c3fb27SDimitry Andric unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) {
16406c3fb27SDimitry Andric   if (!TM.getTargetTriple().isAMDGCN())
16506c3fb27SDimitry Andric     return 128;
16606c3fb27SDimitry Andric 
16706c3fb27SDimitry Andric   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
16806c3fb27SDimitry Andric   unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
16906c3fb27SDimitry Andric 
17006c3fb27SDimitry Andric   // A non-entry function has only 32 caller preserved registers.
17106c3fb27SDimitry Andric   // Do not promote alloca which will force spilling unless we know the function
17206c3fb27SDimitry Andric   // will be inlined.
17306c3fb27SDimitry Andric   if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
17406c3fb27SDimitry Andric       !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
17506c3fb27SDimitry Andric     MaxVGPRs = std::min(MaxVGPRs, 32u);
17606c3fb27SDimitry Andric   return MaxVGPRs;
17706c3fb27SDimitry Andric }
17806c3fb27SDimitry Andric 
1790b57cec5SDimitry Andric } // end anonymous namespace
1800b57cec5SDimitry Andric 
1810b57cec5SDimitry Andric char AMDGPUPromoteAlloca::ID = 0;
1825ffd83dbSDimitry Andric char AMDGPUPromoteAllocaToVector::ID = 0;
1830b57cec5SDimitry Andric 
184fe6060f1SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
185fe6060f1SDimitry Andric                       "AMDGPU promote alloca to vector or LDS", false, false)
186fe6060f1SDimitry Andric // Move LDS uses from functions to kernels before promote alloca for accurate
187fe6060f1SDimitry Andric // estimation of LDS available
188*5f757f3fSDimitry Andric INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
189fe6060f1SDimitry Andric INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
1900b57cec5SDimitry Andric                     "AMDGPU promote alloca to vector or LDS", false, false)
1910b57cec5SDimitry Andric 
1925ffd83dbSDimitry Andric INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
1935ffd83dbSDimitry Andric                 "AMDGPU promote alloca to vector", false, false)
1945ffd83dbSDimitry Andric 
1950b57cec5SDimitry Andric char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
1965ffd83dbSDimitry Andric char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
1970b57cec5SDimitry Andric 
198e8d8bef9SDimitry Andric PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
199e8d8bef9SDimitry Andric                                                FunctionAnalysisManager &AM) {
20006c3fb27SDimitry Andric   bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
201e8d8bef9SDimitry Andric   if (Changed) {
202e8d8bef9SDimitry Andric     PreservedAnalyses PA;
203e8d8bef9SDimitry Andric     PA.preserveSet<CFGAnalyses>();
204e8d8bef9SDimitry Andric     return PA;
205e8d8bef9SDimitry Andric   }
206e8d8bef9SDimitry Andric   return PreservedAnalyses::all();
207e8d8bef9SDimitry Andric }
208e8d8bef9SDimitry Andric 
20906c3fb27SDimitry Andric PreservedAnalyses
21006c3fb27SDimitry Andric AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
21106c3fb27SDimitry Andric   bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
21206c3fb27SDimitry Andric   if (Changed) {
21306c3fb27SDimitry Andric     PreservedAnalyses PA;
21406c3fb27SDimitry Andric     PA.preserveSet<CFGAnalyses>();
21506c3fb27SDimitry Andric     return PA;
21606c3fb27SDimitry Andric   }
21706c3fb27SDimitry Andric   return PreservedAnalyses::all();
21806c3fb27SDimitry Andric }
21906c3fb27SDimitry Andric 
22006c3fb27SDimitry Andric FunctionPass *llvm::createAMDGPUPromoteAlloca() {
22106c3fb27SDimitry Andric   return new AMDGPUPromoteAlloca();
22206c3fb27SDimitry Andric }
22306c3fb27SDimitry Andric 
22406c3fb27SDimitry Andric FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
22506c3fb27SDimitry Andric   return new AMDGPUPromoteAllocaToVector();
22606c3fb27SDimitry Andric }
22706c3fb27SDimitry Andric 
22806c3fb27SDimitry Andric bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
229e8d8bef9SDimitry Andric   Mod = F.getParent();
230e8d8bef9SDimitry Andric   DL = &Mod->getDataLayout();
231e8d8bef9SDimitry Andric 
232e8d8bef9SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
2330b57cec5SDimitry Andric   if (!ST.isPromoteAllocaEnabled())
2340b57cec5SDimitry Andric     return false;
2350b57cec5SDimitry Andric 
23606c3fb27SDimitry Andric   MaxVGPRs = getMaxVGPRs(TM, F);
2375ffd83dbSDimitry Andric 
23806c3fb27SDimitry Andric   bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
2390b57cec5SDimitry Andric 
2400b57cec5SDimitry Andric   SmallVector<AllocaInst *, 16> Allocas;
24106c3fb27SDimitry Andric   for (Instruction &I : F.getEntryBlock()) {
24206c3fb27SDimitry Andric     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
24306c3fb27SDimitry Andric       // Array allocations are probably not worth handling, since an allocation
24406c3fb27SDimitry Andric       // of the array type is the canonical form.
24506c3fb27SDimitry Andric       if (!AI->isStaticAlloca() || AI->isArrayAllocation())
24606c3fb27SDimitry Andric         continue;
2470b57cec5SDimitry Andric       Allocas.push_back(AI);
2480b57cec5SDimitry Andric     }
24906c3fb27SDimitry Andric   }
2500b57cec5SDimitry Andric 
25106c3fb27SDimitry Andric   bool Changed = false;
2520b57cec5SDimitry Andric   for (AllocaInst *AI : Allocas) {
25306c3fb27SDimitry Andric     if (tryPromoteAllocaToVector(*AI))
25406c3fb27SDimitry Andric       Changed = true;
25506c3fb27SDimitry Andric     else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
2560b57cec5SDimitry Andric       Changed = true;
2570b57cec5SDimitry Andric   }
2580b57cec5SDimitry Andric 
25906c3fb27SDimitry Andric   // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
26006c3fb27SDimitry Andric   // dangling pointers. If we want to reuse it past this point, the loop above
26106c3fb27SDimitry Andric   // would need to be updated to remove successfully promoted allocas.
26206c3fb27SDimitry Andric 
2630b57cec5SDimitry Andric   return Changed;
2640b57cec5SDimitry Andric }
2650b57cec5SDimitry Andric 
26606c3fb27SDimitry Andric struct MemTransferInfo {
26706c3fb27SDimitry Andric   ConstantInt *SrcIndex = nullptr;
26806c3fb27SDimitry Andric   ConstantInt *DestIndex = nullptr;
26906c3fb27SDimitry Andric };
27006c3fb27SDimitry Andric 
27106c3fb27SDimitry Andric // Checks if the instruction I is a memset user of the alloca AI that we can
27206c3fb27SDimitry Andric // deal with. Currently, only non-volatile memsets that affect the whole alloca
27306c3fb27SDimitry Andric // are handled.
27406c3fb27SDimitry Andric static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
27506c3fb27SDimitry Andric                               const DataLayout &DL) {
27606c3fb27SDimitry Andric   using namespace PatternMatch;
27706c3fb27SDimitry Andric   // For now we only care about non-volatile memsets that affect the whole type
27806c3fb27SDimitry Andric   // (start at index 0 and fill the whole alloca).
27906c3fb27SDimitry Andric   //
28006c3fb27SDimitry Andric   // TODO: Now that we moved to PromoteAlloca we could handle any memsets
28106c3fb27SDimitry Andric   // (except maybe volatile ones?) - we just need to use shufflevector if it
28206c3fb27SDimitry Andric   // only affects a subset of the vector.
28306c3fb27SDimitry Andric   const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
28406c3fb27SDimitry Andric   return I->getOperand(0) == AI &&
28506c3fb27SDimitry Andric          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
28606c3fb27SDimitry Andric }
28706c3fb27SDimitry Andric 
28806c3fb27SDimitry Andric static Value *
28906c3fb27SDimitry Andric calculateVectorIndex(Value *Ptr,
29006c3fb27SDimitry Andric                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
29106c3fb27SDimitry Andric   auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
29206c3fb27SDimitry Andric   if (!GEP)
29306c3fb27SDimitry Andric     return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
29406c3fb27SDimitry Andric 
29506c3fb27SDimitry Andric   auto I = GEPIdx.find(GEP);
29606c3fb27SDimitry Andric   assert(I != GEPIdx.end() && "Must have entry for GEP!");
29706c3fb27SDimitry Andric   return I->second;
29806c3fb27SDimitry Andric }
29906c3fb27SDimitry Andric 
30006c3fb27SDimitry Andric static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
30106c3fb27SDimitry Andric                                Type *VecElemTy, const DataLayout &DL) {
30206c3fb27SDimitry Andric   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
30306c3fb27SDimitry Andric   // helper.
30406c3fb27SDimitry Andric   unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
30506c3fb27SDimitry Andric   MapVector<Value *, APInt> VarOffsets;
30606c3fb27SDimitry Andric   APInt ConstOffset(BW, 0);
30706c3fb27SDimitry Andric   if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
30806c3fb27SDimitry Andric       !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
30906c3fb27SDimitry Andric     return nullptr;
31006c3fb27SDimitry Andric 
31106c3fb27SDimitry Andric   unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
31206c3fb27SDimitry Andric   if (VarOffsets.size() > 1)
31306c3fb27SDimitry Andric     return nullptr;
31406c3fb27SDimitry Andric 
31506c3fb27SDimitry Andric   if (VarOffsets.size() == 1) {
31606c3fb27SDimitry Andric     // Only handle cases where we don't need to insert extra arithmetic
31706c3fb27SDimitry Andric     // instructions.
31806c3fb27SDimitry Andric     const auto &VarOffset = VarOffsets.front();
31906c3fb27SDimitry Andric     if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
32006c3fb27SDimitry Andric       return nullptr;
32106c3fb27SDimitry Andric     return VarOffset.first;
32206c3fb27SDimitry Andric   }
32306c3fb27SDimitry Andric 
32406c3fb27SDimitry Andric   APInt Quot;
32506c3fb27SDimitry Andric   uint64_t Rem;
32606c3fb27SDimitry Andric   APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
32706c3fb27SDimitry Andric   if (Rem != 0)
32806c3fb27SDimitry Andric     return nullptr;
32906c3fb27SDimitry Andric 
33006c3fb27SDimitry Andric   return ConstantInt::get(GEP->getContext(), Quot);
33106c3fb27SDimitry Andric }
33206c3fb27SDimitry Andric 
33306c3fb27SDimitry Andric /// Promotes a single user of the alloca to a vector form.
33406c3fb27SDimitry Andric ///
33506c3fb27SDimitry Andric /// \param Inst           Instruction to be promoted.
33606c3fb27SDimitry Andric /// \param DL             Module Data Layout.
33706c3fb27SDimitry Andric /// \param VectorTy       Vectorized Type.
33806c3fb27SDimitry Andric /// \param VecStoreSize   Size of \p VectorTy in bytes.
33906c3fb27SDimitry Andric /// \param ElementSize    Size of \p VectorTy element type in bytes.
34006c3fb27SDimitry Andric /// \param TransferInfo   MemTransferInst info map.
34106c3fb27SDimitry Andric /// \param GEPVectorIdx   GEP -> VectorIdx cache.
34206c3fb27SDimitry Andric /// \param CurVal         Current value of the vector (e.g. last stored value)
34306c3fb27SDimitry Andric /// \param[out]  DeferredLoads \p Inst is added to this vector if it can't
34406c3fb27SDimitry Andric ///              be promoted now. This happens when promoting requires \p
34506c3fb27SDimitry Andric ///              CurVal, but \p CurVal is nullptr.
34606c3fb27SDimitry Andric /// \return the stored value if \p Inst would have written to the alloca, or
34706c3fb27SDimitry Andric ///         nullptr otherwise.
34806c3fb27SDimitry Andric static Value *promoteAllocaUserToVector(
34906c3fb27SDimitry Andric     Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
35006c3fb27SDimitry Andric     unsigned VecStoreSize, unsigned ElementSize,
35106c3fb27SDimitry Andric     DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
35206c3fb27SDimitry Andric     std::map<GetElementPtrInst *, Value *> &GEPVectorIdx, Value *CurVal,
35306c3fb27SDimitry Andric     SmallVectorImpl<LoadInst *> &DeferredLoads) {
35406c3fb27SDimitry Andric   // Note: we use InstSimplifyFolder because it can leverage the DataLayout
35506c3fb27SDimitry Andric   // to do more folding, especially in the case of vector splats.
35606c3fb27SDimitry Andric   IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
35706c3fb27SDimitry Andric                                         InstSimplifyFolder(DL));
35806c3fb27SDimitry Andric   Builder.SetInsertPoint(Inst);
35906c3fb27SDimitry Andric 
36006c3fb27SDimitry Andric   const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
36106c3fb27SDimitry Andric     if (CurVal)
36206c3fb27SDimitry Andric       return CurVal;
36306c3fb27SDimitry Andric 
36406c3fb27SDimitry Andric     // If the current value is not known, insert a dummy load and lower it on
36506c3fb27SDimitry Andric     // the second pass.
36606c3fb27SDimitry Andric     LoadInst *Dummy =
36706c3fb27SDimitry Andric         Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
36806c3fb27SDimitry Andric                            "promotealloca.dummyload");
36906c3fb27SDimitry Andric     DeferredLoads.push_back(Dummy);
37006c3fb27SDimitry Andric     return Dummy;
37106c3fb27SDimitry Andric   };
37206c3fb27SDimitry Andric 
37306c3fb27SDimitry Andric   const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
37406c3fb27SDimitry Andric                                                    Type *PtrTy) -> Value * {
37506c3fb27SDimitry Andric     assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
37606c3fb27SDimitry Andric     const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
37706c3fb27SDimitry Andric     if (!PtrTy->isVectorTy())
37806c3fb27SDimitry Andric       return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
37906c3fb27SDimitry Andric     const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
38006c3fb27SDimitry Andric     // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
38106c3fb27SDimitry Andric     // first cast the ptr vector to <2 x i64>.
38206c3fb27SDimitry Andric     assert((Size % NumPtrElts == 0) && "Vector size not divisble");
38306c3fb27SDimitry Andric     Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
38406c3fb27SDimitry Andric     return Builder.CreateBitOrPointerCast(
38506c3fb27SDimitry Andric         Val, FixedVectorType::get(EltTy, NumPtrElts));
38606c3fb27SDimitry Andric   };
38706c3fb27SDimitry Andric 
38806c3fb27SDimitry Andric   Type *VecEltTy = VectorTy->getElementType();
3898a4dda33SDimitry Andric 
39006c3fb27SDimitry Andric   switch (Inst->getOpcode()) {
39106c3fb27SDimitry Andric   case Instruction::Load: {
39206c3fb27SDimitry Andric     // Loads can only be lowered if the value is known.
39306c3fb27SDimitry Andric     if (!CurVal) {
39406c3fb27SDimitry Andric       DeferredLoads.push_back(cast<LoadInst>(Inst));
39506c3fb27SDimitry Andric       return nullptr;
39606c3fb27SDimitry Andric     }
39706c3fb27SDimitry Andric 
39806c3fb27SDimitry Andric     Value *Index = calculateVectorIndex(
39906c3fb27SDimitry Andric         cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
40006c3fb27SDimitry Andric 
40106c3fb27SDimitry Andric     // We're loading the full vector.
40206c3fb27SDimitry Andric     Type *AccessTy = Inst->getType();
40306c3fb27SDimitry Andric     TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
40406c3fb27SDimitry Andric     if (AccessSize == VecStoreSize && cast<Constant>(Index)->isZeroValue()) {
40506c3fb27SDimitry Andric       if (AccessTy->isPtrOrPtrVectorTy())
40606c3fb27SDimitry Andric         CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
40706c3fb27SDimitry Andric       else if (CurVal->getType()->isPtrOrPtrVectorTy())
40806c3fb27SDimitry Andric         CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
40906c3fb27SDimitry Andric       Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
41006c3fb27SDimitry Andric       Inst->replaceAllUsesWith(NewVal);
41106c3fb27SDimitry Andric       return nullptr;
41206c3fb27SDimitry Andric     }
41306c3fb27SDimitry Andric 
41406c3fb27SDimitry Andric     // Loading a subvector.
41506c3fb27SDimitry Andric     if (isa<FixedVectorType>(AccessTy)) {
41606c3fb27SDimitry Andric       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
4178a4dda33SDimitry Andric       const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
4188a4dda33SDimitry Andric       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
41906c3fb27SDimitry Andric       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
42006c3fb27SDimitry Andric 
42106c3fb27SDimitry Andric       Value *SubVec = PoisonValue::get(SubVecTy);
4228a4dda33SDimitry Andric       for (unsigned K = 0; K < NumLoadedElts; ++K) {
423*5f757f3fSDimitry Andric         Value *CurIdx =
424*5f757f3fSDimitry Andric             Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
42506c3fb27SDimitry Andric         SubVec = Builder.CreateInsertElement(
426*5f757f3fSDimitry Andric             SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
42706c3fb27SDimitry Andric       }
42806c3fb27SDimitry Andric 
42906c3fb27SDimitry Andric       if (AccessTy->isPtrOrPtrVectorTy())
43006c3fb27SDimitry Andric         SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
43106c3fb27SDimitry Andric       else if (SubVecTy->isPtrOrPtrVectorTy())
43206c3fb27SDimitry Andric         SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
43306c3fb27SDimitry Andric 
43406c3fb27SDimitry Andric       SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
43506c3fb27SDimitry Andric       Inst->replaceAllUsesWith(SubVec);
43606c3fb27SDimitry Andric       return nullptr;
43706c3fb27SDimitry Andric     }
43806c3fb27SDimitry Andric 
43906c3fb27SDimitry Andric     // We're loading one element.
44006c3fb27SDimitry Andric     Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
44106c3fb27SDimitry Andric     if (AccessTy != VecEltTy)
44206c3fb27SDimitry Andric       ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
44306c3fb27SDimitry Andric 
44406c3fb27SDimitry Andric     Inst->replaceAllUsesWith(ExtractElement);
44506c3fb27SDimitry Andric     return nullptr;
44606c3fb27SDimitry Andric   }
44706c3fb27SDimitry Andric   case Instruction::Store: {
44806c3fb27SDimitry Andric     // For stores, it's a bit trickier and it depends on whether we're storing
44906c3fb27SDimitry Andric     // the full vector or not. If we're storing the full vector, we don't need
45006c3fb27SDimitry Andric     // to know the current value. If this is a store of a single element, we
45106c3fb27SDimitry Andric     // need to know the value.
45206c3fb27SDimitry Andric     StoreInst *SI = cast<StoreInst>(Inst);
45306c3fb27SDimitry Andric     Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
45406c3fb27SDimitry Andric     Value *Val = SI->getValueOperand();
45506c3fb27SDimitry Andric 
45606c3fb27SDimitry Andric     // We're storing the full vector, we can handle this without knowing CurVal.
45706c3fb27SDimitry Andric     Type *AccessTy = Val->getType();
45806c3fb27SDimitry Andric     TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
45906c3fb27SDimitry Andric     if (AccessSize == VecStoreSize && cast<Constant>(Index)->isZeroValue()) {
46006c3fb27SDimitry Andric       if (AccessTy->isPtrOrPtrVectorTy())
46106c3fb27SDimitry Andric         Val = CreateTempPtrIntCast(Val, AccessTy);
46206c3fb27SDimitry Andric       else if (VectorTy->isPtrOrPtrVectorTy())
46306c3fb27SDimitry Andric         Val = CreateTempPtrIntCast(Val, VectorTy);
46406c3fb27SDimitry Andric       return Builder.CreateBitOrPointerCast(Val, VectorTy);
46506c3fb27SDimitry Andric     }
46606c3fb27SDimitry Andric 
46706c3fb27SDimitry Andric     // Storing a subvector.
46806c3fb27SDimitry Andric     if (isa<FixedVectorType>(AccessTy)) {
46906c3fb27SDimitry Andric       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
4708a4dda33SDimitry Andric       const unsigned NumWrittenElts =
4718a4dda33SDimitry Andric           AccessSize / DL.getTypeStoreSize(VecEltTy);
472*5f757f3fSDimitry Andric       const unsigned NumVecElts = VectorTy->getNumElements();
4738a4dda33SDimitry Andric       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
47406c3fb27SDimitry Andric       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
47506c3fb27SDimitry Andric 
47606c3fb27SDimitry Andric       if (SubVecTy->isPtrOrPtrVectorTy())
47706c3fb27SDimitry Andric         Val = CreateTempPtrIntCast(Val, SubVecTy);
47806c3fb27SDimitry Andric       else if (AccessTy->isPtrOrPtrVectorTy())
47906c3fb27SDimitry Andric         Val = CreateTempPtrIntCast(Val, AccessTy);
48006c3fb27SDimitry Andric 
48106c3fb27SDimitry Andric       Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
48206c3fb27SDimitry Andric 
48306c3fb27SDimitry Andric       Value *CurVec = GetOrLoadCurrentVectorValue();
484*5f757f3fSDimitry Andric       for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
485*5f757f3fSDimitry Andric            K < NumElts; ++K) {
486*5f757f3fSDimitry Andric         Value *CurIdx =
487*5f757f3fSDimitry Andric             Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
48806c3fb27SDimitry Andric         CurVec = Builder.CreateInsertElement(
489*5f757f3fSDimitry Andric             CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
49006c3fb27SDimitry Andric       }
49106c3fb27SDimitry Andric       return CurVec;
49206c3fb27SDimitry Andric     }
49306c3fb27SDimitry Andric 
49406c3fb27SDimitry Andric     if (Val->getType() != VecEltTy)
49506c3fb27SDimitry Andric       Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
49606c3fb27SDimitry Andric     return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
49706c3fb27SDimitry Andric                                        Index);
49806c3fb27SDimitry Andric   }
49906c3fb27SDimitry Andric   case Instruction::Call: {
50006c3fb27SDimitry Andric     if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
50106c3fb27SDimitry Andric       // For memcpy, we need to know curval.
50206c3fb27SDimitry Andric       ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
50306c3fb27SDimitry Andric       unsigned NumCopied = Length->getZExtValue() / ElementSize;
50406c3fb27SDimitry Andric       MemTransferInfo *TI = &TransferInfo[MTI];
50506c3fb27SDimitry Andric       unsigned SrcBegin = TI->SrcIndex->getZExtValue();
50606c3fb27SDimitry Andric       unsigned DestBegin = TI->DestIndex->getZExtValue();
50706c3fb27SDimitry Andric 
50806c3fb27SDimitry Andric       SmallVector<int> Mask;
50906c3fb27SDimitry Andric       for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
51006c3fb27SDimitry Andric         if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
51106c3fb27SDimitry Andric           Mask.push_back(SrcBegin++);
51206c3fb27SDimitry Andric         } else {
51306c3fb27SDimitry Andric           Mask.push_back(Idx);
51406c3fb27SDimitry Andric         }
51506c3fb27SDimitry Andric       }
51606c3fb27SDimitry Andric 
51706c3fb27SDimitry Andric       return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
51806c3fb27SDimitry Andric     }
51906c3fb27SDimitry Andric 
52006c3fb27SDimitry Andric     if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
52106c3fb27SDimitry Andric       // For memset, we don't need to know the previous value because we
52206c3fb27SDimitry Andric       // currently only allow memsets that cover the whole alloca.
52306c3fb27SDimitry Andric       Value *Elt = MSI->getOperand(1);
52406c3fb27SDimitry Andric       if (DL.getTypeStoreSize(VecEltTy) > 1) {
52506c3fb27SDimitry Andric         Value *EltBytes =
52606c3fb27SDimitry Andric             Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
52706c3fb27SDimitry Andric         Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
52806c3fb27SDimitry Andric       }
52906c3fb27SDimitry Andric 
53006c3fb27SDimitry Andric       return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
53106c3fb27SDimitry Andric     }
53206c3fb27SDimitry Andric 
53306c3fb27SDimitry Andric     llvm_unreachable("Unsupported call when promoting alloca to vector");
53406c3fb27SDimitry Andric   }
53506c3fb27SDimitry Andric 
53606c3fb27SDimitry Andric   default:
53706c3fb27SDimitry Andric     llvm_unreachable("Inconsistency in instructions promotable to vector");
53806c3fb27SDimitry Andric   }
53906c3fb27SDimitry Andric 
54006c3fb27SDimitry Andric   llvm_unreachable("Did not return after promoting instruction!");
54106c3fb27SDimitry Andric }
54206c3fb27SDimitry Andric 
54306c3fb27SDimitry Andric static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
54406c3fb27SDimitry Andric                                   const DataLayout &DL) {
54506c3fb27SDimitry Andric   // Access as a vector type can work if the size of the access vector is a
54606c3fb27SDimitry Andric   // multiple of the size of the alloca's vector element type.
54706c3fb27SDimitry Andric   //
54806c3fb27SDimitry Andric   // Examples:
54906c3fb27SDimitry Andric   //    - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
55006c3fb27SDimitry Andric   //    - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
55106c3fb27SDimitry Andric   //    - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
55206c3fb27SDimitry Andric   //        - 3*32 is not a multiple of 64
55306c3fb27SDimitry Andric   //
55406c3fb27SDimitry Andric   // We could handle more complicated cases, but it'd make things a lot more
55506c3fb27SDimitry Andric   // complicated.
55606c3fb27SDimitry Andric   if (isa<FixedVectorType>(AccessTy)) {
55706c3fb27SDimitry Andric     TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
55806c3fb27SDimitry Andric     TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
55906c3fb27SDimitry Andric     return AccTS.isKnownMultipleOf(VecTS);
56006c3fb27SDimitry Andric   }
56106c3fb27SDimitry Andric 
56206c3fb27SDimitry Andric   return CastInst::isBitOrNoopPointerCastable(VecTy->getElementType(), AccessTy,
56306c3fb27SDimitry Andric                                               DL);
56406c3fb27SDimitry Andric }
56506c3fb27SDimitry Andric 
56606c3fb27SDimitry Andric /// Iterates over an instruction worklist that may contain multiple instructions
56706c3fb27SDimitry Andric /// from the same basic block, but in a different order.
56806c3fb27SDimitry Andric template <typename InstContainer>
56906c3fb27SDimitry Andric static void forEachWorkListItem(const InstContainer &WorkList,
57006c3fb27SDimitry Andric                                 std::function<void(Instruction *)> Fn) {
57106c3fb27SDimitry Andric   // Bucket up uses of the alloca by the block they occur in.
57206c3fb27SDimitry Andric   // This is important because we have to handle multiple defs/uses in a block
57306c3fb27SDimitry Andric   // ourselves: SSAUpdater is purely for cross-block references.
57406c3fb27SDimitry Andric   DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
57506c3fb27SDimitry Andric   for (Instruction *User : WorkList)
57606c3fb27SDimitry Andric     UsesByBlock[User->getParent()].insert(User);
57706c3fb27SDimitry Andric 
57806c3fb27SDimitry Andric   for (Instruction *User : WorkList) {
57906c3fb27SDimitry Andric     BasicBlock *BB = User->getParent();
58006c3fb27SDimitry Andric     auto &BlockUses = UsesByBlock[BB];
58106c3fb27SDimitry Andric 
58206c3fb27SDimitry Andric     // Already processed, skip.
58306c3fb27SDimitry Andric     if (BlockUses.empty())
58406c3fb27SDimitry Andric       continue;
58506c3fb27SDimitry Andric 
58606c3fb27SDimitry Andric     // Only user in the block, directly process it.
58706c3fb27SDimitry Andric     if (BlockUses.size() == 1) {
58806c3fb27SDimitry Andric       Fn(User);
58906c3fb27SDimitry Andric       continue;
59006c3fb27SDimitry Andric     }
59106c3fb27SDimitry Andric 
59206c3fb27SDimitry Andric     // Multiple users in the block, do a linear scan to see users in order.
59306c3fb27SDimitry Andric     for (Instruction &Inst : *BB) {
59406c3fb27SDimitry Andric       if (!BlockUses.contains(&Inst))
59506c3fb27SDimitry Andric         continue;
59606c3fb27SDimitry Andric 
59706c3fb27SDimitry Andric       Fn(&Inst);
59806c3fb27SDimitry Andric     }
59906c3fb27SDimitry Andric 
60006c3fb27SDimitry Andric     // Clear the block so we know it's been processed.
60106c3fb27SDimitry Andric     BlockUses.clear();
60206c3fb27SDimitry Andric   }
60306c3fb27SDimitry Andric }
60406c3fb27SDimitry Andric 
60506c3fb27SDimitry Andric // FIXME: Should try to pick the most likely to be profitable allocas first.
60606c3fb27SDimitry Andric bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
60706c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
60806c3fb27SDimitry Andric 
60906c3fb27SDimitry Andric   if (DisablePromoteAllocaToVector) {
61006c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  Promote alloca to vector is disabled\n");
61106c3fb27SDimitry Andric     return false;
61206c3fb27SDimitry Andric   }
61306c3fb27SDimitry Andric 
61406c3fb27SDimitry Andric   Type *AllocaTy = Alloca.getAllocatedType();
61506c3fb27SDimitry Andric   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
61606c3fb27SDimitry Andric   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
61706c3fb27SDimitry Andric     if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
61806c3fb27SDimitry Andric         ArrayTy->getNumElements() > 0)
61906c3fb27SDimitry Andric       VectorTy = FixedVectorType::get(ArrayTy->getElementType(),
62006c3fb27SDimitry Andric                                       ArrayTy->getNumElements());
62106c3fb27SDimitry Andric   }
62206c3fb27SDimitry Andric 
62306c3fb27SDimitry Andric   // Use up to 1/4 of available register budget for vectorization.
62406c3fb27SDimitry Andric   unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
62506c3fb27SDimitry Andric                                               : (MaxVGPRs * 32);
62606c3fb27SDimitry Andric 
62706c3fb27SDimitry Andric   if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
62806c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with " << MaxVGPRs
62906c3fb27SDimitry Andric                       << " registers available\n");
63006c3fb27SDimitry Andric     return false;
63106c3fb27SDimitry Andric   }
63206c3fb27SDimitry Andric 
63306c3fb27SDimitry Andric   // FIXME: There is no reason why we can't support larger arrays, we
63406c3fb27SDimitry Andric   // are just being conservative for now.
63506c3fb27SDimitry Andric   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
63606c3fb27SDimitry Andric   // equivalent. Potentially these could also be promoted but we don't currently
63706c3fb27SDimitry Andric   // handle this case
63806c3fb27SDimitry Andric   if (!VectorTy) {
63906c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
64006c3fb27SDimitry Andric     return false;
64106c3fb27SDimitry Andric   }
64206c3fb27SDimitry Andric 
64306c3fb27SDimitry Andric   if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) {
64406c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  " << *VectorTy
64506c3fb27SDimitry Andric                       << " has an unsupported number of elements\n");
64606c3fb27SDimitry Andric     return false;
64706c3fb27SDimitry Andric   }
64806c3fb27SDimitry Andric 
64906c3fb27SDimitry Andric   std::map<GetElementPtrInst *, Value *> GEPVectorIdx;
65006c3fb27SDimitry Andric   SmallVector<Instruction *> WorkList;
65106c3fb27SDimitry Andric   SmallVector<Instruction *> UsersToRemove;
65206c3fb27SDimitry Andric   SmallVector<Instruction *> DeferredInsts;
65306c3fb27SDimitry Andric   SmallVector<Use *, 8> Uses;
65406c3fb27SDimitry Andric   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
65506c3fb27SDimitry Andric 
65606c3fb27SDimitry Andric   const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
65706c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  Cannot promote alloca to vector: " << Msg << "\n"
65806c3fb27SDimitry Andric                       << "    " << *Inst << "\n");
65906c3fb27SDimitry Andric     return false;
66006c3fb27SDimitry Andric   };
66106c3fb27SDimitry Andric 
66206c3fb27SDimitry Andric   for (Use &U : Alloca.uses())
66306c3fb27SDimitry Andric     Uses.push_back(&U);
66406c3fb27SDimitry Andric 
66506c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
66606c3fb27SDimitry Andric 
66706c3fb27SDimitry Andric   Type *VecEltTy = VectorTy->getElementType();
66806c3fb27SDimitry Andric   unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
66906c3fb27SDimitry Andric   while (!Uses.empty()) {
67006c3fb27SDimitry Andric     Use *U = Uses.pop_back_val();
67106c3fb27SDimitry Andric     Instruction *Inst = cast<Instruction>(U->getUser());
67206c3fb27SDimitry Andric 
67306c3fb27SDimitry Andric     if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
67406c3fb27SDimitry Andric       // This is a store of the pointer, not to the pointer.
67506c3fb27SDimitry Andric       if (isa<StoreInst>(Inst) &&
67606c3fb27SDimitry Andric           U->getOperandNo() != StoreInst::getPointerOperandIndex())
67706c3fb27SDimitry Andric         return RejectUser(Inst, "pointer is being stored");
67806c3fb27SDimitry Andric 
67906c3fb27SDimitry Andric       Type *AccessTy = getLoadStoreType(Inst);
68006c3fb27SDimitry Andric       if (AccessTy->isAggregateType())
68106c3fb27SDimitry Andric         return RejectUser(Inst, "unsupported load/store as aggregate");
68206c3fb27SDimitry Andric       assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
68306c3fb27SDimitry Andric 
684*5f757f3fSDimitry Andric       // Check that this is a simple access of a vector element.
685*5f757f3fSDimitry Andric       bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
686*5f757f3fSDimitry Andric                                           : cast<StoreInst>(Inst)->isSimple();
687*5f757f3fSDimitry Andric       if (!IsSimple)
688*5f757f3fSDimitry Andric         return RejectUser(Inst, "not a simple load or store");
689*5f757f3fSDimitry Andric 
69006c3fb27SDimitry Andric       Ptr = Ptr->stripPointerCasts();
69106c3fb27SDimitry Andric 
69206c3fb27SDimitry Andric       // Alloca already accessed as vector.
69306c3fb27SDimitry Andric       if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
69406c3fb27SDimitry Andric                                 DL->getTypeStoreSize(AccessTy)) {
69506c3fb27SDimitry Andric         WorkList.push_back(Inst);
69606c3fb27SDimitry Andric         continue;
69706c3fb27SDimitry Andric       }
69806c3fb27SDimitry Andric 
69906c3fb27SDimitry Andric       if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
70006c3fb27SDimitry Andric         return RejectUser(Inst, "not a supported access type");
70106c3fb27SDimitry Andric 
70206c3fb27SDimitry Andric       WorkList.push_back(Inst);
70306c3fb27SDimitry Andric       continue;
70406c3fb27SDimitry Andric     }
70506c3fb27SDimitry Andric 
70606c3fb27SDimitry Andric     if (isa<BitCastInst>(Inst)) {
70706c3fb27SDimitry Andric       // Look through bitcasts.
70806c3fb27SDimitry Andric       for (Use &U : Inst->uses())
70906c3fb27SDimitry Andric         Uses.push_back(&U);
71006c3fb27SDimitry Andric       UsersToRemove.push_back(Inst);
71106c3fb27SDimitry Andric       continue;
71206c3fb27SDimitry Andric     }
71306c3fb27SDimitry Andric 
71406c3fb27SDimitry Andric     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
71506c3fb27SDimitry Andric       // If we can't compute a vector index from this GEP, then we can't
71606c3fb27SDimitry Andric       // promote this alloca to vector.
71706c3fb27SDimitry Andric       Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL);
71806c3fb27SDimitry Andric       if (!Index)
71906c3fb27SDimitry Andric         return RejectUser(Inst, "cannot compute vector index for GEP");
72006c3fb27SDimitry Andric 
72106c3fb27SDimitry Andric       GEPVectorIdx[GEP] = Index;
72206c3fb27SDimitry Andric       for (Use &U : Inst->uses())
72306c3fb27SDimitry Andric         Uses.push_back(&U);
72406c3fb27SDimitry Andric       UsersToRemove.push_back(Inst);
72506c3fb27SDimitry Andric       continue;
72606c3fb27SDimitry Andric     }
72706c3fb27SDimitry Andric 
72806c3fb27SDimitry Andric     if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
72906c3fb27SDimitry Andric         MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
73006c3fb27SDimitry Andric       WorkList.push_back(Inst);
73106c3fb27SDimitry Andric       continue;
73206c3fb27SDimitry Andric     }
73306c3fb27SDimitry Andric 
73406c3fb27SDimitry Andric     if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
73506c3fb27SDimitry Andric       if (TransferInst->isVolatile())
73606c3fb27SDimitry Andric         return RejectUser(Inst, "mem transfer inst is volatile");
73706c3fb27SDimitry Andric 
73806c3fb27SDimitry Andric       ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
73906c3fb27SDimitry Andric       if (!Len || (Len->getZExtValue() % ElementSize))
74006c3fb27SDimitry Andric         return RejectUser(Inst, "mem transfer inst length is non-constant or "
74106c3fb27SDimitry Andric                                 "not a multiple of the vector element size");
74206c3fb27SDimitry Andric 
74306c3fb27SDimitry Andric       if (!TransferInfo.count(TransferInst)) {
74406c3fb27SDimitry Andric         DeferredInsts.push_back(Inst);
74506c3fb27SDimitry Andric         WorkList.push_back(Inst);
74606c3fb27SDimitry Andric         TransferInfo[TransferInst] = MemTransferInfo();
74706c3fb27SDimitry Andric       }
74806c3fb27SDimitry Andric 
74906c3fb27SDimitry Andric       auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
75006c3fb27SDimitry Andric         GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
75106c3fb27SDimitry Andric         if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
75206c3fb27SDimitry Andric           return nullptr;
75306c3fb27SDimitry Andric 
75406c3fb27SDimitry Andric         return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
75506c3fb27SDimitry Andric       };
75606c3fb27SDimitry Andric 
75706c3fb27SDimitry Andric       unsigned OpNum = U->getOperandNo();
75806c3fb27SDimitry Andric       MemTransferInfo *TI = &TransferInfo[TransferInst];
75906c3fb27SDimitry Andric       if (OpNum == 0) {
76006c3fb27SDimitry Andric         Value *Dest = TransferInst->getDest();
76106c3fb27SDimitry Andric         ConstantInt *Index = getPointerIndexOfAlloca(Dest);
76206c3fb27SDimitry Andric         if (!Index)
76306c3fb27SDimitry Andric           return RejectUser(Inst, "could not calculate constant dest index");
76406c3fb27SDimitry Andric         TI->DestIndex = Index;
76506c3fb27SDimitry Andric       } else {
76606c3fb27SDimitry Andric         assert(OpNum == 1);
76706c3fb27SDimitry Andric         Value *Src = TransferInst->getSource();
76806c3fb27SDimitry Andric         ConstantInt *Index = getPointerIndexOfAlloca(Src);
76906c3fb27SDimitry Andric         if (!Index)
77006c3fb27SDimitry Andric           return RejectUser(Inst, "could not calculate constant src index");
77106c3fb27SDimitry Andric         TI->SrcIndex = Index;
77206c3fb27SDimitry Andric       }
77306c3fb27SDimitry Andric       continue;
77406c3fb27SDimitry Andric     }
77506c3fb27SDimitry Andric 
77606c3fb27SDimitry Andric     // Ignore assume-like intrinsics and comparisons used in assumes.
77706c3fb27SDimitry Andric     if (isAssumeLikeIntrinsic(Inst)) {
77806c3fb27SDimitry Andric       UsersToRemove.push_back(Inst);
77906c3fb27SDimitry Andric       continue;
78006c3fb27SDimitry Andric     }
78106c3fb27SDimitry Andric 
78206c3fb27SDimitry Andric     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
78306c3fb27SDimitry Andric           return isAssumeLikeIntrinsic(cast<Instruction>(U));
78406c3fb27SDimitry Andric         })) {
78506c3fb27SDimitry Andric       UsersToRemove.push_back(Inst);
78606c3fb27SDimitry Andric       continue;
78706c3fb27SDimitry Andric     }
78806c3fb27SDimitry Andric 
78906c3fb27SDimitry Andric     return RejectUser(Inst, "unhandled alloca user");
79006c3fb27SDimitry Andric   }
79106c3fb27SDimitry Andric 
79206c3fb27SDimitry Andric   while (!DeferredInsts.empty()) {
79306c3fb27SDimitry Andric     Instruction *Inst = DeferredInsts.pop_back_val();
79406c3fb27SDimitry Andric     MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
79506c3fb27SDimitry Andric     // TODO: Support the case if the pointers are from different alloca or
79606c3fb27SDimitry Andric     // from different address spaces.
79706c3fb27SDimitry Andric     MemTransferInfo &Info = TransferInfo[TransferInst];
79806c3fb27SDimitry Andric     if (!Info.SrcIndex || !Info.DestIndex)
79906c3fb27SDimitry Andric       return RejectUser(
80006c3fb27SDimitry Andric           Inst, "mem transfer inst is missing constant src and/or dst index");
80106c3fb27SDimitry Andric   }
80206c3fb27SDimitry Andric 
80306c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
80406c3fb27SDimitry Andric                     << *VectorTy << '\n');
80506c3fb27SDimitry Andric   const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
80606c3fb27SDimitry Andric 
80706c3fb27SDimitry Andric   // Alloca is uninitialized memory. Imitate that by making the first value
80806c3fb27SDimitry Andric   // undef.
80906c3fb27SDimitry Andric   SSAUpdater Updater;
81006c3fb27SDimitry Andric   Updater.Initialize(VectorTy, "promotealloca");
81106c3fb27SDimitry Andric   Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));
81206c3fb27SDimitry Andric 
81306c3fb27SDimitry Andric   // First handle the initial worklist.
81406c3fb27SDimitry Andric   SmallVector<LoadInst *, 4> DeferredLoads;
81506c3fb27SDimitry Andric   forEachWorkListItem(WorkList, [&](Instruction *I) {
81606c3fb27SDimitry Andric     BasicBlock *BB = I->getParent();
81706c3fb27SDimitry Andric     // On the first pass, we only take values that are trivially known, i.e.
81806c3fb27SDimitry Andric     // where AddAvailableValue was already called in this block.
81906c3fb27SDimitry Andric     Value *Result = promoteAllocaUserToVector(
82006c3fb27SDimitry Andric         I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
82106c3fb27SDimitry Andric         Updater.FindValueForBlock(BB), DeferredLoads);
82206c3fb27SDimitry Andric     if (Result)
82306c3fb27SDimitry Andric       Updater.AddAvailableValue(BB, Result);
82406c3fb27SDimitry Andric   });
82506c3fb27SDimitry Andric 
82606c3fb27SDimitry Andric   // Then handle deferred loads.
82706c3fb27SDimitry Andric   forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
82806c3fb27SDimitry Andric     SmallVector<LoadInst *, 0> NewDLs;
82906c3fb27SDimitry Andric     BasicBlock *BB = I->getParent();
83006c3fb27SDimitry Andric     // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
83106c3fb27SDimitry Andric     // get a value, inserting PHIs as needed.
83206c3fb27SDimitry Andric     Value *Result = promoteAllocaUserToVector(
83306c3fb27SDimitry Andric         I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
83406c3fb27SDimitry Andric         Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
83506c3fb27SDimitry Andric     if (Result)
83606c3fb27SDimitry Andric       Updater.AddAvailableValue(BB, Result);
83706c3fb27SDimitry Andric     assert(NewDLs.empty() && "No more deferred loads should be queued!");
83806c3fb27SDimitry Andric   });
83906c3fb27SDimitry Andric 
84006c3fb27SDimitry Andric   // Delete all instructions. On the first pass, new dummy loads may have been
84106c3fb27SDimitry Andric   // added so we need to collect them too.
84206c3fb27SDimitry Andric   DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
84306c3fb27SDimitry Andric   InstsToDelete.insert(DeferredLoads.begin(), DeferredLoads.end());
84406c3fb27SDimitry Andric   for (Instruction *I : InstsToDelete) {
84506c3fb27SDimitry Andric     assert(I->use_empty());
84606c3fb27SDimitry Andric     I->eraseFromParent();
84706c3fb27SDimitry Andric   }
84806c3fb27SDimitry Andric 
84906c3fb27SDimitry Andric   // Delete all the users that are known to be removeable.
85006c3fb27SDimitry Andric   for (Instruction *I : reverse(UsersToRemove)) {
85106c3fb27SDimitry Andric     I->dropDroppableUses();
85206c3fb27SDimitry Andric     assert(I->use_empty());
85306c3fb27SDimitry Andric     I->eraseFromParent();
85406c3fb27SDimitry Andric   }
85506c3fb27SDimitry Andric 
85606c3fb27SDimitry Andric   // Alloca should now be dead too.
85706c3fb27SDimitry Andric   assert(Alloca.use_empty());
85806c3fb27SDimitry Andric   Alloca.eraseFromParent();
85906c3fb27SDimitry Andric   return true;
86006c3fb27SDimitry Andric }
86106c3fb27SDimitry Andric 
8620b57cec5SDimitry Andric std::pair<Value *, Value *>
863e8d8bef9SDimitry Andric AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
864349cc55cSDimitry Andric   Function &F = *Builder.GetInsertBlock()->getParent();
865e8d8bef9SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
8660b57cec5SDimitry Andric 
8670b57cec5SDimitry Andric   if (!IsAMDHSA) {
86806c3fb27SDimitry Andric     Function *LocalSizeYFn =
86906c3fb27SDimitry Andric         Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
87006c3fb27SDimitry Andric     Function *LocalSizeZFn =
87106c3fb27SDimitry Andric         Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
8720b57cec5SDimitry Andric 
8730b57cec5SDimitry Andric     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
8740b57cec5SDimitry Andric     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
8750b57cec5SDimitry Andric 
8760b57cec5SDimitry Andric     ST.makeLIDRangeMetadata(LocalSizeY);
8770b57cec5SDimitry Andric     ST.makeLIDRangeMetadata(LocalSizeZ);
8780b57cec5SDimitry Andric 
879bdd1243dSDimitry Andric     return std::pair(LocalSizeY, LocalSizeZ);
8800b57cec5SDimitry Andric   }
8810b57cec5SDimitry Andric 
8820b57cec5SDimitry Andric   // We must read the size out of the dispatch pointer.
8830b57cec5SDimitry Andric   assert(IsAMDGCN);
8840b57cec5SDimitry Andric 
8850b57cec5SDimitry Andric   // We are indexing into this struct, and want to extract the workgroup_size_*
8860b57cec5SDimitry Andric   // fields.
8870b57cec5SDimitry Andric   //
8880b57cec5SDimitry Andric   //   typedef struct hsa_kernel_dispatch_packet_s {
8890b57cec5SDimitry Andric   //     uint16_t header;
8900b57cec5SDimitry Andric   //     uint16_t setup;
8910b57cec5SDimitry Andric   //     uint16_t workgroup_size_x ;
8920b57cec5SDimitry Andric   //     uint16_t workgroup_size_y;
8930b57cec5SDimitry Andric   //     uint16_t workgroup_size_z;
8940b57cec5SDimitry Andric   //     uint16_t reserved0;
8950b57cec5SDimitry Andric   //     uint32_t grid_size_x ;
8960b57cec5SDimitry Andric   //     uint32_t grid_size_y ;
8970b57cec5SDimitry Andric   //     uint32_t grid_size_z;
8980b57cec5SDimitry Andric   //
8990b57cec5SDimitry Andric   //     uint32_t private_segment_size;
9000b57cec5SDimitry Andric   //     uint32_t group_segment_size;
9010b57cec5SDimitry Andric   //     uint64_t kernel_object;
9020b57cec5SDimitry Andric   //
9030b57cec5SDimitry Andric   // #ifdef HSA_LARGE_MODEL
9040b57cec5SDimitry Andric   //     void *kernarg_address;
9050b57cec5SDimitry Andric   // #elif defined HSA_LITTLE_ENDIAN
9060b57cec5SDimitry Andric   //     void *kernarg_address;
9070b57cec5SDimitry Andric   //     uint32_t reserved1;
9080b57cec5SDimitry Andric   // #else
9090b57cec5SDimitry Andric   //     uint32_t reserved1;
9100b57cec5SDimitry Andric   //     void *kernarg_address;
9110b57cec5SDimitry Andric   // #endif
9120b57cec5SDimitry Andric   //     uint64_t reserved2;
9130b57cec5SDimitry Andric   //     hsa_signal_t completion_signal; // uint64_t wrapper
9140b57cec5SDimitry Andric   //   } hsa_kernel_dispatch_packet_t
9150b57cec5SDimitry Andric   //
91606c3fb27SDimitry Andric   Function *DispatchPtrFn =
91706c3fb27SDimitry Andric       Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
9180b57cec5SDimitry Andric 
9190b57cec5SDimitry Andric   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
920349cc55cSDimitry Andric   DispatchPtr->addRetAttr(Attribute::NoAlias);
921349cc55cSDimitry Andric   DispatchPtr->addRetAttr(Attribute::NonNull);
922349cc55cSDimitry Andric   F.removeFnAttr("amdgpu-no-dispatch-ptr");
9230b57cec5SDimitry Andric 
9240b57cec5SDimitry Andric   // Size of the dispatch packet struct.
925349cc55cSDimitry Andric   DispatchPtr->addDereferenceableRetAttr(64);
9260b57cec5SDimitry Andric 
9270b57cec5SDimitry Andric   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
9280b57cec5SDimitry Andric   Value *CastDispatchPtr = Builder.CreateBitCast(
9290b57cec5SDimitry Andric       DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
9300b57cec5SDimitry Andric 
9310b57cec5SDimitry Andric   // We could do a single 64-bit load here, but it's likely that the basic
9320b57cec5SDimitry Andric   // 32-bit and extract sequence is already present, and it is probably easier
933349cc55cSDimitry Andric   // to CSE this. The loads should be mergeable later anyway.
9340b57cec5SDimitry Andric   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
9355ffd83dbSDimitry Andric   LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
9360b57cec5SDimitry Andric 
9370b57cec5SDimitry Andric   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
9385ffd83dbSDimitry Andric   LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
9390b57cec5SDimitry Andric 
940bdd1243dSDimitry Andric   MDNode *MD = MDNode::get(Mod->getContext(), std::nullopt);
9410b57cec5SDimitry Andric   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
9420b57cec5SDimitry Andric   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
9430b57cec5SDimitry Andric   ST.makeLIDRangeMetadata(LoadZU);
9440b57cec5SDimitry Andric 
9450b57cec5SDimitry Andric   // Extract y component. Upper half of LoadZU should be zero already.
9460b57cec5SDimitry Andric   Value *Y = Builder.CreateLShr(LoadXY, 16);
9470b57cec5SDimitry Andric 
948bdd1243dSDimitry Andric   return std::pair(Y, LoadZU);
9490b57cec5SDimitry Andric }
9500b57cec5SDimitry Andric 
951e8d8bef9SDimitry Andric Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
952e8d8bef9SDimitry Andric                                               unsigned N) {
953349cc55cSDimitry Andric   Function *F = Builder.GetInsertBlock()->getParent();
954349cc55cSDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
955480093f4SDimitry Andric   Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
956349cc55cSDimitry Andric   StringRef AttrName;
9570b57cec5SDimitry Andric 
9580b57cec5SDimitry Andric   switch (N) {
9590b57cec5SDimitry Andric   case 0:
960480093f4SDimitry Andric     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
961480093f4SDimitry Andric                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
962349cc55cSDimitry Andric     AttrName = "amdgpu-no-workitem-id-x";
9630b57cec5SDimitry Andric     break;
9640b57cec5SDimitry Andric   case 1:
965480093f4SDimitry Andric     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
966480093f4SDimitry Andric                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
967349cc55cSDimitry Andric     AttrName = "amdgpu-no-workitem-id-y";
9680b57cec5SDimitry Andric     break;
9690b57cec5SDimitry Andric 
9700b57cec5SDimitry Andric   case 2:
971480093f4SDimitry Andric     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
972480093f4SDimitry Andric                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
973349cc55cSDimitry Andric     AttrName = "amdgpu-no-workitem-id-z";
9740b57cec5SDimitry Andric     break;
9750b57cec5SDimitry Andric   default:
9760b57cec5SDimitry Andric     llvm_unreachable("invalid dimension");
9770b57cec5SDimitry Andric   }
9780b57cec5SDimitry Andric 
9790b57cec5SDimitry Andric   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
9800b57cec5SDimitry Andric   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
9810b57cec5SDimitry Andric   ST.makeLIDRangeMetadata(CI);
982349cc55cSDimitry Andric   F->removeFnAttr(AttrName);
9830b57cec5SDimitry Andric 
9840b57cec5SDimitry Andric   return CI;
9850b57cec5SDimitry Andric }
9860b57cec5SDimitry Andric 
9870b57cec5SDimitry Andric static bool isCallPromotable(CallInst *CI) {
9880b57cec5SDimitry Andric   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
9890b57cec5SDimitry Andric   if (!II)
9900b57cec5SDimitry Andric     return false;
9910b57cec5SDimitry Andric 
9920b57cec5SDimitry Andric   switch (II->getIntrinsicID()) {
9930b57cec5SDimitry Andric   case Intrinsic::memcpy:
9940b57cec5SDimitry Andric   case Intrinsic::memmove:
9950b57cec5SDimitry Andric   case Intrinsic::memset:
9960b57cec5SDimitry Andric   case Intrinsic::lifetime_start:
9970b57cec5SDimitry Andric   case Intrinsic::lifetime_end:
9980b57cec5SDimitry Andric   case Intrinsic::invariant_start:
9990b57cec5SDimitry Andric   case Intrinsic::invariant_end:
10000b57cec5SDimitry Andric   case Intrinsic::launder_invariant_group:
10010b57cec5SDimitry Andric   case Intrinsic::strip_invariant_group:
10020b57cec5SDimitry Andric   case Intrinsic::objectsize:
10030b57cec5SDimitry Andric     return true;
10040b57cec5SDimitry Andric   default:
10050b57cec5SDimitry Andric     return false;
10060b57cec5SDimitry Andric   }
10070b57cec5SDimitry Andric }
10080b57cec5SDimitry Andric 
1009e8d8bef9SDimitry Andric bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1010e8d8bef9SDimitry Andric     Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
10110b57cec5SDimitry Andric     int OpIdx1) const {
10120b57cec5SDimitry Andric   // Figure out which operand is the one we might not be promoting.
10130b57cec5SDimitry Andric   Value *OtherOp = Inst->getOperand(OpIdx0);
10140b57cec5SDimitry Andric   if (Val == OtherOp)
10150b57cec5SDimitry Andric     OtherOp = Inst->getOperand(OpIdx1);
10160b57cec5SDimitry Andric 
10170b57cec5SDimitry Andric   if (isa<ConstantPointerNull>(OtherOp))
10180b57cec5SDimitry Andric     return true;
10190b57cec5SDimitry Andric 
1020e8d8bef9SDimitry Andric   Value *OtherObj = getUnderlyingObject(OtherOp);
10210b57cec5SDimitry Andric   if (!isa<AllocaInst>(OtherObj))
10220b57cec5SDimitry Andric     return false;
10230b57cec5SDimitry Andric 
10240b57cec5SDimitry Andric   // TODO: We should be able to replace undefs with the right pointer type.
10250b57cec5SDimitry Andric 
10260b57cec5SDimitry Andric   // TODO: If we know the other base object is another promotable
10270b57cec5SDimitry Andric   // alloca, not necessarily this alloca, we can do this. The
10280b57cec5SDimitry Andric   // important part is both must have the same address space at
10290b57cec5SDimitry Andric   // the end.
10300b57cec5SDimitry Andric   if (OtherObj != BaseAlloca) {
10310b57cec5SDimitry Andric     LLVM_DEBUG(
10320b57cec5SDimitry Andric         dbgs() << "Found a binary instruction with another alloca object\n");
10330b57cec5SDimitry Andric     return false;
10340b57cec5SDimitry Andric   }
10350b57cec5SDimitry Andric 
10360b57cec5SDimitry Andric   return true;
10370b57cec5SDimitry Andric }
10380b57cec5SDimitry Andric 
1039e8d8bef9SDimitry Andric bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1040e8d8bef9SDimitry Andric     Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
10410b57cec5SDimitry Andric 
10420b57cec5SDimitry Andric   for (User *User : Val->users()) {
10430b57cec5SDimitry Andric     if (is_contained(WorkList, User))
10440b57cec5SDimitry Andric       continue;
10450b57cec5SDimitry Andric 
10460b57cec5SDimitry Andric     if (CallInst *CI = dyn_cast<CallInst>(User)) {
10470b57cec5SDimitry Andric       if (!isCallPromotable(CI))
10480b57cec5SDimitry Andric         return false;
10490b57cec5SDimitry Andric 
10500b57cec5SDimitry Andric       WorkList.push_back(User);
10510b57cec5SDimitry Andric       continue;
10520b57cec5SDimitry Andric     }
10530b57cec5SDimitry Andric 
10540b57cec5SDimitry Andric     Instruction *UseInst = cast<Instruction>(User);
10550b57cec5SDimitry Andric     if (UseInst->getOpcode() == Instruction::PtrToInt)
10560b57cec5SDimitry Andric       return false;
10570b57cec5SDimitry Andric 
10580b57cec5SDimitry Andric     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
10590b57cec5SDimitry Andric       if (LI->isVolatile())
10600b57cec5SDimitry Andric         return false;
10610b57cec5SDimitry Andric 
10620b57cec5SDimitry Andric       continue;
10630b57cec5SDimitry Andric     }
10640b57cec5SDimitry Andric 
10650b57cec5SDimitry Andric     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
10660b57cec5SDimitry Andric       if (SI->isVolatile())
10670b57cec5SDimitry Andric         return false;
10680b57cec5SDimitry Andric 
10690b57cec5SDimitry Andric       // Reject if the stored value is not the pointer operand.
10700b57cec5SDimitry Andric       if (SI->getPointerOperand() != Val)
10710b57cec5SDimitry Andric         return false;
10720b57cec5SDimitry Andric     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
10730b57cec5SDimitry Andric       if (RMW->isVolatile())
10740b57cec5SDimitry Andric         return false;
10750b57cec5SDimitry Andric     } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
10760b57cec5SDimitry Andric       if (CAS->isVolatile())
10770b57cec5SDimitry Andric         return false;
10780b57cec5SDimitry Andric     }
10790b57cec5SDimitry Andric 
10800b57cec5SDimitry Andric     // Only promote a select if we know that the other select operand
10810b57cec5SDimitry Andric     // is from another pointer that will also be promoted.
10820b57cec5SDimitry Andric     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
10830b57cec5SDimitry Andric       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
10840b57cec5SDimitry Andric         return false;
10850b57cec5SDimitry Andric 
10860b57cec5SDimitry Andric       // May need to rewrite constant operands.
10870b57cec5SDimitry Andric       WorkList.push_back(ICmp);
10880b57cec5SDimitry Andric     }
10890b57cec5SDimitry Andric 
10900b57cec5SDimitry Andric     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
10910b57cec5SDimitry Andric       // Give up if the pointer may be captured.
10920b57cec5SDimitry Andric       if (PointerMayBeCaptured(UseInst, true, true))
10930b57cec5SDimitry Andric         return false;
10940b57cec5SDimitry Andric       // Don't collect the users of this.
10950b57cec5SDimitry Andric       WorkList.push_back(User);
10960b57cec5SDimitry Andric       continue;
10970b57cec5SDimitry Andric     }
10980b57cec5SDimitry Andric 
1099fe6060f1SDimitry Andric     // Do not promote vector/aggregate type instructions. It is hard to track
1100fe6060f1SDimitry Andric     // their users.
1101fe6060f1SDimitry Andric     if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User))
1102fe6060f1SDimitry Andric       return false;
1103fe6060f1SDimitry Andric 
11040b57cec5SDimitry Andric     if (!User->getType()->isPointerTy())
11050b57cec5SDimitry Andric       continue;
11060b57cec5SDimitry Andric 
11070b57cec5SDimitry Andric     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
11080b57cec5SDimitry Andric       // Be conservative if an address could be computed outside the bounds of
11090b57cec5SDimitry Andric       // the alloca.
11100b57cec5SDimitry Andric       if (!GEP->isInBounds())
11110b57cec5SDimitry Andric         return false;
11120b57cec5SDimitry Andric     }
11130b57cec5SDimitry Andric 
11140b57cec5SDimitry Andric     // Only promote a select if we know that the other select operand is from
11150b57cec5SDimitry Andric     // another pointer that will also be promoted.
11160b57cec5SDimitry Andric     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
11170b57cec5SDimitry Andric       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
11180b57cec5SDimitry Andric         return false;
11190b57cec5SDimitry Andric     }
11200b57cec5SDimitry Andric 
11210b57cec5SDimitry Andric     // Repeat for phis.
11220b57cec5SDimitry Andric     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
11230b57cec5SDimitry Andric       // TODO: Handle more complex cases. We should be able to replace loops
11240b57cec5SDimitry Andric       // over arrays.
11250b57cec5SDimitry Andric       switch (Phi->getNumIncomingValues()) {
11260b57cec5SDimitry Andric       case 1:
11270b57cec5SDimitry Andric         break;
11280b57cec5SDimitry Andric       case 2:
11290b57cec5SDimitry Andric         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
11300b57cec5SDimitry Andric           return false;
11310b57cec5SDimitry Andric         break;
11320b57cec5SDimitry Andric       default:
11330b57cec5SDimitry Andric         return false;
11340b57cec5SDimitry Andric       }
11350b57cec5SDimitry Andric     }
11360b57cec5SDimitry Andric 
11370b57cec5SDimitry Andric     WorkList.push_back(User);
11380b57cec5SDimitry Andric     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
11390b57cec5SDimitry Andric       return false;
11400b57cec5SDimitry Andric   }
11410b57cec5SDimitry Andric 
11420b57cec5SDimitry Andric   return true;
11430b57cec5SDimitry Andric }
11440b57cec5SDimitry Andric 
1145e8d8bef9SDimitry Andric bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
11460b57cec5SDimitry Andric 
11470b57cec5SDimitry Andric   FunctionType *FTy = F.getFunctionType();
1148e8d8bef9SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
11490b57cec5SDimitry Andric 
11500b57cec5SDimitry Andric   // If the function has any arguments in the local address space, then it's
11510b57cec5SDimitry Andric   // possible these arguments require the entire local memory space, so
11520b57cec5SDimitry Andric   // we cannot use local memory in the pass.
11530b57cec5SDimitry Andric   for (Type *ParamTy : FTy->params()) {
11540b57cec5SDimitry Andric     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
11550b57cec5SDimitry Andric     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
11560b57cec5SDimitry Andric       LocalMemLimit = 0;
11570b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
11580b57cec5SDimitry Andric                            "local memory disabled.\n");
11590b57cec5SDimitry Andric       return false;
11600b57cec5SDimitry Andric     }
11610b57cec5SDimitry Andric   }
11620b57cec5SDimitry Andric 
1163bdd1243dSDimitry Andric   LocalMemLimit = ST.getAddressableLocalMemorySize();
11640b57cec5SDimitry Andric   if (LocalMemLimit == 0)
11650b57cec5SDimitry Andric     return false;
11660b57cec5SDimitry Andric 
1167e8d8bef9SDimitry Andric   SmallVector<const Constant *, 16> Stack;
1168e8d8bef9SDimitry Andric   SmallPtrSet<const Constant *, 8> VisitedConstants;
1169e8d8bef9SDimitry Andric   SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
11700b57cec5SDimitry Andric 
1171e8d8bef9SDimitry Andric   auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1172e8d8bef9SDimitry Andric     for (const User *U : Val->users()) {
1173e8d8bef9SDimitry Andric       if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1174e8d8bef9SDimitry Andric         if (Use->getParent()->getParent() == &F)
1175e8d8bef9SDimitry Andric           return true;
1176e8d8bef9SDimitry Andric       } else {
1177e8d8bef9SDimitry Andric         const Constant *C = cast<Constant>(U);
1178e8d8bef9SDimitry Andric         if (VisitedConstants.insert(C).second)
1179e8d8bef9SDimitry Andric           Stack.push_back(C);
1180e8d8bef9SDimitry Andric       }
1181e8d8bef9SDimitry Andric     }
1182e8d8bef9SDimitry Andric 
1183e8d8bef9SDimitry Andric     return false;
1184e8d8bef9SDimitry Andric   };
1185e8d8bef9SDimitry Andric 
11860b57cec5SDimitry Andric   for (GlobalVariable &GV : Mod->globals()) {
1187480093f4SDimitry Andric     if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
11880b57cec5SDimitry Andric       continue;
11890b57cec5SDimitry Andric 
1190e8d8bef9SDimitry Andric     if (visitUsers(&GV, &GV)) {
1191e8d8bef9SDimitry Andric       UsedLDS.insert(&GV);
1192e8d8bef9SDimitry Andric       Stack.clear();
11930b57cec5SDimitry Andric       continue;
1194e8d8bef9SDimitry Andric     }
11950b57cec5SDimitry Andric 
1196e8d8bef9SDimitry Andric     // For any ConstantExpr uses, we need to recursively search the users until
1197e8d8bef9SDimitry Andric     // we see a function.
1198e8d8bef9SDimitry Andric     while (!Stack.empty()) {
1199e8d8bef9SDimitry Andric       const Constant *C = Stack.pop_back_val();
1200e8d8bef9SDimitry Andric       if (visitUsers(&GV, C)) {
1201e8d8bef9SDimitry Andric         UsedLDS.insert(&GV);
1202e8d8bef9SDimitry Andric         Stack.clear();
12030b57cec5SDimitry Andric         break;
12040b57cec5SDimitry Andric       }
12050b57cec5SDimitry Andric     }
12060b57cec5SDimitry Andric   }
12070b57cec5SDimitry Andric 
1208e8d8bef9SDimitry Andric   const DataLayout &DL = Mod->getDataLayout();
1209e8d8bef9SDimitry Andric   SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1210e8d8bef9SDimitry Andric   AllocatedSizes.reserve(UsedLDS.size());
1211e8d8bef9SDimitry Andric 
1212e8d8bef9SDimitry Andric   for (const GlobalVariable *GV : UsedLDS) {
1213e8d8bef9SDimitry Andric     Align Alignment =
1214e8d8bef9SDimitry Andric         DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1215e8d8bef9SDimitry Andric     uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
121604eeddc0SDimitry Andric 
121704eeddc0SDimitry Andric     // HIP uses an extern unsized array in local address space for dynamically
121804eeddc0SDimitry Andric     // allocated shared memory.  In that case, we have to disable the promotion.
121904eeddc0SDimitry Andric     if (GV->hasExternalLinkage() && AllocSize == 0) {
122004eeddc0SDimitry Andric       LocalMemLimit = 0;
122104eeddc0SDimitry Andric       LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
122204eeddc0SDimitry Andric                            "local memory. Promoting to local memory "
122304eeddc0SDimitry Andric                            "disabled.\n");
122404eeddc0SDimitry Andric       return false;
122504eeddc0SDimitry Andric     }
122604eeddc0SDimitry Andric 
1227e8d8bef9SDimitry Andric     AllocatedSizes.emplace_back(AllocSize, Alignment);
1228e8d8bef9SDimitry Andric   }
1229e8d8bef9SDimitry Andric 
1230e8d8bef9SDimitry Andric   // Sort to try to estimate the worst case alignment padding
1231e8d8bef9SDimitry Andric   //
1232e8d8bef9SDimitry Andric   // FIXME: We should really do something to fix the addresses to a more optimal
1233e8d8bef9SDimitry Andric   // value instead
123481ad6265SDimitry Andric   llvm::sort(AllocatedSizes, llvm::less_second());
1235e8d8bef9SDimitry Andric 
1236e8d8bef9SDimitry Andric   // Check how much local memory is being used by global objects
1237e8d8bef9SDimitry Andric   CurrentLocalMemUsage = 0;
1238e8d8bef9SDimitry Andric 
1239e8d8bef9SDimitry Andric   // FIXME: Try to account for padding here. The real padding and address is
1240e8d8bef9SDimitry Andric   // currently determined from the inverse order of uses in the function when
1241e8d8bef9SDimitry Andric   // legalizing, which could also potentially change. We try to estimate the
1242e8d8bef9SDimitry Andric   // worst case here, but we probably should fix the addresses earlier.
1243e8d8bef9SDimitry Andric   for (auto Alloc : AllocatedSizes) {
1244e8d8bef9SDimitry Andric     CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1245e8d8bef9SDimitry Andric     CurrentLocalMemUsage += Alloc.first;
1246e8d8bef9SDimitry Andric   }
1247e8d8bef9SDimitry Andric 
124806c3fb27SDimitry Andric   unsigned MaxOccupancy =
124906c3fb27SDimitry Andric       ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F);
12500b57cec5SDimitry Andric 
12510b57cec5SDimitry Andric   // Restrict local memory usage so that we don't drastically reduce occupancy,
12520b57cec5SDimitry Andric   // unless it is already significantly reduced.
12530b57cec5SDimitry Andric 
12540b57cec5SDimitry Andric   // TODO: Have some sort of hint or other heuristics to guess occupancy based
12550b57cec5SDimitry Andric   // on other factors..
12560b57cec5SDimitry Andric   unsigned OccupancyHint = ST.getWavesPerEU(F).second;
12570b57cec5SDimitry Andric   if (OccupancyHint == 0)
12580b57cec5SDimitry Andric     OccupancyHint = 7;
12590b57cec5SDimitry Andric 
12600b57cec5SDimitry Andric   // Clamp to max value.
12610b57cec5SDimitry Andric   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
12620b57cec5SDimitry Andric 
12630b57cec5SDimitry Andric   // Check the hint but ignore it if it's obviously wrong from the existing LDS
12640b57cec5SDimitry Andric   // usage.
12650b57cec5SDimitry Andric   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
12660b57cec5SDimitry Andric 
12670b57cec5SDimitry Andric   // Round up to the next tier of usage.
126806c3fb27SDimitry Andric   unsigned MaxSizeWithWaveCount =
126906c3fb27SDimitry Andric       ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
12700b57cec5SDimitry Andric 
12710b57cec5SDimitry Andric   // Program is possibly broken by using more local mem than available.
12720b57cec5SDimitry Andric   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
12730b57cec5SDimitry Andric     return false;
12740b57cec5SDimitry Andric 
12750b57cec5SDimitry Andric   LocalMemLimit = MaxSizeWithWaveCount;
12760b57cec5SDimitry Andric 
12770b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
12780b57cec5SDimitry Andric                     << " bytes of LDS\n"
12790b57cec5SDimitry Andric                     << "  Rounding size to " << MaxSizeWithWaveCount
12800b57cec5SDimitry Andric                     << " with a maximum occupancy of " << MaxOccupancy << '\n'
12810b57cec5SDimitry Andric                     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
12820b57cec5SDimitry Andric                     << " available for promotion\n");
12830b57cec5SDimitry Andric 
12840b57cec5SDimitry Andric   return true;
12850b57cec5SDimitry Andric }
12860b57cec5SDimitry Andric 
12870b57cec5SDimitry Andric // FIXME: Should try to pick the most likely to be profitable allocas first.
128806c3fb27SDimitry Andric bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
128906c3fb27SDimitry Andric                                                     bool SufficientLDS) {
129006c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
129106c3fb27SDimitry Andric 
129206c3fb27SDimitry Andric   if (DisablePromoteAllocaToLDS) {
129306c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  Promote alloca to LDS is disabled\n");
12940b57cec5SDimitry Andric     return false;
129506c3fb27SDimitry Andric   }
12960b57cec5SDimitry Andric 
12975ffd83dbSDimitry Andric   const DataLayout &DL = Mod->getDataLayout();
12980b57cec5SDimitry Andric   IRBuilder<> Builder(&I);
12990b57cec5SDimitry Andric 
13000b57cec5SDimitry Andric   const Function &ContainingFunction = *I.getParent()->getParent();
13010b57cec5SDimitry Andric   CallingConv::ID CC = ContainingFunction.getCallingConv();
13020b57cec5SDimitry Andric 
13030b57cec5SDimitry Andric   // Don't promote the alloca to LDS for shader calling conventions as the work
13040b57cec5SDimitry Andric   // item ID intrinsics are not supported for these calling conventions.
13050b57cec5SDimitry Andric   // Furthermore not all LDS is available for some of the stages.
13060b57cec5SDimitry Andric   switch (CC) {
13070b57cec5SDimitry Andric   case CallingConv::AMDGPU_KERNEL:
13080b57cec5SDimitry Andric   case CallingConv::SPIR_KERNEL:
13090b57cec5SDimitry Andric     break;
13100b57cec5SDimitry Andric   default:
13110b57cec5SDimitry Andric     LLVM_DEBUG(
13120b57cec5SDimitry Andric         dbgs()
13130b57cec5SDimitry Andric         << " promote alloca to LDS not supported with calling convention.\n");
13140b57cec5SDimitry Andric     return false;
13150b57cec5SDimitry Andric   }
13160b57cec5SDimitry Andric 
13170b57cec5SDimitry Andric   // Not likely to have sufficient local memory for promotion.
13180b57cec5SDimitry Andric   if (!SufficientLDS)
13190b57cec5SDimitry Andric     return false;
13200b57cec5SDimitry Andric 
1321e8d8bef9SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
13220b57cec5SDimitry Andric   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
13230b57cec5SDimitry Andric 
13245ffd83dbSDimitry Andric   Align Alignment =
13255ffd83dbSDimitry Andric       DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
13260b57cec5SDimitry Andric 
13270b57cec5SDimitry Andric   // FIXME: This computed padding is likely wrong since it depends on inverse
13280b57cec5SDimitry Andric   // usage order.
13290b57cec5SDimitry Andric   //
13300b57cec5SDimitry Andric   // FIXME: It is also possible that if we're allowed to use all of the memory
133181ad6265SDimitry Andric   // could end up using more than the maximum due to alignment padding.
13320b57cec5SDimitry Andric 
13335ffd83dbSDimitry Andric   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
133406c3fb27SDimitry Andric   uint32_t AllocSize =
133506c3fb27SDimitry Andric       WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
13360b57cec5SDimitry Andric   NewSize += AllocSize;
13370b57cec5SDimitry Andric 
13380b57cec5SDimitry Andric   if (NewSize > LocalMemLimit) {
13390b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  " << AllocSize
13400b57cec5SDimitry Andric                       << " bytes of local memory not available to promote\n");
13410b57cec5SDimitry Andric     return false;
13420b57cec5SDimitry Andric   }
13430b57cec5SDimitry Andric 
13440b57cec5SDimitry Andric   CurrentLocalMemUsage = NewSize;
13450b57cec5SDimitry Andric 
13460b57cec5SDimitry Andric   std::vector<Value *> WorkList;
13470b57cec5SDimitry Andric 
13480b57cec5SDimitry Andric   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
13490b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
13500b57cec5SDimitry Andric     return false;
13510b57cec5SDimitry Andric   }
13520b57cec5SDimitry Andric 
13530b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
13540b57cec5SDimitry Andric 
13550b57cec5SDimitry Andric   Function *F = I.getParent()->getParent();
13560b57cec5SDimitry Andric 
13570b57cec5SDimitry Andric   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
13580b57cec5SDimitry Andric   GlobalVariable *GV = new GlobalVariable(
1359bdd1243dSDimitry Andric       *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy),
1360bdd1243dSDimitry Andric       Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
1361bdd1243dSDimitry Andric       GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
13620b57cec5SDimitry Andric   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
13630eae32dcSDimitry Andric   GV->setAlignment(I.getAlign());
13640b57cec5SDimitry Andric 
13650b57cec5SDimitry Andric   Value *TCntY, *TCntZ;
13660b57cec5SDimitry Andric 
13670b57cec5SDimitry Andric   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
13680b57cec5SDimitry Andric   Value *TIdX = getWorkitemID(Builder, 0);
13690b57cec5SDimitry Andric   Value *TIdY = getWorkitemID(Builder, 1);
13700b57cec5SDimitry Andric   Value *TIdZ = getWorkitemID(Builder, 2);
13710b57cec5SDimitry Andric 
13720b57cec5SDimitry Andric   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
13730b57cec5SDimitry Andric   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
13740b57cec5SDimitry Andric   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
13750b57cec5SDimitry Andric   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
13760b57cec5SDimitry Andric   TID = Builder.CreateAdd(TID, TIdZ);
13770b57cec5SDimitry Andric 
137806c3fb27SDimitry Andric   LLVMContext &Context = Mod->getContext();
137906c3fb27SDimitry Andric   Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID};
13800b57cec5SDimitry Andric 
13810b57cec5SDimitry Andric   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
13820b57cec5SDimitry Andric   I.mutateType(Offset->getType());
13830b57cec5SDimitry Andric   I.replaceAllUsesWith(Offset);
13840b57cec5SDimitry Andric   I.eraseFromParent();
13850b57cec5SDimitry Andric 
1386fe6060f1SDimitry Andric   SmallVector<IntrinsicInst *> DeferredIntrs;
1387fe6060f1SDimitry Andric 
13880b57cec5SDimitry Andric   for (Value *V : WorkList) {
13890b57cec5SDimitry Andric     CallInst *Call = dyn_cast<CallInst>(V);
13900b57cec5SDimitry Andric     if (!Call) {
13910b57cec5SDimitry Andric       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
139206c3fb27SDimitry Andric         PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
13930b57cec5SDimitry Andric 
13940b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(CI->getOperand(0)))
13950b57cec5SDimitry Andric           CI->setOperand(0, ConstantPointerNull::get(NewTy));
13960b57cec5SDimitry Andric 
13970b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(CI->getOperand(1)))
13980b57cec5SDimitry Andric           CI->setOperand(1, ConstantPointerNull::get(NewTy));
13990b57cec5SDimitry Andric 
14000b57cec5SDimitry Andric         continue;
14010b57cec5SDimitry Andric       }
14020b57cec5SDimitry Andric 
14030b57cec5SDimitry Andric       // The operand's value should be corrected on its own and we don't want to
14040b57cec5SDimitry Andric       // touch the users.
14050b57cec5SDimitry Andric       if (isa<AddrSpaceCastInst>(V))
14060b57cec5SDimitry Andric         continue;
14070b57cec5SDimitry Andric 
140806c3fb27SDimitry Andric       PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
14090b57cec5SDimitry Andric 
14100b57cec5SDimitry Andric       // FIXME: It doesn't really make sense to try to do this for all
14110b57cec5SDimitry Andric       // instructions.
14120b57cec5SDimitry Andric       V->mutateType(NewTy);
14130b57cec5SDimitry Andric 
14140b57cec5SDimitry Andric       // Adjust the types of any constant operands.
14150b57cec5SDimitry Andric       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
14160b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(SI->getOperand(1)))
14170b57cec5SDimitry Andric           SI->setOperand(1, ConstantPointerNull::get(NewTy));
14180b57cec5SDimitry Andric 
14190b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(SI->getOperand(2)))
14200b57cec5SDimitry Andric           SI->setOperand(2, ConstantPointerNull::get(NewTy));
14210b57cec5SDimitry Andric       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
14220b57cec5SDimitry Andric         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
14230b57cec5SDimitry Andric           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
14240b57cec5SDimitry Andric             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
14250b57cec5SDimitry Andric         }
14260b57cec5SDimitry Andric       }
14270b57cec5SDimitry Andric 
14280b57cec5SDimitry Andric       continue;
14290b57cec5SDimitry Andric     }
14300b57cec5SDimitry Andric 
14310b57cec5SDimitry Andric     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
14320b57cec5SDimitry Andric     Builder.SetInsertPoint(Intr);
14330b57cec5SDimitry Andric     switch (Intr->getIntrinsicID()) {
14340b57cec5SDimitry Andric     case Intrinsic::lifetime_start:
14350b57cec5SDimitry Andric     case Intrinsic::lifetime_end:
14360b57cec5SDimitry Andric       // These intrinsics are for address space 0 only
14370b57cec5SDimitry Andric       Intr->eraseFromParent();
14380b57cec5SDimitry Andric       continue;
1439fe6060f1SDimitry Andric     case Intrinsic::memcpy:
1440fe6060f1SDimitry Andric     case Intrinsic::memmove:
1441fe6060f1SDimitry Andric       // These have 2 pointer operands. In case if second pointer also needs
1442fe6060f1SDimitry Andric       // to be replaced we defer processing of these intrinsics until all
1443fe6060f1SDimitry Andric       // other values are processed.
1444fe6060f1SDimitry Andric       DeferredIntrs.push_back(Intr);
14450b57cec5SDimitry Andric       continue;
14460b57cec5SDimitry Andric     case Intrinsic::memset: {
14470b57cec5SDimitry Andric       MemSetInst *MemSet = cast<MemSetInst>(Intr);
1448bdd1243dSDimitry Andric       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1449bdd1243dSDimitry Andric                            MemSet->getLength(), MemSet->getDestAlign(),
1450bdd1243dSDimitry Andric                            MemSet->isVolatile());
14510b57cec5SDimitry Andric       Intr->eraseFromParent();
14520b57cec5SDimitry Andric       continue;
14530b57cec5SDimitry Andric     }
14540b57cec5SDimitry Andric     case Intrinsic::invariant_start:
14550b57cec5SDimitry Andric     case Intrinsic::invariant_end:
14560b57cec5SDimitry Andric     case Intrinsic::launder_invariant_group:
14570b57cec5SDimitry Andric     case Intrinsic::strip_invariant_group:
14580b57cec5SDimitry Andric       Intr->eraseFromParent();
14590b57cec5SDimitry Andric       // FIXME: I think the invariant marker should still theoretically apply,
14600b57cec5SDimitry Andric       // but the intrinsics need to be changed to accept pointers with any
14610b57cec5SDimitry Andric       // address space.
14620b57cec5SDimitry Andric       continue;
14630b57cec5SDimitry Andric     case Intrinsic::objectsize: {
14640b57cec5SDimitry Andric       Value *Src = Intr->getOperand(0);
1465fe6060f1SDimitry Andric       Function *ObjectSize = Intrinsic::getDeclaration(
1466fe6060f1SDimitry Andric           Mod, Intrinsic::objectsize,
1467fe6060f1SDimitry Andric           {Intr->getType(),
146806c3fb27SDimitry Andric            PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)});
14690b57cec5SDimitry Andric 
14700b57cec5SDimitry Andric       CallInst *NewCall = Builder.CreateCall(
14710b57cec5SDimitry Andric           ObjectSize,
14720b57cec5SDimitry Andric           {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
14730b57cec5SDimitry Andric       Intr->replaceAllUsesWith(NewCall);
14740b57cec5SDimitry Andric       Intr->eraseFromParent();
14750b57cec5SDimitry Andric       continue;
14760b57cec5SDimitry Andric     }
14770b57cec5SDimitry Andric     default:
14780b57cec5SDimitry Andric       Intr->print(errs());
14790b57cec5SDimitry Andric       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
14800b57cec5SDimitry Andric     }
14810b57cec5SDimitry Andric   }
1482fe6060f1SDimitry Andric 
1483fe6060f1SDimitry Andric   for (IntrinsicInst *Intr : DeferredIntrs) {
1484fe6060f1SDimitry Andric     Builder.SetInsertPoint(Intr);
1485fe6060f1SDimitry Andric     Intrinsic::ID ID = Intr->getIntrinsicID();
1486fe6060f1SDimitry Andric     assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1487fe6060f1SDimitry Andric 
1488fe6060f1SDimitry Andric     MemTransferInst *MI = cast<MemTransferInst>(Intr);
148906c3fb27SDimitry Andric     auto *B = Builder.CreateMemTransferInst(
149006c3fb27SDimitry Andric         ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
149106c3fb27SDimitry Andric         MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1492fe6060f1SDimitry Andric 
1493349cc55cSDimitry Andric     for (unsigned I = 0; I != 2; ++I) {
1494349cc55cSDimitry Andric       if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1495349cc55cSDimitry Andric         B->addDereferenceableParamAttr(I, Bytes);
1496fe6060f1SDimitry Andric       }
1497fe6060f1SDimitry Andric     }
1498fe6060f1SDimitry Andric 
1499fe6060f1SDimitry Andric     Intr->eraseFromParent();
1500fe6060f1SDimitry Andric   }
1501fe6060f1SDimitry Andric 
15020b57cec5SDimitry Andric   return true;
15030b57cec5SDimitry Andric }
1504