xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (revision 0b57cec536236d46e3dba9bd041533462f33dbb7)
1*0b57cec5SDimitry Andric //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2*0b57cec5SDimitry Andric //
3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*0b57cec5SDimitry Andric //
7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
8*0b57cec5SDimitry Andric //
9*0b57cec5SDimitry Andric // This pass eliminates allocas by either converting them into vectors or
10*0b57cec5SDimitry Andric // by migrating them to local address space.
11*0b57cec5SDimitry Andric //
12*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
13*0b57cec5SDimitry Andric 
14*0b57cec5SDimitry Andric #include "AMDGPU.h"
15*0b57cec5SDimitry Andric #include "AMDGPUSubtarget.h"
16*0b57cec5SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
17*0b57cec5SDimitry Andric #include "llvm/ADT/APInt.h"
18*0b57cec5SDimitry Andric #include "llvm/ADT/None.h"
19*0b57cec5SDimitry Andric #include "llvm/ADT/STLExtras.h"
20*0b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h"
21*0b57cec5SDimitry Andric #include "llvm/ADT/Triple.h"
22*0b57cec5SDimitry Andric #include "llvm/ADT/Twine.h"
23*0b57cec5SDimitry Andric #include "llvm/Analysis/CaptureTracking.h"
24*0b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
25*0b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
26*0b57cec5SDimitry Andric #include "llvm/IR/Attributes.h"
27*0b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h"
28*0b57cec5SDimitry Andric #include "llvm/IR/Constant.h"
29*0b57cec5SDimitry Andric #include "llvm/IR/Constants.h"
30*0b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h"
31*0b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
32*0b57cec5SDimitry Andric #include "llvm/IR/Function.h"
33*0b57cec5SDimitry Andric #include "llvm/IR/GlobalValue.h"
34*0b57cec5SDimitry Andric #include "llvm/IR/GlobalVariable.h"
35*0b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h"
36*0b57cec5SDimitry Andric #include "llvm/IR/Instruction.h"
37*0b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
38*0b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
39*0b57cec5SDimitry Andric #include "llvm/IR/Intrinsics.h"
40*0b57cec5SDimitry Andric #include "llvm/IR/LLVMContext.h"
41*0b57cec5SDimitry Andric #include "llvm/IR/Metadata.h"
42*0b57cec5SDimitry Andric #include "llvm/IR/Module.h"
43*0b57cec5SDimitry Andric #include "llvm/IR/Type.h"
44*0b57cec5SDimitry Andric #include "llvm/IR/User.h"
45*0b57cec5SDimitry Andric #include "llvm/IR/Value.h"
46*0b57cec5SDimitry Andric #include "llvm/Pass.h"
47*0b57cec5SDimitry Andric #include "llvm/Support/Casting.h"
48*0b57cec5SDimitry Andric #include "llvm/Support/Debug.h"
49*0b57cec5SDimitry Andric #include "llvm/Support/ErrorHandling.h"
50*0b57cec5SDimitry Andric #include "llvm/Support/MathExtras.h"
51*0b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h"
52*0b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h"
53*0b57cec5SDimitry Andric #include <algorithm>
54*0b57cec5SDimitry Andric #include <cassert>
55*0b57cec5SDimitry Andric #include <cstdint>
56*0b57cec5SDimitry Andric #include <map>
57*0b57cec5SDimitry Andric #include <tuple>
58*0b57cec5SDimitry Andric #include <utility>
59*0b57cec5SDimitry Andric #include <vector>
60*0b57cec5SDimitry Andric 
61*0b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-promote-alloca"
62*0b57cec5SDimitry Andric 
63*0b57cec5SDimitry Andric using namespace llvm;
64*0b57cec5SDimitry Andric 
65*0b57cec5SDimitry Andric namespace {
66*0b57cec5SDimitry Andric 
67*0b57cec5SDimitry Andric static cl::opt<bool> DisablePromoteAllocaToVector(
68*0b57cec5SDimitry Andric   "disable-promote-alloca-to-vector",
69*0b57cec5SDimitry Andric   cl::desc("Disable promote alloca to vector"),
70*0b57cec5SDimitry Andric   cl::init(false));
71*0b57cec5SDimitry Andric 
72*0b57cec5SDimitry Andric static cl::opt<bool> DisablePromoteAllocaToLDS(
73*0b57cec5SDimitry Andric   "disable-promote-alloca-to-lds",
74*0b57cec5SDimitry Andric   cl::desc("Disable promote alloca to LDS"),
75*0b57cec5SDimitry Andric   cl::init(false));
76*0b57cec5SDimitry Andric 
77*0b57cec5SDimitry Andric // FIXME: This can create globals so should be a module pass.
78*0b57cec5SDimitry Andric class AMDGPUPromoteAlloca : public FunctionPass {
79*0b57cec5SDimitry Andric private:
80*0b57cec5SDimitry Andric   const TargetMachine *TM;
81*0b57cec5SDimitry Andric   Module *Mod = nullptr;
82*0b57cec5SDimitry Andric   const DataLayout *DL = nullptr;
83*0b57cec5SDimitry Andric 
84*0b57cec5SDimitry Andric   // FIXME: This should be per-kernel.
85*0b57cec5SDimitry Andric   uint32_t LocalMemLimit = 0;
86*0b57cec5SDimitry Andric   uint32_t CurrentLocalMemUsage = 0;
87*0b57cec5SDimitry Andric 
88*0b57cec5SDimitry Andric   bool IsAMDGCN = false;
89*0b57cec5SDimitry Andric   bool IsAMDHSA = false;
90*0b57cec5SDimitry Andric 
91*0b57cec5SDimitry Andric   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
92*0b57cec5SDimitry Andric   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
93*0b57cec5SDimitry Andric 
94*0b57cec5SDimitry Andric   /// BaseAlloca is the alloca root the search started from.
95*0b57cec5SDimitry Andric   /// Val may be that alloca or a recursive user of it.
96*0b57cec5SDimitry Andric   bool collectUsesWithPtrTypes(Value *BaseAlloca,
97*0b57cec5SDimitry Andric                                Value *Val,
98*0b57cec5SDimitry Andric                                std::vector<Value*> &WorkList) const;
99*0b57cec5SDimitry Andric 
100*0b57cec5SDimitry Andric   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
101*0b57cec5SDimitry Andric   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
102*0b57cec5SDimitry Andric   /// Returns true if both operands are derived from the same alloca. Val should
103*0b57cec5SDimitry Andric   /// be the same value as one of the input operands of UseInst.
104*0b57cec5SDimitry Andric   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
105*0b57cec5SDimitry Andric                                        Instruction *UseInst,
106*0b57cec5SDimitry Andric                                        int OpIdx0, int OpIdx1) const;
107*0b57cec5SDimitry Andric 
108*0b57cec5SDimitry Andric   /// Check whether we have enough local memory for promotion.
109*0b57cec5SDimitry Andric   bool hasSufficientLocalMem(const Function &F);
110*0b57cec5SDimitry Andric 
111*0b57cec5SDimitry Andric public:
112*0b57cec5SDimitry Andric   static char ID;
113*0b57cec5SDimitry Andric 
114*0b57cec5SDimitry Andric   AMDGPUPromoteAlloca() : FunctionPass(ID) {}
115*0b57cec5SDimitry Andric 
116*0b57cec5SDimitry Andric   bool doInitialization(Module &M) override;
117*0b57cec5SDimitry Andric   bool runOnFunction(Function &F) override;
118*0b57cec5SDimitry Andric 
119*0b57cec5SDimitry Andric   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
120*0b57cec5SDimitry Andric 
121*0b57cec5SDimitry Andric   bool handleAlloca(AllocaInst &I, bool SufficientLDS);
122*0b57cec5SDimitry Andric 
123*0b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
124*0b57cec5SDimitry Andric     AU.setPreservesCFG();
125*0b57cec5SDimitry Andric     FunctionPass::getAnalysisUsage(AU);
126*0b57cec5SDimitry Andric   }
127*0b57cec5SDimitry Andric };
128*0b57cec5SDimitry Andric 
129*0b57cec5SDimitry Andric } // end anonymous namespace
130*0b57cec5SDimitry Andric 
131*0b57cec5SDimitry Andric char AMDGPUPromoteAlloca::ID = 0;
132*0b57cec5SDimitry Andric 
133*0b57cec5SDimitry Andric INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
134*0b57cec5SDimitry Andric                 "AMDGPU promote alloca to vector or LDS", false, false)
135*0b57cec5SDimitry Andric 
136*0b57cec5SDimitry Andric char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
137*0b57cec5SDimitry Andric 
138*0b57cec5SDimitry Andric bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
139*0b57cec5SDimitry Andric   Mod = &M;
140*0b57cec5SDimitry Andric   DL = &Mod->getDataLayout();
141*0b57cec5SDimitry Andric 
142*0b57cec5SDimitry Andric   return false;
143*0b57cec5SDimitry Andric }
144*0b57cec5SDimitry Andric 
145*0b57cec5SDimitry Andric bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
146*0b57cec5SDimitry Andric   if (skipFunction(F))
147*0b57cec5SDimitry Andric     return false;
148*0b57cec5SDimitry Andric 
149*0b57cec5SDimitry Andric   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
150*0b57cec5SDimitry Andric     TM = &TPC->getTM<TargetMachine>();
151*0b57cec5SDimitry Andric   else
152*0b57cec5SDimitry Andric     return false;
153*0b57cec5SDimitry Andric 
154*0b57cec5SDimitry Andric   const Triple &TT = TM->getTargetTriple();
155*0b57cec5SDimitry Andric   IsAMDGCN = TT.getArch() == Triple::amdgcn;
156*0b57cec5SDimitry Andric   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
157*0b57cec5SDimitry Andric 
158*0b57cec5SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
159*0b57cec5SDimitry Andric   if (!ST.isPromoteAllocaEnabled())
160*0b57cec5SDimitry Andric     return false;
161*0b57cec5SDimitry Andric 
162*0b57cec5SDimitry Andric   bool SufficientLDS = hasSufficientLocalMem(F);
163*0b57cec5SDimitry Andric   bool Changed = false;
164*0b57cec5SDimitry Andric   BasicBlock &EntryBB = *F.begin();
165*0b57cec5SDimitry Andric 
166*0b57cec5SDimitry Andric   SmallVector<AllocaInst *, 16> Allocas;
167*0b57cec5SDimitry Andric   for (Instruction &I : EntryBB) {
168*0b57cec5SDimitry Andric     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
169*0b57cec5SDimitry Andric       Allocas.push_back(AI);
170*0b57cec5SDimitry Andric   }
171*0b57cec5SDimitry Andric 
172*0b57cec5SDimitry Andric   for (AllocaInst *AI : Allocas) {
173*0b57cec5SDimitry Andric     if (handleAlloca(*AI, SufficientLDS))
174*0b57cec5SDimitry Andric       Changed = true;
175*0b57cec5SDimitry Andric   }
176*0b57cec5SDimitry Andric 
177*0b57cec5SDimitry Andric   return Changed;
178*0b57cec5SDimitry Andric }
179*0b57cec5SDimitry Andric 
180*0b57cec5SDimitry Andric std::pair<Value *, Value *>
181*0b57cec5SDimitry Andric AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
182*0b57cec5SDimitry Andric   const Function &F = *Builder.GetInsertBlock()->getParent();
183*0b57cec5SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
184*0b57cec5SDimitry Andric 
185*0b57cec5SDimitry Andric   if (!IsAMDHSA) {
186*0b57cec5SDimitry Andric     Function *LocalSizeYFn
187*0b57cec5SDimitry Andric       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
188*0b57cec5SDimitry Andric     Function *LocalSizeZFn
189*0b57cec5SDimitry Andric       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
190*0b57cec5SDimitry Andric 
191*0b57cec5SDimitry Andric     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
192*0b57cec5SDimitry Andric     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
193*0b57cec5SDimitry Andric 
194*0b57cec5SDimitry Andric     ST.makeLIDRangeMetadata(LocalSizeY);
195*0b57cec5SDimitry Andric     ST.makeLIDRangeMetadata(LocalSizeZ);
196*0b57cec5SDimitry Andric 
197*0b57cec5SDimitry Andric     return std::make_pair(LocalSizeY, LocalSizeZ);
198*0b57cec5SDimitry Andric   }
199*0b57cec5SDimitry Andric 
200*0b57cec5SDimitry Andric   // We must read the size out of the dispatch pointer.
201*0b57cec5SDimitry Andric   assert(IsAMDGCN);
202*0b57cec5SDimitry Andric 
203*0b57cec5SDimitry Andric   // We are indexing into this struct, and want to extract the workgroup_size_*
204*0b57cec5SDimitry Andric   // fields.
205*0b57cec5SDimitry Andric   //
206*0b57cec5SDimitry Andric   //   typedef struct hsa_kernel_dispatch_packet_s {
207*0b57cec5SDimitry Andric   //     uint16_t header;
208*0b57cec5SDimitry Andric   //     uint16_t setup;
209*0b57cec5SDimitry Andric   //     uint16_t workgroup_size_x ;
210*0b57cec5SDimitry Andric   //     uint16_t workgroup_size_y;
211*0b57cec5SDimitry Andric   //     uint16_t workgroup_size_z;
212*0b57cec5SDimitry Andric   //     uint16_t reserved0;
213*0b57cec5SDimitry Andric   //     uint32_t grid_size_x ;
214*0b57cec5SDimitry Andric   //     uint32_t grid_size_y ;
215*0b57cec5SDimitry Andric   //     uint32_t grid_size_z;
216*0b57cec5SDimitry Andric   //
217*0b57cec5SDimitry Andric   //     uint32_t private_segment_size;
218*0b57cec5SDimitry Andric   //     uint32_t group_segment_size;
219*0b57cec5SDimitry Andric   //     uint64_t kernel_object;
220*0b57cec5SDimitry Andric   //
221*0b57cec5SDimitry Andric   // #ifdef HSA_LARGE_MODEL
222*0b57cec5SDimitry Andric   //     void *kernarg_address;
223*0b57cec5SDimitry Andric   // #elif defined HSA_LITTLE_ENDIAN
224*0b57cec5SDimitry Andric   //     void *kernarg_address;
225*0b57cec5SDimitry Andric   //     uint32_t reserved1;
226*0b57cec5SDimitry Andric   // #else
227*0b57cec5SDimitry Andric   //     uint32_t reserved1;
228*0b57cec5SDimitry Andric   //     void *kernarg_address;
229*0b57cec5SDimitry Andric   // #endif
230*0b57cec5SDimitry Andric   //     uint64_t reserved2;
231*0b57cec5SDimitry Andric   //     hsa_signal_t completion_signal; // uint64_t wrapper
232*0b57cec5SDimitry Andric   //   } hsa_kernel_dispatch_packet_t
233*0b57cec5SDimitry Andric   //
234*0b57cec5SDimitry Andric   Function *DispatchPtrFn
235*0b57cec5SDimitry Andric     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
236*0b57cec5SDimitry Andric 
237*0b57cec5SDimitry Andric   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
238*0b57cec5SDimitry Andric   DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
239*0b57cec5SDimitry Andric   DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
240*0b57cec5SDimitry Andric 
241*0b57cec5SDimitry Andric   // Size of the dispatch packet struct.
242*0b57cec5SDimitry Andric   DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
243*0b57cec5SDimitry Andric 
244*0b57cec5SDimitry Andric   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
245*0b57cec5SDimitry Andric   Value *CastDispatchPtr = Builder.CreateBitCast(
246*0b57cec5SDimitry Andric     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
247*0b57cec5SDimitry Andric 
248*0b57cec5SDimitry Andric   // We could do a single 64-bit load here, but it's likely that the basic
249*0b57cec5SDimitry Andric   // 32-bit and extract sequence is already present, and it is probably easier
250*0b57cec5SDimitry Andric   // to CSE this. The loads should be mergable later anyway.
251*0b57cec5SDimitry Andric   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
252*0b57cec5SDimitry Andric   LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4);
253*0b57cec5SDimitry Andric 
254*0b57cec5SDimitry Andric   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
255*0b57cec5SDimitry Andric   LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4);
256*0b57cec5SDimitry Andric 
257*0b57cec5SDimitry Andric   MDNode *MD = MDNode::get(Mod->getContext(), None);
258*0b57cec5SDimitry Andric   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
259*0b57cec5SDimitry Andric   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
260*0b57cec5SDimitry Andric   ST.makeLIDRangeMetadata(LoadZU);
261*0b57cec5SDimitry Andric 
262*0b57cec5SDimitry Andric   // Extract y component. Upper half of LoadZU should be zero already.
263*0b57cec5SDimitry Andric   Value *Y = Builder.CreateLShr(LoadXY, 16);
264*0b57cec5SDimitry Andric 
265*0b57cec5SDimitry Andric   return std::make_pair(Y, LoadZU);
266*0b57cec5SDimitry Andric }
267*0b57cec5SDimitry Andric 
268*0b57cec5SDimitry Andric Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
269*0b57cec5SDimitry Andric   const AMDGPUSubtarget &ST =
270*0b57cec5SDimitry Andric       AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
271*0b57cec5SDimitry Andric   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
272*0b57cec5SDimitry Andric 
273*0b57cec5SDimitry Andric   switch (N) {
274*0b57cec5SDimitry Andric   case 0:
275*0b57cec5SDimitry Andric     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
276*0b57cec5SDimitry Andric       : Intrinsic::r600_read_tidig_x;
277*0b57cec5SDimitry Andric     break;
278*0b57cec5SDimitry Andric   case 1:
279*0b57cec5SDimitry Andric     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
280*0b57cec5SDimitry Andric       : Intrinsic::r600_read_tidig_y;
281*0b57cec5SDimitry Andric     break;
282*0b57cec5SDimitry Andric 
283*0b57cec5SDimitry Andric   case 2:
284*0b57cec5SDimitry Andric     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
285*0b57cec5SDimitry Andric       : Intrinsic::r600_read_tidig_z;
286*0b57cec5SDimitry Andric     break;
287*0b57cec5SDimitry Andric   default:
288*0b57cec5SDimitry Andric     llvm_unreachable("invalid dimension");
289*0b57cec5SDimitry Andric   }
290*0b57cec5SDimitry Andric 
291*0b57cec5SDimitry Andric   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
292*0b57cec5SDimitry Andric   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
293*0b57cec5SDimitry Andric   ST.makeLIDRangeMetadata(CI);
294*0b57cec5SDimitry Andric 
295*0b57cec5SDimitry Andric   return CI;
296*0b57cec5SDimitry Andric }
297*0b57cec5SDimitry Andric 
298*0b57cec5SDimitry Andric static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
299*0b57cec5SDimitry Andric   return VectorType::get(ArrayTy->getElementType(),
300*0b57cec5SDimitry Andric                          ArrayTy->getNumElements());
301*0b57cec5SDimitry Andric }
302*0b57cec5SDimitry Andric 
303*0b57cec5SDimitry Andric static Value *
304*0b57cec5SDimitry Andric calculateVectorIndex(Value *Ptr,
305*0b57cec5SDimitry Andric                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
306*0b57cec5SDimitry Andric   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
307*0b57cec5SDimitry Andric 
308*0b57cec5SDimitry Andric   auto I = GEPIdx.find(GEP);
309*0b57cec5SDimitry Andric   return I == GEPIdx.end() ? nullptr : I->second;
310*0b57cec5SDimitry Andric }
311*0b57cec5SDimitry Andric 
312*0b57cec5SDimitry Andric static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
313*0b57cec5SDimitry Andric   // FIXME we only support simple cases
314*0b57cec5SDimitry Andric   if (GEP->getNumOperands() != 3)
315*0b57cec5SDimitry Andric     return nullptr;
316*0b57cec5SDimitry Andric 
317*0b57cec5SDimitry Andric   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
318*0b57cec5SDimitry Andric   if (!I0 || !I0->isZero())
319*0b57cec5SDimitry Andric     return nullptr;
320*0b57cec5SDimitry Andric 
321*0b57cec5SDimitry Andric   return GEP->getOperand(2);
322*0b57cec5SDimitry Andric }
323*0b57cec5SDimitry Andric 
324*0b57cec5SDimitry Andric // Not an instruction handled below to turn into a vector.
325*0b57cec5SDimitry Andric //
326*0b57cec5SDimitry Andric // TODO: Check isTriviallyVectorizable for calls and handle other
327*0b57cec5SDimitry Andric // instructions.
328*0b57cec5SDimitry Andric static bool canVectorizeInst(Instruction *Inst, User *User) {
329*0b57cec5SDimitry Andric   switch (Inst->getOpcode()) {
330*0b57cec5SDimitry Andric   case Instruction::Load: {
331*0b57cec5SDimitry Andric     // Currently only handle the case where the Pointer Operand is a GEP.
332*0b57cec5SDimitry Andric     // Also we could not vectorize volatile or atomic loads.
333*0b57cec5SDimitry Andric     LoadInst *LI = cast<LoadInst>(Inst);
334*0b57cec5SDimitry Andric     if (isa<AllocaInst>(User) &&
335*0b57cec5SDimitry Andric         LI->getPointerOperandType() == User->getType() &&
336*0b57cec5SDimitry Andric         isa<VectorType>(LI->getType()))
337*0b57cec5SDimitry Andric       return true;
338*0b57cec5SDimitry Andric     return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
339*0b57cec5SDimitry Andric   }
340*0b57cec5SDimitry Andric   case Instruction::BitCast:
341*0b57cec5SDimitry Andric     return true;
342*0b57cec5SDimitry Andric   case Instruction::Store: {
343*0b57cec5SDimitry Andric     // Must be the stored pointer operand, not a stored value, plus
344*0b57cec5SDimitry Andric     // since it should be canonical form, the User should be a GEP.
345*0b57cec5SDimitry Andric     // Also we could not vectorize volatile or atomic stores.
346*0b57cec5SDimitry Andric     StoreInst *SI = cast<StoreInst>(Inst);
347*0b57cec5SDimitry Andric     if (isa<AllocaInst>(User) &&
348*0b57cec5SDimitry Andric         SI->getPointerOperandType() == User->getType() &&
349*0b57cec5SDimitry Andric         isa<VectorType>(SI->getValueOperand()->getType()))
350*0b57cec5SDimitry Andric       return true;
351*0b57cec5SDimitry Andric     return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
352*0b57cec5SDimitry Andric   }
353*0b57cec5SDimitry Andric   default:
354*0b57cec5SDimitry Andric     return false;
355*0b57cec5SDimitry Andric   }
356*0b57cec5SDimitry Andric }
357*0b57cec5SDimitry Andric 
358*0b57cec5SDimitry Andric static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
359*0b57cec5SDimitry Andric 
360*0b57cec5SDimitry Andric   if (DisablePromoteAllocaToVector) {
361*0b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
362*0b57cec5SDimitry Andric     return false;
363*0b57cec5SDimitry Andric   }
364*0b57cec5SDimitry Andric 
365*0b57cec5SDimitry Andric   Type *AT = Alloca->getAllocatedType();
366*0b57cec5SDimitry Andric   SequentialType *AllocaTy = dyn_cast<SequentialType>(AT);
367*0b57cec5SDimitry Andric 
368*0b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
369*0b57cec5SDimitry Andric 
370*0b57cec5SDimitry Andric   // FIXME: There is no reason why we can't support larger arrays, we
371*0b57cec5SDimitry Andric   // are just being conservative for now.
372*0b57cec5SDimitry Andric   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
373*0b57cec5SDimitry Andric   // could also be promoted but we don't currently handle this case
374*0b57cec5SDimitry Andric   if (!AllocaTy ||
375*0b57cec5SDimitry Andric       AllocaTy->getNumElements() > 16 ||
376*0b57cec5SDimitry Andric       AllocaTy->getNumElements() < 2 ||
377*0b57cec5SDimitry Andric       !VectorType::isValidElementType(AllocaTy->getElementType())) {
378*0b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
379*0b57cec5SDimitry Andric     return false;
380*0b57cec5SDimitry Andric   }
381*0b57cec5SDimitry Andric 
382*0b57cec5SDimitry Andric   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
383*0b57cec5SDimitry Andric   std::vector<Value*> WorkList;
384*0b57cec5SDimitry Andric   for (User *AllocaUser : Alloca->users()) {
385*0b57cec5SDimitry Andric     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
386*0b57cec5SDimitry Andric     if (!GEP) {
387*0b57cec5SDimitry Andric       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
388*0b57cec5SDimitry Andric         return false;
389*0b57cec5SDimitry Andric 
390*0b57cec5SDimitry Andric       WorkList.push_back(AllocaUser);
391*0b57cec5SDimitry Andric       continue;
392*0b57cec5SDimitry Andric     }
393*0b57cec5SDimitry Andric 
394*0b57cec5SDimitry Andric     Value *Index = GEPToVectorIndex(GEP);
395*0b57cec5SDimitry Andric 
396*0b57cec5SDimitry Andric     // If we can't compute a vector index from this GEP, then we can't
397*0b57cec5SDimitry Andric     // promote this alloca to vector.
398*0b57cec5SDimitry Andric     if (!Index) {
399*0b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
400*0b57cec5SDimitry Andric                         << '\n');
401*0b57cec5SDimitry Andric       return false;
402*0b57cec5SDimitry Andric     }
403*0b57cec5SDimitry Andric 
404*0b57cec5SDimitry Andric     GEPVectorIdx[GEP] = Index;
405*0b57cec5SDimitry Andric     for (User *GEPUser : AllocaUser->users()) {
406*0b57cec5SDimitry Andric       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
407*0b57cec5SDimitry Andric         return false;
408*0b57cec5SDimitry Andric 
409*0b57cec5SDimitry Andric       WorkList.push_back(GEPUser);
410*0b57cec5SDimitry Andric     }
411*0b57cec5SDimitry Andric   }
412*0b57cec5SDimitry Andric 
413*0b57cec5SDimitry Andric   VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
414*0b57cec5SDimitry Andric   if (!VectorTy)
415*0b57cec5SDimitry Andric     VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy));
416*0b57cec5SDimitry Andric 
417*0b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
418*0b57cec5SDimitry Andric                     << *VectorTy << '\n');
419*0b57cec5SDimitry Andric 
420*0b57cec5SDimitry Andric   for (Value *V : WorkList) {
421*0b57cec5SDimitry Andric     Instruction *Inst = cast<Instruction>(V);
422*0b57cec5SDimitry Andric     IRBuilder<> Builder(Inst);
423*0b57cec5SDimitry Andric     switch (Inst->getOpcode()) {
424*0b57cec5SDimitry Andric     case Instruction::Load: {
425*0b57cec5SDimitry Andric       if (Inst->getType() == AT)
426*0b57cec5SDimitry Andric         break;
427*0b57cec5SDimitry Andric 
428*0b57cec5SDimitry Andric       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
429*0b57cec5SDimitry Andric       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
430*0b57cec5SDimitry Andric       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
431*0b57cec5SDimitry Andric 
432*0b57cec5SDimitry Andric       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
433*0b57cec5SDimitry Andric       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
434*0b57cec5SDimitry Andric       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
435*0b57cec5SDimitry Andric       Inst->replaceAllUsesWith(ExtractElement);
436*0b57cec5SDimitry Andric       Inst->eraseFromParent();
437*0b57cec5SDimitry Andric       break;
438*0b57cec5SDimitry Andric     }
439*0b57cec5SDimitry Andric     case Instruction::Store: {
440*0b57cec5SDimitry Andric       StoreInst *SI = cast<StoreInst>(Inst);
441*0b57cec5SDimitry Andric       if (SI->getValueOperand()->getType() == AT)
442*0b57cec5SDimitry Andric         break;
443*0b57cec5SDimitry Andric 
444*0b57cec5SDimitry Andric       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
445*0b57cec5SDimitry Andric       Value *Ptr = SI->getPointerOperand();
446*0b57cec5SDimitry Andric       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
447*0b57cec5SDimitry Andric       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
448*0b57cec5SDimitry Andric       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
449*0b57cec5SDimitry Andric       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
450*0b57cec5SDimitry Andric                                                        SI->getValueOperand(),
451*0b57cec5SDimitry Andric                                                        Index);
452*0b57cec5SDimitry Andric       Builder.CreateStore(NewVecValue, BitCast);
453*0b57cec5SDimitry Andric       Inst->eraseFromParent();
454*0b57cec5SDimitry Andric       break;
455*0b57cec5SDimitry Andric     }
456*0b57cec5SDimitry Andric     case Instruction::BitCast:
457*0b57cec5SDimitry Andric     case Instruction::AddrSpaceCast:
458*0b57cec5SDimitry Andric       break;
459*0b57cec5SDimitry Andric 
460*0b57cec5SDimitry Andric     default:
461*0b57cec5SDimitry Andric       llvm_unreachable("Inconsistency in instructions promotable to vector");
462*0b57cec5SDimitry Andric     }
463*0b57cec5SDimitry Andric   }
464*0b57cec5SDimitry Andric   return true;
465*0b57cec5SDimitry Andric }
466*0b57cec5SDimitry Andric 
467*0b57cec5SDimitry Andric static bool isCallPromotable(CallInst *CI) {
468*0b57cec5SDimitry Andric   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
469*0b57cec5SDimitry Andric   if (!II)
470*0b57cec5SDimitry Andric     return false;
471*0b57cec5SDimitry Andric 
472*0b57cec5SDimitry Andric   switch (II->getIntrinsicID()) {
473*0b57cec5SDimitry Andric   case Intrinsic::memcpy:
474*0b57cec5SDimitry Andric   case Intrinsic::memmove:
475*0b57cec5SDimitry Andric   case Intrinsic::memset:
476*0b57cec5SDimitry Andric   case Intrinsic::lifetime_start:
477*0b57cec5SDimitry Andric   case Intrinsic::lifetime_end:
478*0b57cec5SDimitry Andric   case Intrinsic::invariant_start:
479*0b57cec5SDimitry Andric   case Intrinsic::invariant_end:
480*0b57cec5SDimitry Andric   case Intrinsic::launder_invariant_group:
481*0b57cec5SDimitry Andric   case Intrinsic::strip_invariant_group:
482*0b57cec5SDimitry Andric   case Intrinsic::objectsize:
483*0b57cec5SDimitry Andric     return true;
484*0b57cec5SDimitry Andric   default:
485*0b57cec5SDimitry Andric     return false;
486*0b57cec5SDimitry Andric   }
487*0b57cec5SDimitry Andric }
488*0b57cec5SDimitry Andric 
489*0b57cec5SDimitry Andric bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
490*0b57cec5SDimitry Andric                                                           Value *Val,
491*0b57cec5SDimitry Andric                                                           Instruction *Inst,
492*0b57cec5SDimitry Andric                                                           int OpIdx0,
493*0b57cec5SDimitry Andric                                                           int OpIdx1) const {
494*0b57cec5SDimitry Andric   // Figure out which operand is the one we might not be promoting.
495*0b57cec5SDimitry Andric   Value *OtherOp = Inst->getOperand(OpIdx0);
496*0b57cec5SDimitry Andric   if (Val == OtherOp)
497*0b57cec5SDimitry Andric     OtherOp = Inst->getOperand(OpIdx1);
498*0b57cec5SDimitry Andric 
499*0b57cec5SDimitry Andric   if (isa<ConstantPointerNull>(OtherOp))
500*0b57cec5SDimitry Andric     return true;
501*0b57cec5SDimitry Andric 
502*0b57cec5SDimitry Andric   Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
503*0b57cec5SDimitry Andric   if (!isa<AllocaInst>(OtherObj))
504*0b57cec5SDimitry Andric     return false;
505*0b57cec5SDimitry Andric 
506*0b57cec5SDimitry Andric   // TODO: We should be able to replace undefs with the right pointer type.
507*0b57cec5SDimitry Andric 
508*0b57cec5SDimitry Andric   // TODO: If we know the other base object is another promotable
509*0b57cec5SDimitry Andric   // alloca, not necessarily this alloca, we can do this. The
510*0b57cec5SDimitry Andric   // important part is both must have the same address space at
511*0b57cec5SDimitry Andric   // the end.
512*0b57cec5SDimitry Andric   if (OtherObj != BaseAlloca) {
513*0b57cec5SDimitry Andric     LLVM_DEBUG(
514*0b57cec5SDimitry Andric         dbgs() << "Found a binary instruction with another alloca object\n");
515*0b57cec5SDimitry Andric     return false;
516*0b57cec5SDimitry Andric   }
517*0b57cec5SDimitry Andric 
518*0b57cec5SDimitry Andric   return true;
519*0b57cec5SDimitry Andric }
520*0b57cec5SDimitry Andric 
521*0b57cec5SDimitry Andric bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
522*0b57cec5SDimitry Andric   Value *BaseAlloca,
523*0b57cec5SDimitry Andric   Value *Val,
524*0b57cec5SDimitry Andric   std::vector<Value*> &WorkList) const {
525*0b57cec5SDimitry Andric 
526*0b57cec5SDimitry Andric   for (User *User : Val->users()) {
527*0b57cec5SDimitry Andric     if (is_contained(WorkList, User))
528*0b57cec5SDimitry Andric       continue;
529*0b57cec5SDimitry Andric 
530*0b57cec5SDimitry Andric     if (CallInst *CI = dyn_cast<CallInst>(User)) {
531*0b57cec5SDimitry Andric       if (!isCallPromotable(CI))
532*0b57cec5SDimitry Andric         return false;
533*0b57cec5SDimitry Andric 
534*0b57cec5SDimitry Andric       WorkList.push_back(User);
535*0b57cec5SDimitry Andric       continue;
536*0b57cec5SDimitry Andric     }
537*0b57cec5SDimitry Andric 
538*0b57cec5SDimitry Andric     Instruction *UseInst = cast<Instruction>(User);
539*0b57cec5SDimitry Andric     if (UseInst->getOpcode() == Instruction::PtrToInt)
540*0b57cec5SDimitry Andric       return false;
541*0b57cec5SDimitry Andric 
542*0b57cec5SDimitry Andric     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
543*0b57cec5SDimitry Andric       if (LI->isVolatile())
544*0b57cec5SDimitry Andric         return false;
545*0b57cec5SDimitry Andric 
546*0b57cec5SDimitry Andric       continue;
547*0b57cec5SDimitry Andric     }
548*0b57cec5SDimitry Andric 
549*0b57cec5SDimitry Andric     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
550*0b57cec5SDimitry Andric       if (SI->isVolatile())
551*0b57cec5SDimitry Andric         return false;
552*0b57cec5SDimitry Andric 
553*0b57cec5SDimitry Andric       // Reject if the stored value is not the pointer operand.
554*0b57cec5SDimitry Andric       if (SI->getPointerOperand() != Val)
555*0b57cec5SDimitry Andric         return false;
556*0b57cec5SDimitry Andric     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
557*0b57cec5SDimitry Andric       if (RMW->isVolatile())
558*0b57cec5SDimitry Andric         return false;
559*0b57cec5SDimitry Andric     } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
560*0b57cec5SDimitry Andric       if (CAS->isVolatile())
561*0b57cec5SDimitry Andric         return false;
562*0b57cec5SDimitry Andric     }
563*0b57cec5SDimitry Andric 
564*0b57cec5SDimitry Andric     // Only promote a select if we know that the other select operand
565*0b57cec5SDimitry Andric     // is from another pointer that will also be promoted.
566*0b57cec5SDimitry Andric     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
567*0b57cec5SDimitry Andric       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
568*0b57cec5SDimitry Andric         return false;
569*0b57cec5SDimitry Andric 
570*0b57cec5SDimitry Andric       // May need to rewrite constant operands.
571*0b57cec5SDimitry Andric       WorkList.push_back(ICmp);
572*0b57cec5SDimitry Andric     }
573*0b57cec5SDimitry Andric 
574*0b57cec5SDimitry Andric     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
575*0b57cec5SDimitry Andric       // Give up if the pointer may be captured.
576*0b57cec5SDimitry Andric       if (PointerMayBeCaptured(UseInst, true, true))
577*0b57cec5SDimitry Andric         return false;
578*0b57cec5SDimitry Andric       // Don't collect the users of this.
579*0b57cec5SDimitry Andric       WorkList.push_back(User);
580*0b57cec5SDimitry Andric       continue;
581*0b57cec5SDimitry Andric     }
582*0b57cec5SDimitry Andric 
583*0b57cec5SDimitry Andric     if (!User->getType()->isPointerTy())
584*0b57cec5SDimitry Andric       continue;
585*0b57cec5SDimitry Andric 
586*0b57cec5SDimitry Andric     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
587*0b57cec5SDimitry Andric       // Be conservative if an address could be computed outside the bounds of
588*0b57cec5SDimitry Andric       // the alloca.
589*0b57cec5SDimitry Andric       if (!GEP->isInBounds())
590*0b57cec5SDimitry Andric         return false;
591*0b57cec5SDimitry Andric     }
592*0b57cec5SDimitry Andric 
593*0b57cec5SDimitry Andric     // Only promote a select if we know that the other select operand is from
594*0b57cec5SDimitry Andric     // another pointer that will also be promoted.
595*0b57cec5SDimitry Andric     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
596*0b57cec5SDimitry Andric       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
597*0b57cec5SDimitry Andric         return false;
598*0b57cec5SDimitry Andric     }
599*0b57cec5SDimitry Andric 
600*0b57cec5SDimitry Andric     // Repeat for phis.
601*0b57cec5SDimitry Andric     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
602*0b57cec5SDimitry Andric       // TODO: Handle more complex cases. We should be able to replace loops
603*0b57cec5SDimitry Andric       // over arrays.
604*0b57cec5SDimitry Andric       switch (Phi->getNumIncomingValues()) {
605*0b57cec5SDimitry Andric       case 1:
606*0b57cec5SDimitry Andric         break;
607*0b57cec5SDimitry Andric       case 2:
608*0b57cec5SDimitry Andric         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
609*0b57cec5SDimitry Andric           return false;
610*0b57cec5SDimitry Andric         break;
611*0b57cec5SDimitry Andric       default:
612*0b57cec5SDimitry Andric         return false;
613*0b57cec5SDimitry Andric       }
614*0b57cec5SDimitry Andric     }
615*0b57cec5SDimitry Andric 
616*0b57cec5SDimitry Andric     WorkList.push_back(User);
617*0b57cec5SDimitry Andric     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
618*0b57cec5SDimitry Andric       return false;
619*0b57cec5SDimitry Andric   }
620*0b57cec5SDimitry Andric 
621*0b57cec5SDimitry Andric   return true;
622*0b57cec5SDimitry Andric }
623*0b57cec5SDimitry Andric 
624*0b57cec5SDimitry Andric bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
625*0b57cec5SDimitry Andric 
626*0b57cec5SDimitry Andric   FunctionType *FTy = F.getFunctionType();
627*0b57cec5SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
628*0b57cec5SDimitry Andric 
629*0b57cec5SDimitry Andric   // If the function has any arguments in the local address space, then it's
630*0b57cec5SDimitry Andric   // possible these arguments require the entire local memory space, so
631*0b57cec5SDimitry Andric   // we cannot use local memory in the pass.
632*0b57cec5SDimitry Andric   for (Type *ParamTy : FTy->params()) {
633*0b57cec5SDimitry Andric     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
634*0b57cec5SDimitry Andric     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
635*0b57cec5SDimitry Andric       LocalMemLimit = 0;
636*0b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
637*0b57cec5SDimitry Andric                            "local memory disabled.\n");
638*0b57cec5SDimitry Andric       return false;
639*0b57cec5SDimitry Andric     }
640*0b57cec5SDimitry Andric   }
641*0b57cec5SDimitry Andric 
642*0b57cec5SDimitry Andric   LocalMemLimit = ST.getLocalMemorySize();
643*0b57cec5SDimitry Andric   if (LocalMemLimit == 0)
644*0b57cec5SDimitry Andric     return false;
645*0b57cec5SDimitry Andric 
646*0b57cec5SDimitry Andric   const DataLayout &DL = Mod->getDataLayout();
647*0b57cec5SDimitry Andric 
648*0b57cec5SDimitry Andric   // Check how much local memory is being used by global objects
649*0b57cec5SDimitry Andric   CurrentLocalMemUsage = 0;
650*0b57cec5SDimitry Andric   for (GlobalVariable &GV : Mod->globals()) {
651*0b57cec5SDimitry Andric     if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
652*0b57cec5SDimitry Andric       continue;
653*0b57cec5SDimitry Andric 
654*0b57cec5SDimitry Andric     for (const User *U : GV.users()) {
655*0b57cec5SDimitry Andric       const Instruction *Use = dyn_cast<Instruction>(U);
656*0b57cec5SDimitry Andric       if (!Use)
657*0b57cec5SDimitry Andric         continue;
658*0b57cec5SDimitry Andric 
659*0b57cec5SDimitry Andric       if (Use->getParent()->getParent() == &F) {
660*0b57cec5SDimitry Andric         unsigned Align = GV.getAlignment();
661*0b57cec5SDimitry Andric         if (Align == 0)
662*0b57cec5SDimitry Andric           Align = DL.getABITypeAlignment(GV.getValueType());
663*0b57cec5SDimitry Andric 
664*0b57cec5SDimitry Andric         // FIXME: Try to account for padding here. The padding is currently
665*0b57cec5SDimitry Andric         // determined from the inverse order of uses in the function. I'm not
666*0b57cec5SDimitry Andric         // sure if the use list order is in any way connected to this, so the
667*0b57cec5SDimitry Andric         // total reported size is likely incorrect.
668*0b57cec5SDimitry Andric         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
669*0b57cec5SDimitry Andric         CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
670*0b57cec5SDimitry Andric         CurrentLocalMemUsage += AllocSize;
671*0b57cec5SDimitry Andric         break;
672*0b57cec5SDimitry Andric       }
673*0b57cec5SDimitry Andric     }
674*0b57cec5SDimitry Andric   }
675*0b57cec5SDimitry Andric 
676*0b57cec5SDimitry Andric   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
677*0b57cec5SDimitry Andric                                                           F);
678*0b57cec5SDimitry Andric 
679*0b57cec5SDimitry Andric   // Restrict local memory usage so that we don't drastically reduce occupancy,
680*0b57cec5SDimitry Andric   // unless it is already significantly reduced.
681*0b57cec5SDimitry Andric 
682*0b57cec5SDimitry Andric   // TODO: Have some sort of hint or other heuristics to guess occupancy based
683*0b57cec5SDimitry Andric   // on other factors..
684*0b57cec5SDimitry Andric   unsigned OccupancyHint = ST.getWavesPerEU(F).second;
685*0b57cec5SDimitry Andric   if (OccupancyHint == 0)
686*0b57cec5SDimitry Andric     OccupancyHint = 7;
687*0b57cec5SDimitry Andric 
688*0b57cec5SDimitry Andric   // Clamp to max value.
689*0b57cec5SDimitry Andric   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
690*0b57cec5SDimitry Andric 
691*0b57cec5SDimitry Andric   // Check the hint but ignore it if it's obviously wrong from the existing LDS
692*0b57cec5SDimitry Andric   // usage.
693*0b57cec5SDimitry Andric   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
694*0b57cec5SDimitry Andric 
695*0b57cec5SDimitry Andric 
696*0b57cec5SDimitry Andric   // Round up to the next tier of usage.
697*0b57cec5SDimitry Andric   unsigned MaxSizeWithWaveCount
698*0b57cec5SDimitry Andric     = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
699*0b57cec5SDimitry Andric 
700*0b57cec5SDimitry Andric   // Program is possibly broken by using more local mem than available.
701*0b57cec5SDimitry Andric   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
702*0b57cec5SDimitry Andric     return false;
703*0b57cec5SDimitry Andric 
704*0b57cec5SDimitry Andric   LocalMemLimit = MaxSizeWithWaveCount;
705*0b57cec5SDimitry Andric 
706*0b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
707*0b57cec5SDimitry Andric                     << " bytes of LDS\n"
708*0b57cec5SDimitry Andric                     << "  Rounding size to " << MaxSizeWithWaveCount
709*0b57cec5SDimitry Andric                     << " with a maximum occupancy of " << MaxOccupancy << '\n'
710*0b57cec5SDimitry Andric                     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
711*0b57cec5SDimitry Andric                     << " available for promotion\n");
712*0b57cec5SDimitry Andric 
713*0b57cec5SDimitry Andric   return true;
714*0b57cec5SDimitry Andric }
715*0b57cec5SDimitry Andric 
716*0b57cec5SDimitry Andric // FIXME: Should try to pick the most likely to be profitable allocas first.
717*0b57cec5SDimitry Andric bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
718*0b57cec5SDimitry Andric   // Array allocations are probably not worth handling, since an allocation of
719*0b57cec5SDimitry Andric   // the array type is the canonical form.
720*0b57cec5SDimitry Andric   if (!I.isStaticAlloca() || I.isArrayAllocation())
721*0b57cec5SDimitry Andric     return false;
722*0b57cec5SDimitry Andric 
723*0b57cec5SDimitry Andric   IRBuilder<> Builder(&I);
724*0b57cec5SDimitry Andric 
725*0b57cec5SDimitry Andric   // First try to replace the alloca with a vector
726*0b57cec5SDimitry Andric   Type *AllocaTy = I.getAllocatedType();
727*0b57cec5SDimitry Andric 
728*0b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
729*0b57cec5SDimitry Andric 
730*0b57cec5SDimitry Andric   if (tryPromoteAllocaToVector(&I))
731*0b57cec5SDimitry Andric     return true; // Promoted to vector.
732*0b57cec5SDimitry Andric 
733*0b57cec5SDimitry Andric   if (DisablePromoteAllocaToLDS)
734*0b57cec5SDimitry Andric     return false;
735*0b57cec5SDimitry Andric 
736*0b57cec5SDimitry Andric   const Function &ContainingFunction = *I.getParent()->getParent();
737*0b57cec5SDimitry Andric   CallingConv::ID CC = ContainingFunction.getCallingConv();
738*0b57cec5SDimitry Andric 
739*0b57cec5SDimitry Andric   // Don't promote the alloca to LDS for shader calling conventions as the work
740*0b57cec5SDimitry Andric   // item ID intrinsics are not supported for these calling conventions.
741*0b57cec5SDimitry Andric   // Furthermore not all LDS is available for some of the stages.
742*0b57cec5SDimitry Andric   switch (CC) {
743*0b57cec5SDimitry Andric   case CallingConv::AMDGPU_KERNEL:
744*0b57cec5SDimitry Andric   case CallingConv::SPIR_KERNEL:
745*0b57cec5SDimitry Andric     break;
746*0b57cec5SDimitry Andric   default:
747*0b57cec5SDimitry Andric     LLVM_DEBUG(
748*0b57cec5SDimitry Andric         dbgs()
749*0b57cec5SDimitry Andric         << " promote alloca to LDS not supported with calling convention.\n");
750*0b57cec5SDimitry Andric     return false;
751*0b57cec5SDimitry Andric   }
752*0b57cec5SDimitry Andric 
753*0b57cec5SDimitry Andric   // Not likely to have sufficient local memory for promotion.
754*0b57cec5SDimitry Andric   if (!SufficientLDS)
755*0b57cec5SDimitry Andric     return false;
756*0b57cec5SDimitry Andric 
757*0b57cec5SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
758*0b57cec5SDimitry Andric   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
759*0b57cec5SDimitry Andric 
760*0b57cec5SDimitry Andric   const DataLayout &DL = Mod->getDataLayout();
761*0b57cec5SDimitry Andric 
762*0b57cec5SDimitry Andric   unsigned Align = I.getAlignment();
763*0b57cec5SDimitry Andric   if (Align == 0)
764*0b57cec5SDimitry Andric     Align = DL.getABITypeAlignment(I.getAllocatedType());
765*0b57cec5SDimitry Andric 
766*0b57cec5SDimitry Andric   // FIXME: This computed padding is likely wrong since it depends on inverse
767*0b57cec5SDimitry Andric   // usage order.
768*0b57cec5SDimitry Andric   //
769*0b57cec5SDimitry Andric   // FIXME: It is also possible that if we're allowed to use all of the memory
770*0b57cec5SDimitry Andric   // could could end up using more than the maximum due to alignment padding.
771*0b57cec5SDimitry Andric 
772*0b57cec5SDimitry Andric   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
773*0b57cec5SDimitry Andric   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
774*0b57cec5SDimitry Andric   NewSize += AllocSize;
775*0b57cec5SDimitry Andric 
776*0b57cec5SDimitry Andric   if (NewSize > LocalMemLimit) {
777*0b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  " << AllocSize
778*0b57cec5SDimitry Andric                       << " bytes of local memory not available to promote\n");
779*0b57cec5SDimitry Andric     return false;
780*0b57cec5SDimitry Andric   }
781*0b57cec5SDimitry Andric 
782*0b57cec5SDimitry Andric   CurrentLocalMemUsage = NewSize;
783*0b57cec5SDimitry Andric 
784*0b57cec5SDimitry Andric   std::vector<Value*> WorkList;
785*0b57cec5SDimitry Andric 
786*0b57cec5SDimitry Andric   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
787*0b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
788*0b57cec5SDimitry Andric     return false;
789*0b57cec5SDimitry Andric   }
790*0b57cec5SDimitry Andric 
791*0b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
792*0b57cec5SDimitry Andric 
793*0b57cec5SDimitry Andric   Function *F = I.getParent()->getParent();
794*0b57cec5SDimitry Andric 
795*0b57cec5SDimitry Andric   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
796*0b57cec5SDimitry Andric   GlobalVariable *GV = new GlobalVariable(
797*0b57cec5SDimitry Andric       *Mod, GVTy, false, GlobalValue::InternalLinkage,
798*0b57cec5SDimitry Andric       UndefValue::get(GVTy),
799*0b57cec5SDimitry Andric       Twine(F->getName()) + Twine('.') + I.getName(),
800*0b57cec5SDimitry Andric       nullptr,
801*0b57cec5SDimitry Andric       GlobalVariable::NotThreadLocal,
802*0b57cec5SDimitry Andric       AMDGPUAS::LOCAL_ADDRESS);
803*0b57cec5SDimitry Andric   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
804*0b57cec5SDimitry Andric   GV->setAlignment(I.getAlignment());
805*0b57cec5SDimitry Andric 
806*0b57cec5SDimitry Andric   Value *TCntY, *TCntZ;
807*0b57cec5SDimitry Andric 
808*0b57cec5SDimitry Andric   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
809*0b57cec5SDimitry Andric   Value *TIdX = getWorkitemID(Builder, 0);
810*0b57cec5SDimitry Andric   Value *TIdY = getWorkitemID(Builder, 1);
811*0b57cec5SDimitry Andric   Value *TIdZ = getWorkitemID(Builder, 2);
812*0b57cec5SDimitry Andric 
813*0b57cec5SDimitry Andric   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
814*0b57cec5SDimitry Andric   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
815*0b57cec5SDimitry Andric   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
816*0b57cec5SDimitry Andric   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
817*0b57cec5SDimitry Andric   TID = Builder.CreateAdd(TID, TIdZ);
818*0b57cec5SDimitry Andric 
819*0b57cec5SDimitry Andric   Value *Indices[] = {
820*0b57cec5SDimitry Andric     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
821*0b57cec5SDimitry Andric     TID
822*0b57cec5SDimitry Andric   };
823*0b57cec5SDimitry Andric 
824*0b57cec5SDimitry Andric   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
825*0b57cec5SDimitry Andric   I.mutateType(Offset->getType());
826*0b57cec5SDimitry Andric   I.replaceAllUsesWith(Offset);
827*0b57cec5SDimitry Andric   I.eraseFromParent();
828*0b57cec5SDimitry Andric 
829*0b57cec5SDimitry Andric   for (Value *V : WorkList) {
830*0b57cec5SDimitry Andric     CallInst *Call = dyn_cast<CallInst>(V);
831*0b57cec5SDimitry Andric     if (!Call) {
832*0b57cec5SDimitry Andric       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
833*0b57cec5SDimitry Andric         Value *Src0 = CI->getOperand(0);
834*0b57cec5SDimitry Andric         Type *EltTy = Src0->getType()->getPointerElementType();
835*0b57cec5SDimitry Andric         PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
836*0b57cec5SDimitry Andric 
837*0b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(CI->getOperand(0)))
838*0b57cec5SDimitry Andric           CI->setOperand(0, ConstantPointerNull::get(NewTy));
839*0b57cec5SDimitry Andric 
840*0b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(CI->getOperand(1)))
841*0b57cec5SDimitry Andric           CI->setOperand(1, ConstantPointerNull::get(NewTy));
842*0b57cec5SDimitry Andric 
843*0b57cec5SDimitry Andric         continue;
844*0b57cec5SDimitry Andric       }
845*0b57cec5SDimitry Andric 
846*0b57cec5SDimitry Andric       // The operand's value should be corrected on its own and we don't want to
847*0b57cec5SDimitry Andric       // touch the users.
848*0b57cec5SDimitry Andric       if (isa<AddrSpaceCastInst>(V))
849*0b57cec5SDimitry Andric         continue;
850*0b57cec5SDimitry Andric 
851*0b57cec5SDimitry Andric       Type *EltTy = V->getType()->getPointerElementType();
852*0b57cec5SDimitry Andric       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
853*0b57cec5SDimitry Andric 
854*0b57cec5SDimitry Andric       // FIXME: It doesn't really make sense to try to do this for all
855*0b57cec5SDimitry Andric       // instructions.
856*0b57cec5SDimitry Andric       V->mutateType(NewTy);
857*0b57cec5SDimitry Andric 
858*0b57cec5SDimitry Andric       // Adjust the types of any constant operands.
859*0b57cec5SDimitry Andric       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
860*0b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(SI->getOperand(1)))
861*0b57cec5SDimitry Andric           SI->setOperand(1, ConstantPointerNull::get(NewTy));
862*0b57cec5SDimitry Andric 
863*0b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(SI->getOperand(2)))
864*0b57cec5SDimitry Andric           SI->setOperand(2, ConstantPointerNull::get(NewTy));
865*0b57cec5SDimitry Andric       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
866*0b57cec5SDimitry Andric         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
867*0b57cec5SDimitry Andric           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
868*0b57cec5SDimitry Andric             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
869*0b57cec5SDimitry Andric         }
870*0b57cec5SDimitry Andric       }
871*0b57cec5SDimitry Andric 
872*0b57cec5SDimitry Andric       continue;
873*0b57cec5SDimitry Andric     }
874*0b57cec5SDimitry Andric 
875*0b57cec5SDimitry Andric     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
876*0b57cec5SDimitry Andric     Builder.SetInsertPoint(Intr);
877*0b57cec5SDimitry Andric     switch (Intr->getIntrinsicID()) {
878*0b57cec5SDimitry Andric     case Intrinsic::lifetime_start:
879*0b57cec5SDimitry Andric     case Intrinsic::lifetime_end:
880*0b57cec5SDimitry Andric       // These intrinsics are for address space 0 only
881*0b57cec5SDimitry Andric       Intr->eraseFromParent();
882*0b57cec5SDimitry Andric       continue;
883*0b57cec5SDimitry Andric     case Intrinsic::memcpy: {
884*0b57cec5SDimitry Andric       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
885*0b57cec5SDimitry Andric       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(),
886*0b57cec5SDimitry Andric                            MemCpy->getRawSource(), MemCpy->getSourceAlignment(),
887*0b57cec5SDimitry Andric                            MemCpy->getLength(), MemCpy->isVolatile());
888*0b57cec5SDimitry Andric       Intr->eraseFromParent();
889*0b57cec5SDimitry Andric       continue;
890*0b57cec5SDimitry Andric     }
891*0b57cec5SDimitry Andric     case Intrinsic::memmove: {
892*0b57cec5SDimitry Andric       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
893*0b57cec5SDimitry Andric       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(),
894*0b57cec5SDimitry Andric                             MemMove->getRawSource(), MemMove->getSourceAlignment(),
895*0b57cec5SDimitry Andric                             MemMove->getLength(), MemMove->isVolatile());
896*0b57cec5SDimitry Andric       Intr->eraseFromParent();
897*0b57cec5SDimitry Andric       continue;
898*0b57cec5SDimitry Andric     }
899*0b57cec5SDimitry Andric     case Intrinsic::memset: {
900*0b57cec5SDimitry Andric       MemSetInst *MemSet = cast<MemSetInst>(Intr);
901*0b57cec5SDimitry Andric       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
902*0b57cec5SDimitry Andric                            MemSet->getLength(), MemSet->getDestAlignment(),
903*0b57cec5SDimitry Andric                            MemSet->isVolatile());
904*0b57cec5SDimitry Andric       Intr->eraseFromParent();
905*0b57cec5SDimitry Andric       continue;
906*0b57cec5SDimitry Andric     }
907*0b57cec5SDimitry Andric     case Intrinsic::invariant_start:
908*0b57cec5SDimitry Andric     case Intrinsic::invariant_end:
909*0b57cec5SDimitry Andric     case Intrinsic::launder_invariant_group:
910*0b57cec5SDimitry Andric     case Intrinsic::strip_invariant_group:
911*0b57cec5SDimitry Andric       Intr->eraseFromParent();
912*0b57cec5SDimitry Andric       // FIXME: I think the invariant marker should still theoretically apply,
913*0b57cec5SDimitry Andric       // but the intrinsics need to be changed to accept pointers with any
914*0b57cec5SDimitry Andric       // address space.
915*0b57cec5SDimitry Andric       continue;
916*0b57cec5SDimitry Andric     case Intrinsic::objectsize: {
917*0b57cec5SDimitry Andric       Value *Src = Intr->getOperand(0);
918*0b57cec5SDimitry Andric       Type *SrcTy = Src->getType()->getPointerElementType();
919*0b57cec5SDimitry Andric       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
920*0b57cec5SDimitry Andric         Intrinsic::objectsize,
921*0b57cec5SDimitry Andric         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
922*0b57cec5SDimitry Andric       );
923*0b57cec5SDimitry Andric 
924*0b57cec5SDimitry Andric       CallInst *NewCall = Builder.CreateCall(
925*0b57cec5SDimitry Andric           ObjectSize,
926*0b57cec5SDimitry Andric           {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
927*0b57cec5SDimitry Andric       Intr->replaceAllUsesWith(NewCall);
928*0b57cec5SDimitry Andric       Intr->eraseFromParent();
929*0b57cec5SDimitry Andric       continue;
930*0b57cec5SDimitry Andric     }
931*0b57cec5SDimitry Andric     default:
932*0b57cec5SDimitry Andric       Intr->print(errs());
933*0b57cec5SDimitry Andric       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
934*0b57cec5SDimitry Andric     }
935*0b57cec5SDimitry Andric   }
936*0b57cec5SDimitry Andric   return true;
937*0b57cec5SDimitry Andric }
938*0b57cec5SDimitry Andric 
939*0b57cec5SDimitry Andric FunctionPass *llvm::createAMDGPUPromoteAlloca() {
940*0b57cec5SDimitry Andric   return new AMDGPUPromoteAlloca();
941*0b57cec5SDimitry Andric }
942