xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass lowers the local data store, LDS, uses in kernel and non-kernel
10 // functions in module to use dynamically allocated global memory.
11 // Packed LDS Layout is emulated in the global memory.
12 // The lowered memory instructions from LDS to global memory are then
13 // instrumented for address sanitizer, to catch addressing errors.
14 // This pass only work when address sanitizer has been enabled and has
15 // instrumented the IR. It identifies that IR has been instrumented using
16 // "nosanitize_address" module flag.
17 //
18 // Replacement of Kernel LDS accesses:
19 //    For a kernel, LDS access can be static or dynamic which are direct
20 //    (accessed within kernel) and indirect (accessed through non-kernels).
21 //    All these LDS accesses corresponding to kernel will be packed together,
22 //    where all static LDS accesses will be allocated first and then dynamic
23 //    LDS follows. The total size with alignment is calculated. A new LDS global
24 //    will be created for the kernel called "SW LDS" and it will have the
25 //    attribute "amdgpu-lds-size" attached with value of the size calculated.
26 //    All the LDS accesses in the module will be replaced by GEP with offset
27 //    into the "Sw LDS".
28 //    A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing
29 //    the dynamic LDS. This will be marked used by kernel and will have
30 //    MD_absolue_symbol metadata set to total static LDS size, Since dynamic
31 //    LDS allocation starts after all static LDS allocation.
32 //
33 //    A device global memory equal to the total LDS size will be allocated.
34 //    At the prologue of the kernel, a single work-item from the
35 //    work-group, does a "malloc" and stores the pointer of the
36 //    allocation in "SW LDS".
37 //
38 //    To store the offsets corresponding to all LDS accesses, another global
39 //    variable is created which will be called "SW LDS metadata" in this pass.
40 //    - SW LDS Global:
41 //        It is LDS global of ptr type with name
42 //        "llvm.amdgcn.sw.lds.<kernel-name>".
43 //    - Metadata Global:
44 //        It is of struct type, with n members. n equals the number of LDS
45 //        globals accessed by the kernel(direct and indirect). Each member of
46 //        struct is another struct of type {i32, i32, i32}. First member
47 //        corresponds to offset, second member corresponds to size of LDS global
48 //        being replaced and third represents the total aligned size. It will
49 //        have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
50 //        an intializer with static LDS related offsets and sizes initialized.
51 //        But for dynamic LDS related entries, offsets will be intialized to
52 //        previous static LDS allocation end offset. Sizes for them will be zero
53 //        initially. These dynamic LDS offset and size values will be updated
54 //        within the kernel, since kernel can read the dynamic LDS size
55 //        allocation done at runtime with query to "hidden_dynamic_lds_size"
56 //        hidden kernel argument.
57 //
58 //    At the epilogue of kernel, allocated memory would be made free by the same
59 //    single work-item.
60 //
61 // Replacement of non-kernel LDS accesses:
62 //    Multiple kernels can access the same non-kernel function.
63 //    All the kernels accessing LDS through non-kernels are sorted and
64 //    assigned a kernel-id. All the LDS globals accessed by non-kernels
65 //    are sorted. This information is used to build two tables:
66 //    - Base table:
67 //        Base table will have single row, with elements of the row
68 //        placed as per kernel ID. Each element in the row corresponds
69 //        to ptr of "SW LDS" variable created for that kernel.
70 //    - Offset table:
71 //        Offset table will have multiple rows and columns.
72 //        Rows are assumed to be from 0 to (n-1). n is total number
73 //        of kernels accessing the LDS through non-kernels.
74 //        Each row will have m elements. m is the total number of
75 //        unique LDS globals accessed by all non-kernels.
76 //        Each element in the row correspond to the ptr of
77 //        the replacement of LDS global done by that particular kernel.
78 //    A LDS variable in non-kernel will be replaced based on the information
79 //    from base and offset tables. Based on kernel-id query, ptr of "SW
80 //    LDS" for that corresponding kernel is obtained from base table.
81 //    The Offset into the base "SW LDS" is obtained from
82 //    corresponding element in offset table. With this information, replacement
83 //    value is obtained.
84 //===----------------------------------------------------------------------===//
85 
86 #include "AMDGPU.h"
87 #include "AMDGPUAsanInstrumentation.h"
88 #include "AMDGPUMemoryUtils.h"
89 #include "AMDGPUTargetMachine.h"
90 #include "llvm/ADT/DenseMap.h"
91 #include "llvm/ADT/DenseSet.h"
92 #include "llvm/ADT/SetVector.h"
93 #include "llvm/ADT/StringExtras.h"
94 #include "llvm/ADT/StringRef.h"
95 #include "llvm/Analysis/CallGraph.h"
96 #include "llvm/Analysis/DomTreeUpdater.h"
97 #include "llvm/CodeGen/TargetPassConfig.h"
98 #include "llvm/IR/Constants.h"
99 #include "llvm/IR/DIBuilder.h"
100 #include "llvm/IR/DebugInfo.h"
101 #include "llvm/IR/DebugInfoMetadata.h"
102 #include "llvm/IR/IRBuilder.h"
103 #include "llvm/IR/Instructions.h"
104 #include "llvm/IR/IntrinsicsAMDGPU.h"
105 #include "llvm/IR/MDBuilder.h"
106 #include "llvm/IR/ReplaceConstant.h"
107 #include "llvm/Pass.h"
108 #include "llvm/Support/raw_ostream.h"
109 #include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
110 #include "llvm/Transforms/Utils/ModuleUtils.h"
111 
112 #include <algorithm>
113 
114 #define DEBUG_TYPE "amdgpu-sw-lower-lds"
115 #define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
116 
117 using namespace llvm;
118 using namespace AMDGPU;
119 
120 namespace {
121 
122 cl::opt<bool>
123     AsanInstrumentLDS("amdgpu-asan-instrument-lds",
124                       cl::desc("Run asan instrumentation on LDS instructions "
125                                "lowered to global memory"),
126                       cl::init(true), cl::Hidden);
127 
128 using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
129 
130 struct LDSAccessTypeInfo {
131   SetVector<GlobalVariable *> StaticLDSGlobals;
132   SetVector<GlobalVariable *> DynamicLDSGlobals;
133 };
134 
135 // Struct to hold all the Metadata required for a kernel
136 // to replace a LDS global uses with corresponding offset
137 // in to device global memory.
138 struct KernelLDSParameters {
139   GlobalVariable *SwLDS = nullptr;
140   GlobalVariable *SwDynLDS = nullptr;
141   GlobalVariable *SwLDSMetadata = nullptr;
142   LDSAccessTypeInfo DirectAccess;
143   LDSAccessTypeInfo IndirectAccess;
144   DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
145       LDSToReplacementIndicesMap;
146   uint32_t MallocSize = 0;
147   uint32_t LDSSize = 0;
148   SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector;
149 };
150 
151 // Struct to store information for creation of offset table
152 // for all the non-kernel LDS accesses.
153 struct NonKernelLDSParameters {
154   GlobalVariable *LDSBaseTable = nullptr;
155   GlobalVariable *LDSOffsetTable = nullptr;
156   SetVector<Function *> OrderedKernels;
157   SetVector<GlobalVariable *> OrdereLDSGlobals;
158 };
159 
160 struct AsanInstrumentInfo {
161   int Scale = 0;
162   uint32_t Offset = 0;
163   SetVector<Instruction *> Instructions;
164 };
165 
166 struct FunctionsAndLDSAccess {
167   DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
168   SetVector<Function *> KernelsWithIndirectLDSAccess;
169   SetVector<Function *> NonKernelsWithLDSArgument;
170   SetVector<GlobalVariable *> AllNonKernelLDSAccess;
171   FunctionVariableMap NonKernelToLDSAccessMap;
172 };
173 
174 class AMDGPUSwLowerLDS {
175 public:
AMDGPUSwLowerLDS(Module & Mod,const AMDGPUTargetMachine & TM,DomTreeCallback Callback)176   AMDGPUSwLowerLDS(Module &Mod, const AMDGPUTargetMachine &TM,
177                    DomTreeCallback Callback)
178       : M(Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {}
179   bool run();
180   void getUsesOfLDSByNonKernels();
181   void getNonKernelsWithLDSArguments(const CallGraph &CG);
182   SetVector<Function *>
183   getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels);
184   SetVector<GlobalVariable *>
185   getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables);
186   void buildSwLDSGlobal(Function *Func);
187   void buildSwDynLDSGlobal(Function *Func);
188   void populateSwMetadataGlobal(Function *Func);
189   void populateSwLDSAttributeAndMetadata(Function *Func);
190   void populateLDSToReplacementIndicesMap(Function *Func);
191   void getLDSMemoryInstructions(Function *Func,
192                                 SetVector<Instruction *> &LDSInstructions);
193   void replaceKernelLDSAccesses(Function *Func);
194   Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr);
195   void translateLDSMemoryOperationsToGlobalMemory(
196       Function *Func, Value *LoadMallocPtr,
197       SetVector<Instruction *> &LDSInstructions);
198   void poisonRedzones(Function *Func, Value *MallocPtr);
199   void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
200   void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
201   void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
202   Constant *
203   getAddressesOfVariablesInKernel(Function *Func,
204                                   SetVector<GlobalVariable *> &Variables);
205   void lowerNonKernelLDSAccesses(Function *Func,
206                                  SetVector<GlobalVariable *> &LDSGlobals,
207                                  NonKernelLDSParameters &NKLDSParams);
208   void
209   updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize,
210                                 Value *HiddenDynLDSSize,
211                                 SetVector<GlobalVariable *> &DynamicLDSGlobals);
212   void initAsanInfo();
213 
214 private:
215   Module &M;
216   const AMDGPUTargetMachine &AMDGPUTM;
217   IRBuilder<> IRB;
218   DomTreeCallback DTCallback;
219   FunctionsAndLDSAccess FuncLDSAccessInfo;
220   AsanInstrumentInfo AsanInfo;
221 };
222 
sortByName(std::vector<T> && V)223 template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
224   // Sort the vector of globals or Functions based on their name.
225   // Returns a SetVector of globals/Functions.
226   sort(V, [](const auto *L, const auto *R) {
227     return L->getName() < R->getName();
228   });
229   return {SetVector<T>(llvm::from_range, V)};
230 }
231 
getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable * > & Variables)232 SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
233     SetVector<GlobalVariable *> &Variables) {
234   // Sort all the non-kernel LDS accesses based on their name.
235   return sortByName(
236       std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
237 }
238 
getOrderedIndirectLDSAccessingKernels(SetVector<Function * > & Kernels)239 SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
240     SetVector<Function *> &Kernels) {
241   // Sort the non-kernels accessing LDS based on their name.
242   // Also assign a kernel ID metadata based on the sorted order.
243   LLVMContext &Ctx = M.getContext();
244   if (Kernels.size() > UINT32_MAX) {
245     report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels");
246   }
247   SetVector<Function *> OrderedKernels =
248       sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
249   for (size_t i = 0; i < Kernels.size(); i++) {
250     Metadata *AttrMDArgs[1] = {
251         ConstantAsMetadata::get(IRB.getInt32(i)),
252     };
253     Function *Func = OrderedKernels[i];
254     Func->setMetadata("llvm.amdgcn.lds.kernel.id",
255                       MDNode::get(Ctx, AttrMDArgs));
256   }
257   return OrderedKernels;
258 }
259 
getNonKernelsWithLDSArguments(const CallGraph & CG)260 void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
261   // Among the kernels accessing LDS, get list of
262   // Non-kernels to which a call is made and a ptr
263   // to addrspace(3) is passed as argument.
264   for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
265     Function *Func = K.first;
266     const CallGraphNode *CGN = CG[Func];
267     if (!CGN)
268       continue;
269     for (auto &I : *CGN) {
270       CallGraphNode *CallerCGN = I.second;
271       Function *CalledFunc = CallerCGN->getFunction();
272       if (!CalledFunc || CalledFunc->isDeclaration())
273         continue;
274       if (AMDGPU::isKernelLDS(CalledFunc))
275         continue;
276       for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
277            AI != E; ++AI) {
278         Type *ArgTy = (*AI).getType();
279         if (!ArgTy->isPointerTy())
280           continue;
281         if (ArgTy->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
282           continue;
283         FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc);
284         // Also add the Calling function to KernelsWithIndirectLDSAccess list
285         // so that base table of LDS is generated.
286         FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func);
287       }
288     }
289   }
290 }
291 
getUsesOfLDSByNonKernels()292 void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
293   for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
294     if (!AMDGPU::isLDSVariableToLower(*GV))
295       continue;
296 
297     for (User *V : GV->users()) {
298       if (auto *I = dyn_cast<Instruction>(V)) {
299         Function *F = I->getFunction();
300         if (!isKernelLDS(F) && !F->isDeclaration())
301           FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
302       }
303     }
304   }
305 }
306 
recordLDSAbsoluteAddress(Module & M,GlobalVariable * GV,uint32_t Address)307 static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
308                                      uint32_t Address) {
309   // Write the specified address into metadata where it can be retrieved by
310   // the assembler. Format is a half open range, [Address Address+1)
311   LLVMContext &Ctx = M.getContext();
312   auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
313   MDBuilder MDB(Ctx);
314   MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
315                                          ConstantInt::get(IntTy, Address + 1));
316   GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
317 }
318 
addLDSSizeAttribute(Function * Func,uint32_t Offset,bool IsDynLDS)319 static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
320                                 bool IsDynLDS) {
321   if (Offset != 0) {
322     std::string Buffer;
323     raw_string_ostream SS{Buffer};
324     SS << Offset;
325     if (IsDynLDS)
326       SS << "," << Offset;
327     Func->addFnAttr("amdgpu-lds-size", Buffer);
328   }
329 }
330 
markUsedByKernel(Function * Func,GlobalVariable * SGV)331 static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
332   BasicBlock *Entry = &Func->getEntryBlock();
333   IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
334 
335   Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(),
336                                                      Intrinsic::donothing, {});
337 
338   Value *UseInstance[1] = {
339       Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
340 
341   Builder.CreateCall(Decl, {},
342                      {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
343 }
344 
buildSwLDSGlobal(Function * Func)345 void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) {
346   // Create new LDS global required for each kernel to store
347   // device global memory pointer.
348   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
349   // Create new global pointer variable
350   LDSParams.SwLDS = new GlobalVariable(
351       M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
352       PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
353       nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
354   GlobalValue::SanitizerMetadata MD;
355   MD.NoAddress = true;
356   LDSParams.SwLDS->setSanitizerMetadata(MD);
357 }
358 
buildSwDynLDSGlobal(Function * Func)359 void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) {
360   // Create new Dyn LDS global if kernel accesses dyn LDS.
361   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
362   if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
363       LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
364     return;
365   // Create new global pointer variable
366   auto *emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0);
367   LDSParams.SwDynLDS = new GlobalVariable(
368       M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
369       "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr,
370       GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
371   markUsedByKernel(Func, LDSParams.SwDynLDS);
372   GlobalValue::SanitizerMetadata MD;
373   MD.NoAddress = true;
374   LDSParams.SwDynLDS->setSanitizerMetadata(MD);
375 }
376 
populateSwLDSAttributeAndMetadata(Function * Func)377 void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
378   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
379   bool IsDynLDSUsed = LDSParams.SwDynLDS;
380   uint32_t Offset = LDSParams.LDSSize;
381   recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
382   addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
383   if (LDSParams.SwDynLDS)
384     recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
385 }
386 
populateSwMetadataGlobal(Function * Func)387 void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
388   // Create new metadata global for every kernel and initialize the
389   // start offsets and sizes corresponding to each LDS accesses.
390   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
391   auto &Ctx = M.getContext();
392   auto &DL = M.getDataLayout();
393   std::vector<Type *> Items;
394   Type *Int32Ty = IRB.getInt32Ty();
395   std::vector<Constant *> Initializers;
396   Align MaxAlignment(1);
397   auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
398     Align GVAlign = AMDGPU::getAlign(DL, GV);
399     MaxAlignment = std::max(MaxAlignment, GVAlign);
400   };
401 
402   for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
403     UpdateMaxAlignment(GV);
404 
405   for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
406     UpdateMaxAlignment(GV);
407 
408   for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
409     UpdateMaxAlignment(GV);
410 
411   for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
412     UpdateMaxAlignment(GV);
413 
414   //{StartOffset, AlignedSizeInBytes}
415   SmallString<128> MDItemStr;
416   raw_svector_ostream MDItemOS(MDItemStr);
417   MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item";
418 
419   StructType *LDSItemTy =
420       StructType::create(Ctx, {Int32Ty, Int32Ty, Int32Ty}, MDItemOS.str());
421   uint32_t &MallocSize = LDSParams.MallocSize;
422   SetVector<GlobalVariable *> UniqueLDSGlobals;
423   int AsanScale = AsanInfo.Scale;
424   auto buildInitializerForSwLDSMD =
425       [&](SetVector<GlobalVariable *> &LDSGlobals) {
426         for (auto &GV : LDSGlobals) {
427           if (is_contained(UniqueLDSGlobals, GV))
428             continue;
429           UniqueLDSGlobals.insert(GV);
430 
431           Type *Ty = GV->getValueType();
432           const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
433           Items.push_back(LDSItemTy);
434           Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize);
435           Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
436           // Get redzone size corresponding a size.
437           const uint64_t RightRedzoneSize =
438               AMDGPU::getRedzoneSizeForGlobal(AsanScale, SizeInBytes);
439           // Update MallocSize with current size and redzone size.
440           MallocSize += SizeInBytes;
441           if (!AMDGPU::isDynamicLDS(*GV))
442             LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize,
443                                                               RightRedzoneSize);
444           MallocSize += RightRedzoneSize;
445           // Align current size plus redzone.
446           uint64_t AlignedSize =
447               alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment);
448           Constant *AlignedSizeInBytesConst =
449               ConstantInt::get(Int32Ty, AlignedSize);
450           // Align MallocSize
451           MallocSize = alignTo(MallocSize, MaxAlignment);
452           Constant *InitItem =
453               ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst,
454                                               AlignedSizeInBytesConst});
455           Initializers.push_back(InitItem);
456         }
457       };
458   SetVector<GlobalVariable *> SwLDSVector;
459   SwLDSVector.insert(LDSParams.SwLDS);
460   buildInitializerForSwLDSMD(SwLDSVector);
461   buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
462   buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
463   buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
464   buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
465 
466   // Update the LDS size used by the kernel.
467   Type *Ty = LDSParams.SwLDS->getValueType();
468   const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
469   uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
470   LDSParams.LDSSize = AlignedSize;
471   SmallString<128> MDTypeStr;
472   raw_svector_ostream MDTypeOS(MDTypeStr);
473   MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type";
474   StructType *MetadataStructType =
475       StructType::create(Ctx, Items, MDTypeOS.str());
476   SmallString<128> MDStr;
477   raw_svector_ostream MDOS(MDStr);
478   MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md";
479   LDSParams.SwLDSMetadata = new GlobalVariable(
480       M, MetadataStructType, false, GlobalValue::InternalLinkage,
481       PoisonValue::get(MetadataStructType), MDOS.str(), nullptr,
482       GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
483   Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
484   LDSParams.SwLDSMetadata->setInitializer(data);
485   assert(LDSParams.SwLDS);
486   // Set the alignment to MaxAlignment for SwLDS.
487   LDSParams.SwLDS->setAlignment(MaxAlignment);
488   if (LDSParams.SwDynLDS)
489     LDSParams.SwDynLDS->setAlignment(MaxAlignment);
490   GlobalValue::SanitizerMetadata MD;
491   MD.NoAddress = true;
492   LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
493 }
494 
populateLDSToReplacementIndicesMap(Function * Func)495 void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
496   // Fill the corresponding LDS replacement indices for each LDS access
497   // related to this kernel.
498   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
499   SetVector<GlobalVariable *> UniqueLDSGlobals;
500   auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
501                              uint32_t &Idx) {
502     for (auto &GV : LDSGlobals) {
503       if (is_contained(UniqueLDSGlobals, GV))
504         continue;
505       UniqueLDSGlobals.insert(GV);
506       LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
507       ++Idx;
508     }
509   };
510   uint32_t Idx = 0;
511   SetVector<GlobalVariable *> SwLDSVector;
512   SwLDSVector.insert(LDSParams.SwLDS);
513   PopulateIndices(SwLDSVector, Idx);
514   PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
515   PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
516   PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
517   PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
518 }
519 
replacesUsesOfGlobalInFunction(Function * Func,GlobalVariable * GV,Value * Replacement)520 static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
521                                            Value *Replacement) {
522   // Replace all uses of LDS global in this Function with a Replacement.
523   auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
524     auto *V = U.getUser();
525     if (auto *Inst = dyn_cast<Instruction>(V)) {
526       auto *Func1 = Inst->getParent()->getParent();
527       if (Func == Func1)
528         return true;
529     }
530     return false;
531   };
532   GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda);
533 }
534 
replaceKernelLDSAccesses(Function * Func)535 void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
536   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
537   GlobalVariable *SwLDS = LDSParams.SwLDS;
538   assert(SwLDS);
539   GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
540   assert(SwLDSMetadata);
541   StructType *SwLDSMetadataStructType =
542       cast<StructType>(SwLDSMetadata->getValueType());
543   Type *Int32Ty = IRB.getInt32Ty();
544   auto &IndirectAccess = LDSParams.IndirectAccess;
545   auto &DirectAccess = LDSParams.DirectAccess;
546   // Replace all uses of LDS global in this Function with a Replacement.
547   SetVector<GlobalVariable *> UniqueLDSGlobals;
548   auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
549     for (auto &GV : LDSGlobals) {
550       // Do not generate instructions if LDS access is in non-kernel
551       // i.e indirect-access.
552       if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
553            IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
554           (!DirectAccess.StaticLDSGlobals.contains(GV) &&
555            !DirectAccess.DynamicLDSGlobals.contains(GV)))
556         continue;
557       if (is_contained(UniqueLDSGlobals, GV))
558         continue;
559       UniqueLDSGlobals.insert(GV);
560       auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
561       assert(Indices.size() == 3);
562       Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
563                             ConstantInt::get(Int32Ty, Indices[1]),
564                             ConstantInt::get(Int32Ty, Indices[2])};
565       Constant *GEP = ConstantExpr::getGetElementPtr(
566           SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
567       Value *Offset = IRB.CreateLoad(Int32Ty, GEP);
568       Value *BasePlusOffset =
569           IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset});
570       LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ",
571                                     false));
572       replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
573     }
574   };
575   ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
576   ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
577   ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
578   ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
579 }
580 
updateMallocSizeForDynamicLDS(Function * Func,Value ** CurrMallocSize,Value * HiddenDynLDSSize,SetVector<GlobalVariable * > & DynamicLDSGlobals)581 void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
582     Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize,
583     SetVector<GlobalVariable *> &DynamicLDSGlobals) {
584   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
585   Type *Int32Ty = IRB.getInt32Ty();
586 
587   GlobalVariable *SwLDS = LDSParams.SwLDS;
588   GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
589   assert(SwLDS && SwLDSMetadata);
590   StructType *MetadataStructType =
591       cast<StructType>(SwLDSMetadata->getValueType());
592   unsigned MaxAlignment = SwLDS->getAlignment();
593   Value *MaxAlignValue = IRB.getInt32(MaxAlignment);
594   Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1);
595 
596   for (GlobalVariable *DynGV : DynamicLDSGlobals) {
597     auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
598     // Update the Offset metadata.
599     Constant *Index0 = ConstantInt::get(Int32Ty, 0);
600     Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]);
601 
602     Constant *Index2Offset = ConstantInt::get(Int32Ty, 0);
603     auto *GEPForOffset = IRB.CreateInBoundsGEP(
604         MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset});
605 
606     IRB.CreateStore(*CurrMallocSize, GEPForOffset);
607     // Update the size and Aligned Size metadata.
608     Constant *Index2Size = ConstantInt::get(Int32Ty, 1);
609     auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
610                                              {Index0, Index1, Index2Size});
611 
612     Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize);
613     IRB.CreateStore(CurrDynLDSSize, GEPForSize);
614     Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2);
615     auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
616         MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize});
617 
618     Value *AlignedDynLDSSize =
619         IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne);
620     AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue);
621     AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue);
622     IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize);
623 
624     // Update the Current Malloc Size
625     *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize);
626   }
627 }
628 
getOrCreateDebugLoc(const Instruction * InsertBefore,DISubprogram * SP)629 static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
630                                     DISubprogram *SP) {
631   assert(InsertBefore);
632   if (InsertBefore->getDebugLoc())
633     return InsertBefore->getDebugLoc();
634   if (SP)
635     return DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
636   return DebugLoc();
637 }
638 
getLDSMemoryInstructions(Function * Func,SetVector<Instruction * > & LDSInstructions)639 void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
640     Function *Func, SetVector<Instruction *> &LDSInstructions) {
641   for (BasicBlock &BB : *Func) {
642     for (Instruction &Inst : BB) {
643       if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
644         if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
645           LDSInstructions.insert(&Inst);
646       } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
647         if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
648           LDSInstructions.insert(&Inst);
649       } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) {
650         if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
651           LDSInstructions.insert(&Inst);
652       } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) {
653         if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
654           LDSInstructions.insert(&Inst);
655       } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&Inst)) {
656         if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
657             ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)
658           LDSInstructions.insert(&Inst);
659       } else
660         continue;
661     }
662   }
663 }
664 
getTranslatedGlobalMemoryPtrOfLDS(Value * LoadMallocPtr,Value * LDSPtr)665 Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr,
666                                                            Value *LDSPtr) {
667   assert(LDSPtr && "Invalid LDS pointer operand");
668   Type *LDSPtrType = LDSPtr->getType();
669   LLVMContext &Ctx = M.getContext();
670   const DataLayout &DL = M.getDataLayout();
671   Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
672   if (auto *VecPtrTy = dyn_cast<VectorType>(LDSPtrType)) {
673     // Handle vector of pointers
674     ElementCount NumElements = VecPtrTy->getElementCount();
675     IntTy = VectorType::get(IntTy, NumElements);
676   }
677   Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy);
678   return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex});
679 }
680 
translateLDSMemoryOperationsToGlobalMemory(Function * Func,Value * LoadMallocPtr,SetVector<Instruction * > & LDSInstructions)681 void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
682     Function *Func, Value *LoadMallocPtr,
683     SetVector<Instruction *> &LDSInstructions) {
684   LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : "
685                     << Func->getName());
686   for (Instruction *Inst : LDSInstructions) {
687     IRB.SetInsertPoint(Inst);
688     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
689       Value *LIOperand = LI->getPointerOperand();
690       Value *Replacement =
691           getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand);
692       LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement,
693                                               LI->getAlign(), LI->isVolatile());
694       NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
695       AsanInfo.Instructions.insert(NewLI);
696       LI->replaceAllUsesWith(NewLI);
697       LI->eraseFromParent();
698     } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
699       Value *SIOperand = SI->getPointerOperand();
700       Value *Replacement =
701           getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand);
702       StoreInst *NewSI = IRB.CreateAlignedStore(
703           SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile());
704       NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
705       AsanInfo.Instructions.insert(NewSI);
706       SI->replaceAllUsesWith(NewSI);
707       SI->eraseFromParent();
708     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
709       Value *RMWPtrOperand = RMW->getPointerOperand();
710       Value *RMWValOperand = RMW->getValOperand();
711       Value *Replacement =
712           getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand);
713       AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
714           RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(),
715           RMW->getOrdering(), RMW->getSyncScopeID());
716       NewRMW->setVolatile(RMW->isVolatile());
717       AsanInfo.Instructions.insert(NewRMW);
718       RMW->replaceAllUsesWith(NewRMW);
719       RMW->eraseFromParent();
720     } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Inst)) {
721       Value *XCHGPtrOperand = XCHG->getPointerOperand();
722       Value *Replacement =
723           getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand);
724       AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg(
725           Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(),
726           XCHG->getAlign(), XCHG->getSuccessOrdering(),
727           XCHG->getFailureOrdering(), XCHG->getSyncScopeID());
728       NewXCHG->setVolatile(XCHG->isVolatile());
729       AsanInfo.Instructions.insert(NewXCHG);
730       XCHG->replaceAllUsesWith(NewXCHG);
731       XCHG->eraseFromParent();
732     } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Inst)) {
733       Value *AIOperand = ASC->getPointerOperand();
734       Value *Replacement =
735           getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand);
736       Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType());
737       // Note: No need to add the instruction to AsanInfo instructions to be
738       // instrumented list. FLAT_ADDRESS ptr would have been already
739       // instrumented by asan pass prior to this pass.
740       ASC->replaceAllUsesWith(NewAI);
741       ASC->eraseFromParent();
742     } else
743       report_fatal_error("Unimplemented LDS lowering instruction");
744   }
745 }
746 
poisonRedzones(Function * Func,Value * MallocPtr)747 void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) {
748   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
749   Type *Int64Ty = IRB.getInt64Ty();
750   Type *VoidTy = IRB.getVoidTy();
751   FunctionCallee AsanPoisonRegion = M.getOrInsertFunction(
752       "__asan_poison_region",
753       FunctionType::get(VoidTy, {Int64Ty, Int64Ty}, false));
754 
755   auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
756   size_t VecSize = RedzonesVec.size();
757   for (unsigned i = 0; i < VecSize; i++) {
758     auto &RedzonePair = RedzonesVec[i];
759     uint64_t RedzoneOffset = RedzonePair.first;
760     uint64_t RedzoneSize = RedzonePair.second;
761     Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
762         IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)});
763     Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty);
764     IRB.CreateCall(AsanPoisonRegion,
765                    {RedzoneAddress, IRB.getInt64(RedzoneSize)});
766   }
767 }
768 
lowerKernelLDSAccesses(Function * Func,DomTreeUpdater & DTU)769 void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
770                                               DomTreeUpdater &DTU) {
771   LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName());
772   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
773   auto &Ctx = M.getContext();
774   auto *PrevEntryBlock = &Func->getEntryBlock();
775   SetVector<Instruction *> LDSInstructions;
776   getLDSMemoryInstructions(Func, LDSInstructions);
777 
778   // Create malloc block.
779   auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock);
780 
781   // Create WIdBlock block which has instructions related to selection of
782   // {0,0,0} indiex work item in the work group.
783   auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
784   IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
785   DebugLoc FirstDL =
786       getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram());
787   IRB.SetCurrentDebugLocation(FirstDL);
788   Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
789   Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {});
790   Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {});
791   Value *XYOr = IRB.CreateOr(WIdx, WIdy);
792   Value *XYZOr = IRB.CreateOr(XYOr, WIdz);
793   Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
794 
795   // All work items will branch to PrevEntryBlock except {0,0,0} index
796   // work item which will branch to malloc block.
797   IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
798 
799   // Malloc block
800   IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
801 
802   // If Dynamic LDS globals are accessed by the kernel,
803   // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
804   // Update the corresponding metadata global entries for this dyn lds global.
805   GlobalVariable *SwLDS = LDSParams.SwLDS;
806   GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
807   assert(SwLDS && SwLDSMetadata);
808   StructType *MetadataStructType =
809       cast<StructType>(SwLDSMetadata->getValueType());
810   uint32_t MallocSize = 0;
811   Value *CurrMallocSize;
812   Type *Int32Ty = IRB.getInt32Ty();
813   Type *Int64Ty = IRB.getInt64Ty();
814 
815   SetVector<GlobalVariable *> UniqueLDSGlobals;
816   auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) {
817     for (auto &GV : LDSGlobals) {
818       if (is_contained(UniqueLDSGlobals, GV))
819         continue;
820       UniqueLDSGlobals.insert(GV);
821     }
822   };
823 
824   GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
825   GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
826   unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size();
827   UniqueLDSGlobals.clear();
828 
829   if (NumStaticLDS) {
830     auto *GEPForEndStaticLDSOffset =
831         IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
832                               {ConstantInt::get(Int32Ty, 0),
833                                ConstantInt::get(Int32Ty, NumStaticLDS - 1),
834                                ConstantInt::get(Int32Ty, 0)});
835 
836     auto *GEPForEndStaticLDSSize =
837         IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
838                               {ConstantInt::get(Int32Ty, 0),
839                                ConstantInt::get(Int32Ty, NumStaticLDS - 1),
840                                ConstantInt::get(Int32Ty, 2)});
841 
842     Value *EndStaticLDSOffset =
843         IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset);
844     Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize);
845     CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize);
846   } else
847     CurrMallocSize = IRB.getInt32(MallocSize);
848 
849   if (LDSParams.SwDynLDS) {
850     if (!(AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5))
851       report_fatal_error(
852           "Dynamic LDS size query is only supported for CO V5 and later.");
853     // Get size from hidden dyn_lds_size argument of kernel
854     Value *ImplicitArg =
855         IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {});
856     Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
857         ImplicitArg->getType(), ImplicitArg,
858         {ConstantInt::get(Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)});
859     UniqueLDSGlobals.clear();
860     GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
861     GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
862     updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize,
863                                   UniqueLDSGlobals);
864   }
865 
866   CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty);
867 
868   // Create a call to malloc function which does device global memory allocation
869   // with size equals to all LDS global accesses size in this kernel.
870   Value *ReturnAddress =
871       IRB.CreateIntrinsic(Intrinsic::returnaddress, {IRB.getInt32(0)});
872   FunctionCallee MallocFunc = M.getOrInsertFunction(
873       StringRef("__asan_malloc_impl"),
874       FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false));
875   Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty);
876   Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt});
877 
878   Value *MallocPtr =
879       IRB.CreateIntToPtr(MallocCall, IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS));
880 
881   // Create store of malloc to new global
882   IRB.CreateStore(MallocPtr, SwLDS);
883 
884   // Create calls to __asan_poison_region to poison redzones.
885   poisonRedzones(Func, MallocPtr);
886 
887   // Create branch to PrevEntryBlock
888   IRB.CreateBr(PrevEntryBlock);
889 
890   // Create wave-group barrier at the starting of Previous entry block
891   Type *Int1Ty = IRB.getInt1Ty();
892   IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
893   auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond");
894   XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
895   XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
896 
897   IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
898 
899   // Load malloc pointer from Sw LDS.
900   Value *LoadMallocPtr =
901       IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), SwLDS);
902 
903   // Replace All uses of LDS globals with new LDS pointers.
904   replaceKernelLDSAccesses(Func);
905 
906   // Replace Memory Operations on LDS with corresponding
907   // global memory pointers.
908   translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
909                                              LDSInstructions);
910 
911   auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func);
912   auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func);
913   auto *EndBlock = BasicBlock::Create(Ctx, "End", Func);
914   for (BasicBlock &BB : *Func) {
915     if (!BB.empty()) {
916       if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
917         RI->eraseFromParent();
918         IRB.SetInsertPoint(&BB, BB.end());
919         IRB.CreateBr(CondFreeBlock);
920       }
921     }
922   }
923 
924   // Cond Free Block
925   IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
926   IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
927   IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
928 
929   // Free Block
930   IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
931 
932   // Free the previously allocate device global memory.
933   FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
934       StringRef("__asan_free_impl"),
935       FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
936   Value *ReturnAddr =
937       IRB.CreateIntrinsic(Intrinsic::returnaddress, IRB.getInt32(0));
938   Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
939   Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
940   IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
941 
942   IRB.CreateBr(EndBlock);
943 
944   // End Block
945   IRB.SetInsertPoint(EndBlock, EndBlock->begin());
946   IRB.CreateRetVoid();
947   // Update the DomTree with corresponding links to basic blocks.
948   DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
949                     {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
950                     {DominatorTree::Insert, CondFreeBlock, FreeBlock},
951                     {DominatorTree::Insert, FreeBlock, EndBlock}});
952 }
953 
getAddressesOfVariablesInKernel(Function * Func,SetVector<GlobalVariable * > & Variables)954 Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
955     Function *Func, SetVector<GlobalVariable *> &Variables) {
956   Type *Int32Ty = IRB.getInt32Ty();
957   auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
958 
959   GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
960   assert(SwLDSMetadata);
961   auto *SwLDSMetadataStructType =
962       cast<StructType>(SwLDSMetadata->getValueType());
963   ArrayType *KernelOffsetsType =
964       ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), Variables.size());
965 
966   SmallVector<Constant *> Elements;
967   for (auto *GV : Variables) {
968     auto It = LDSParams.LDSToReplacementIndicesMap.find(GV);
969     if (It == LDSParams.LDSToReplacementIndicesMap.end()) {
970       Elements.push_back(
971           PoisonValue::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS)));
972       continue;
973     }
974     auto &Indices = It->second;
975     Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
976                           ConstantInt::get(Int32Ty, Indices[1]),
977                           ConstantInt::get(Int32Ty, Indices[2])};
978     Constant *GEP = ConstantExpr::getGetElementPtr(SwLDSMetadataStructType,
979                                                    SwLDSMetadata, GEPIdx, true);
980     Elements.push_back(GEP);
981   }
982   return ConstantArray::get(KernelOffsetsType, Elements);
983 }
984 
buildNonKernelLDSBaseTable(NonKernelLDSParameters & NKLDSParams)985 void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
986     NonKernelLDSParameters &NKLDSParams) {
987   // Base table will have single row, with elements of the row
988   // placed as per kernel ID. Each element in the row corresponds
989   // to addresss of "SW LDS" global of the kernel.
990   auto &Kernels = NKLDSParams.OrderedKernels;
991   if (Kernels.empty())
992     return;
993   Type *Int32Ty = IRB.getInt32Ty();
994   const size_t NumberKernels = Kernels.size();
995   ArrayType *AllKernelsOffsetsType =
996       ArrayType::get(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), NumberKernels);
997   std::vector<Constant *> OverallConstantExprElts(NumberKernels);
998   for (size_t i = 0; i < NumberKernels; i++) {
999     Function *Func = Kernels[i];
1000     auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1001     GlobalVariable *SwLDS = LDSParams.SwLDS;
1002     assert(SwLDS);
1003     Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)};
1004     Constant *GEP =
1005         ConstantExpr::getGetElementPtr(SwLDS->getType(), SwLDS, GEPIdx, true);
1006     OverallConstantExprElts[i] = GEP;
1007   }
1008   Constant *init =
1009       ConstantArray::get(AllKernelsOffsetsType, OverallConstantExprElts);
1010   NKLDSParams.LDSBaseTable = new GlobalVariable(
1011       M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
1012       "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
1013       AMDGPUAS::GLOBAL_ADDRESS);
1014   GlobalValue::SanitizerMetadata MD;
1015   MD.NoAddress = true;
1016   NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
1017 }
1018 
buildNonKernelLDSOffsetTable(NonKernelLDSParameters & NKLDSParams)1019 void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1020     NonKernelLDSParameters &NKLDSParams) {
1021   // Offset table will have multiple rows and columns.
1022   // Rows are assumed to be from 0 to (n-1). n is total number
1023   // of kernels accessing the LDS through non-kernels.
1024   // Each row will have m elements. m is the total number of
1025   // unique LDS globals accessed by non-kernels.
1026   // Each element in the row correspond to the address of
1027   // the replacement of LDS global done by that particular kernel.
1028   auto &Variables = NKLDSParams.OrdereLDSGlobals;
1029   auto &Kernels = NKLDSParams.OrderedKernels;
1030   if (Variables.empty() || Kernels.empty())
1031     return;
1032   const size_t NumberVariables = Variables.size();
1033   const size_t NumberKernels = Kernels.size();
1034 
1035   ArrayType *KernelOffsetsType =
1036       ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumberVariables);
1037 
1038   ArrayType *AllKernelsOffsetsType =
1039       ArrayType::get(KernelOffsetsType, NumberKernels);
1040   std::vector<Constant *> overallConstantExprElts(NumberKernels);
1041   for (size_t i = 0; i < NumberKernels; i++) {
1042     Function *Func = Kernels[i];
1043     overallConstantExprElts[i] =
1044         getAddressesOfVariablesInKernel(Func, Variables);
1045   }
1046   Constant *Init =
1047       ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
1048   NKLDSParams.LDSOffsetTable = new GlobalVariable(
1049       M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
1050       "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
1051       AMDGPUAS::GLOBAL_ADDRESS);
1052   GlobalValue::SanitizerMetadata MD;
1053   MD.NoAddress = true;
1054   NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1055 }
1056 
lowerNonKernelLDSAccesses(Function * Func,SetVector<GlobalVariable * > & LDSGlobals,NonKernelLDSParameters & NKLDSParams)1057 void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1058     Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
1059     NonKernelLDSParameters &NKLDSParams) {
1060   // Replace LDS access in non-kernel with replacement queried from
1061   // Base table and offset from offset table.
1062   LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
1063                     << Func->getName());
1064   auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1065   IRB.SetInsertPoint(InsertAt);
1066 
1067   // Get LDS memory instructions.
1068   SetVector<Instruction *> LDSInstructions;
1069   getLDSMemoryInstructions(Func, LDSInstructions);
1070 
1071   auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {});
1072   GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
1073   GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
1074   auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1075   Value *BaseGEP = IRB.CreateInBoundsGEP(
1076       LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
1077   Value *BaseLoad =
1078       IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), BaseGEP);
1079   Value *LoadMallocPtr =
1080       IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), BaseLoad);
1081 
1082   for (GlobalVariable *GV : LDSGlobals) {
1083     const auto *GVIt = llvm::find(OrdereLDSGlobals, GV);
1084     assert(GVIt != OrdereLDSGlobals.end());
1085     uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
1086 
1087     Value *OffsetGEP = IRB.CreateInBoundsGEP(
1088         LDSOffsetTable->getValueType(), LDSOffsetTable,
1089         {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)});
1090     Value *OffsetLoad =
1091         IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), OffsetGEP);
1092     Value *Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
1093     Value *BasePlusOffset =
1094         IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset});
1095     LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
1096                       << GV->getName());
1097     replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
1098   }
1099   translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1100                                              LDSInstructions);
1101 }
1102 
reorderStaticDynamicIndirectLDSSet(KernelLDSParameters & LDSParams)1103 static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1104   // Sort Static, dynamic LDS globals which are either
1105   // direct or indirect access on basis of name.
1106   auto &DirectAccess = LDSParams.DirectAccess;
1107   auto &IndirectAccess = LDSParams.IndirectAccess;
1108   LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1109       std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1110                                     DirectAccess.StaticLDSGlobals.end()));
1111   LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1112       std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1113                                     DirectAccess.DynamicLDSGlobals.end()));
1114   LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1115       std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1116                                     IndirectAccess.StaticLDSGlobals.end()));
1117   LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1118       std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1119                                     IndirectAccess.DynamicLDSGlobals.end()));
1120 }
1121 
initAsanInfo()1122 void AMDGPUSwLowerLDS::initAsanInfo() {
1123   // Get Shadow mapping scale and offset.
1124   unsigned LongSize =
1125       M.getDataLayout().getPointerSizeInBits(AMDGPUAS::GLOBAL_ADDRESS);
1126   uint64_t Offset;
1127   int Scale;
1128   bool OrShadowOffset;
1129   llvm::getAddressSanitizerParams(AMDGPUTM.getTargetTriple(), LongSize, false,
1130                                   &Offset, &Scale, &OrShadowOffset);
1131   AsanInfo.Scale = Scale;
1132   AsanInfo.Offset = Offset;
1133 }
1134 
hasFnWithSanitizeAddressAttr(FunctionVariableMap & LDSAccesses)1135 static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) {
1136   for (auto &K : LDSAccesses) {
1137     Function *F = K.first;
1138     if (!F)
1139       continue;
1140     if (F->hasFnAttribute(Attribute::SanitizeAddress))
1141       return true;
1142   }
1143   return false;
1144 }
1145 
run()1146 bool AMDGPUSwLowerLDS::run() {
1147   bool Changed = false;
1148 
1149   CallGraph CG = CallGraph(M);
1150 
1151   Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
1152 
1153   // Get all the direct and indirect access of LDS for all the kernels.
1154   LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
1155 
1156   // Flag to decide whether to lower all the LDS accesses
1157   // based on sanitize_address attribute.
1158   bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSUsesInfo.direct_access) ||
1159                      hasFnWithSanitizeAddressAttr(LDSUsesInfo.indirect_access);
1160 
1161   if (!LowerAllLDS)
1162     return Changed;
1163 
1164   // Utility to group LDS access into direct, indirect, static and dynamic.
1165   auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
1166                                             bool DirectAccess) {
1167     for (auto &K : LDSAccesses) {
1168       Function *F = K.first;
1169       if (!F || K.second.empty())
1170         continue;
1171 
1172       assert(isKernelLDS(F));
1173 
1174       // Only inserts if key isn't already in the map.
1175       FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1176           {F, KernelLDSParameters()});
1177 
1178       auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F];
1179       if (!DirectAccess)
1180         FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(F);
1181       for (GlobalVariable *GV : K.second) {
1182         if (!DirectAccess) {
1183           if (AMDGPU::isDynamicLDS(*GV))
1184             LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV);
1185           else
1186             LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV);
1187           FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV);
1188         } else {
1189           if (AMDGPU::isDynamicLDS(*GV))
1190             LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV);
1191           else
1192             LDSParams.DirectAccess.StaticLDSGlobals.insert(GV);
1193         }
1194       }
1195     }
1196   };
1197 
1198   PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true);
1199   PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false);
1200 
1201   // Get address sanitizer scale.
1202   initAsanInfo();
1203 
1204   for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1205     Function *Func = K.first;
1206     auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1207     if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1208         LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1209         LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1210         LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1211       Changed = false;
1212     } else {
1213       removeFnAttrFromReachable(
1214           CG, Func,
1215           {"amdgpu-no-workitem-id-x", "amdgpu-no-workitem-id-y",
1216            "amdgpu-no-workitem-id-z", "amdgpu-no-heap-ptr"});
1217       if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() ||
1218           !LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
1219         removeFnAttrFromReachable(CG, Func, {"amdgpu-no-lds-kernel-id"});
1220       reorderStaticDynamicIndirectLDSSet(LDSParams);
1221       buildSwLDSGlobal(Func);
1222       buildSwDynLDSGlobal(Func);
1223       populateSwMetadataGlobal(Func);
1224       populateSwLDSAttributeAndMetadata(Func);
1225       populateLDSToReplacementIndicesMap(Func);
1226       DomTreeUpdater DTU(DTCallback(*Func),
1227                          DomTreeUpdater::UpdateStrategy::Lazy);
1228       lowerKernelLDSAccesses(Func, DTU);
1229       Changed = true;
1230     }
1231   }
1232 
1233   // Get the Uses of LDS from non-kernels.
1234   getUsesOfLDSByNonKernels();
1235 
1236   // Get non-kernels with LDS ptr as argument and called by kernels.
1237   getNonKernelsWithLDSArguments(CG);
1238 
1239   // Lower LDS accesses in non-kernels.
1240   if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1241       !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1242     NonKernelLDSParameters NKLDSParams;
1243     NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1244         FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1245     NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1246         FuncLDSAccessInfo.AllNonKernelLDSAccess);
1247     buildNonKernelLDSBaseTable(NKLDSParams);
1248     buildNonKernelLDSOffsetTable(NKLDSParams);
1249     for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1250       Function *Func = K.first;
1251       DenseSet<GlobalVariable *> &LDSGlobals = K.second;
1252       SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
1253           std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
1254       lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
1255     }
1256     for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1257       auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1258       if (K.contains(Func))
1259         continue;
1260       SetVector<llvm::GlobalVariable *> Vec;
1261       lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams);
1262     }
1263     Changed = true;
1264   }
1265 
1266   if (!Changed)
1267     return Changed;
1268 
1269   for (auto &GV : make_early_inc_range(M.globals())) {
1270     if (AMDGPU::isLDSVariableToLower(GV)) {
1271       // probably want to remove from used lists
1272       GV.removeDeadConstantUsers();
1273       if (GV.use_empty())
1274         GV.eraseFromParent();
1275     }
1276   }
1277 
1278   if (AsanInstrumentLDS) {
1279     SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
1280     for (Instruction *Inst : AsanInfo.Instructions) {
1281       SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
1282       getInterestingMemoryOperands(M, Inst, InterestingOperands);
1283       llvm::append_range(OperandsToInstrument, InterestingOperands);
1284     }
1285     for (auto &Operand : OperandsToInstrument) {
1286       Value *Addr = Operand.getPtr();
1287       instrumentAddress(M, IRB, Operand.getInsn(), Operand.getInsn(), Addr,
1288                         Operand.Alignment.valueOrOne(), Operand.TypeStoreSize,
1289                         Operand.IsWrite, nullptr, false, false, AsanInfo.Scale,
1290                         AsanInfo.Offset);
1291       Changed = true;
1292     }
1293   }
1294 
1295   return Changed;
1296 }
1297 
1298 class AMDGPUSwLowerLDSLegacy : public ModulePass {
1299 public:
1300   const AMDGPUTargetMachine *AMDGPUTM;
1301   static char ID;
AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine * TM)1302   AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM)
1303       : ModulePass(ID), AMDGPUTM(TM) {}
1304   bool runOnModule(Module &M) override;
getAnalysisUsage(AnalysisUsage & AU) const1305   void getAnalysisUsage(AnalysisUsage &AU) const override {
1306     AU.addPreserved<DominatorTreeWrapperPass>();
1307   }
1308 };
1309 } // namespace
1310 
1311 char AMDGPUSwLowerLDSLegacy::ID = 0;
1312 char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
1313 
1314 INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1315                       "AMDGPU Software lowering of LDS", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)1316 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
1317 INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1318                     "AMDGPU Software lowering of LDS", false, false)
1319 
1320 bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
1321   // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1322   // instrumented the IR. Return early if the flag is not present.
1323   if (!M.getModuleFlag("nosanitize_address"))
1324     return false;
1325   DominatorTreeWrapperPass *const DTW =
1326       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1327   auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
1328     return DTW ? &DTW->getDomTree() : nullptr;
1329   };
1330   if (!AMDGPUTM) {
1331     auto &TPC = getAnalysis<TargetPassConfig>();
1332     AMDGPUTM = &TPC.getTM<AMDGPUTargetMachine>();
1333   }
1334   AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback);
1335   bool IsChanged = SwLowerLDSImpl.run();
1336   return IsChanged;
1337 }
1338 
1339 ModulePass *
createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine * TM)1340 llvm::createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM) {
1341   return new AMDGPUSwLowerLDSLegacy(TM);
1342 }
1343 
run(Module & M,ModuleAnalysisManager & AM)1344 PreservedAnalyses AMDGPUSwLowerLDSPass::run(Module &M,
1345                                             ModuleAnalysisManager &AM) {
1346   // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1347   // instrumented the IR. Return early if the flag is not present.
1348   if (!M.getModuleFlag("nosanitize_address"))
1349     return PreservedAnalyses::all();
1350   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1351   auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
1352     return &FAM.getResult<DominatorTreeAnalysis>(F);
1353   };
1354   AMDGPUSwLowerLDS SwLowerLDSImpl(M, TM, DTCallback);
1355   bool IsChanged = SwLowerLDSImpl.run();
1356   if (!IsChanged)
1357     return PreservedAnalyses::all();
1358 
1359   PreservedAnalyses PA;
1360   PA.preserve<DominatorTreeAnalysis>();
1361   return PA;
1362 }
1363