1 //===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass lowers the local data store, LDS, uses in kernel and non-kernel
10 // functions in module to use dynamically allocated global memory.
11 // Packed LDS Layout is emulated in the global memory.
12 // The lowered memory instructions from LDS to global memory are then
13 // instrumented for address sanitizer, to catch addressing errors.
14 // This pass only work when address sanitizer has been enabled and has
15 // instrumented the IR. It identifies that IR has been instrumented using
16 // "nosanitize_address" module flag.
17 //
18 // Replacement of Kernel LDS accesses:
19 // For a kernel, LDS access can be static or dynamic which are direct
20 // (accessed within kernel) and indirect (accessed through non-kernels).
21 // All these LDS accesses corresponding to kernel will be packed together,
22 // where all static LDS accesses will be allocated first and then dynamic
23 // LDS follows. The total size with alignment is calculated. A new LDS global
24 // will be created for the kernel called "SW LDS" and it will have the
25 // attribute "amdgpu-lds-size" attached with value of the size calculated.
26 // All the LDS accesses in the module will be replaced by GEP with offset
27 // into the "Sw LDS".
28 // A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing
29 // the dynamic LDS. This will be marked used by kernel and will have
30 // MD_absolue_symbol metadata set to total static LDS size, Since dynamic
31 // LDS allocation starts after all static LDS allocation.
32 //
33 // A device global memory equal to the total LDS size will be allocated.
34 // At the prologue of the kernel, a single work-item from the
35 // work-group, does a "malloc" and stores the pointer of the
36 // allocation in "SW LDS".
37 //
38 // To store the offsets corresponding to all LDS accesses, another global
39 // variable is created which will be called "SW LDS metadata" in this pass.
40 // - SW LDS Global:
41 // It is LDS global of ptr type with name
42 // "llvm.amdgcn.sw.lds.<kernel-name>".
43 // - Metadata Global:
44 // It is of struct type, with n members. n equals the number of LDS
45 // globals accessed by the kernel(direct and indirect). Each member of
46 // struct is another struct of type {i32, i32, i32}. First member
47 // corresponds to offset, second member corresponds to size of LDS global
48 // being replaced and third represents the total aligned size. It will
49 // have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
50 // an intializer with static LDS related offsets and sizes initialized.
51 // But for dynamic LDS related entries, offsets will be intialized to
52 // previous static LDS allocation end offset. Sizes for them will be zero
53 // initially. These dynamic LDS offset and size values will be updated
54 // within the kernel, since kernel can read the dynamic LDS size
55 // allocation done at runtime with query to "hidden_dynamic_lds_size"
56 // hidden kernel argument.
57 //
58 // At the epilogue of kernel, allocated memory would be made free by the same
59 // single work-item.
60 //
61 // Replacement of non-kernel LDS accesses:
62 // Multiple kernels can access the same non-kernel function.
63 // All the kernels accessing LDS through non-kernels are sorted and
64 // assigned a kernel-id. All the LDS globals accessed by non-kernels
65 // are sorted. This information is used to build two tables:
66 // - Base table:
67 // Base table will have single row, with elements of the row
68 // placed as per kernel ID. Each element in the row corresponds
69 // to ptr of "SW LDS" variable created for that kernel.
70 // - Offset table:
71 // Offset table will have multiple rows and columns.
72 // Rows are assumed to be from 0 to (n-1). n is total number
73 // of kernels accessing the LDS through non-kernels.
74 // Each row will have m elements. m is the total number of
75 // unique LDS globals accessed by all non-kernels.
76 // Each element in the row correspond to the ptr of
77 // the replacement of LDS global done by that particular kernel.
78 // A LDS variable in non-kernel will be replaced based on the information
79 // from base and offset tables. Based on kernel-id query, ptr of "SW
80 // LDS" for that corresponding kernel is obtained from base table.
81 // The Offset into the base "SW LDS" is obtained from
82 // corresponding element in offset table. With this information, replacement
83 // value is obtained.
84 //===----------------------------------------------------------------------===//
85
86 #include "AMDGPU.h"
87 #include "AMDGPUAsanInstrumentation.h"
88 #include "AMDGPUMemoryUtils.h"
89 #include "AMDGPUTargetMachine.h"
90 #include "llvm/ADT/DenseMap.h"
91 #include "llvm/ADT/DenseSet.h"
92 #include "llvm/ADT/SetVector.h"
93 #include "llvm/ADT/StringExtras.h"
94 #include "llvm/ADT/StringRef.h"
95 #include "llvm/Analysis/CallGraph.h"
96 #include "llvm/Analysis/DomTreeUpdater.h"
97 #include "llvm/CodeGen/TargetPassConfig.h"
98 #include "llvm/IR/Constants.h"
99 #include "llvm/IR/DIBuilder.h"
100 #include "llvm/IR/DebugInfo.h"
101 #include "llvm/IR/DebugInfoMetadata.h"
102 #include "llvm/IR/IRBuilder.h"
103 #include "llvm/IR/Instructions.h"
104 #include "llvm/IR/IntrinsicsAMDGPU.h"
105 #include "llvm/IR/MDBuilder.h"
106 #include "llvm/IR/ReplaceConstant.h"
107 #include "llvm/Pass.h"
108 #include "llvm/Support/raw_ostream.h"
109 #include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
110 #include "llvm/Transforms/Utils/ModuleUtils.h"
111
112 #include <algorithm>
113
114 #define DEBUG_TYPE "amdgpu-sw-lower-lds"
115 #define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
116
117 using namespace llvm;
118 using namespace AMDGPU;
119
120 namespace {
121
122 cl::opt<bool>
123 AsanInstrumentLDS("amdgpu-asan-instrument-lds",
124 cl::desc("Run asan instrumentation on LDS instructions "
125 "lowered to global memory"),
126 cl::init(true), cl::Hidden);
127
128 using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
129
130 struct LDSAccessTypeInfo {
131 SetVector<GlobalVariable *> StaticLDSGlobals;
132 SetVector<GlobalVariable *> DynamicLDSGlobals;
133 };
134
135 // Struct to hold all the Metadata required for a kernel
136 // to replace a LDS global uses with corresponding offset
137 // in to device global memory.
138 struct KernelLDSParameters {
139 GlobalVariable *SwLDS = nullptr;
140 GlobalVariable *SwDynLDS = nullptr;
141 GlobalVariable *SwLDSMetadata = nullptr;
142 LDSAccessTypeInfo DirectAccess;
143 LDSAccessTypeInfo IndirectAccess;
144 DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>>
145 LDSToReplacementIndicesMap;
146 uint32_t MallocSize = 0;
147 uint32_t LDSSize = 0;
148 SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector;
149 };
150
151 // Struct to store information for creation of offset table
152 // for all the non-kernel LDS accesses.
153 struct NonKernelLDSParameters {
154 GlobalVariable *LDSBaseTable = nullptr;
155 GlobalVariable *LDSOffsetTable = nullptr;
156 SetVector<Function *> OrderedKernels;
157 SetVector<GlobalVariable *> OrdereLDSGlobals;
158 };
159
160 struct AsanInstrumentInfo {
161 int Scale = 0;
162 uint32_t Offset = 0;
163 SetVector<Instruction *> Instructions;
164 };
165
166 struct FunctionsAndLDSAccess {
167 DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
168 SetVector<Function *> KernelsWithIndirectLDSAccess;
169 SetVector<Function *> NonKernelsWithLDSArgument;
170 SetVector<GlobalVariable *> AllNonKernelLDSAccess;
171 FunctionVariableMap NonKernelToLDSAccessMap;
172 };
173
174 class AMDGPUSwLowerLDS {
175 public:
AMDGPUSwLowerLDS(Module & Mod,const AMDGPUTargetMachine & TM,DomTreeCallback Callback)176 AMDGPUSwLowerLDS(Module &Mod, const AMDGPUTargetMachine &TM,
177 DomTreeCallback Callback)
178 : M(Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {}
179 bool run();
180 void getUsesOfLDSByNonKernels();
181 void getNonKernelsWithLDSArguments(const CallGraph &CG);
182 SetVector<Function *>
183 getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels);
184 SetVector<GlobalVariable *>
185 getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables);
186 void buildSwLDSGlobal(Function *Func);
187 void buildSwDynLDSGlobal(Function *Func);
188 void populateSwMetadataGlobal(Function *Func);
189 void populateSwLDSAttributeAndMetadata(Function *Func);
190 void populateLDSToReplacementIndicesMap(Function *Func);
191 void getLDSMemoryInstructions(Function *Func,
192 SetVector<Instruction *> &LDSInstructions);
193 void replaceKernelLDSAccesses(Function *Func);
194 Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr);
195 void translateLDSMemoryOperationsToGlobalMemory(
196 Function *Func, Value *LoadMallocPtr,
197 SetVector<Instruction *> &LDSInstructions);
198 void poisonRedzones(Function *Func, Value *MallocPtr);
199 void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
200 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
201 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
202 Constant *
203 getAddressesOfVariablesInKernel(Function *Func,
204 SetVector<GlobalVariable *> &Variables);
205 void lowerNonKernelLDSAccesses(Function *Func,
206 SetVector<GlobalVariable *> &LDSGlobals,
207 NonKernelLDSParameters &NKLDSParams);
208 void
209 updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize,
210 Value *HiddenDynLDSSize,
211 SetVector<GlobalVariable *> &DynamicLDSGlobals);
212 void initAsanInfo();
213
214 private:
215 Module &M;
216 const AMDGPUTargetMachine &AMDGPUTM;
217 IRBuilder<> IRB;
218 DomTreeCallback DTCallback;
219 FunctionsAndLDSAccess FuncLDSAccessInfo;
220 AsanInstrumentInfo AsanInfo;
221 };
222
sortByName(std::vector<T> && V)223 template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
224 // Sort the vector of globals or Functions based on their name.
225 // Returns a SetVector of globals/Functions.
226 sort(V, [](const auto *L, const auto *R) {
227 return L->getName() < R->getName();
228 });
229 return {SetVector<T>(llvm::from_range, V)};
230 }
231
getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable * > & Variables)232 SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
233 SetVector<GlobalVariable *> &Variables) {
234 // Sort all the non-kernel LDS accesses based on their name.
235 return sortByName(
236 std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
237 }
238
getOrderedIndirectLDSAccessingKernels(SetVector<Function * > & Kernels)239 SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
240 SetVector<Function *> &Kernels) {
241 // Sort the non-kernels accessing LDS based on their name.
242 // Also assign a kernel ID metadata based on the sorted order.
243 LLVMContext &Ctx = M.getContext();
244 if (Kernels.size() > UINT32_MAX) {
245 report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels");
246 }
247 SetVector<Function *> OrderedKernels =
248 sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
249 for (size_t i = 0; i < Kernels.size(); i++) {
250 Metadata *AttrMDArgs[1] = {
251 ConstantAsMetadata::get(IRB.getInt32(i)),
252 };
253 Function *Func = OrderedKernels[i];
254 Func->setMetadata("llvm.amdgcn.lds.kernel.id",
255 MDNode::get(Ctx, AttrMDArgs));
256 }
257 return OrderedKernels;
258 }
259
getNonKernelsWithLDSArguments(const CallGraph & CG)260 void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
261 // Among the kernels accessing LDS, get list of
262 // Non-kernels to which a call is made and a ptr
263 // to addrspace(3) is passed as argument.
264 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
265 Function *Func = K.first;
266 const CallGraphNode *CGN = CG[Func];
267 if (!CGN)
268 continue;
269 for (auto &I : *CGN) {
270 CallGraphNode *CallerCGN = I.second;
271 Function *CalledFunc = CallerCGN->getFunction();
272 if (!CalledFunc || CalledFunc->isDeclaration())
273 continue;
274 if (AMDGPU::isKernelLDS(CalledFunc))
275 continue;
276 for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
277 AI != E; ++AI) {
278 Type *ArgTy = (*AI).getType();
279 if (!ArgTy->isPointerTy())
280 continue;
281 if (ArgTy->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
282 continue;
283 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc);
284 // Also add the Calling function to KernelsWithIndirectLDSAccess list
285 // so that base table of LDS is generated.
286 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func);
287 }
288 }
289 }
290 }
291
getUsesOfLDSByNonKernels()292 void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
293 for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
294 if (!AMDGPU::isLDSVariableToLower(*GV))
295 continue;
296
297 for (User *V : GV->users()) {
298 if (auto *I = dyn_cast<Instruction>(V)) {
299 Function *F = I->getFunction();
300 if (!isKernelLDS(F) && !F->isDeclaration())
301 FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
302 }
303 }
304 }
305 }
306
recordLDSAbsoluteAddress(Module & M,GlobalVariable * GV,uint32_t Address)307 static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
308 uint32_t Address) {
309 // Write the specified address into metadata where it can be retrieved by
310 // the assembler. Format is a half open range, [Address Address+1)
311 LLVMContext &Ctx = M.getContext();
312 auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
313 MDBuilder MDB(Ctx);
314 MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
315 ConstantInt::get(IntTy, Address + 1));
316 GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
317 }
318
addLDSSizeAttribute(Function * Func,uint32_t Offset,bool IsDynLDS)319 static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
320 bool IsDynLDS) {
321 if (Offset != 0) {
322 std::string Buffer;
323 raw_string_ostream SS{Buffer};
324 SS << Offset;
325 if (IsDynLDS)
326 SS << "," << Offset;
327 Func->addFnAttr("amdgpu-lds-size", Buffer);
328 }
329 }
330
markUsedByKernel(Function * Func,GlobalVariable * SGV)331 static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
332 BasicBlock *Entry = &Func->getEntryBlock();
333 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
334
335 Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(),
336 Intrinsic::donothing, {});
337
338 Value *UseInstance[1] = {
339 Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
340
341 Builder.CreateCall(Decl, {},
342 {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
343 }
344
buildSwLDSGlobal(Function * Func)345 void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) {
346 // Create new LDS global required for each kernel to store
347 // device global memory pointer.
348 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
349 // Create new global pointer variable
350 LDSParams.SwLDS = new GlobalVariable(
351 M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
352 PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
353 nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
354 GlobalValue::SanitizerMetadata MD;
355 MD.NoAddress = true;
356 LDSParams.SwLDS->setSanitizerMetadata(MD);
357 }
358
buildSwDynLDSGlobal(Function * Func)359 void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) {
360 // Create new Dyn LDS global if kernel accesses dyn LDS.
361 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
362 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
363 LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
364 return;
365 // Create new global pointer variable
366 auto *emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0);
367 LDSParams.SwDynLDS = new GlobalVariable(
368 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
369 "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr,
370 GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
371 markUsedByKernel(Func, LDSParams.SwDynLDS);
372 GlobalValue::SanitizerMetadata MD;
373 MD.NoAddress = true;
374 LDSParams.SwDynLDS->setSanitizerMetadata(MD);
375 }
376
populateSwLDSAttributeAndMetadata(Function * Func)377 void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
378 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
379 bool IsDynLDSUsed = LDSParams.SwDynLDS;
380 uint32_t Offset = LDSParams.LDSSize;
381 recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
382 addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
383 if (LDSParams.SwDynLDS)
384 recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
385 }
386
populateSwMetadataGlobal(Function * Func)387 void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
388 // Create new metadata global for every kernel and initialize the
389 // start offsets and sizes corresponding to each LDS accesses.
390 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
391 auto &Ctx = M.getContext();
392 auto &DL = M.getDataLayout();
393 std::vector<Type *> Items;
394 Type *Int32Ty = IRB.getInt32Ty();
395 std::vector<Constant *> Initializers;
396 Align MaxAlignment(1);
397 auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
398 Align GVAlign = AMDGPU::getAlign(DL, GV);
399 MaxAlignment = std::max(MaxAlignment, GVAlign);
400 };
401
402 for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
403 UpdateMaxAlignment(GV);
404
405 for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
406 UpdateMaxAlignment(GV);
407
408 for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
409 UpdateMaxAlignment(GV);
410
411 for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
412 UpdateMaxAlignment(GV);
413
414 //{StartOffset, AlignedSizeInBytes}
415 SmallString<128> MDItemStr;
416 raw_svector_ostream MDItemOS(MDItemStr);
417 MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item";
418
419 StructType *LDSItemTy =
420 StructType::create(Ctx, {Int32Ty, Int32Ty, Int32Ty}, MDItemOS.str());
421 uint32_t &MallocSize = LDSParams.MallocSize;
422 SetVector<GlobalVariable *> UniqueLDSGlobals;
423 int AsanScale = AsanInfo.Scale;
424 auto buildInitializerForSwLDSMD =
425 [&](SetVector<GlobalVariable *> &LDSGlobals) {
426 for (auto &GV : LDSGlobals) {
427 if (is_contained(UniqueLDSGlobals, GV))
428 continue;
429 UniqueLDSGlobals.insert(GV);
430
431 Type *Ty = GV->getValueType();
432 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
433 Items.push_back(LDSItemTy);
434 Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize);
435 Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
436 // Get redzone size corresponding a size.
437 const uint64_t RightRedzoneSize =
438 AMDGPU::getRedzoneSizeForGlobal(AsanScale, SizeInBytes);
439 // Update MallocSize with current size and redzone size.
440 MallocSize += SizeInBytes;
441 if (!AMDGPU::isDynamicLDS(*GV))
442 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize,
443 RightRedzoneSize);
444 MallocSize += RightRedzoneSize;
445 // Align current size plus redzone.
446 uint64_t AlignedSize =
447 alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment);
448 Constant *AlignedSizeInBytesConst =
449 ConstantInt::get(Int32Ty, AlignedSize);
450 // Align MallocSize
451 MallocSize = alignTo(MallocSize, MaxAlignment);
452 Constant *InitItem =
453 ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst,
454 AlignedSizeInBytesConst});
455 Initializers.push_back(InitItem);
456 }
457 };
458 SetVector<GlobalVariable *> SwLDSVector;
459 SwLDSVector.insert(LDSParams.SwLDS);
460 buildInitializerForSwLDSMD(SwLDSVector);
461 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
462 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
463 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
464 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
465
466 // Update the LDS size used by the kernel.
467 Type *Ty = LDSParams.SwLDS->getValueType();
468 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
469 uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
470 LDSParams.LDSSize = AlignedSize;
471 SmallString<128> MDTypeStr;
472 raw_svector_ostream MDTypeOS(MDTypeStr);
473 MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type";
474 StructType *MetadataStructType =
475 StructType::create(Ctx, Items, MDTypeOS.str());
476 SmallString<128> MDStr;
477 raw_svector_ostream MDOS(MDStr);
478 MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md";
479 LDSParams.SwLDSMetadata = new GlobalVariable(
480 M, MetadataStructType, false, GlobalValue::InternalLinkage,
481 PoisonValue::get(MetadataStructType), MDOS.str(), nullptr,
482 GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false);
483 Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
484 LDSParams.SwLDSMetadata->setInitializer(data);
485 assert(LDSParams.SwLDS);
486 // Set the alignment to MaxAlignment for SwLDS.
487 LDSParams.SwLDS->setAlignment(MaxAlignment);
488 if (LDSParams.SwDynLDS)
489 LDSParams.SwDynLDS->setAlignment(MaxAlignment);
490 GlobalValue::SanitizerMetadata MD;
491 MD.NoAddress = true;
492 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
493 }
494
populateLDSToReplacementIndicesMap(Function * Func)495 void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
496 // Fill the corresponding LDS replacement indices for each LDS access
497 // related to this kernel.
498 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
499 SetVector<GlobalVariable *> UniqueLDSGlobals;
500 auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
501 uint32_t &Idx) {
502 for (auto &GV : LDSGlobals) {
503 if (is_contained(UniqueLDSGlobals, GV))
504 continue;
505 UniqueLDSGlobals.insert(GV);
506 LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
507 ++Idx;
508 }
509 };
510 uint32_t Idx = 0;
511 SetVector<GlobalVariable *> SwLDSVector;
512 SwLDSVector.insert(LDSParams.SwLDS);
513 PopulateIndices(SwLDSVector, Idx);
514 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
515 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
516 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
517 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
518 }
519
replacesUsesOfGlobalInFunction(Function * Func,GlobalVariable * GV,Value * Replacement)520 static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
521 Value *Replacement) {
522 // Replace all uses of LDS global in this Function with a Replacement.
523 auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
524 auto *V = U.getUser();
525 if (auto *Inst = dyn_cast<Instruction>(V)) {
526 auto *Func1 = Inst->getParent()->getParent();
527 if (Func == Func1)
528 return true;
529 }
530 return false;
531 };
532 GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda);
533 }
534
replaceKernelLDSAccesses(Function * Func)535 void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
536 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
537 GlobalVariable *SwLDS = LDSParams.SwLDS;
538 assert(SwLDS);
539 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
540 assert(SwLDSMetadata);
541 StructType *SwLDSMetadataStructType =
542 cast<StructType>(SwLDSMetadata->getValueType());
543 Type *Int32Ty = IRB.getInt32Ty();
544 auto &IndirectAccess = LDSParams.IndirectAccess;
545 auto &DirectAccess = LDSParams.DirectAccess;
546 // Replace all uses of LDS global in this Function with a Replacement.
547 SetVector<GlobalVariable *> UniqueLDSGlobals;
548 auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
549 for (auto &GV : LDSGlobals) {
550 // Do not generate instructions if LDS access is in non-kernel
551 // i.e indirect-access.
552 if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
553 IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
554 (!DirectAccess.StaticLDSGlobals.contains(GV) &&
555 !DirectAccess.DynamicLDSGlobals.contains(GV)))
556 continue;
557 if (is_contained(UniqueLDSGlobals, GV))
558 continue;
559 UniqueLDSGlobals.insert(GV);
560 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
561 assert(Indices.size() == 3);
562 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
563 ConstantInt::get(Int32Ty, Indices[1]),
564 ConstantInt::get(Int32Ty, Indices[2])};
565 Constant *GEP = ConstantExpr::getGetElementPtr(
566 SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
567 Value *Offset = IRB.CreateLoad(Int32Ty, GEP);
568 Value *BasePlusOffset =
569 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset});
570 LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ",
571 false));
572 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
573 }
574 };
575 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
576 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
577 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
578 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
579 }
580
updateMallocSizeForDynamicLDS(Function * Func,Value ** CurrMallocSize,Value * HiddenDynLDSSize,SetVector<GlobalVariable * > & DynamicLDSGlobals)581 void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
582 Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize,
583 SetVector<GlobalVariable *> &DynamicLDSGlobals) {
584 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
585 Type *Int32Ty = IRB.getInt32Ty();
586
587 GlobalVariable *SwLDS = LDSParams.SwLDS;
588 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
589 assert(SwLDS && SwLDSMetadata);
590 StructType *MetadataStructType =
591 cast<StructType>(SwLDSMetadata->getValueType());
592 unsigned MaxAlignment = SwLDS->getAlignment();
593 Value *MaxAlignValue = IRB.getInt32(MaxAlignment);
594 Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1);
595
596 for (GlobalVariable *DynGV : DynamicLDSGlobals) {
597 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
598 // Update the Offset metadata.
599 Constant *Index0 = ConstantInt::get(Int32Ty, 0);
600 Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]);
601
602 Constant *Index2Offset = ConstantInt::get(Int32Ty, 0);
603 auto *GEPForOffset = IRB.CreateInBoundsGEP(
604 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset});
605
606 IRB.CreateStore(*CurrMallocSize, GEPForOffset);
607 // Update the size and Aligned Size metadata.
608 Constant *Index2Size = ConstantInt::get(Int32Ty, 1);
609 auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
610 {Index0, Index1, Index2Size});
611
612 Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize);
613 IRB.CreateStore(CurrDynLDSSize, GEPForSize);
614 Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2);
615 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
616 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize});
617
618 Value *AlignedDynLDSSize =
619 IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne);
620 AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue);
621 AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue);
622 IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize);
623
624 // Update the Current Malloc Size
625 *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize);
626 }
627 }
628
getOrCreateDebugLoc(const Instruction * InsertBefore,DISubprogram * SP)629 static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
630 DISubprogram *SP) {
631 assert(InsertBefore);
632 if (InsertBefore->getDebugLoc())
633 return InsertBefore->getDebugLoc();
634 if (SP)
635 return DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
636 return DebugLoc();
637 }
638
getLDSMemoryInstructions(Function * Func,SetVector<Instruction * > & LDSInstructions)639 void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
640 Function *Func, SetVector<Instruction *> &LDSInstructions) {
641 for (BasicBlock &BB : *Func) {
642 for (Instruction &Inst : BB) {
643 if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
644 if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
645 LDSInstructions.insert(&Inst);
646 } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
647 if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
648 LDSInstructions.insert(&Inst);
649 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) {
650 if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
651 LDSInstructions.insert(&Inst);
652 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) {
653 if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
654 LDSInstructions.insert(&Inst);
655 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&Inst)) {
656 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
657 ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)
658 LDSInstructions.insert(&Inst);
659 } else
660 continue;
661 }
662 }
663 }
664
getTranslatedGlobalMemoryPtrOfLDS(Value * LoadMallocPtr,Value * LDSPtr)665 Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr,
666 Value *LDSPtr) {
667 assert(LDSPtr && "Invalid LDS pointer operand");
668 Type *LDSPtrType = LDSPtr->getType();
669 LLVMContext &Ctx = M.getContext();
670 const DataLayout &DL = M.getDataLayout();
671 Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
672 if (auto *VecPtrTy = dyn_cast<VectorType>(LDSPtrType)) {
673 // Handle vector of pointers
674 ElementCount NumElements = VecPtrTy->getElementCount();
675 IntTy = VectorType::get(IntTy, NumElements);
676 }
677 Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy);
678 return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex});
679 }
680
translateLDSMemoryOperationsToGlobalMemory(Function * Func,Value * LoadMallocPtr,SetVector<Instruction * > & LDSInstructions)681 void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
682 Function *Func, Value *LoadMallocPtr,
683 SetVector<Instruction *> &LDSInstructions) {
684 LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : "
685 << Func->getName());
686 for (Instruction *Inst : LDSInstructions) {
687 IRB.SetInsertPoint(Inst);
688 if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
689 Value *LIOperand = LI->getPointerOperand();
690 Value *Replacement =
691 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand);
692 LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement,
693 LI->getAlign(), LI->isVolatile());
694 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
695 AsanInfo.Instructions.insert(NewLI);
696 LI->replaceAllUsesWith(NewLI);
697 LI->eraseFromParent();
698 } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
699 Value *SIOperand = SI->getPointerOperand();
700 Value *Replacement =
701 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand);
702 StoreInst *NewSI = IRB.CreateAlignedStore(
703 SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile());
704 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
705 AsanInfo.Instructions.insert(NewSI);
706 SI->replaceAllUsesWith(NewSI);
707 SI->eraseFromParent();
708 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
709 Value *RMWPtrOperand = RMW->getPointerOperand();
710 Value *RMWValOperand = RMW->getValOperand();
711 Value *Replacement =
712 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand);
713 AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
714 RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(),
715 RMW->getOrdering(), RMW->getSyncScopeID());
716 NewRMW->setVolatile(RMW->isVolatile());
717 AsanInfo.Instructions.insert(NewRMW);
718 RMW->replaceAllUsesWith(NewRMW);
719 RMW->eraseFromParent();
720 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Inst)) {
721 Value *XCHGPtrOperand = XCHG->getPointerOperand();
722 Value *Replacement =
723 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand);
724 AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg(
725 Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(),
726 XCHG->getAlign(), XCHG->getSuccessOrdering(),
727 XCHG->getFailureOrdering(), XCHG->getSyncScopeID());
728 NewXCHG->setVolatile(XCHG->isVolatile());
729 AsanInfo.Instructions.insert(NewXCHG);
730 XCHG->replaceAllUsesWith(NewXCHG);
731 XCHG->eraseFromParent();
732 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Inst)) {
733 Value *AIOperand = ASC->getPointerOperand();
734 Value *Replacement =
735 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand);
736 Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType());
737 // Note: No need to add the instruction to AsanInfo instructions to be
738 // instrumented list. FLAT_ADDRESS ptr would have been already
739 // instrumented by asan pass prior to this pass.
740 ASC->replaceAllUsesWith(NewAI);
741 ASC->eraseFromParent();
742 } else
743 report_fatal_error("Unimplemented LDS lowering instruction");
744 }
745 }
746
poisonRedzones(Function * Func,Value * MallocPtr)747 void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) {
748 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
749 Type *Int64Ty = IRB.getInt64Ty();
750 Type *VoidTy = IRB.getVoidTy();
751 FunctionCallee AsanPoisonRegion = M.getOrInsertFunction(
752 "__asan_poison_region",
753 FunctionType::get(VoidTy, {Int64Ty, Int64Ty}, false));
754
755 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
756 size_t VecSize = RedzonesVec.size();
757 for (unsigned i = 0; i < VecSize; i++) {
758 auto &RedzonePair = RedzonesVec[i];
759 uint64_t RedzoneOffset = RedzonePair.first;
760 uint64_t RedzoneSize = RedzonePair.second;
761 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
762 IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)});
763 Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty);
764 IRB.CreateCall(AsanPoisonRegion,
765 {RedzoneAddress, IRB.getInt64(RedzoneSize)});
766 }
767 }
768
lowerKernelLDSAccesses(Function * Func,DomTreeUpdater & DTU)769 void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
770 DomTreeUpdater &DTU) {
771 LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName());
772 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
773 auto &Ctx = M.getContext();
774 auto *PrevEntryBlock = &Func->getEntryBlock();
775 SetVector<Instruction *> LDSInstructions;
776 getLDSMemoryInstructions(Func, LDSInstructions);
777
778 // Create malloc block.
779 auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock);
780
781 // Create WIdBlock block which has instructions related to selection of
782 // {0,0,0} indiex work item in the work group.
783 auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
784 IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
785 DebugLoc FirstDL =
786 getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram());
787 IRB.SetCurrentDebugLocation(FirstDL);
788 Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
789 Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {});
790 Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {});
791 Value *XYOr = IRB.CreateOr(WIdx, WIdy);
792 Value *XYZOr = IRB.CreateOr(XYOr, WIdz);
793 Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
794
795 // All work items will branch to PrevEntryBlock except {0,0,0} index
796 // work item which will branch to malloc block.
797 IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
798
799 // Malloc block
800 IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
801
802 // If Dynamic LDS globals are accessed by the kernel,
803 // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
804 // Update the corresponding metadata global entries for this dyn lds global.
805 GlobalVariable *SwLDS = LDSParams.SwLDS;
806 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
807 assert(SwLDS && SwLDSMetadata);
808 StructType *MetadataStructType =
809 cast<StructType>(SwLDSMetadata->getValueType());
810 uint32_t MallocSize = 0;
811 Value *CurrMallocSize;
812 Type *Int32Ty = IRB.getInt32Ty();
813 Type *Int64Ty = IRB.getInt64Ty();
814
815 SetVector<GlobalVariable *> UniqueLDSGlobals;
816 auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) {
817 for (auto &GV : LDSGlobals) {
818 if (is_contained(UniqueLDSGlobals, GV))
819 continue;
820 UniqueLDSGlobals.insert(GV);
821 }
822 };
823
824 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
825 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
826 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size();
827 UniqueLDSGlobals.clear();
828
829 if (NumStaticLDS) {
830 auto *GEPForEndStaticLDSOffset =
831 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
832 {ConstantInt::get(Int32Ty, 0),
833 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
834 ConstantInt::get(Int32Ty, 0)});
835
836 auto *GEPForEndStaticLDSSize =
837 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
838 {ConstantInt::get(Int32Ty, 0),
839 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
840 ConstantInt::get(Int32Ty, 2)});
841
842 Value *EndStaticLDSOffset =
843 IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset);
844 Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize);
845 CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize);
846 } else
847 CurrMallocSize = IRB.getInt32(MallocSize);
848
849 if (LDSParams.SwDynLDS) {
850 if (!(AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5))
851 report_fatal_error(
852 "Dynamic LDS size query is only supported for CO V5 and later.");
853 // Get size from hidden dyn_lds_size argument of kernel
854 Value *ImplicitArg =
855 IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {});
856 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
857 ImplicitArg->getType(), ImplicitArg,
858 {ConstantInt::get(Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)});
859 UniqueLDSGlobals.clear();
860 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
861 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
862 updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize,
863 UniqueLDSGlobals);
864 }
865
866 CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty);
867
868 // Create a call to malloc function which does device global memory allocation
869 // with size equals to all LDS global accesses size in this kernel.
870 Value *ReturnAddress =
871 IRB.CreateIntrinsic(Intrinsic::returnaddress, {IRB.getInt32(0)});
872 FunctionCallee MallocFunc = M.getOrInsertFunction(
873 StringRef("__asan_malloc_impl"),
874 FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false));
875 Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty);
876 Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt});
877
878 Value *MallocPtr =
879 IRB.CreateIntToPtr(MallocCall, IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS));
880
881 // Create store of malloc to new global
882 IRB.CreateStore(MallocPtr, SwLDS);
883
884 // Create calls to __asan_poison_region to poison redzones.
885 poisonRedzones(Func, MallocPtr);
886
887 // Create branch to PrevEntryBlock
888 IRB.CreateBr(PrevEntryBlock);
889
890 // Create wave-group barrier at the starting of Previous entry block
891 Type *Int1Ty = IRB.getInt1Ty();
892 IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
893 auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond");
894 XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
895 XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
896
897 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
898
899 // Load malloc pointer from Sw LDS.
900 Value *LoadMallocPtr =
901 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), SwLDS);
902
903 // Replace All uses of LDS globals with new LDS pointers.
904 replaceKernelLDSAccesses(Func);
905
906 // Replace Memory Operations on LDS with corresponding
907 // global memory pointers.
908 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
909 LDSInstructions);
910
911 auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func);
912 auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func);
913 auto *EndBlock = BasicBlock::Create(Ctx, "End", Func);
914 for (BasicBlock &BB : *Func) {
915 if (!BB.empty()) {
916 if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
917 RI->eraseFromParent();
918 IRB.SetInsertPoint(&BB, BB.end());
919 IRB.CreateBr(CondFreeBlock);
920 }
921 }
922 }
923
924 // Cond Free Block
925 IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
926 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
927 IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
928
929 // Free Block
930 IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
931
932 // Free the previously allocate device global memory.
933 FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
934 StringRef("__asan_free_impl"),
935 FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
936 Value *ReturnAddr =
937 IRB.CreateIntrinsic(Intrinsic::returnaddress, IRB.getInt32(0));
938 Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
939 Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
940 IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
941
942 IRB.CreateBr(EndBlock);
943
944 // End Block
945 IRB.SetInsertPoint(EndBlock, EndBlock->begin());
946 IRB.CreateRetVoid();
947 // Update the DomTree with corresponding links to basic blocks.
948 DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
949 {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
950 {DominatorTree::Insert, CondFreeBlock, FreeBlock},
951 {DominatorTree::Insert, FreeBlock, EndBlock}});
952 }
953
getAddressesOfVariablesInKernel(Function * Func,SetVector<GlobalVariable * > & Variables)954 Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
955 Function *Func, SetVector<GlobalVariable *> &Variables) {
956 Type *Int32Ty = IRB.getInt32Ty();
957 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
958
959 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
960 assert(SwLDSMetadata);
961 auto *SwLDSMetadataStructType =
962 cast<StructType>(SwLDSMetadata->getValueType());
963 ArrayType *KernelOffsetsType =
964 ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), Variables.size());
965
966 SmallVector<Constant *> Elements;
967 for (auto *GV : Variables) {
968 auto It = LDSParams.LDSToReplacementIndicesMap.find(GV);
969 if (It == LDSParams.LDSToReplacementIndicesMap.end()) {
970 Elements.push_back(
971 PoisonValue::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS)));
972 continue;
973 }
974 auto &Indices = It->second;
975 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
976 ConstantInt::get(Int32Ty, Indices[1]),
977 ConstantInt::get(Int32Ty, Indices[2])};
978 Constant *GEP = ConstantExpr::getGetElementPtr(SwLDSMetadataStructType,
979 SwLDSMetadata, GEPIdx, true);
980 Elements.push_back(GEP);
981 }
982 return ConstantArray::get(KernelOffsetsType, Elements);
983 }
984
buildNonKernelLDSBaseTable(NonKernelLDSParameters & NKLDSParams)985 void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
986 NonKernelLDSParameters &NKLDSParams) {
987 // Base table will have single row, with elements of the row
988 // placed as per kernel ID. Each element in the row corresponds
989 // to addresss of "SW LDS" global of the kernel.
990 auto &Kernels = NKLDSParams.OrderedKernels;
991 if (Kernels.empty())
992 return;
993 Type *Int32Ty = IRB.getInt32Ty();
994 const size_t NumberKernels = Kernels.size();
995 ArrayType *AllKernelsOffsetsType =
996 ArrayType::get(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), NumberKernels);
997 std::vector<Constant *> OverallConstantExprElts(NumberKernels);
998 for (size_t i = 0; i < NumberKernels; i++) {
999 Function *Func = Kernels[i];
1000 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1001 GlobalVariable *SwLDS = LDSParams.SwLDS;
1002 assert(SwLDS);
1003 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)};
1004 Constant *GEP =
1005 ConstantExpr::getGetElementPtr(SwLDS->getType(), SwLDS, GEPIdx, true);
1006 OverallConstantExprElts[i] = GEP;
1007 }
1008 Constant *init =
1009 ConstantArray::get(AllKernelsOffsetsType, OverallConstantExprElts);
1010 NKLDSParams.LDSBaseTable = new GlobalVariable(
1011 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
1012 "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
1013 AMDGPUAS::GLOBAL_ADDRESS);
1014 GlobalValue::SanitizerMetadata MD;
1015 MD.NoAddress = true;
1016 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
1017 }
1018
buildNonKernelLDSOffsetTable(NonKernelLDSParameters & NKLDSParams)1019 void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1020 NonKernelLDSParameters &NKLDSParams) {
1021 // Offset table will have multiple rows and columns.
1022 // Rows are assumed to be from 0 to (n-1). n is total number
1023 // of kernels accessing the LDS through non-kernels.
1024 // Each row will have m elements. m is the total number of
1025 // unique LDS globals accessed by non-kernels.
1026 // Each element in the row correspond to the address of
1027 // the replacement of LDS global done by that particular kernel.
1028 auto &Variables = NKLDSParams.OrdereLDSGlobals;
1029 auto &Kernels = NKLDSParams.OrderedKernels;
1030 if (Variables.empty() || Kernels.empty())
1031 return;
1032 const size_t NumberVariables = Variables.size();
1033 const size_t NumberKernels = Kernels.size();
1034
1035 ArrayType *KernelOffsetsType =
1036 ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumberVariables);
1037
1038 ArrayType *AllKernelsOffsetsType =
1039 ArrayType::get(KernelOffsetsType, NumberKernels);
1040 std::vector<Constant *> overallConstantExprElts(NumberKernels);
1041 for (size_t i = 0; i < NumberKernels; i++) {
1042 Function *Func = Kernels[i];
1043 overallConstantExprElts[i] =
1044 getAddressesOfVariablesInKernel(Func, Variables);
1045 }
1046 Constant *Init =
1047 ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
1048 NKLDSParams.LDSOffsetTable = new GlobalVariable(
1049 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
1050 "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
1051 AMDGPUAS::GLOBAL_ADDRESS);
1052 GlobalValue::SanitizerMetadata MD;
1053 MD.NoAddress = true;
1054 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1055 }
1056
lowerNonKernelLDSAccesses(Function * Func,SetVector<GlobalVariable * > & LDSGlobals,NonKernelLDSParameters & NKLDSParams)1057 void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1058 Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
1059 NonKernelLDSParameters &NKLDSParams) {
1060 // Replace LDS access in non-kernel with replacement queried from
1061 // Base table and offset from offset table.
1062 LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
1063 << Func->getName());
1064 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1065 IRB.SetInsertPoint(InsertAt);
1066
1067 // Get LDS memory instructions.
1068 SetVector<Instruction *> LDSInstructions;
1069 getLDSMemoryInstructions(Func, LDSInstructions);
1070
1071 auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {});
1072 GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
1073 GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
1074 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1075 Value *BaseGEP = IRB.CreateInBoundsGEP(
1076 LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
1077 Value *BaseLoad =
1078 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), BaseGEP);
1079 Value *LoadMallocPtr =
1080 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), BaseLoad);
1081
1082 for (GlobalVariable *GV : LDSGlobals) {
1083 const auto *GVIt = llvm::find(OrdereLDSGlobals, GV);
1084 assert(GVIt != OrdereLDSGlobals.end());
1085 uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
1086
1087 Value *OffsetGEP = IRB.CreateInBoundsGEP(
1088 LDSOffsetTable->getValueType(), LDSOffsetTable,
1089 {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)});
1090 Value *OffsetLoad =
1091 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), OffsetGEP);
1092 Value *Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
1093 Value *BasePlusOffset =
1094 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset});
1095 LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
1096 << GV->getName());
1097 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
1098 }
1099 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1100 LDSInstructions);
1101 }
1102
reorderStaticDynamicIndirectLDSSet(KernelLDSParameters & LDSParams)1103 static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1104 // Sort Static, dynamic LDS globals which are either
1105 // direct or indirect access on basis of name.
1106 auto &DirectAccess = LDSParams.DirectAccess;
1107 auto &IndirectAccess = LDSParams.IndirectAccess;
1108 LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1109 std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1110 DirectAccess.StaticLDSGlobals.end()));
1111 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1112 std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1113 DirectAccess.DynamicLDSGlobals.end()));
1114 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1115 std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1116 IndirectAccess.StaticLDSGlobals.end()));
1117 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1118 std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1119 IndirectAccess.DynamicLDSGlobals.end()));
1120 }
1121
initAsanInfo()1122 void AMDGPUSwLowerLDS::initAsanInfo() {
1123 // Get Shadow mapping scale and offset.
1124 unsigned LongSize =
1125 M.getDataLayout().getPointerSizeInBits(AMDGPUAS::GLOBAL_ADDRESS);
1126 uint64_t Offset;
1127 int Scale;
1128 bool OrShadowOffset;
1129 llvm::getAddressSanitizerParams(AMDGPUTM.getTargetTriple(), LongSize, false,
1130 &Offset, &Scale, &OrShadowOffset);
1131 AsanInfo.Scale = Scale;
1132 AsanInfo.Offset = Offset;
1133 }
1134
hasFnWithSanitizeAddressAttr(FunctionVariableMap & LDSAccesses)1135 static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) {
1136 for (auto &K : LDSAccesses) {
1137 Function *F = K.first;
1138 if (!F)
1139 continue;
1140 if (F->hasFnAttribute(Attribute::SanitizeAddress))
1141 return true;
1142 }
1143 return false;
1144 }
1145
run()1146 bool AMDGPUSwLowerLDS::run() {
1147 bool Changed = false;
1148
1149 CallGraph CG = CallGraph(M);
1150
1151 Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
1152
1153 // Get all the direct and indirect access of LDS for all the kernels.
1154 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
1155
1156 // Flag to decide whether to lower all the LDS accesses
1157 // based on sanitize_address attribute.
1158 bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSUsesInfo.direct_access) ||
1159 hasFnWithSanitizeAddressAttr(LDSUsesInfo.indirect_access);
1160
1161 if (!LowerAllLDS)
1162 return Changed;
1163
1164 // Utility to group LDS access into direct, indirect, static and dynamic.
1165 auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
1166 bool DirectAccess) {
1167 for (auto &K : LDSAccesses) {
1168 Function *F = K.first;
1169 if (!F || K.second.empty())
1170 continue;
1171
1172 assert(isKernelLDS(F));
1173
1174 // Only inserts if key isn't already in the map.
1175 FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1176 {F, KernelLDSParameters()});
1177
1178 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F];
1179 if (!DirectAccess)
1180 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(F);
1181 for (GlobalVariable *GV : K.second) {
1182 if (!DirectAccess) {
1183 if (AMDGPU::isDynamicLDS(*GV))
1184 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV);
1185 else
1186 LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV);
1187 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV);
1188 } else {
1189 if (AMDGPU::isDynamicLDS(*GV))
1190 LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV);
1191 else
1192 LDSParams.DirectAccess.StaticLDSGlobals.insert(GV);
1193 }
1194 }
1195 }
1196 };
1197
1198 PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true);
1199 PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false);
1200
1201 // Get address sanitizer scale.
1202 initAsanInfo();
1203
1204 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1205 Function *Func = K.first;
1206 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1207 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1208 LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1209 LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1210 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1211 Changed = false;
1212 } else {
1213 removeFnAttrFromReachable(
1214 CG, Func,
1215 {"amdgpu-no-workitem-id-x", "amdgpu-no-workitem-id-y",
1216 "amdgpu-no-workitem-id-z", "amdgpu-no-heap-ptr"});
1217 if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() ||
1218 !LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
1219 removeFnAttrFromReachable(CG, Func, {"amdgpu-no-lds-kernel-id"});
1220 reorderStaticDynamicIndirectLDSSet(LDSParams);
1221 buildSwLDSGlobal(Func);
1222 buildSwDynLDSGlobal(Func);
1223 populateSwMetadataGlobal(Func);
1224 populateSwLDSAttributeAndMetadata(Func);
1225 populateLDSToReplacementIndicesMap(Func);
1226 DomTreeUpdater DTU(DTCallback(*Func),
1227 DomTreeUpdater::UpdateStrategy::Lazy);
1228 lowerKernelLDSAccesses(Func, DTU);
1229 Changed = true;
1230 }
1231 }
1232
1233 // Get the Uses of LDS from non-kernels.
1234 getUsesOfLDSByNonKernels();
1235
1236 // Get non-kernels with LDS ptr as argument and called by kernels.
1237 getNonKernelsWithLDSArguments(CG);
1238
1239 // Lower LDS accesses in non-kernels.
1240 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1241 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1242 NonKernelLDSParameters NKLDSParams;
1243 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1244 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1245 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1246 FuncLDSAccessInfo.AllNonKernelLDSAccess);
1247 buildNonKernelLDSBaseTable(NKLDSParams);
1248 buildNonKernelLDSOffsetTable(NKLDSParams);
1249 for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1250 Function *Func = K.first;
1251 DenseSet<GlobalVariable *> &LDSGlobals = K.second;
1252 SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
1253 std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
1254 lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
1255 }
1256 for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1257 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1258 if (K.contains(Func))
1259 continue;
1260 SetVector<llvm::GlobalVariable *> Vec;
1261 lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams);
1262 }
1263 Changed = true;
1264 }
1265
1266 if (!Changed)
1267 return Changed;
1268
1269 for (auto &GV : make_early_inc_range(M.globals())) {
1270 if (AMDGPU::isLDSVariableToLower(GV)) {
1271 // probably want to remove from used lists
1272 GV.removeDeadConstantUsers();
1273 if (GV.use_empty())
1274 GV.eraseFromParent();
1275 }
1276 }
1277
1278 if (AsanInstrumentLDS) {
1279 SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
1280 for (Instruction *Inst : AsanInfo.Instructions) {
1281 SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
1282 getInterestingMemoryOperands(M, Inst, InterestingOperands);
1283 llvm::append_range(OperandsToInstrument, InterestingOperands);
1284 }
1285 for (auto &Operand : OperandsToInstrument) {
1286 Value *Addr = Operand.getPtr();
1287 instrumentAddress(M, IRB, Operand.getInsn(), Operand.getInsn(), Addr,
1288 Operand.Alignment.valueOrOne(), Operand.TypeStoreSize,
1289 Operand.IsWrite, nullptr, false, false, AsanInfo.Scale,
1290 AsanInfo.Offset);
1291 Changed = true;
1292 }
1293 }
1294
1295 return Changed;
1296 }
1297
1298 class AMDGPUSwLowerLDSLegacy : public ModulePass {
1299 public:
1300 const AMDGPUTargetMachine *AMDGPUTM;
1301 static char ID;
AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine * TM)1302 AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM)
1303 : ModulePass(ID), AMDGPUTM(TM) {}
1304 bool runOnModule(Module &M) override;
getAnalysisUsage(AnalysisUsage & AU) const1305 void getAnalysisUsage(AnalysisUsage &AU) const override {
1306 AU.addPreserved<DominatorTreeWrapperPass>();
1307 }
1308 };
1309 } // namespace
1310
1311 char AMDGPUSwLowerLDSLegacy::ID = 0;
1312 char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
1313
1314 INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1315 "AMDGPU Software lowering of LDS", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)1316 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
1317 INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1318 "AMDGPU Software lowering of LDS", false, false)
1319
1320 bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
1321 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1322 // instrumented the IR. Return early if the flag is not present.
1323 if (!M.getModuleFlag("nosanitize_address"))
1324 return false;
1325 DominatorTreeWrapperPass *const DTW =
1326 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1327 auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
1328 return DTW ? &DTW->getDomTree() : nullptr;
1329 };
1330 if (!AMDGPUTM) {
1331 auto &TPC = getAnalysis<TargetPassConfig>();
1332 AMDGPUTM = &TPC.getTM<AMDGPUTargetMachine>();
1333 }
1334 AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback);
1335 bool IsChanged = SwLowerLDSImpl.run();
1336 return IsChanged;
1337 }
1338
1339 ModulePass *
createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine * TM)1340 llvm::createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM) {
1341 return new AMDGPUSwLowerLDSLegacy(TM);
1342 }
1343
run(Module & M,ModuleAnalysisManager & AM)1344 PreservedAnalyses AMDGPUSwLowerLDSPass::run(Module &M,
1345 ModuleAnalysisManager &AM) {
1346 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1347 // instrumented the IR. Return early if the flag is not present.
1348 if (!M.getModuleFlag("nosanitize_address"))
1349 return PreservedAnalyses::all();
1350 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1351 auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
1352 return &FAM.getResult<DominatorTreeAnalysis>(F);
1353 };
1354 AMDGPUSwLowerLDS SwLowerLDSImpl(M, TM, DTCallback);
1355 bool IsChanged = SwLowerLDSImpl.run();
1356 if (!IsChanged)
1357 return PreservedAnalyses::all();
1358
1359 PreservedAnalyses PA;
1360 PA.preserve<DominatorTreeAnalysis>();
1361 return PA;
1362 }
1363