1 //===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass lowers the local data store, LDS, uses in kernel and non-kernel 10 // functions in module to use dynamically allocated global memory. 11 // Packed LDS Layout is emulated in the global memory. 12 // The lowered memory instructions from LDS to global memory are then 13 // instrumented for address sanitizer, to catch addressing errors. 14 // This pass only work when address sanitizer has been enabled and has 15 // instrumented the IR. It identifies that IR has been instrumented using 16 // "nosanitize_address" module flag. 17 // 18 // Replacement of Kernel LDS accesses: 19 // For a kernel, LDS access can be static or dynamic which are direct 20 // (accessed within kernel) and indirect (accessed through non-kernels). 21 // All these LDS accesses corresponding to kernel will be packed together, 22 // where all static LDS accesses will be allocated first and then dynamic 23 // LDS follows. The total size with alignment is calculated. A new LDS global 24 // will be created for the kernel called "SW LDS" and it will have the 25 // attribute "amdgpu-lds-size" attached with value of the size calculated. 26 // All the LDS accesses in the module will be replaced by GEP with offset 27 // into the "Sw LDS". 28 // A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing 29 // the dynamic LDS. This will be marked used by kernel and will have 30 // MD_absolue_symbol metadata set to total static LDS size, Since dynamic 31 // LDS allocation starts after all static LDS allocation. 32 // 33 // A device global memory equal to the total LDS size will be allocated. 34 // At the prologue of the kernel, a single work-item from the 35 // work-group, does a "malloc" and stores the pointer of the 36 // allocation in "SW LDS". 37 // 38 // To store the offsets corresponding to all LDS accesses, another global 39 // variable is created which will be called "SW LDS metadata" in this pass. 40 // - SW LDS Global: 41 // It is LDS global of ptr type with name 42 // "llvm.amdgcn.sw.lds.<kernel-name>". 43 // - Metadata Global: 44 // It is of struct type, with n members. n equals the number of LDS 45 // globals accessed by the kernel(direct and indirect). Each member of 46 // struct is another struct of type {i32, i32, i32}. First member 47 // corresponds to offset, second member corresponds to size of LDS global 48 // being replaced and third represents the total aligned size. It will 49 // have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have 50 // an intializer with static LDS related offsets and sizes initialized. 51 // But for dynamic LDS related entries, offsets will be intialized to 52 // previous static LDS allocation end offset. Sizes for them will be zero 53 // initially. These dynamic LDS offset and size values will be updated 54 // within the kernel, since kernel can read the dynamic LDS size 55 // allocation done at runtime with query to "hidden_dynamic_lds_size" 56 // hidden kernel argument. 57 // 58 // At the epilogue of kernel, allocated memory would be made free by the same 59 // single work-item. 60 // 61 // Replacement of non-kernel LDS accesses: 62 // Multiple kernels can access the same non-kernel function. 63 // All the kernels accessing LDS through non-kernels are sorted and 64 // assigned a kernel-id. All the LDS globals accessed by non-kernels 65 // are sorted. This information is used to build two tables: 66 // - Base table: 67 // Base table will have single row, with elements of the row 68 // placed as per kernel ID. Each element in the row corresponds 69 // to ptr of "SW LDS" variable created for that kernel. 70 // - Offset table: 71 // Offset table will have multiple rows and columns. 72 // Rows are assumed to be from 0 to (n-1). n is total number 73 // of kernels accessing the LDS through non-kernels. 74 // Each row will have m elements. m is the total number of 75 // unique LDS globals accessed by all non-kernels. 76 // Each element in the row correspond to the ptr of 77 // the replacement of LDS global done by that particular kernel. 78 // A LDS variable in non-kernel will be replaced based on the information 79 // from base and offset tables. Based on kernel-id query, ptr of "SW 80 // LDS" for that corresponding kernel is obtained from base table. 81 // The Offset into the base "SW LDS" is obtained from 82 // corresponding element in offset table. With this information, replacement 83 // value is obtained. 84 //===----------------------------------------------------------------------===// 85 86 #include "AMDGPU.h" 87 #include "AMDGPUAsanInstrumentation.h" 88 #include "AMDGPUMemoryUtils.h" 89 #include "AMDGPUTargetMachine.h" 90 #include "llvm/ADT/DenseMap.h" 91 #include "llvm/ADT/DenseSet.h" 92 #include "llvm/ADT/SetVector.h" 93 #include "llvm/ADT/StringExtras.h" 94 #include "llvm/ADT/StringRef.h" 95 #include "llvm/Analysis/CallGraph.h" 96 #include "llvm/Analysis/DomTreeUpdater.h" 97 #include "llvm/CodeGen/TargetPassConfig.h" 98 #include "llvm/IR/Constants.h" 99 #include "llvm/IR/DIBuilder.h" 100 #include "llvm/IR/DebugInfo.h" 101 #include "llvm/IR/DebugInfoMetadata.h" 102 #include "llvm/IR/IRBuilder.h" 103 #include "llvm/IR/Instructions.h" 104 #include "llvm/IR/IntrinsicsAMDGPU.h" 105 #include "llvm/IR/MDBuilder.h" 106 #include "llvm/IR/ReplaceConstant.h" 107 #include "llvm/Pass.h" 108 #include "llvm/Support/raw_ostream.h" 109 #include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h" 110 #include "llvm/Transforms/Utils/ModuleUtils.h" 111 112 #include <algorithm> 113 114 #define DEBUG_TYPE "amdgpu-sw-lower-lds" 115 #define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15 116 117 using namespace llvm; 118 using namespace AMDGPU; 119 120 namespace { 121 122 cl::opt<bool> 123 AsanInstrumentLDS("amdgpu-asan-instrument-lds", 124 cl::desc("Run asan instrumentation on LDS instructions " 125 "lowered to global memory"), 126 cl::init(true), cl::Hidden); 127 128 using DomTreeCallback = function_ref<DominatorTree *(Function &F)>; 129 130 struct LDSAccessTypeInfo { 131 SetVector<GlobalVariable *> StaticLDSGlobals; 132 SetVector<GlobalVariable *> DynamicLDSGlobals; 133 }; 134 135 // Struct to hold all the Metadata required for a kernel 136 // to replace a LDS global uses with corresponding offset 137 // in to device global memory. 138 struct KernelLDSParameters { 139 GlobalVariable *SwLDS = nullptr; 140 GlobalVariable *SwDynLDS = nullptr; 141 GlobalVariable *SwLDSMetadata = nullptr; 142 LDSAccessTypeInfo DirectAccess; 143 LDSAccessTypeInfo IndirectAccess; 144 DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>> 145 LDSToReplacementIndicesMap; 146 uint32_t MallocSize = 0; 147 uint32_t LDSSize = 0; 148 SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector; 149 }; 150 151 // Struct to store information for creation of offset table 152 // for all the non-kernel LDS accesses. 153 struct NonKernelLDSParameters { 154 GlobalVariable *LDSBaseTable = nullptr; 155 GlobalVariable *LDSOffsetTable = nullptr; 156 SetVector<Function *> OrderedKernels; 157 SetVector<GlobalVariable *> OrdereLDSGlobals; 158 }; 159 160 struct AsanInstrumentInfo { 161 int Scale = 0; 162 uint32_t Offset = 0; 163 SetVector<Instruction *> Instructions; 164 }; 165 166 struct FunctionsAndLDSAccess { 167 DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap; 168 SetVector<Function *> KernelsWithIndirectLDSAccess; 169 SetVector<Function *> NonKernelsWithLDSArgument; 170 SetVector<GlobalVariable *> AllNonKernelLDSAccess; 171 FunctionVariableMap NonKernelToLDSAccessMap; 172 }; 173 174 class AMDGPUSwLowerLDS { 175 public: 176 AMDGPUSwLowerLDS(Module &Mod, const AMDGPUTargetMachine &TM, 177 DomTreeCallback Callback) 178 : M(Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {} 179 bool run(); 180 void getUsesOfLDSByNonKernels(); 181 void getNonKernelsWithLDSArguments(const CallGraph &CG); 182 SetVector<Function *> 183 getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels); 184 SetVector<GlobalVariable *> 185 getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables); 186 void buildSwLDSGlobal(Function *Func); 187 void buildSwDynLDSGlobal(Function *Func); 188 void populateSwMetadataGlobal(Function *Func); 189 void populateSwLDSAttributeAndMetadata(Function *Func); 190 void populateLDSToReplacementIndicesMap(Function *Func); 191 void getLDSMemoryInstructions(Function *Func, 192 SetVector<Instruction *> &LDSInstructions); 193 void replaceKernelLDSAccesses(Function *Func); 194 Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr); 195 void translateLDSMemoryOperationsToGlobalMemory( 196 Function *Func, Value *LoadMallocPtr, 197 SetVector<Instruction *> &LDSInstructions); 198 void poisonRedzones(Function *Func, Value *MallocPtr); 199 void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU); 200 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams); 201 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams); 202 Constant * 203 getAddressesOfVariablesInKernel(Function *Func, 204 SetVector<GlobalVariable *> &Variables); 205 void lowerNonKernelLDSAccesses(Function *Func, 206 SetVector<GlobalVariable *> &LDSGlobals, 207 NonKernelLDSParameters &NKLDSParams); 208 void 209 updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize, 210 Value *HiddenDynLDSSize, 211 SetVector<GlobalVariable *> &DynamicLDSGlobals); 212 void initAsanInfo(); 213 214 private: 215 Module &M; 216 const AMDGPUTargetMachine &AMDGPUTM; 217 IRBuilder<> IRB; 218 DomTreeCallback DTCallback; 219 FunctionsAndLDSAccess FuncLDSAccessInfo; 220 AsanInstrumentInfo AsanInfo; 221 }; 222 223 template <typename T> SetVector<T> sortByName(std::vector<T> &&V) { 224 // Sort the vector of globals or Functions based on their name. 225 // Returns a SetVector of globals/Functions. 226 sort(V, [](const auto *L, const auto *R) { 227 return L->getName() < R->getName(); 228 }); 229 return {SetVector<T>(llvm::from_range, V)}; 230 } 231 232 SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals( 233 SetVector<GlobalVariable *> &Variables) { 234 // Sort all the non-kernel LDS accesses based on their name. 235 return sortByName( 236 std::vector<GlobalVariable *>(Variables.begin(), Variables.end())); 237 } 238 239 SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels( 240 SetVector<Function *> &Kernels) { 241 // Sort the non-kernels accessing LDS based on their name. 242 // Also assign a kernel ID metadata based on the sorted order. 243 LLVMContext &Ctx = M.getContext(); 244 if (Kernels.size() > UINT32_MAX) { 245 report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels"); 246 } 247 SetVector<Function *> OrderedKernels = 248 sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end())); 249 for (size_t i = 0; i < Kernels.size(); i++) { 250 Metadata *AttrMDArgs[1] = { 251 ConstantAsMetadata::get(IRB.getInt32(i)), 252 }; 253 Function *Func = OrderedKernels[i]; 254 Func->setMetadata("llvm.amdgcn.lds.kernel.id", 255 MDNode::get(Ctx, AttrMDArgs)); 256 } 257 return OrderedKernels; 258 } 259 260 void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) { 261 // Among the kernels accessing LDS, get list of 262 // Non-kernels to which a call is made and a ptr 263 // to addrspace(3) is passed as argument. 264 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) { 265 Function *Func = K.first; 266 const CallGraphNode *CGN = CG[Func]; 267 if (!CGN) 268 continue; 269 for (auto &I : *CGN) { 270 CallGraphNode *CallerCGN = I.second; 271 Function *CalledFunc = CallerCGN->getFunction(); 272 if (!CalledFunc || CalledFunc->isDeclaration()) 273 continue; 274 if (AMDGPU::isKernelLDS(CalledFunc)) 275 continue; 276 for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end(); 277 AI != E; ++AI) { 278 Type *ArgTy = (*AI).getType(); 279 if (!ArgTy->isPointerTy()) 280 continue; 281 if (ArgTy->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) 282 continue; 283 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc); 284 // Also add the Calling function to KernelsWithIndirectLDSAccess list 285 // so that base table of LDS is generated. 286 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func); 287 } 288 } 289 } 290 } 291 292 void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() { 293 for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) { 294 if (!AMDGPU::isLDSVariableToLower(*GV)) 295 continue; 296 297 for (User *V : GV->users()) { 298 if (auto *I = dyn_cast<Instruction>(V)) { 299 Function *F = I->getFunction(); 300 if (!isKernelLDS(F) && !F->isDeclaration()) 301 FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV); 302 } 303 } 304 } 305 } 306 307 static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV, 308 uint32_t Address) { 309 // Write the specified address into metadata where it can be retrieved by 310 // the assembler. Format is a half open range, [Address Address+1) 311 LLVMContext &Ctx = M.getContext(); 312 auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); 313 MDBuilder MDB(Ctx); 314 MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address), 315 ConstantInt::get(IntTy, Address + 1)); 316 GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode); 317 } 318 319 static void addLDSSizeAttribute(Function *Func, uint32_t Offset, 320 bool IsDynLDS) { 321 if (Offset != 0) { 322 std::string Buffer; 323 raw_string_ostream SS{Buffer}; 324 SS << Offset; 325 if (IsDynLDS) 326 SS << "," << Offset; 327 Func->addFnAttr("amdgpu-lds-size", Buffer); 328 } 329 } 330 331 static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { 332 BasicBlock *Entry = &Func->getEntryBlock(); 333 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); 334 335 Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(), 336 Intrinsic::donothing, {}); 337 338 Value *UseInstance[1] = { 339 Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)}; 340 341 Builder.CreateCall(Decl, {}, 342 {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}); 343 } 344 345 void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) { 346 // Create new LDS global required for each kernel to store 347 // device global memory pointer. 348 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 349 // Create new global pointer variable 350 LDSParams.SwLDS = new GlobalVariable( 351 M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage, 352 PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(), 353 nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); 354 GlobalValue::SanitizerMetadata MD; 355 MD.NoAddress = true; 356 LDSParams.SwLDS->setSanitizerMetadata(MD); 357 } 358 359 void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) { 360 // Create new Dyn LDS global if kernel accesses dyn LDS. 361 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 362 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() && 363 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) 364 return; 365 // Create new global pointer variable 366 auto *emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0); 367 LDSParams.SwDynLDS = new GlobalVariable( 368 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, 369 "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr, 370 GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); 371 markUsedByKernel(Func, LDSParams.SwDynLDS); 372 GlobalValue::SanitizerMetadata MD; 373 MD.NoAddress = true; 374 LDSParams.SwDynLDS->setSanitizerMetadata(MD); 375 } 376 377 void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) { 378 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 379 bool IsDynLDSUsed = LDSParams.SwDynLDS; 380 uint32_t Offset = LDSParams.LDSSize; 381 recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0); 382 addLDSSizeAttribute(Func, Offset, IsDynLDSUsed); 383 if (LDSParams.SwDynLDS) 384 recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset); 385 } 386 387 void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) { 388 // Create new metadata global for every kernel and initialize the 389 // start offsets and sizes corresponding to each LDS accesses. 390 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 391 auto &Ctx = M.getContext(); 392 auto &DL = M.getDataLayout(); 393 std::vector<Type *> Items; 394 Type *Int32Ty = IRB.getInt32Ty(); 395 std::vector<Constant *> Initializers; 396 Align MaxAlignment(1); 397 auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) { 398 Align GVAlign = AMDGPU::getAlign(DL, GV); 399 MaxAlignment = std::max(MaxAlignment, GVAlign); 400 }; 401 402 for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals) 403 UpdateMaxAlignment(GV); 404 405 for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals) 406 UpdateMaxAlignment(GV); 407 408 for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals) 409 UpdateMaxAlignment(GV); 410 411 for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals) 412 UpdateMaxAlignment(GV); 413 414 //{StartOffset, AlignedSizeInBytes} 415 SmallString<128> MDItemStr; 416 raw_svector_ostream MDItemOS(MDItemStr); 417 MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item"; 418 419 StructType *LDSItemTy = 420 StructType::create(Ctx, {Int32Ty, Int32Ty, Int32Ty}, MDItemOS.str()); 421 uint32_t &MallocSize = LDSParams.MallocSize; 422 SetVector<GlobalVariable *> UniqueLDSGlobals; 423 int AsanScale = AsanInfo.Scale; 424 auto buildInitializerForSwLDSMD = 425 [&](SetVector<GlobalVariable *> &LDSGlobals) { 426 for (auto &GV : LDSGlobals) { 427 if (is_contained(UniqueLDSGlobals, GV)) 428 continue; 429 UniqueLDSGlobals.insert(GV); 430 431 Type *Ty = GV->getValueType(); 432 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty); 433 Items.push_back(LDSItemTy); 434 Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize); 435 Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes); 436 // Get redzone size corresponding a size. 437 const uint64_t RightRedzoneSize = 438 AMDGPU::getRedzoneSizeForGlobal(AsanScale, SizeInBytes); 439 // Update MallocSize with current size and redzone size. 440 MallocSize += SizeInBytes; 441 if (!AMDGPU::isDynamicLDS(*GV)) 442 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize, 443 RightRedzoneSize); 444 MallocSize += RightRedzoneSize; 445 // Align current size plus redzone. 446 uint64_t AlignedSize = 447 alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment); 448 Constant *AlignedSizeInBytesConst = 449 ConstantInt::get(Int32Ty, AlignedSize); 450 // Align MallocSize 451 MallocSize = alignTo(MallocSize, MaxAlignment); 452 Constant *InitItem = 453 ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst, 454 AlignedSizeInBytesConst}); 455 Initializers.push_back(InitItem); 456 } 457 }; 458 SetVector<GlobalVariable *> SwLDSVector; 459 SwLDSVector.insert(LDSParams.SwLDS); 460 buildInitializerForSwLDSMD(SwLDSVector); 461 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals); 462 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals); 463 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals); 464 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals); 465 466 // Update the LDS size used by the kernel. 467 Type *Ty = LDSParams.SwLDS->getValueType(); 468 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty); 469 uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment); 470 LDSParams.LDSSize = AlignedSize; 471 SmallString<128> MDTypeStr; 472 raw_svector_ostream MDTypeOS(MDTypeStr); 473 MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type"; 474 StructType *MetadataStructType = 475 StructType::create(Ctx, Items, MDTypeOS.str()); 476 SmallString<128> MDStr; 477 raw_svector_ostream MDOS(MDStr); 478 MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md"; 479 LDSParams.SwLDSMetadata = new GlobalVariable( 480 M, MetadataStructType, false, GlobalValue::InternalLinkage, 481 PoisonValue::get(MetadataStructType), MDOS.str(), nullptr, 482 GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false); 483 Constant *data = ConstantStruct::get(MetadataStructType, Initializers); 484 LDSParams.SwLDSMetadata->setInitializer(data); 485 assert(LDSParams.SwLDS); 486 // Set the alignment to MaxAlignment for SwLDS. 487 LDSParams.SwLDS->setAlignment(MaxAlignment); 488 if (LDSParams.SwDynLDS) 489 LDSParams.SwDynLDS->setAlignment(MaxAlignment); 490 GlobalValue::SanitizerMetadata MD; 491 MD.NoAddress = true; 492 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD); 493 } 494 495 void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) { 496 // Fill the corresponding LDS replacement indices for each LDS access 497 // related to this kernel. 498 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 499 SetVector<GlobalVariable *> UniqueLDSGlobals; 500 auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals, 501 uint32_t &Idx) { 502 for (auto &GV : LDSGlobals) { 503 if (is_contained(UniqueLDSGlobals, GV)) 504 continue; 505 UniqueLDSGlobals.insert(GV); 506 LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0}; 507 ++Idx; 508 } 509 }; 510 uint32_t Idx = 0; 511 SetVector<GlobalVariable *> SwLDSVector; 512 SwLDSVector.insert(LDSParams.SwLDS); 513 PopulateIndices(SwLDSVector, Idx); 514 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx); 515 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx); 516 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx); 517 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx); 518 } 519 520 static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV, 521 Value *Replacement) { 522 // Replace all uses of LDS global in this Function with a Replacement. 523 auto ReplaceUsesLambda = [Func](const Use &U) -> bool { 524 auto *V = U.getUser(); 525 if (auto *Inst = dyn_cast<Instruction>(V)) { 526 auto *Func1 = Inst->getParent()->getParent(); 527 if (Func == Func1) 528 return true; 529 } 530 return false; 531 }; 532 GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda); 533 } 534 535 void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) { 536 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 537 GlobalVariable *SwLDS = LDSParams.SwLDS; 538 assert(SwLDS); 539 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata; 540 assert(SwLDSMetadata); 541 StructType *SwLDSMetadataStructType = 542 cast<StructType>(SwLDSMetadata->getValueType()); 543 Type *Int32Ty = IRB.getInt32Ty(); 544 auto &IndirectAccess = LDSParams.IndirectAccess; 545 auto &DirectAccess = LDSParams.DirectAccess; 546 // Replace all uses of LDS global in this Function with a Replacement. 547 SetVector<GlobalVariable *> UniqueLDSGlobals; 548 auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) { 549 for (auto &GV : LDSGlobals) { 550 // Do not generate instructions if LDS access is in non-kernel 551 // i.e indirect-access. 552 if ((IndirectAccess.StaticLDSGlobals.contains(GV) || 553 IndirectAccess.DynamicLDSGlobals.contains(GV)) && 554 (!DirectAccess.StaticLDSGlobals.contains(GV) && 555 !DirectAccess.DynamicLDSGlobals.contains(GV))) 556 continue; 557 if (is_contained(UniqueLDSGlobals, GV)) 558 continue; 559 UniqueLDSGlobals.insert(GV); 560 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV]; 561 assert(Indices.size() == 3); 562 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]), 563 ConstantInt::get(Int32Ty, Indices[1]), 564 ConstantInt::get(Int32Ty, Indices[2])}; 565 Constant *GEP = ConstantExpr::getGetElementPtr( 566 SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true); 567 Value *Offset = IRB.CreateLoad(Int32Ty, GEP); 568 Value *BasePlusOffset = 569 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset}); 570 LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ", 571 false)); 572 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset); 573 } 574 }; 575 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals); 576 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals); 577 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals); 578 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals); 579 } 580 581 void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS( 582 Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize, 583 SetVector<GlobalVariable *> &DynamicLDSGlobals) { 584 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 585 Type *Int32Ty = IRB.getInt32Ty(); 586 587 GlobalVariable *SwLDS = LDSParams.SwLDS; 588 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata; 589 assert(SwLDS && SwLDSMetadata); 590 StructType *MetadataStructType = 591 cast<StructType>(SwLDSMetadata->getValueType()); 592 unsigned MaxAlignment = SwLDS->getAlignment(); 593 Value *MaxAlignValue = IRB.getInt32(MaxAlignment); 594 Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1); 595 596 for (GlobalVariable *DynGV : DynamicLDSGlobals) { 597 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV]; 598 // Update the Offset metadata. 599 Constant *Index0 = ConstantInt::get(Int32Ty, 0); 600 Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]); 601 602 Constant *Index2Offset = ConstantInt::get(Int32Ty, 0); 603 auto *GEPForOffset = IRB.CreateInBoundsGEP( 604 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset}); 605 606 IRB.CreateStore(*CurrMallocSize, GEPForOffset); 607 // Update the size and Aligned Size metadata. 608 Constant *Index2Size = ConstantInt::get(Int32Ty, 1); 609 auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata, 610 {Index0, Index1, Index2Size}); 611 612 Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize); 613 IRB.CreateStore(CurrDynLDSSize, GEPForSize); 614 Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2); 615 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP( 616 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize}); 617 618 Value *AlignedDynLDSSize = 619 IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne); 620 AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue); 621 AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue); 622 IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize); 623 624 // Update the Current Malloc Size 625 *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize); 626 } 627 } 628 629 static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore, 630 DISubprogram *SP) { 631 assert(InsertBefore); 632 if (InsertBefore->getDebugLoc()) 633 return InsertBefore->getDebugLoc(); 634 if (SP) 635 return DILocation::get(SP->getContext(), SP->getLine(), 1, SP); 636 return DebugLoc(); 637 } 638 639 void AMDGPUSwLowerLDS::getLDSMemoryInstructions( 640 Function *Func, SetVector<Instruction *> &LDSInstructions) { 641 for (BasicBlock &BB : *Func) { 642 for (Instruction &Inst : BB) { 643 if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) { 644 if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) 645 LDSInstructions.insert(&Inst); 646 } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) { 647 if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) 648 LDSInstructions.insert(&Inst); 649 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) { 650 if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) 651 LDSInstructions.insert(&Inst); 652 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) { 653 if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) 654 LDSInstructions.insert(&Inst); 655 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&Inst)) { 656 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 657 ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) 658 LDSInstructions.insert(&Inst); 659 } else 660 continue; 661 } 662 } 663 } 664 665 Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, 666 Value *LDSPtr) { 667 assert(LDSPtr && "Invalid LDS pointer operand"); 668 Type *LDSPtrType = LDSPtr->getType(); 669 LLVMContext &Ctx = M.getContext(); 670 const DataLayout &DL = M.getDataLayout(); 671 Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); 672 if (auto *VecPtrTy = dyn_cast<VectorType>(LDSPtrType)) { 673 // Handle vector of pointers 674 ElementCount NumElements = VecPtrTy->getElementCount(); 675 IntTy = VectorType::get(IntTy, NumElements); 676 } 677 Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy); 678 return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex}); 679 } 680 681 void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( 682 Function *Func, Value *LoadMallocPtr, 683 SetVector<Instruction *> &LDSInstructions) { 684 LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : " 685 << Func->getName()); 686 for (Instruction *Inst : LDSInstructions) { 687 IRB.SetInsertPoint(Inst); 688 if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 689 Value *LIOperand = LI->getPointerOperand(); 690 Value *Replacement = 691 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand); 692 LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement, 693 LI->getAlign(), LI->isVolatile()); 694 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); 695 AsanInfo.Instructions.insert(NewLI); 696 LI->replaceAllUsesWith(NewLI); 697 LI->eraseFromParent(); 698 } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 699 Value *SIOperand = SI->getPointerOperand(); 700 Value *Replacement = 701 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand); 702 StoreInst *NewSI = IRB.CreateAlignedStore( 703 SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile()); 704 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); 705 AsanInfo.Instructions.insert(NewSI); 706 SI->replaceAllUsesWith(NewSI); 707 SI->eraseFromParent(); 708 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { 709 Value *RMWPtrOperand = RMW->getPointerOperand(); 710 Value *RMWValOperand = RMW->getValOperand(); 711 Value *Replacement = 712 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand); 713 AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW( 714 RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(), 715 RMW->getOrdering(), RMW->getSyncScopeID()); 716 NewRMW->setVolatile(RMW->isVolatile()); 717 AsanInfo.Instructions.insert(NewRMW); 718 RMW->replaceAllUsesWith(NewRMW); 719 RMW->eraseFromParent(); 720 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Inst)) { 721 Value *XCHGPtrOperand = XCHG->getPointerOperand(); 722 Value *Replacement = 723 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand); 724 AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg( 725 Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(), 726 XCHG->getAlign(), XCHG->getSuccessOrdering(), 727 XCHG->getFailureOrdering(), XCHG->getSyncScopeID()); 728 NewXCHG->setVolatile(XCHG->isVolatile()); 729 AsanInfo.Instructions.insert(NewXCHG); 730 XCHG->replaceAllUsesWith(NewXCHG); 731 XCHG->eraseFromParent(); 732 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Inst)) { 733 Value *AIOperand = ASC->getPointerOperand(); 734 Value *Replacement = 735 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand); 736 Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType()); 737 // Note: No need to add the instruction to AsanInfo instructions to be 738 // instrumented list. FLAT_ADDRESS ptr would have been already 739 // instrumented by asan pass prior to this pass. 740 ASC->replaceAllUsesWith(NewAI); 741 ASC->eraseFromParent(); 742 } else 743 report_fatal_error("Unimplemented LDS lowering instruction"); 744 } 745 } 746 747 void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) { 748 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 749 Type *Int64Ty = IRB.getInt64Ty(); 750 Type *VoidTy = IRB.getVoidTy(); 751 FunctionCallee AsanPoisonRegion = M.getOrInsertFunction( 752 "__asan_poison_region", 753 FunctionType::get(VoidTy, {Int64Ty, Int64Ty}, false)); 754 755 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector; 756 size_t VecSize = RedzonesVec.size(); 757 for (unsigned i = 0; i < VecSize; i++) { 758 auto &RedzonePair = RedzonesVec[i]; 759 uint64_t RedzoneOffset = RedzonePair.first; 760 uint64_t RedzoneSize = RedzonePair.second; 761 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP( 762 IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)}); 763 Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty); 764 IRB.CreateCall(AsanPoisonRegion, 765 {RedzoneAddress, IRB.getInt64(RedzoneSize)}); 766 } 767 } 768 769 void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, 770 DomTreeUpdater &DTU) { 771 LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName()); 772 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 773 auto &Ctx = M.getContext(); 774 auto *PrevEntryBlock = &Func->getEntryBlock(); 775 SetVector<Instruction *> LDSInstructions; 776 getLDSMemoryInstructions(Func, LDSInstructions); 777 778 // Create malloc block. 779 auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock); 780 781 // Create WIdBlock block which has instructions related to selection of 782 // {0,0,0} indiex work item in the work group. 783 auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock); 784 IRB.SetInsertPoint(WIdBlock, WIdBlock->begin()); 785 DebugLoc FirstDL = 786 getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram()); 787 IRB.SetCurrentDebugLocation(FirstDL); 788 Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}); 789 Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {}); 790 Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {}); 791 Value *XYOr = IRB.CreateOr(WIdx, WIdy); 792 Value *XYZOr = IRB.CreateOr(XYOr, WIdz); 793 Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0)); 794 795 // All work items will branch to PrevEntryBlock except {0,0,0} index 796 // work item which will branch to malloc block. 797 IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock); 798 799 // Malloc block 800 IRB.SetInsertPoint(MallocBlock, MallocBlock->begin()); 801 802 // If Dynamic LDS globals are accessed by the kernel, 803 // Get the size of dyn lds from hidden dyn_lds_size kernel arg. 804 // Update the corresponding metadata global entries for this dyn lds global. 805 GlobalVariable *SwLDS = LDSParams.SwLDS; 806 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata; 807 assert(SwLDS && SwLDSMetadata); 808 StructType *MetadataStructType = 809 cast<StructType>(SwLDSMetadata->getValueType()); 810 uint32_t MallocSize = 0; 811 Value *CurrMallocSize; 812 Type *Int32Ty = IRB.getInt32Ty(); 813 Type *Int64Ty = IRB.getInt64Ty(); 814 815 SetVector<GlobalVariable *> UniqueLDSGlobals; 816 auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) { 817 for (auto &GV : LDSGlobals) { 818 if (is_contained(UniqueLDSGlobals, GV)) 819 continue; 820 UniqueLDSGlobals.insert(GV); 821 } 822 }; 823 824 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals); 825 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals); 826 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size(); 827 UniqueLDSGlobals.clear(); 828 829 if (NumStaticLDS) { 830 auto *GEPForEndStaticLDSOffset = 831 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata, 832 {ConstantInt::get(Int32Ty, 0), 833 ConstantInt::get(Int32Ty, NumStaticLDS - 1), 834 ConstantInt::get(Int32Ty, 0)}); 835 836 auto *GEPForEndStaticLDSSize = 837 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata, 838 {ConstantInt::get(Int32Ty, 0), 839 ConstantInt::get(Int32Ty, NumStaticLDS - 1), 840 ConstantInt::get(Int32Ty, 2)}); 841 842 Value *EndStaticLDSOffset = 843 IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset); 844 Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize); 845 CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize); 846 } else 847 CurrMallocSize = IRB.getInt32(MallocSize); 848 849 if (LDSParams.SwDynLDS) { 850 if (!(AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5)) 851 report_fatal_error( 852 "Dynamic LDS size query is only supported for CO V5 and later."); 853 // Get size from hidden dyn_lds_size argument of kernel 854 Value *ImplicitArg = 855 IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}); 856 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP( 857 ImplicitArg->getType(), ImplicitArg, 858 {ConstantInt::get(Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)}); 859 UniqueLDSGlobals.clear(); 860 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals); 861 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals); 862 updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize, 863 UniqueLDSGlobals); 864 } 865 866 CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty); 867 868 // Create a call to malloc function which does device global memory allocation 869 // with size equals to all LDS global accesses size in this kernel. 870 Value *ReturnAddress = 871 IRB.CreateIntrinsic(Intrinsic::returnaddress, {IRB.getInt32(0)}); 872 FunctionCallee MallocFunc = M.getOrInsertFunction( 873 StringRef("__asan_malloc_impl"), 874 FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false)); 875 Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty); 876 Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt}); 877 878 Value *MallocPtr = 879 IRB.CreateIntToPtr(MallocCall, IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS)); 880 881 // Create store of malloc to new global 882 IRB.CreateStore(MallocPtr, SwLDS); 883 884 // Create calls to __asan_poison_region to poison redzones. 885 poisonRedzones(Func, MallocPtr); 886 887 // Create branch to PrevEntryBlock 888 IRB.CreateBr(PrevEntryBlock); 889 890 // Create wave-group barrier at the starting of Previous entry block 891 Type *Int1Ty = IRB.getInt1Ty(); 892 IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin()); 893 auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond"); 894 XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock); 895 XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock); 896 897 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}); 898 899 // Load malloc pointer from Sw LDS. 900 Value *LoadMallocPtr = 901 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), SwLDS); 902 903 // Replace All uses of LDS globals with new LDS pointers. 904 replaceKernelLDSAccesses(Func); 905 906 // Replace Memory Operations on LDS with corresponding 907 // global memory pointers. 908 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr, 909 LDSInstructions); 910 911 auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func); 912 auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func); 913 auto *EndBlock = BasicBlock::Create(Ctx, "End", Func); 914 for (BasicBlock &BB : *Func) { 915 if (!BB.empty()) { 916 if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) { 917 RI->eraseFromParent(); 918 IRB.SetInsertPoint(&BB, BB.end()); 919 IRB.CreateBr(CondFreeBlock); 920 } 921 } 922 } 923 924 // Cond Free Block 925 IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin()); 926 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}); 927 IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock); 928 929 // Free Block 930 IRB.SetInsertPoint(FreeBlock, FreeBlock->begin()); 931 932 // Free the previously allocate device global memory. 933 FunctionCallee AsanFreeFunc = M.getOrInsertFunction( 934 StringRef("__asan_free_impl"), 935 FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false)); 936 Value *ReturnAddr = 937 IRB.CreateIntrinsic(Intrinsic::returnaddress, IRB.getInt32(0)); 938 Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty); 939 Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty); 940 IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt}); 941 942 IRB.CreateBr(EndBlock); 943 944 // End Block 945 IRB.SetInsertPoint(EndBlock, EndBlock->begin()); 946 IRB.CreateRetVoid(); 947 // Update the DomTree with corresponding links to basic blocks. 948 DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock}, 949 {DominatorTree::Insert, MallocBlock, PrevEntryBlock}, 950 {DominatorTree::Insert, CondFreeBlock, FreeBlock}, 951 {DominatorTree::Insert, FreeBlock, EndBlock}}); 952 } 953 954 Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel( 955 Function *Func, SetVector<GlobalVariable *> &Variables) { 956 Type *Int32Ty = IRB.getInt32Ty(); 957 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 958 959 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata; 960 assert(SwLDSMetadata); 961 auto *SwLDSMetadataStructType = 962 cast<StructType>(SwLDSMetadata->getValueType()); 963 ArrayType *KernelOffsetsType = 964 ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), Variables.size()); 965 966 SmallVector<Constant *> Elements; 967 for (auto *GV : Variables) { 968 auto It = LDSParams.LDSToReplacementIndicesMap.find(GV); 969 if (It == LDSParams.LDSToReplacementIndicesMap.end()) { 970 Elements.push_back( 971 PoisonValue::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS))); 972 continue; 973 } 974 auto &Indices = It->second; 975 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]), 976 ConstantInt::get(Int32Ty, Indices[1]), 977 ConstantInt::get(Int32Ty, Indices[2])}; 978 Constant *GEP = ConstantExpr::getGetElementPtr(SwLDSMetadataStructType, 979 SwLDSMetadata, GEPIdx, true); 980 Elements.push_back(GEP); 981 } 982 return ConstantArray::get(KernelOffsetsType, Elements); 983 } 984 985 void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable( 986 NonKernelLDSParameters &NKLDSParams) { 987 // Base table will have single row, with elements of the row 988 // placed as per kernel ID. Each element in the row corresponds 989 // to addresss of "SW LDS" global of the kernel. 990 auto &Kernels = NKLDSParams.OrderedKernels; 991 if (Kernels.empty()) 992 return; 993 Type *Int32Ty = IRB.getInt32Ty(); 994 const size_t NumberKernels = Kernels.size(); 995 ArrayType *AllKernelsOffsetsType = 996 ArrayType::get(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), NumberKernels); 997 std::vector<Constant *> OverallConstantExprElts(NumberKernels); 998 for (size_t i = 0; i < NumberKernels; i++) { 999 Function *Func = Kernels[i]; 1000 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 1001 GlobalVariable *SwLDS = LDSParams.SwLDS; 1002 assert(SwLDS); 1003 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)}; 1004 Constant *GEP = 1005 ConstantExpr::getGetElementPtr(SwLDS->getType(), SwLDS, GEPIdx, true); 1006 OverallConstantExprElts[i] = GEP; 1007 } 1008 Constant *init = 1009 ConstantArray::get(AllKernelsOffsetsType, OverallConstantExprElts); 1010 NKLDSParams.LDSBaseTable = new GlobalVariable( 1011 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init, 1012 "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal, 1013 AMDGPUAS::GLOBAL_ADDRESS); 1014 GlobalValue::SanitizerMetadata MD; 1015 MD.NoAddress = true; 1016 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD); 1017 } 1018 1019 void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable( 1020 NonKernelLDSParameters &NKLDSParams) { 1021 // Offset table will have multiple rows and columns. 1022 // Rows are assumed to be from 0 to (n-1). n is total number 1023 // of kernels accessing the LDS through non-kernels. 1024 // Each row will have m elements. m is the total number of 1025 // unique LDS globals accessed by non-kernels. 1026 // Each element in the row correspond to the address of 1027 // the replacement of LDS global done by that particular kernel. 1028 auto &Variables = NKLDSParams.OrdereLDSGlobals; 1029 auto &Kernels = NKLDSParams.OrderedKernels; 1030 if (Variables.empty() || Kernels.empty()) 1031 return; 1032 const size_t NumberVariables = Variables.size(); 1033 const size_t NumberKernels = Kernels.size(); 1034 1035 ArrayType *KernelOffsetsType = 1036 ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumberVariables); 1037 1038 ArrayType *AllKernelsOffsetsType = 1039 ArrayType::get(KernelOffsetsType, NumberKernels); 1040 std::vector<Constant *> overallConstantExprElts(NumberKernels); 1041 for (size_t i = 0; i < NumberKernels; i++) { 1042 Function *Func = Kernels[i]; 1043 overallConstantExprElts[i] = 1044 getAddressesOfVariablesInKernel(Func, Variables); 1045 } 1046 Constant *Init = 1047 ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts); 1048 NKLDSParams.LDSOffsetTable = new GlobalVariable( 1049 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init, 1050 "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal, 1051 AMDGPUAS::GLOBAL_ADDRESS); 1052 GlobalValue::SanitizerMetadata MD; 1053 MD.NoAddress = true; 1054 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD); 1055 } 1056 1057 void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses( 1058 Function *Func, SetVector<GlobalVariable *> &LDSGlobals, 1059 NonKernelLDSParameters &NKLDSParams) { 1060 // Replace LDS access in non-kernel with replacement queried from 1061 // Base table and offset from offset table. 1062 LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : " 1063 << Func->getName()); 1064 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); 1065 IRB.SetInsertPoint(InsertAt); 1066 1067 // Get LDS memory instructions. 1068 SetVector<Instruction *> LDSInstructions; 1069 getLDSMemoryInstructions(Func, LDSInstructions); 1070 1071 auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}); 1072 GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable; 1073 GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable; 1074 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals; 1075 Value *BaseGEP = IRB.CreateInBoundsGEP( 1076 LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId}); 1077 Value *BaseLoad = 1078 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), BaseGEP); 1079 Value *LoadMallocPtr = 1080 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), BaseLoad); 1081 1082 for (GlobalVariable *GV : LDSGlobals) { 1083 const auto *GVIt = llvm::find(OrdereLDSGlobals, GV); 1084 assert(GVIt != OrdereLDSGlobals.end()); 1085 uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt); 1086 1087 Value *OffsetGEP = IRB.CreateInBoundsGEP( 1088 LDSOffsetTable->getValueType(), LDSOffsetTable, 1089 {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)}); 1090 Value *OffsetLoad = 1091 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), OffsetGEP); 1092 Value *Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad); 1093 Value *BasePlusOffset = 1094 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset}); 1095 LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for " 1096 << GV->getName()); 1097 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset); 1098 } 1099 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr, 1100 LDSInstructions); 1101 } 1102 1103 static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) { 1104 // Sort Static, dynamic LDS globals which are either 1105 // direct or indirect access on basis of name. 1106 auto &DirectAccess = LDSParams.DirectAccess; 1107 auto &IndirectAccess = LDSParams.IndirectAccess; 1108 LDSParams.DirectAccess.StaticLDSGlobals = sortByName( 1109 std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(), 1110 DirectAccess.StaticLDSGlobals.end())); 1111 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName( 1112 std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(), 1113 DirectAccess.DynamicLDSGlobals.end())); 1114 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName( 1115 std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(), 1116 IndirectAccess.StaticLDSGlobals.end())); 1117 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName( 1118 std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(), 1119 IndirectAccess.DynamicLDSGlobals.end())); 1120 } 1121 1122 void AMDGPUSwLowerLDS::initAsanInfo() { 1123 // Get Shadow mapping scale and offset. 1124 unsigned LongSize = 1125 M.getDataLayout().getPointerSizeInBits(AMDGPUAS::GLOBAL_ADDRESS); 1126 uint64_t Offset; 1127 int Scale; 1128 bool OrShadowOffset; 1129 llvm::getAddressSanitizerParams(AMDGPUTM.getTargetTriple(), LongSize, false, 1130 &Offset, &Scale, &OrShadowOffset); 1131 AsanInfo.Scale = Scale; 1132 AsanInfo.Offset = Offset; 1133 } 1134 1135 static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) { 1136 for (auto &K : LDSAccesses) { 1137 Function *F = K.first; 1138 if (!F) 1139 continue; 1140 if (F->hasFnAttribute(Attribute::SanitizeAddress)) 1141 return true; 1142 } 1143 return false; 1144 } 1145 1146 bool AMDGPUSwLowerLDS::run() { 1147 bool Changed = false; 1148 1149 CallGraph CG = CallGraph(M); 1150 1151 Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); 1152 1153 // Get all the direct and indirect access of LDS for all the kernels. 1154 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); 1155 1156 // Flag to decide whether to lower all the LDS accesses 1157 // based on sanitize_address attribute. 1158 bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSUsesInfo.direct_access) || 1159 hasFnWithSanitizeAddressAttr(LDSUsesInfo.indirect_access); 1160 1161 if (!LowerAllLDS) 1162 return Changed; 1163 1164 // Utility to group LDS access into direct, indirect, static and dynamic. 1165 auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses, 1166 bool DirectAccess) { 1167 for (auto &K : LDSAccesses) { 1168 Function *F = K.first; 1169 if (!F || K.second.empty()) 1170 continue; 1171 1172 assert(isKernelLDS(F)); 1173 1174 // Only inserts if key isn't already in the map. 1175 FuncLDSAccessInfo.KernelToLDSParametersMap.insert( 1176 {F, KernelLDSParameters()}); 1177 1178 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F]; 1179 if (!DirectAccess) 1180 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(F); 1181 for (GlobalVariable *GV : K.second) { 1182 if (!DirectAccess) { 1183 if (AMDGPU::isDynamicLDS(*GV)) 1184 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV); 1185 else 1186 LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV); 1187 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV); 1188 } else { 1189 if (AMDGPU::isDynamicLDS(*GV)) 1190 LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV); 1191 else 1192 LDSParams.DirectAccess.StaticLDSGlobals.insert(GV); 1193 } 1194 } 1195 } 1196 }; 1197 1198 PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true); 1199 PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false); 1200 1201 // Get address sanitizer scale. 1202 initAsanInfo(); 1203 1204 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) { 1205 Function *Func = K.first; 1206 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; 1207 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() && 1208 LDSParams.DirectAccess.DynamicLDSGlobals.empty() && 1209 LDSParams.IndirectAccess.StaticLDSGlobals.empty() && 1210 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) { 1211 Changed = false; 1212 } else { 1213 removeFnAttrFromReachable( 1214 CG, Func, 1215 {"amdgpu-no-workitem-id-x", "amdgpu-no-workitem-id-y", 1216 "amdgpu-no-workitem-id-z", "amdgpu-no-heap-ptr"}); 1217 if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() || 1218 !LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) 1219 removeFnAttrFromReachable(CG, Func, {"amdgpu-no-lds-kernel-id"}); 1220 reorderStaticDynamicIndirectLDSSet(LDSParams); 1221 buildSwLDSGlobal(Func); 1222 buildSwDynLDSGlobal(Func); 1223 populateSwMetadataGlobal(Func); 1224 populateSwLDSAttributeAndMetadata(Func); 1225 populateLDSToReplacementIndicesMap(Func); 1226 DomTreeUpdater DTU(DTCallback(*Func), 1227 DomTreeUpdater::UpdateStrategy::Lazy); 1228 lowerKernelLDSAccesses(Func, DTU); 1229 Changed = true; 1230 } 1231 } 1232 1233 // Get the Uses of LDS from non-kernels. 1234 getUsesOfLDSByNonKernels(); 1235 1236 // Get non-kernels with LDS ptr as argument and called by kernels. 1237 getNonKernelsWithLDSArguments(CG); 1238 1239 // Lower LDS accesses in non-kernels. 1240 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() || 1241 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) { 1242 NonKernelLDSParameters NKLDSParams; 1243 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels( 1244 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess); 1245 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals( 1246 FuncLDSAccessInfo.AllNonKernelLDSAccess); 1247 buildNonKernelLDSBaseTable(NKLDSParams); 1248 buildNonKernelLDSOffsetTable(NKLDSParams); 1249 for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) { 1250 Function *Func = K.first; 1251 DenseSet<GlobalVariable *> &LDSGlobals = K.second; 1252 SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName( 1253 std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end())); 1254 lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams); 1255 } 1256 for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) { 1257 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap; 1258 if (K.contains(Func)) 1259 continue; 1260 SetVector<llvm::GlobalVariable *> Vec; 1261 lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams); 1262 } 1263 Changed = true; 1264 } 1265 1266 if (!Changed) 1267 return Changed; 1268 1269 for (auto &GV : make_early_inc_range(M.globals())) { 1270 if (AMDGPU::isLDSVariableToLower(GV)) { 1271 // probably want to remove from used lists 1272 GV.removeDeadConstantUsers(); 1273 if (GV.use_empty()) 1274 GV.eraseFromParent(); 1275 } 1276 } 1277 1278 if (AsanInstrumentLDS) { 1279 SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument; 1280 for (Instruction *Inst : AsanInfo.Instructions) { 1281 SmallVector<InterestingMemoryOperand, 1> InterestingOperands; 1282 getInterestingMemoryOperands(M, Inst, InterestingOperands); 1283 llvm::append_range(OperandsToInstrument, InterestingOperands); 1284 } 1285 for (auto &Operand : OperandsToInstrument) { 1286 Value *Addr = Operand.getPtr(); 1287 instrumentAddress(M, IRB, Operand.getInsn(), Operand.getInsn(), Addr, 1288 Operand.Alignment.valueOrOne(), Operand.TypeStoreSize, 1289 Operand.IsWrite, nullptr, false, false, AsanInfo.Scale, 1290 AsanInfo.Offset); 1291 Changed = true; 1292 } 1293 } 1294 1295 return Changed; 1296 } 1297 1298 class AMDGPUSwLowerLDSLegacy : public ModulePass { 1299 public: 1300 const AMDGPUTargetMachine *AMDGPUTM; 1301 static char ID; 1302 AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM) 1303 : ModulePass(ID), AMDGPUTM(TM) {} 1304 bool runOnModule(Module &M) override; 1305 void getAnalysisUsage(AnalysisUsage &AU) const override { 1306 AU.addPreserved<DominatorTreeWrapperPass>(); 1307 } 1308 }; 1309 } // namespace 1310 1311 char AMDGPUSwLowerLDSLegacy::ID = 0; 1312 char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID; 1313 1314 INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds", 1315 "AMDGPU Software lowering of LDS", false, false) 1316 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 1317 INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds", 1318 "AMDGPU Software lowering of LDS", false, false) 1319 1320 bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) { 1321 // AddressSanitizer pass adds "nosanitize_address" module flag if it has 1322 // instrumented the IR. Return early if the flag is not present. 1323 if (!M.getModuleFlag("nosanitize_address")) 1324 return false; 1325 DominatorTreeWrapperPass *const DTW = 1326 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 1327 auto DTCallback = [&DTW](Function &F) -> DominatorTree * { 1328 return DTW ? &DTW->getDomTree() : nullptr; 1329 }; 1330 if (!AMDGPUTM) { 1331 auto &TPC = getAnalysis<TargetPassConfig>(); 1332 AMDGPUTM = &TPC.getTM<AMDGPUTargetMachine>(); 1333 } 1334 AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback); 1335 bool IsChanged = SwLowerLDSImpl.run(); 1336 return IsChanged; 1337 } 1338 1339 ModulePass * 1340 llvm::createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM) { 1341 return new AMDGPUSwLowerLDSLegacy(TM); 1342 } 1343 1344 PreservedAnalyses AMDGPUSwLowerLDSPass::run(Module &M, 1345 ModuleAnalysisManager &AM) { 1346 // AddressSanitizer pass adds "nosanitize_address" module flag if it has 1347 // instrumented the IR. Return early if the flag is not present. 1348 if (!M.getModuleFlag("nosanitize_address")) 1349 return PreservedAnalyses::all(); 1350 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1351 auto DTCallback = [&FAM](Function &F) -> DominatorTree * { 1352 return &FAM.getResult<DominatorTreeAnalysis>(F); 1353 }; 1354 AMDGPUSwLowerLDS SwLowerLDSImpl(M, TM, DTCallback); 1355 bool IsChanged = SwLowerLDSImpl.run(); 1356 if (!IsChanged) 1357 return PreservedAnalyses::all(); 1358 1359 PreservedAnalyses PA; 1360 PA.preserve<DominatorTreeAnalysis>(); 1361 return PA; 1362 } 1363