1 //===- AMDGPUAttributor.cpp -----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AMDGPU.h" 14 #include "GCNSubtarget.h" 15 #include "Utils/AMDGPUBaseInfo.h" 16 #include "llvm/CodeGen/TargetPassConfig.h" 17 #include "llvm/IR/IntrinsicsAMDGPU.h" 18 #include "llvm/IR/IntrinsicsR600.h" 19 #include "llvm/Target/TargetMachine.h" 20 #include "llvm/Transforms/IPO/Attributor.h" 21 22 #define DEBUG_TYPE "amdgpu-attributor" 23 24 using namespace llvm; 25 26 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS, 27 28 enum ImplicitArgumentPositions { 29 #include "AMDGPUAttributes.def" 30 LAST_ARG_POS 31 }; 32 33 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, 34 35 enum ImplicitArgumentMask { 36 NOT_IMPLICIT_INPUT = 0, 37 #include "AMDGPUAttributes.def" 38 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 39 }; 40 41 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, 42 static constexpr std::pair<ImplicitArgumentMask, 43 StringLiteral> ImplicitAttrs[] = { 44 #include "AMDGPUAttributes.def" 45 }; 46 47 // We do not need to note the x workitem or workgroup id because they are always 48 // initialized. 49 // 50 // TODO: We should not add the attributes if the known compile time workgroup 51 // size is 1 for y/z. 52 static ImplicitArgumentMask 53 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, 54 bool HasApertureRegs, bool SupportsGetDoorBellID) { 55 unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion(); 56 switch (ID) { 57 case Intrinsic::amdgcn_workitem_id_x: 58 NonKernelOnly = true; 59 return WORKITEM_ID_X; 60 case Intrinsic::amdgcn_workgroup_id_x: 61 NonKernelOnly = true; 62 return WORKGROUP_ID_X; 63 case Intrinsic::amdgcn_workitem_id_y: 64 case Intrinsic::r600_read_tidig_y: 65 return WORKITEM_ID_Y; 66 case Intrinsic::amdgcn_workitem_id_z: 67 case Intrinsic::r600_read_tidig_z: 68 return WORKITEM_ID_Z; 69 case Intrinsic::amdgcn_workgroup_id_y: 70 case Intrinsic::r600_read_tgid_y: 71 return WORKGROUP_ID_Y; 72 case Intrinsic::amdgcn_workgroup_id_z: 73 case Intrinsic::r600_read_tgid_z: 74 return WORKGROUP_ID_Z; 75 case Intrinsic::amdgcn_lds_kernel_id: 76 return LDS_KERNEL_ID; 77 case Intrinsic::amdgcn_dispatch_ptr: 78 return DISPATCH_PTR; 79 case Intrinsic::amdgcn_dispatch_id: 80 return DISPATCH_ID; 81 case Intrinsic::amdgcn_implicitarg_ptr: 82 return IMPLICIT_ARG_PTR; 83 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access 84 // queue_ptr. 85 case Intrinsic::amdgcn_queue_ptr: 86 NeedsImplicit = (CodeObjectVersion == 5); 87 return QUEUE_PTR; 88 case Intrinsic::amdgcn_is_shared: 89 case Intrinsic::amdgcn_is_private: 90 if (HasApertureRegs) 91 return NOT_IMPLICIT_INPUT; 92 // Under V5, we need implicitarg_ptr + offsets to access private_base or 93 // shared_base. For pre-V5, however, need to access them through queue_ptr + 94 // offsets. 95 return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR; 96 case Intrinsic::trap: 97 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4. 98 return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR; 99 NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5. 100 return QUEUE_PTR; 101 default: 102 return NOT_IMPLICIT_INPUT; 103 } 104 } 105 106 static bool castRequiresQueuePtr(unsigned SrcAS) { 107 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 108 } 109 110 static bool isDSAddress(const Constant *C) { 111 const GlobalValue *GV = dyn_cast<GlobalValue>(C); 112 if (!GV) 113 return false; 114 unsigned AS = GV->getAddressSpace(); 115 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; 116 } 117 118 /// Returns true if the function requires the implicit argument be passed 119 /// regardless of the function contents. 120 static bool funcRequiresHostcallPtr(const Function &F) { 121 // Sanitizers require the hostcall buffer passed in the implicit arguments. 122 return F.hasFnAttribute(Attribute::SanitizeAddress) || 123 F.hasFnAttribute(Attribute::SanitizeThread) || 124 F.hasFnAttribute(Attribute::SanitizeMemory) || 125 F.hasFnAttribute(Attribute::SanitizeHWAddress) || 126 F.hasFnAttribute(Attribute::SanitizeMemTag); 127 } 128 129 namespace { 130 class AMDGPUInformationCache : public InformationCache { 131 public: 132 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, 133 BumpPtrAllocator &Allocator, 134 SetVector<Function *> *CGSCC, TargetMachine &TM) 135 : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {} 136 TargetMachine &TM; 137 138 enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 }; 139 140 /// Check if the subtarget has aperture regs. 141 bool hasApertureRegs(Function &F) { 142 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 143 return ST.hasApertureRegs(); 144 } 145 146 /// Check if the subtarget supports GetDoorbellID. 147 bool supportsGetDoorbellID(Function &F) { 148 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 149 return ST.supportsGetDoorbellID(); 150 } 151 152 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) { 153 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 154 return ST.getFlatWorkGroupSizes(F); 155 } 156 157 std::pair<unsigned, unsigned> 158 getMaximumFlatWorkGroupRange(const Function &F) { 159 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 160 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; 161 } 162 163 private: 164 /// Check if the ConstantExpr \p CE requires the queue pointer. 165 static bool visitConstExpr(const ConstantExpr *CE) { 166 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 167 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 168 return castRequiresQueuePtr(SrcAS); 169 } 170 return false; 171 } 172 173 /// Get the constant access bitmap for \p C. 174 uint8_t getConstantAccess(const Constant *C) { 175 auto It = ConstantStatus.find(C); 176 if (It != ConstantStatus.end()) 177 return It->second; 178 179 uint8_t Result = 0; 180 if (isDSAddress(C)) 181 Result = DS_GLOBAL; 182 183 if (const auto *CE = dyn_cast<ConstantExpr>(C)) 184 if (visitConstExpr(CE)) 185 Result |= ADDR_SPACE_CAST; 186 187 for (const Use &U : C->operands()) { 188 const auto *OpC = dyn_cast<Constant>(U); 189 if (!OpC) 190 continue; 191 192 Result |= getConstantAccess(OpC); 193 } 194 return Result; 195 } 196 197 public: 198 /// Returns true if \p Fn needs the queue pointer because of \p C. 199 bool needsQueuePtr(const Constant *C, Function &Fn) { 200 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv()); 201 bool HasAperture = hasApertureRegs(Fn); 202 203 // No need to explore the constants. 204 if (!IsNonEntryFunc && HasAperture) 205 return false; 206 207 uint8_t Access = getConstantAccess(C); 208 209 // We need to trap on DS globals in non-entry functions. 210 if (IsNonEntryFunc && (Access & DS_GLOBAL)) 211 return true; 212 213 return !HasAperture && (Access & ADDR_SPACE_CAST); 214 } 215 216 private: 217 /// Used to determine if the Constant needs the queue pointer. 218 DenseMap<const Constant *, uint8_t> ConstantStatus; 219 }; 220 221 struct AAAMDAttributes : public StateWrapper< 222 BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>, AbstractAttribute> { 223 using Base = StateWrapper<BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>, 224 AbstractAttribute>; 225 226 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 227 228 /// Create an abstract attribute view for the position \p IRP. 229 static AAAMDAttributes &createForPosition(const IRPosition &IRP, 230 Attributor &A); 231 232 /// See AbstractAttribute::getName(). 233 const std::string getName() const override { return "AAAMDAttributes"; } 234 235 /// See AbstractAttribute::getIdAddr(). 236 const char *getIdAddr() const override { return &ID; } 237 238 /// This function should return true if the type of the \p AA is 239 /// AAAMDAttributes. 240 static bool classof(const AbstractAttribute *AA) { 241 return (AA->getIdAddr() == &ID); 242 } 243 244 /// Unique ID (due to the unique address) 245 static const char ID; 246 }; 247 const char AAAMDAttributes::ID = 0; 248 249 struct AAUniformWorkGroupSize 250 : public StateWrapper<BooleanState, AbstractAttribute> { 251 using Base = StateWrapper<BooleanState, AbstractAttribute>; 252 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 253 254 /// Create an abstract attribute view for the position \p IRP. 255 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP, 256 Attributor &A); 257 258 /// See AbstractAttribute::getName(). 259 const std::string getName() const override { 260 return "AAUniformWorkGroupSize"; 261 } 262 263 /// See AbstractAttribute::getIdAddr(). 264 const char *getIdAddr() const override { return &ID; } 265 266 /// This function should return true if the type of the \p AA is 267 /// AAAMDAttributes. 268 static bool classof(const AbstractAttribute *AA) { 269 return (AA->getIdAddr() == &ID); 270 } 271 272 /// Unique ID (due to the unique address) 273 static const char ID; 274 }; 275 const char AAUniformWorkGroupSize::ID = 0; 276 277 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize { 278 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A) 279 : AAUniformWorkGroupSize(IRP, A) {} 280 281 void initialize(Attributor &A) override { 282 Function *F = getAssociatedFunction(); 283 CallingConv::ID CC = F->getCallingConv(); 284 285 if (CC != CallingConv::AMDGPU_KERNEL) 286 return; 287 288 bool InitialValue = false; 289 if (F->hasFnAttribute("uniform-work-group-size")) 290 InitialValue = F->getFnAttribute("uniform-work-group-size") 291 .getValueAsString() 292 .equals("true"); 293 294 if (InitialValue) 295 indicateOptimisticFixpoint(); 296 else 297 indicatePessimisticFixpoint(); 298 } 299 300 ChangeStatus updateImpl(Attributor &A) override { 301 ChangeStatus Change = ChangeStatus::UNCHANGED; 302 303 auto CheckCallSite = [&](AbstractCallSite CS) { 304 Function *Caller = CS.getInstruction()->getFunction(); 305 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName() 306 << "->" << getAssociatedFunction()->getName() << "\n"); 307 308 const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>( 309 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 310 311 Change = Change | clampStateAndIndicateChange(this->getState(), 312 CallerInfo.getState()); 313 314 return true; 315 }; 316 317 bool AllCallSitesKnown = true; 318 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 319 return indicatePessimisticFixpoint(); 320 321 return Change; 322 } 323 324 ChangeStatus manifest(Attributor &A) override { 325 SmallVector<Attribute, 8> AttrList; 326 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 327 328 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size", 329 getAssumed() ? "true" : "false")); 330 return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, 331 /* ForceReplace */ true); 332 } 333 334 bool isValidState() const override { 335 // This state is always valid, even when the state is false. 336 return true; 337 } 338 339 const std::string getAsStr() const override { 340 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]"; 341 } 342 343 /// See AbstractAttribute::trackStatistics() 344 void trackStatistics() const override {} 345 }; 346 347 AAUniformWorkGroupSize & 348 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP, 349 Attributor &A) { 350 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 351 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A); 352 llvm_unreachable( 353 "AAUniformWorkGroupSize is only valid for function position"); 354 } 355 356 struct AAAMDAttributesFunction : public AAAMDAttributes { 357 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A) 358 : AAAMDAttributes(IRP, A) {} 359 360 void initialize(Attributor &A) override { 361 Function *F = getAssociatedFunction(); 362 363 // If the function requires the implicit arg pointer due to sanitizers, 364 // assume it's needed even if explicitly marked as not requiring it. 365 const bool NeedsHostcall = funcRequiresHostcallPtr(*F); 366 if (NeedsHostcall) { 367 removeAssumedBits(IMPLICIT_ARG_PTR); 368 removeAssumedBits(HOSTCALL_PTR); 369 } 370 371 for (auto Attr : ImplicitAttrs) { 372 if (NeedsHostcall && 373 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR)) 374 continue; 375 376 if (F->hasFnAttribute(Attr.second)) 377 addKnownBits(Attr.first); 378 } 379 380 if (F->isDeclaration()) 381 return; 382 383 // Ignore functions with graphics calling conventions, these are currently 384 // not allowed to have kernel arguments. 385 if (AMDGPU::isGraphics(F->getCallingConv())) { 386 indicatePessimisticFixpoint(); 387 return; 388 } 389 } 390 391 ChangeStatus updateImpl(Attributor &A) override { 392 Function *F = getAssociatedFunction(); 393 // The current assumed state used to determine a change. 394 auto OrigAssumed = getAssumed(); 395 396 // Check for Intrinsics and propagate attributes. 397 const AACallEdges &AAEdges = A.getAAFor<AACallEdges>( 398 *this, this->getIRPosition(), DepClassTy::REQUIRED); 399 if (AAEdges.hasNonAsmUnknownCallee()) 400 return indicatePessimisticFixpoint(); 401 402 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); 403 404 bool NeedsImplicit = false; 405 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 406 bool HasApertureRegs = InfoCache.hasApertureRegs(*F); 407 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F); 408 409 for (Function *Callee : AAEdges.getOptimisticEdges()) { 410 Intrinsic::ID IID = Callee->getIntrinsicID(); 411 if (IID == Intrinsic::not_intrinsic) { 412 const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>( 413 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); 414 *this &= AAAMD; 415 continue; 416 } 417 418 bool NonKernelOnly = false; 419 ImplicitArgumentMask AttrMask = 420 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, 421 HasApertureRegs, SupportsGetDoorbellID); 422 if (AttrMask != NOT_IMPLICIT_INPUT) { 423 if ((IsNonEntryFunc || !NonKernelOnly)) 424 removeAssumedBits(AttrMask); 425 } 426 } 427 428 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base. 429 if (NeedsImplicit) 430 removeAssumedBits(IMPLICIT_ARG_PTR); 431 432 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) { 433 // Under V5, we need implicitarg_ptr + offsets to access private_base or 434 // shared_base. We do not actually need queue_ptr. 435 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) 436 removeAssumedBits(IMPLICIT_ARG_PTR); 437 else 438 removeAssumedBits(QUEUE_PTR); 439 } 440 441 if (funcRetrievesMultigridSyncArg(A)) { 442 assert(!isAssumed(IMPLICIT_ARG_PTR) && 443 "multigrid_sync_arg needs implicitarg_ptr"); 444 removeAssumedBits(MULTIGRID_SYNC_ARG); 445 } 446 447 if (funcRetrievesHostcallPtr(A)) { 448 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr"); 449 removeAssumedBits(HOSTCALL_PTR); 450 } 451 452 if (funcRetrievesHeapPtr(A)) { 453 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr"); 454 removeAssumedBits(HEAP_PTR); 455 } 456 457 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) { 458 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr"); 459 removeAssumedBits(QUEUE_PTR); 460 } 461 462 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) { 463 removeAssumedBits(LDS_KERNEL_ID); 464 } 465 466 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED 467 : ChangeStatus::UNCHANGED; 468 } 469 470 ChangeStatus manifest(Attributor &A) override { 471 SmallVector<Attribute, 8> AttrList; 472 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 473 474 for (auto Attr : ImplicitAttrs) { 475 if (isKnown(Attr.first)) 476 AttrList.push_back(Attribute::get(Ctx, Attr.second)); 477 } 478 479 return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, 480 /* ForceReplace */ true); 481 } 482 483 const std::string getAsStr() const override { 484 std::string Str; 485 raw_string_ostream OS(Str); 486 OS << "AMDInfo["; 487 for (auto Attr : ImplicitAttrs) 488 OS << ' ' << Attr.second; 489 OS << " ]"; 490 return OS.str(); 491 } 492 493 /// See AbstractAttribute::trackStatistics() 494 void trackStatistics() const override {} 495 496 private: 497 bool checkForQueuePtr(Attributor &A) { 498 Function *F = getAssociatedFunction(); 499 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); 500 501 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 502 503 bool NeedsQueuePtr = false; 504 505 auto CheckAddrSpaceCasts = [&](Instruction &I) { 506 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace(); 507 if (castRequiresQueuePtr(SrcAS)) { 508 NeedsQueuePtr = true; 509 return false; 510 } 511 return true; 512 }; 513 514 bool HasApertureRegs = InfoCache.hasApertureRegs(*F); 515 516 // `checkForAllInstructions` is much more cheaper than going through all 517 // instructions, try it first. 518 519 // The queue pointer is not needed if aperture regs is present. 520 if (!HasApertureRegs) { 521 bool UsedAssumedInformation = false; 522 A.checkForAllInstructions(CheckAddrSpaceCasts, *this, 523 {Instruction::AddrSpaceCast}, 524 UsedAssumedInformation); 525 } 526 527 // If we found that we need the queue pointer, nothing else to do. 528 if (NeedsQueuePtr) 529 return true; 530 531 if (!IsNonEntryFunc && HasApertureRegs) 532 return false; 533 534 for (BasicBlock &BB : *F) { 535 for (Instruction &I : BB) { 536 for (const Use &U : I.operands()) { 537 if (const auto *C = dyn_cast<Constant>(U)) { 538 if (InfoCache.needsQueuePtr(C, *F)) 539 return true; 540 } 541 } 542 } 543 } 544 545 return false; 546 } 547 548 bool funcRetrievesMultigridSyncArg(Attributor &A) { 549 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(); 550 AAPointerInfo::OffsetAndSize OAS(Pos, 8); 551 return funcRetrievesImplicitKernelArg(A, OAS); 552 } 553 554 bool funcRetrievesHostcallPtr(Attributor &A) { 555 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(); 556 AAPointerInfo::OffsetAndSize OAS(Pos, 8); 557 return funcRetrievesImplicitKernelArg(A, OAS); 558 } 559 560 bool funcRetrievesHeapPtr(Attributor &A) { 561 if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) 562 return false; 563 AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8); 564 return funcRetrievesImplicitKernelArg(A, OAS); 565 } 566 567 bool funcRetrievesQueuePtr(Attributor &A) { 568 if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) 569 return false; 570 AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8); 571 return funcRetrievesImplicitKernelArg(A, OAS); 572 } 573 574 bool funcRetrievesImplicitKernelArg(Attributor &A, 575 AAPointerInfo::OffsetAndSize OAS) { 576 // Check if this is a call to the implicitarg_ptr builtin and it 577 // is used to retrieve the hostcall pointer. The implicit arg for 578 // hostcall is not used only if every use of the implicitarg_ptr 579 // is a load that clearly does not retrieve any byte of the 580 // hostcall pointer. We check this by tracing all the uses of the 581 // initial call to the implicitarg_ptr intrinsic. 582 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) { 583 auto &Call = cast<CallBase>(I); 584 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr) 585 return true; 586 587 const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>( 588 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED); 589 590 return PointerInfoAA.forallInterferingAccesses( 591 OAS, [](const AAPointerInfo::Access &Acc, bool IsExact) { 592 return Acc.getRemoteInst()->isDroppable(); 593 }); 594 }; 595 596 bool UsedAssumedInformation = false; 597 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this, 598 UsedAssumedInformation); 599 } 600 601 bool funcRetrievesLDSKernelId(Attributor &A) { 602 auto DoesNotRetrieve = [&](Instruction &I) { 603 auto &Call = cast<CallBase>(I); 604 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id; 605 }; 606 bool UsedAssumedInformation = false; 607 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this, 608 UsedAssumedInformation); 609 } 610 }; 611 612 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, 613 Attributor &A) { 614 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 615 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A); 616 llvm_unreachable("AAAMDAttributes is only valid for function position"); 617 } 618 619 /// Propagate amdgpu-flat-work-group-size attribute. 620 struct AAAMDFlatWorkGroupSize 621 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> { 622 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>; 623 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A) 624 : Base(IRP, 32) {} 625 626 /// See AbstractAttribute::getState(...). 627 IntegerRangeState &getState() override { return *this; } 628 const IntegerRangeState &getState() const override { return *this; } 629 630 void initialize(Attributor &A) override { 631 Function *F = getAssociatedFunction(); 632 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 633 unsigned MinGroupSize, MaxGroupSize; 634 std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F); 635 intersectKnown( 636 ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1))); 637 638 if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) 639 indicatePessimisticFixpoint(); 640 } 641 642 ChangeStatus updateImpl(Attributor &A) override { 643 ChangeStatus Change = ChangeStatus::UNCHANGED; 644 645 auto CheckCallSite = [&](AbstractCallSite CS) { 646 Function *Caller = CS.getInstruction()->getFunction(); 647 LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName() 648 << "->" << getAssociatedFunction()->getName() << '\n'); 649 650 const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>( 651 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 652 653 Change |= 654 clampStateAndIndicateChange(this->getState(), CallerInfo.getState()); 655 656 return true; 657 }; 658 659 bool AllCallSitesKnown = true; 660 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 661 return indicatePessimisticFixpoint(); 662 663 return Change; 664 } 665 666 ChangeStatus manifest(Attributor &A) override { 667 SmallVector<Attribute, 8> AttrList; 668 Function *F = getAssociatedFunction(); 669 LLVMContext &Ctx = F->getContext(); 670 671 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 672 unsigned Min, Max; 673 std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F); 674 675 // Don't add the attribute if it's the implied default. 676 if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max) 677 return ChangeStatus::UNCHANGED; 678 679 SmallString<10> Buffer; 680 raw_svector_ostream OS(Buffer); 681 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; 682 683 AttrList.push_back( 684 Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str())); 685 return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, 686 /* ForceReplace */ true); 687 } 688 689 const std::string getAsStr() const override { 690 std::string Str; 691 raw_string_ostream OS(Str); 692 OS << "AMDFlatWorkGroupSize["; 693 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; 694 OS << ']'; 695 return OS.str(); 696 } 697 698 /// See AbstractAttribute::trackStatistics() 699 void trackStatistics() const override {} 700 701 /// Create an abstract attribute view for the position \p IRP. 702 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP, 703 Attributor &A); 704 705 /// See AbstractAttribute::getName() 706 const std::string getName() const override { 707 return "AAAMDFlatWorkGroupSize"; 708 } 709 710 /// See AbstractAttribute::getIdAddr() 711 const char *getIdAddr() const override { return &ID; } 712 713 /// This function should return true if the type of the \p AA is 714 /// AAAMDFlatWorkGroupSize 715 static bool classof(const AbstractAttribute *AA) { 716 return (AA->getIdAddr() == &ID); 717 } 718 719 /// Unique ID (due to the unique address) 720 static const char ID; 721 }; 722 723 const char AAAMDFlatWorkGroupSize::ID = 0; 724 725 AAAMDFlatWorkGroupSize & 726 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP, 727 Attributor &A) { 728 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 729 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A); 730 llvm_unreachable( 731 "AAAMDFlatWorkGroupSize is only valid for function position"); 732 } 733 734 class AMDGPUAttributor : public ModulePass { 735 public: 736 AMDGPUAttributor() : ModulePass(ID) {} 737 738 /// doInitialization - Virtual method overridden by subclasses to do 739 /// any necessary initialization before any pass is run. 740 bool doInitialization(Module &) override { 741 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 742 if (!TPC) 743 report_fatal_error("TargetMachine is required"); 744 745 TM = &TPC->getTM<TargetMachine>(); 746 return false; 747 } 748 749 bool runOnModule(Module &M) override { 750 SetVector<Function *> Functions; 751 AnalysisGetter AG; 752 for (Function &F : M) { 753 if (!F.isIntrinsic()) 754 Functions.insert(&F); 755 } 756 757 CallGraphUpdater CGUpdater; 758 BumpPtrAllocator Allocator; 759 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); 760 DenseSet<const char *> Allowed( 761 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, 762 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, 763 &AAPointerInfo::ID}); 764 765 AttributorConfig AC(CGUpdater); 766 AC.Allowed = &Allowed; 767 AC.IsModulePass = true; 768 AC.DefaultInitializeLiveInternals = false; 769 770 Attributor A(Functions, InfoCache, AC); 771 772 for (Function &F : M) { 773 if (!F.isIntrinsic()) { 774 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F)); 775 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F)); 776 if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) { 777 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F)); 778 } 779 } 780 } 781 782 ChangeStatus Change = A.run(); 783 return Change == ChangeStatus::CHANGED; 784 } 785 786 StringRef getPassName() const override { return "AMDGPU Attributor"; } 787 TargetMachine *TM; 788 static char ID; 789 }; 790 } // namespace 791 792 char AMDGPUAttributor::ID = 0; 793 794 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); } 795 INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false) 796