1 //===- AMDGPUAttributor.cpp -----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AMDGPU.h" 14 #include "GCNSubtarget.h" 15 #include "Utils/AMDGPUBaseInfo.h" 16 #include "llvm/CodeGen/TargetPassConfig.h" 17 #include "llvm/IR/IntrinsicsAMDGPU.h" 18 #include "llvm/IR/IntrinsicsR600.h" 19 #include "llvm/Target/TargetMachine.h" 20 #include "llvm/Transforms/IPO/Attributor.h" 21 22 #define DEBUG_TYPE "amdgpu-attributor" 23 24 using namespace llvm; 25 26 static cl::opt<unsigned> IndirectCallSpecializationThreshold( 27 "amdgpu-indirect-call-specialization-threshold", 28 cl::desc( 29 "A threshold controls whether an indirect call will be specialized"), 30 cl::init(3)); 31 32 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS, 33 34 enum ImplicitArgumentPositions { 35 #include "AMDGPUAttributes.def" 36 LAST_ARG_POS 37 }; 38 39 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, 40 41 enum ImplicitArgumentMask { 42 NOT_IMPLICIT_INPUT = 0, 43 #include "AMDGPUAttributes.def" 44 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 45 }; 46 47 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, 48 static constexpr std::pair<ImplicitArgumentMask, StringLiteral> 49 ImplicitAttrs[] = { 50 #include "AMDGPUAttributes.def" 51 }; 52 53 // We do not need to note the x workitem or workgroup id because they are always 54 // initialized. 55 // 56 // TODO: We should not add the attributes if the known compile time workgroup 57 // size is 1 for y/z. 58 static ImplicitArgumentMask 59 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, 60 bool HasApertureRegs, bool SupportsGetDoorBellID, 61 unsigned CodeObjectVersion) { 62 switch (ID) { 63 case Intrinsic::amdgcn_workitem_id_x: 64 NonKernelOnly = true; 65 return WORKITEM_ID_X; 66 case Intrinsic::amdgcn_workgroup_id_x: 67 NonKernelOnly = true; 68 return WORKGROUP_ID_X; 69 case Intrinsic::amdgcn_workitem_id_y: 70 case Intrinsic::r600_read_tidig_y: 71 return WORKITEM_ID_Y; 72 case Intrinsic::amdgcn_workitem_id_z: 73 case Intrinsic::r600_read_tidig_z: 74 return WORKITEM_ID_Z; 75 case Intrinsic::amdgcn_workgroup_id_y: 76 case Intrinsic::r600_read_tgid_y: 77 return WORKGROUP_ID_Y; 78 case Intrinsic::amdgcn_workgroup_id_z: 79 case Intrinsic::r600_read_tgid_z: 80 return WORKGROUP_ID_Z; 81 case Intrinsic::amdgcn_lds_kernel_id: 82 return LDS_KERNEL_ID; 83 case Intrinsic::amdgcn_dispatch_ptr: 84 return DISPATCH_PTR; 85 case Intrinsic::amdgcn_dispatch_id: 86 return DISPATCH_ID; 87 case Intrinsic::amdgcn_implicitarg_ptr: 88 return IMPLICIT_ARG_PTR; 89 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access 90 // queue_ptr. 91 case Intrinsic::amdgcn_queue_ptr: 92 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); 93 return QUEUE_PTR; 94 case Intrinsic::amdgcn_is_shared: 95 case Intrinsic::amdgcn_is_private: 96 if (HasApertureRegs) 97 return NOT_IMPLICIT_INPUT; 98 // Under V5, we need implicitarg_ptr + offsets to access private_base or 99 // shared_base. For pre-V5, however, need to access them through queue_ptr + 100 // offsets. 101 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR 102 : QUEUE_PTR; 103 case Intrinsic::trap: 104 case Intrinsic::debugtrap: 105 case Intrinsic::ubsantrap: 106 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4. 107 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT 108 : QUEUE_PTR; 109 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); 110 return QUEUE_PTR; 111 default: 112 return NOT_IMPLICIT_INPUT; 113 } 114 } 115 116 static bool castRequiresQueuePtr(unsigned SrcAS) { 117 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 118 } 119 120 static bool isDSAddress(const Constant *C) { 121 const GlobalValue *GV = dyn_cast<GlobalValue>(C); 122 if (!GV) 123 return false; 124 unsigned AS = GV->getAddressSpace(); 125 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; 126 } 127 128 /// Returns true if the function requires the implicit argument be passed 129 /// regardless of the function contents. 130 static bool funcRequiresHostcallPtr(const Function &F) { 131 // Sanitizers require the hostcall buffer passed in the implicit arguments. 132 return F.hasFnAttribute(Attribute::SanitizeAddress) || 133 F.hasFnAttribute(Attribute::SanitizeThread) || 134 F.hasFnAttribute(Attribute::SanitizeMemory) || 135 F.hasFnAttribute(Attribute::SanitizeHWAddress) || 136 F.hasFnAttribute(Attribute::SanitizeMemTag); 137 } 138 139 namespace { 140 class AMDGPUInformationCache : public InformationCache { 141 public: 142 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, 143 BumpPtrAllocator &Allocator, 144 SetVector<Function *> *CGSCC, TargetMachine &TM) 145 : InformationCache(M, AG, Allocator, CGSCC), TM(TM), 146 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {} 147 148 TargetMachine &TM; 149 150 enum ConstantStatus : uint8_t { 151 NONE = 0, 152 DS_GLOBAL = 1 << 0, 153 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1, 154 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2, 155 ADDR_SPACE_CAST_BOTH_TO_FLAT = 156 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT 157 }; 158 159 /// Check if the subtarget has aperture regs. 160 bool hasApertureRegs(Function &F) { 161 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 162 return ST.hasApertureRegs(); 163 } 164 165 /// Check if the subtarget supports GetDoorbellID. 166 bool supportsGetDoorbellID(Function &F) { 167 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 168 return ST.supportsGetDoorbellID(); 169 } 170 171 std::optional<std::pair<unsigned, unsigned>> 172 getFlatWorkGroupSizeAttr(const Function &F) const { 173 auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size"); 174 if (!R) 175 return std::nullopt; 176 return std::make_pair(R->first, *(R->second)); 177 } 178 179 std::pair<unsigned, unsigned> 180 getDefaultFlatWorkGroupSize(const Function &F) const { 181 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 182 return ST.getDefaultFlatWorkGroupSize(F.getCallingConv()); 183 } 184 185 std::pair<unsigned, unsigned> 186 getMaximumFlatWorkGroupRange(const Function &F) { 187 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 188 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; 189 } 190 191 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) { 192 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 193 return ST.getMaxNumWorkGroups(F); 194 } 195 196 /// Get code object version. 197 unsigned getCodeObjectVersion() const { return CodeObjectVersion; } 198 199 /// Get the effective value of "amdgpu-waves-per-eu" for the function, 200 /// accounting for the interaction with the passed value to use for 201 /// "amdgpu-flat-work-group-size". 202 std::pair<unsigned, unsigned> 203 getWavesPerEU(const Function &F, 204 std::pair<unsigned, unsigned> FlatWorkGroupSize) { 205 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 206 return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F); 207 } 208 209 std::optional<std::pair<unsigned, unsigned>> 210 getWavesPerEUAttr(const Function &F) { 211 auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", 212 /*OnlyFirstRequired=*/true); 213 if (!Val) 214 return std::nullopt; 215 if (!Val->second) { 216 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 217 Val->second = ST.getMaxWavesPerEU(); 218 } 219 return std::make_pair(Val->first, *(Val->second)); 220 } 221 222 std::pair<unsigned, unsigned> 223 getEffectiveWavesPerEU(const Function &F, 224 std::pair<unsigned, unsigned> WavesPerEU, 225 std::pair<unsigned, unsigned> FlatWorkGroupSize) { 226 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 227 return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize, 228 getLDSSize(F)); 229 } 230 231 unsigned getMaxWavesPerEU(const Function &F) { 232 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 233 return ST.getMaxWavesPerEU(); 234 } 235 236 unsigned getMaxAddrSpace() const override { 237 return AMDGPUAS::MAX_AMDGPU_ADDRESS; 238 } 239 240 private: 241 /// Check if the ConstantExpr \p CE uses an addrspacecast from private or 242 /// local to flat. These casts may require the queue pointer. 243 static uint8_t visitConstExpr(const ConstantExpr *CE) { 244 uint8_t Status = NONE; 245 246 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 247 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 248 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS) 249 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT; 250 else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS) 251 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT; 252 } 253 254 return Status; 255 } 256 257 /// Returns the minimum amount of LDS space used by a workgroup running 258 /// function \p F. 259 static unsigned getLDSSize(const Function &F) { 260 return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", 261 {0, UINT32_MAX}, true) 262 .first; 263 } 264 265 /// Get the constant access bitmap for \p C. 266 uint8_t getConstantAccess(const Constant *C, 267 SmallPtrSetImpl<const Constant *> &Visited) { 268 auto It = ConstantStatus.find(C); 269 if (It != ConstantStatus.end()) 270 return It->second; 271 272 uint8_t Result = 0; 273 if (isDSAddress(C)) 274 Result = DS_GLOBAL; 275 276 if (const auto *CE = dyn_cast<ConstantExpr>(C)) 277 Result |= visitConstExpr(CE); 278 279 for (const Use &U : C->operands()) { 280 const auto *OpC = dyn_cast<Constant>(U); 281 if (!OpC || !Visited.insert(OpC).second) 282 continue; 283 284 Result |= getConstantAccess(OpC, Visited); 285 } 286 return Result; 287 } 288 289 public: 290 /// Returns true if \p Fn needs the queue pointer because of \p C. 291 bool needsQueuePtr(const Constant *C, Function &Fn) { 292 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv()); 293 bool HasAperture = hasApertureRegs(Fn); 294 295 // No need to explore the constants. 296 if (!IsNonEntryFunc && HasAperture) 297 return false; 298 299 SmallPtrSet<const Constant *, 8> Visited; 300 uint8_t Access = getConstantAccess(C, Visited); 301 302 // We need to trap on DS globals in non-entry functions. 303 if (IsNonEntryFunc && (Access & DS_GLOBAL)) 304 return true; 305 306 return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT); 307 } 308 309 bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) { 310 SmallPtrSet<const Constant *, 8> Visited; 311 uint8_t Access = getConstantAccess(C, Visited); 312 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT; 313 } 314 315 private: 316 /// Used to determine if the Constant needs the queue pointer. 317 DenseMap<const Constant *, uint8_t> ConstantStatus; 318 const unsigned CodeObjectVersion; 319 }; 320 321 struct AAAMDAttributes 322 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>, 323 AbstractAttribute> { 324 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>, 325 AbstractAttribute>; 326 327 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 328 329 /// Create an abstract attribute view for the position \p IRP. 330 static AAAMDAttributes &createForPosition(const IRPosition &IRP, 331 Attributor &A); 332 333 /// See AbstractAttribute::getName(). 334 StringRef getName() const override { return "AAAMDAttributes"; } 335 336 /// See AbstractAttribute::getIdAddr(). 337 const char *getIdAddr() const override { return &ID; } 338 339 /// This function should return true if the type of the \p AA is 340 /// AAAMDAttributes. 341 static bool classof(const AbstractAttribute *AA) { 342 return (AA->getIdAddr() == &ID); 343 } 344 345 /// Unique ID (due to the unique address) 346 static const char ID; 347 }; 348 const char AAAMDAttributes::ID = 0; 349 350 struct AAUniformWorkGroupSize 351 : public StateWrapper<BooleanState, AbstractAttribute> { 352 using Base = StateWrapper<BooleanState, AbstractAttribute>; 353 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 354 355 /// Create an abstract attribute view for the position \p IRP. 356 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP, 357 Attributor &A); 358 359 /// See AbstractAttribute::getName(). 360 StringRef getName() const override { return "AAUniformWorkGroupSize"; } 361 362 /// See AbstractAttribute::getIdAddr(). 363 const char *getIdAddr() const override { return &ID; } 364 365 /// This function should return true if the type of the \p AA is 366 /// AAAMDAttributes. 367 static bool classof(const AbstractAttribute *AA) { 368 return (AA->getIdAddr() == &ID); 369 } 370 371 /// Unique ID (due to the unique address) 372 static const char ID; 373 }; 374 const char AAUniformWorkGroupSize::ID = 0; 375 376 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize { 377 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A) 378 : AAUniformWorkGroupSize(IRP, A) {} 379 380 void initialize(Attributor &A) override { 381 Function *F = getAssociatedFunction(); 382 CallingConv::ID CC = F->getCallingConv(); 383 384 if (CC != CallingConv::AMDGPU_KERNEL) 385 return; 386 387 bool InitialValue = false; 388 if (F->hasFnAttribute("uniform-work-group-size")) 389 InitialValue = 390 F->getFnAttribute("uniform-work-group-size").getValueAsString() == 391 "true"; 392 393 if (InitialValue) 394 indicateOptimisticFixpoint(); 395 else 396 indicatePessimisticFixpoint(); 397 } 398 399 ChangeStatus updateImpl(Attributor &A) override { 400 ChangeStatus Change = ChangeStatus::UNCHANGED; 401 402 auto CheckCallSite = [&](AbstractCallSite CS) { 403 Function *Caller = CS.getInstruction()->getFunction(); 404 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName() 405 << "->" << getAssociatedFunction()->getName() << "\n"); 406 407 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>( 408 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 409 if (!CallerInfo || !CallerInfo->isValidState()) 410 return false; 411 412 Change = Change | clampStateAndIndicateChange(this->getState(), 413 CallerInfo->getState()); 414 415 return true; 416 }; 417 418 bool AllCallSitesKnown = true; 419 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 420 return indicatePessimisticFixpoint(); 421 422 return Change; 423 } 424 425 ChangeStatus manifest(Attributor &A) override { 426 SmallVector<Attribute, 8> AttrList; 427 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 428 429 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size", 430 getAssumed() ? "true" : "false")); 431 return A.manifestAttrs(getIRPosition(), AttrList, 432 /* ForceReplace */ true); 433 } 434 435 bool isValidState() const override { 436 // This state is always valid, even when the state is false. 437 return true; 438 } 439 440 const std::string getAsStr(Attributor *) const override { 441 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]"; 442 } 443 444 /// See AbstractAttribute::trackStatistics() 445 void trackStatistics() const override {} 446 }; 447 448 AAUniformWorkGroupSize & 449 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP, 450 Attributor &A) { 451 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 452 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A); 453 llvm_unreachable( 454 "AAUniformWorkGroupSize is only valid for function position"); 455 } 456 457 struct AAAMDAttributesFunction : public AAAMDAttributes { 458 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A) 459 : AAAMDAttributes(IRP, A) {} 460 461 void initialize(Attributor &A) override { 462 Function *F = getAssociatedFunction(); 463 464 // If the function requires the implicit arg pointer due to sanitizers, 465 // assume it's needed even if explicitly marked as not requiring it. 466 const bool NeedsHostcall = funcRequiresHostcallPtr(*F); 467 if (NeedsHostcall) { 468 removeAssumedBits(IMPLICIT_ARG_PTR); 469 removeAssumedBits(HOSTCALL_PTR); 470 } 471 472 for (auto Attr : ImplicitAttrs) { 473 if (NeedsHostcall && 474 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR)) 475 continue; 476 477 if (F->hasFnAttribute(Attr.second)) 478 addKnownBits(Attr.first); 479 } 480 481 if (F->isDeclaration()) 482 return; 483 484 // Ignore functions with graphics calling conventions, these are currently 485 // not allowed to have kernel arguments. 486 if (AMDGPU::isGraphics(F->getCallingConv())) { 487 indicatePessimisticFixpoint(); 488 return; 489 } 490 } 491 492 ChangeStatus updateImpl(Attributor &A) override { 493 Function *F = getAssociatedFunction(); 494 // The current assumed state used to determine a change. 495 auto OrigAssumed = getAssumed(); 496 497 // Check for Intrinsics and propagate attributes. 498 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>( 499 *this, this->getIRPosition(), DepClassTy::REQUIRED); 500 if (!AAEdges || !AAEdges->isValidState() || 501 AAEdges->hasNonAsmUnknownCallee()) 502 return indicatePessimisticFixpoint(); 503 504 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); 505 506 bool NeedsImplicit = false; 507 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 508 bool HasApertureRegs = InfoCache.hasApertureRegs(*F); 509 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F); 510 unsigned COV = InfoCache.getCodeObjectVersion(); 511 512 for (Function *Callee : AAEdges->getOptimisticEdges()) { 513 Intrinsic::ID IID = Callee->getIntrinsicID(); 514 if (IID == Intrinsic::not_intrinsic) { 515 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>( 516 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); 517 if (!AAAMD || !AAAMD->isValidState()) 518 return indicatePessimisticFixpoint(); 519 *this &= *AAAMD; 520 continue; 521 } 522 523 bool NonKernelOnly = false; 524 ImplicitArgumentMask AttrMask = 525 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, 526 HasApertureRegs, SupportsGetDoorbellID, COV); 527 if (AttrMask != NOT_IMPLICIT_INPUT) { 528 if ((IsNonEntryFunc || !NonKernelOnly)) 529 removeAssumedBits(AttrMask); 530 } 531 } 532 533 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base. 534 if (NeedsImplicit) 535 removeAssumedBits(IMPLICIT_ARG_PTR); 536 537 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) { 538 // Under V5, we need implicitarg_ptr + offsets to access private_base or 539 // shared_base. We do not actually need queue_ptr. 540 if (COV >= 5) 541 removeAssumedBits(IMPLICIT_ARG_PTR); 542 else 543 removeAssumedBits(QUEUE_PTR); 544 } 545 546 if (funcRetrievesMultigridSyncArg(A, COV)) { 547 assert(!isAssumed(IMPLICIT_ARG_PTR) && 548 "multigrid_sync_arg needs implicitarg_ptr"); 549 removeAssumedBits(MULTIGRID_SYNC_ARG); 550 } 551 552 if (funcRetrievesHostcallPtr(A, COV)) { 553 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr"); 554 removeAssumedBits(HOSTCALL_PTR); 555 } 556 557 if (funcRetrievesHeapPtr(A, COV)) { 558 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr"); 559 removeAssumedBits(HEAP_PTR); 560 } 561 562 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) { 563 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr"); 564 removeAssumedBits(QUEUE_PTR); 565 } 566 567 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) { 568 removeAssumedBits(LDS_KERNEL_ID); 569 } 570 571 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV)) 572 removeAssumedBits(DEFAULT_QUEUE); 573 574 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV)) 575 removeAssumedBits(COMPLETION_ACTION); 576 577 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A)) 578 removeAssumedBits(FLAT_SCRATCH_INIT); 579 580 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED 581 : ChangeStatus::UNCHANGED; 582 } 583 584 ChangeStatus manifest(Attributor &A) override { 585 SmallVector<Attribute, 8> AttrList; 586 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 587 588 for (auto Attr : ImplicitAttrs) { 589 if (isKnown(Attr.first)) 590 AttrList.push_back(Attribute::get(Ctx, Attr.second)); 591 } 592 593 return A.manifestAttrs(getIRPosition(), AttrList, 594 /* ForceReplace */ true); 595 } 596 597 const std::string getAsStr(Attributor *) const override { 598 std::string Str; 599 raw_string_ostream OS(Str); 600 OS << "AMDInfo["; 601 for (auto Attr : ImplicitAttrs) 602 if (isAssumed(Attr.first)) 603 OS << ' ' << Attr.second; 604 OS << " ]"; 605 return OS.str(); 606 } 607 608 /// See AbstractAttribute::trackStatistics() 609 void trackStatistics() const override {} 610 611 private: 612 bool checkForQueuePtr(Attributor &A) { 613 Function *F = getAssociatedFunction(); 614 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); 615 616 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 617 618 bool NeedsQueuePtr = false; 619 620 auto CheckAddrSpaceCasts = [&](Instruction &I) { 621 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace(); 622 if (castRequiresQueuePtr(SrcAS)) { 623 NeedsQueuePtr = true; 624 return false; 625 } 626 return true; 627 }; 628 629 bool HasApertureRegs = InfoCache.hasApertureRegs(*F); 630 631 // `checkForAllInstructions` is much more cheaper than going through all 632 // instructions, try it first. 633 634 // The queue pointer is not needed if aperture regs is present. 635 if (!HasApertureRegs) { 636 bool UsedAssumedInformation = false; 637 A.checkForAllInstructions(CheckAddrSpaceCasts, *this, 638 {Instruction::AddrSpaceCast}, 639 UsedAssumedInformation); 640 } 641 642 // If we found that we need the queue pointer, nothing else to do. 643 if (NeedsQueuePtr) 644 return true; 645 646 if (!IsNonEntryFunc && HasApertureRegs) 647 return false; 648 649 for (BasicBlock &BB : *F) { 650 for (Instruction &I : BB) { 651 for (const Use &U : I.operands()) { 652 if (const auto *C = dyn_cast<Constant>(U)) { 653 if (InfoCache.needsQueuePtr(C, *F)) 654 return true; 655 } 656 } 657 } 658 } 659 660 return false; 661 } 662 663 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) { 664 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV); 665 AA::RangeTy Range(Pos, 8); 666 return funcRetrievesImplicitKernelArg(A, Range); 667 } 668 669 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) { 670 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV); 671 AA::RangeTy Range(Pos, 8); 672 return funcRetrievesImplicitKernelArg(A, Range); 673 } 674 675 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) { 676 auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV); 677 AA::RangeTy Range(Pos, 8); 678 return funcRetrievesImplicitKernelArg(A, Range); 679 } 680 681 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) { 682 auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV); 683 AA::RangeTy Range(Pos, 8); 684 return funcRetrievesImplicitKernelArg(A, Range); 685 } 686 687 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) { 688 if (COV < 5) 689 return false; 690 AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8); 691 return funcRetrievesImplicitKernelArg(A, Range); 692 } 693 694 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) { 695 if (COV < 5) 696 return false; 697 AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8); 698 return funcRetrievesImplicitKernelArg(A, Range); 699 } 700 701 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) { 702 // Check if this is a call to the implicitarg_ptr builtin and it 703 // is used to retrieve the hostcall pointer. The implicit arg for 704 // hostcall is not used only if every use of the implicitarg_ptr 705 // is a load that clearly does not retrieve any byte of the 706 // hostcall pointer. We check this by tracing all the uses of the 707 // initial call to the implicitarg_ptr intrinsic. 708 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) { 709 auto &Call = cast<CallBase>(I); 710 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr) 711 return true; 712 713 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>( 714 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED); 715 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState()) 716 return false; 717 718 return PointerInfoAA->forallInterferingAccesses( 719 Range, [](const AAPointerInfo::Access &Acc, bool IsExact) { 720 return Acc.getRemoteInst()->isDroppable(); 721 }); 722 }; 723 724 bool UsedAssumedInformation = false; 725 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this, 726 UsedAssumedInformation); 727 } 728 729 bool funcRetrievesLDSKernelId(Attributor &A) { 730 auto DoesNotRetrieve = [&](Instruction &I) { 731 auto &Call = cast<CallBase>(I); 732 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id; 733 }; 734 bool UsedAssumedInformation = false; 735 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this, 736 UsedAssumedInformation); 737 } 738 739 // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is 740 // not to be set. 741 bool needFlatScratchInit(Attributor &A) { 742 assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set 743 744 // Check all AddrSpaceCast instructions. FlatScratchInit is needed if 745 // there is a cast from PRIVATE_ADDRESS. 746 auto AddrSpaceCastNotFromPrivate = [](Instruction &I) { 747 return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() != 748 AMDGPUAS::PRIVATE_ADDRESS; 749 }; 750 751 bool UsedAssumedInformation = false; 752 if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this, 753 {Instruction::AddrSpaceCast}, 754 UsedAssumedInformation)) 755 return true; 756 757 // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions 758 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 759 760 Function *F = getAssociatedFunction(); 761 for (Instruction &I : instructions(F)) { 762 for (const Use &U : I.operands()) { 763 if (const auto *C = dyn_cast<Constant>(U)) { 764 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C)) 765 return true; 766 } 767 } 768 } 769 770 // Finally check callees. 771 772 // This is called on each callee; false means callee shouldn't have 773 // no-flat-scratch-init. 774 auto CheckForNoFlatScratchInit = [&](Instruction &I) { 775 const auto &CB = cast<CallBase>(I); 776 const Function *Callee = CB.getCalledFunction(); 777 778 // Callee == 0 for inline asm or indirect call with known callees. 779 // In the latter case, updateImpl() already checked the callees and we 780 // know their FLAT_SCRATCH_INIT bit is set. 781 // If function has indirect call with unknown callees, the bit is 782 // already removed in updateImpl() and execution won't reach here. 783 if (!Callee) 784 return true; 785 786 return Callee->getIntrinsicID() != 787 Intrinsic::amdgcn_addrspacecast_nonnull; 788 }; 789 790 UsedAssumedInformation = false; 791 // If any callee is false (i.e. need FlatScratchInit), 792 // checkForAllCallLikeInstructions returns false, in which case this 793 // function returns true. 794 return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this, 795 UsedAssumedInformation); 796 } 797 }; 798 799 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, 800 Attributor &A) { 801 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 802 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A); 803 llvm_unreachable("AAAMDAttributes is only valid for function position"); 804 } 805 806 /// Base class to derive different size ranges. 807 struct AAAMDSizeRangeAttribute 808 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> { 809 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>; 810 811 StringRef AttrName; 812 813 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A, 814 StringRef AttrName) 815 : Base(IRP, 32), AttrName(AttrName) {} 816 817 /// See AbstractAttribute::trackStatistics() 818 void trackStatistics() const override {} 819 820 template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) { 821 ChangeStatus Change = ChangeStatus::UNCHANGED; 822 823 auto CheckCallSite = [&](AbstractCallSite CS) { 824 Function *Caller = CS.getInstruction()->getFunction(); 825 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName() 826 << "->" << getAssociatedFunction()->getName() << '\n'); 827 828 const auto *CallerInfo = A.getAAFor<AttributeImpl>( 829 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 830 if (!CallerInfo || !CallerInfo->isValidState()) 831 return false; 832 833 Change |= 834 clampStateAndIndicateChange(this->getState(), CallerInfo->getState()); 835 836 return true; 837 }; 838 839 bool AllCallSitesKnown = true; 840 if (!A.checkForAllCallSites(CheckCallSite, *this, 841 /*RequireAllCallSites=*/true, 842 AllCallSitesKnown)) 843 return indicatePessimisticFixpoint(); 844 845 return Change; 846 } 847 848 /// Clamp the assumed range to the default value ([Min, Max]) and emit the 849 /// attribute if it is not same as default. 850 ChangeStatus 851 emitAttributeIfNotDefaultAfterClamp(Attributor &A, 852 std::pair<unsigned, unsigned> Default) { 853 auto [Min, Max] = Default; 854 unsigned Lower = getAssumed().getLower().getZExtValue(); 855 unsigned Upper = getAssumed().getUpper().getZExtValue(); 856 857 // Clamp the range to the default value. 858 if (Lower < Min) 859 Lower = Min; 860 if (Upper > Max + 1) 861 Upper = Max + 1; 862 863 // No manifest if the value is invalid or same as default after clamp. 864 if ((Lower == Min && Upper == Max + 1) || (Upper < Lower)) 865 return ChangeStatus::UNCHANGED; 866 867 Function *F = getAssociatedFunction(); 868 LLVMContext &Ctx = F->getContext(); 869 SmallString<10> Buffer; 870 raw_svector_ostream OS(Buffer); 871 OS << Lower << ',' << Upper - 1; 872 return A.manifestAttrs(getIRPosition(), 873 {Attribute::get(Ctx, AttrName, OS.str())}, 874 /*ForceReplace=*/true); 875 } 876 877 const std::string getAsStr(Attributor *) const override { 878 std::string Str; 879 raw_string_ostream OS(Str); 880 OS << getName() << '['; 881 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; 882 OS << ']'; 883 return OS.str(); 884 } 885 }; 886 887 /// Propagate amdgpu-flat-work-group-size attribute. 888 struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute { 889 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A) 890 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {} 891 892 void initialize(Attributor &A) override { 893 Function *F = getAssociatedFunction(); 894 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 895 896 bool HasAttr = false; 897 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F); 898 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F); 899 900 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) { 901 // We only consider an attribute that is not max range because the front 902 // end always emits the attribute, unfortunately, and sometimes it emits 903 // the max range. 904 if (*Attr != MaxRange) { 905 Range = *Attr; 906 HasAttr = true; 907 } 908 } 909 910 // We don't want to directly clamp the state if it's the max range because 911 // that is basically the worst state. 912 if (Range == MaxRange) 913 return; 914 915 auto [Min, Max] = Range; 916 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1)); 917 IntegerRangeState IRS(CR); 918 clampStateAndIndicateChange(this->getState(), IRS); 919 920 if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv())) 921 indicateOptimisticFixpoint(); 922 } 923 924 ChangeStatus updateImpl(Attributor &A) override { 925 return updateImplImpl<AAAMDFlatWorkGroupSize>(A); 926 } 927 928 /// Create an abstract attribute view for the position \p IRP. 929 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP, 930 Attributor &A); 931 932 ChangeStatus manifest(Attributor &A) override { 933 Function *F = getAssociatedFunction(); 934 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 935 return emitAttributeIfNotDefaultAfterClamp( 936 A, InfoCache.getMaximumFlatWorkGroupRange(*F)); 937 } 938 939 /// See AbstractAttribute::getName() 940 StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; } 941 942 /// See AbstractAttribute::getIdAddr() 943 const char *getIdAddr() const override { return &ID; } 944 945 /// This function should return true if the type of the \p AA is 946 /// AAAMDFlatWorkGroupSize 947 static bool classof(const AbstractAttribute *AA) { 948 return (AA->getIdAddr() == &ID); 949 } 950 951 /// Unique ID (due to the unique address) 952 static const char ID; 953 }; 954 955 const char AAAMDFlatWorkGroupSize::ID = 0; 956 957 AAAMDFlatWorkGroupSize & 958 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP, 959 Attributor &A) { 960 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 961 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A); 962 llvm_unreachable( 963 "AAAMDFlatWorkGroupSize is only valid for function position"); 964 } 965 966 struct TupleDecIntegerRangeState : public AbstractState { 967 DecIntegerState<uint32_t> X, Y, Z; 968 969 bool isValidState() const override { 970 return X.isValidState() && Y.isValidState() && Z.isValidState(); 971 } 972 973 bool isAtFixpoint() const override { 974 return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint(); 975 } 976 977 ChangeStatus indicateOptimisticFixpoint() override { 978 return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() | 979 Z.indicateOptimisticFixpoint(); 980 } 981 982 ChangeStatus indicatePessimisticFixpoint() override { 983 return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() | 984 Z.indicatePessimisticFixpoint(); 985 } 986 987 TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) { 988 X ^= Other.X; 989 Y ^= Other.Y; 990 Z ^= Other.Z; 991 return *this; 992 } 993 994 bool operator==(const TupleDecIntegerRangeState &Other) const { 995 return X == Other.X && Y == Other.Y && Z == Other.Z; 996 } 997 998 TupleDecIntegerRangeState &getAssumed() { return *this; } 999 const TupleDecIntegerRangeState &getAssumed() const { return *this; } 1000 }; 1001 1002 using AAAMDMaxNumWorkgroupsState = 1003 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>; 1004 1005 /// Propagate amdgpu-max-num-workgroups attribute. 1006 struct AAAMDMaxNumWorkgroups 1007 : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> { 1008 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>; 1009 1010 AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 1011 1012 void initialize(Attributor &A) override { 1013 Function *F = getAssociatedFunction(); 1014 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 1015 1016 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F); 1017 1018 X.takeKnownMinimum(MaxNumWorkgroups[0]); 1019 Y.takeKnownMinimum(MaxNumWorkgroups[1]); 1020 Z.takeKnownMinimum(MaxNumWorkgroups[2]); 1021 1022 if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) 1023 indicatePessimisticFixpoint(); 1024 } 1025 1026 ChangeStatus updateImpl(Attributor &A) override { 1027 ChangeStatus Change = ChangeStatus::UNCHANGED; 1028 1029 auto CheckCallSite = [&](AbstractCallSite CS) { 1030 Function *Caller = CS.getInstruction()->getFunction(); 1031 LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName() 1032 << "->" << getAssociatedFunction()->getName() << '\n'); 1033 1034 const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>( 1035 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 1036 if (!CallerInfo || !CallerInfo->isValidState()) 1037 return false; 1038 1039 Change |= 1040 clampStateAndIndicateChange(this->getState(), CallerInfo->getState()); 1041 return true; 1042 }; 1043 1044 bool AllCallSitesKnown = true; 1045 if (!A.checkForAllCallSites(CheckCallSite, *this, 1046 /*RequireAllCallSites=*/true, 1047 AllCallSitesKnown)) 1048 return indicatePessimisticFixpoint(); 1049 1050 return Change; 1051 } 1052 1053 /// Create an abstract attribute view for the position \p IRP. 1054 static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP, 1055 Attributor &A); 1056 1057 ChangeStatus manifest(Attributor &A) override { 1058 Function *F = getAssociatedFunction(); 1059 LLVMContext &Ctx = F->getContext(); 1060 SmallString<32> Buffer; 1061 raw_svector_ostream OS(Buffer); 1062 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed(); 1063 1064 // TODO: Should annotate loads of the group size for this to do anything 1065 // useful. 1066 return A.manifestAttrs( 1067 getIRPosition(), 1068 {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())}, 1069 /* ForceReplace= */ true); 1070 } 1071 1072 StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; } 1073 1074 const std::string getAsStr(Attributor *) const override { 1075 std::string Buffer = "AAAMDMaxNumWorkgroupsState["; 1076 raw_string_ostream OS(Buffer); 1077 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed() 1078 << ']'; 1079 return OS.str(); 1080 } 1081 1082 const char *getIdAddr() const override { return &ID; } 1083 1084 /// This function should return true if the type of the \p AA is 1085 /// AAAMDMaxNumWorkgroups 1086 static bool classof(const AbstractAttribute *AA) { 1087 return (AA->getIdAddr() == &ID); 1088 } 1089 1090 void trackStatistics() const override {} 1091 1092 /// Unique ID (due to the unique address) 1093 static const char ID; 1094 }; 1095 1096 const char AAAMDMaxNumWorkgroups::ID = 0; 1097 1098 AAAMDMaxNumWorkgroups & 1099 AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) { 1100 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 1101 return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A); 1102 llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position"); 1103 } 1104 1105 /// Propagate amdgpu-waves-per-eu attribute. 1106 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { 1107 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) 1108 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} 1109 1110 void initialize(Attributor &A) override { 1111 Function *F = getAssociatedFunction(); 1112 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 1113 1114 // If the attribute exists, we will honor it if it is not the default. 1115 if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { 1116 std::pair<unsigned, unsigned> MaxWavesPerEURange{ 1117 1U, InfoCache.getMaxWavesPerEU(*F)}; 1118 if (*Attr != MaxWavesPerEURange) { 1119 auto [Min, Max] = *Attr; 1120 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); 1121 IntegerRangeState RangeState(Range); 1122 this->getState() = RangeState; 1123 indicateOptimisticFixpoint(); 1124 return; 1125 } 1126 } 1127 1128 if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) 1129 indicatePessimisticFixpoint(); 1130 } 1131 1132 ChangeStatus updateImpl(Attributor &A) override { 1133 ChangeStatus Change = ChangeStatus::UNCHANGED; 1134 1135 auto CheckCallSite = [&](AbstractCallSite CS) { 1136 Function *Caller = CS.getInstruction()->getFunction(); 1137 Function *Func = getAssociatedFunction(); 1138 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName() 1139 << "->" << Func->getName() << '\n'); 1140 (void)Func; 1141 1142 const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>( 1143 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 1144 if (!CallerAA || !CallerAA->isValidState()) 1145 return false; 1146 1147 ConstantRange Assumed = getAssumed(); 1148 unsigned Min = std::max(Assumed.getLower().getZExtValue(), 1149 CallerAA->getAssumed().getLower().getZExtValue()); 1150 unsigned Max = std::max(Assumed.getUpper().getZExtValue(), 1151 CallerAA->getAssumed().getUpper().getZExtValue()); 1152 ConstantRange Range(APInt(32, Min), APInt(32, Max)); 1153 IntegerRangeState RangeState(Range); 1154 getState() = RangeState; 1155 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED 1156 : ChangeStatus::CHANGED; 1157 1158 return true; 1159 }; 1160 1161 bool AllCallSitesKnown = true; 1162 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 1163 return indicatePessimisticFixpoint(); 1164 1165 return Change; 1166 } 1167 1168 /// Create an abstract attribute view for the position \p IRP. 1169 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP, 1170 Attributor &A); 1171 1172 ChangeStatus manifest(Attributor &A) override { 1173 Function *F = getAssociatedFunction(); 1174 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 1175 return emitAttributeIfNotDefaultAfterClamp( 1176 A, {1U, InfoCache.getMaxWavesPerEU(*F)}); 1177 } 1178 1179 /// See AbstractAttribute::getName() 1180 StringRef getName() const override { return "AAAMDWavesPerEU"; } 1181 1182 /// See AbstractAttribute::getIdAddr() 1183 const char *getIdAddr() const override { return &ID; } 1184 1185 /// This function should return true if the type of the \p AA is 1186 /// AAAMDWavesPerEU 1187 static bool classof(const AbstractAttribute *AA) { 1188 return (AA->getIdAddr() == &ID); 1189 } 1190 1191 /// Unique ID (due to the unique address) 1192 static const char ID; 1193 }; 1194 1195 const char AAAMDWavesPerEU::ID = 0; 1196 1197 AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, 1198 Attributor &A) { 1199 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 1200 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A); 1201 llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); 1202 } 1203 1204 static bool inlineAsmUsesAGPRs(const InlineAsm *IA) { 1205 for (const auto &CI : IA->ParseConstraints()) { 1206 for (StringRef Code : CI.Codes) { 1207 Code.consume_front("{"); 1208 if (Code.starts_with("a")) 1209 return true; 1210 } 1211 } 1212 1213 return false; 1214 } 1215 1216 // TODO: Migrate to range merge of amdgpu-agpr-alloc. 1217 // FIXME: Why is this using Attribute::NoUnwind? 1218 struct AAAMDGPUNoAGPR 1219 : public IRAttribute<Attribute::NoUnwind, 1220 StateWrapper<BooleanState, AbstractAttribute>, 1221 AAAMDGPUNoAGPR> { 1222 AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {} 1223 1224 static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP, 1225 Attributor &A) { 1226 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 1227 return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A); 1228 llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position"); 1229 } 1230 1231 void initialize(Attributor &A) override { 1232 Function *F = getAssociatedFunction(); 1233 auto [MinNumAGPR, MaxNumAGPR] = 1234 AMDGPU::getIntegerPairAttribute(*F, "amdgpu-agpr-alloc", {~0u, ~0u}, 1235 /*OnlyFirstRequired=*/true); 1236 if (MinNumAGPR == 0) 1237 indicateOptimisticFixpoint(); 1238 } 1239 1240 const std::string getAsStr(Attributor *A) const override { 1241 return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr"; 1242 } 1243 1244 void trackStatistics() const override {} 1245 1246 ChangeStatus updateImpl(Attributor &A) override { 1247 // TODO: Use AACallEdges, but then we need a way to inspect asm edges. 1248 1249 auto CheckForNoAGPRs = [&](Instruction &I) { 1250 const auto &CB = cast<CallBase>(I); 1251 const Value *CalleeOp = CB.getCalledOperand(); 1252 const Function *Callee = dyn_cast<Function>(CalleeOp); 1253 if (!Callee) { 1254 if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) 1255 return !inlineAsmUsesAGPRs(IA); 1256 return false; 1257 } 1258 1259 // Some intrinsics may use AGPRs, but if we have a choice, we are not 1260 // required to use AGPRs. 1261 if (Callee->isIntrinsic()) 1262 return true; 1263 1264 // TODO: Handle callsite attributes 1265 const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>( 1266 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); 1267 return CalleeInfo && CalleeInfo->isValidState() && 1268 CalleeInfo->getAssumed(); 1269 }; 1270 1271 bool UsedAssumedInformation = false; 1272 if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this, 1273 UsedAssumedInformation)) 1274 return indicatePessimisticFixpoint(); 1275 return ChangeStatus::UNCHANGED; 1276 } 1277 1278 ChangeStatus manifest(Attributor &A) override { 1279 if (!getAssumed()) 1280 return ChangeStatus::UNCHANGED; 1281 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 1282 return A.manifestAttrs(getIRPosition(), 1283 {Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")}); 1284 } 1285 1286 StringRef getName() const override { return "AAAMDGPUNoAGPR"; } 1287 const char *getIdAddr() const override { return &ID; } 1288 1289 /// This function should return true if the type of the \p AA is 1290 /// AAAMDGPUNoAGPRs 1291 static bool classof(const AbstractAttribute *AA) { 1292 return (AA->getIdAddr() == &ID); 1293 } 1294 1295 static const char ID; 1296 }; 1297 1298 const char AAAMDGPUNoAGPR::ID = 0; 1299 1300 /// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute 1301 /// based on the finalized 'amdgpu-flat-work-group-size' attribute. 1302 /// Both attributes start with narrow ranges that expand during iteration. 1303 /// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range, 1304 /// preventing optimal updates later. Therefore, waves-per-eu can't be updated 1305 /// with intermediate values during the attributor run. We defer the 1306 /// finalization of waves-per-eu until after the flat-workgroup-size is 1307 /// finalized. 1308 /// TODO: Remove this and move similar logic back into the attributor run once 1309 /// we have a better representation for waves-per-eu. 1310 static bool updateWavesPerEU(Module &M, TargetMachine &TM) { 1311 bool Changed = false; 1312 1313 LLVMContext &Ctx = M.getContext(); 1314 1315 for (Function &F : M) { 1316 if (F.isDeclaration()) 1317 continue; 1318 1319 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 1320 1321 std::optional<std::pair<unsigned, std::optional<unsigned>>> 1322 FlatWgrpSizeAttr = 1323 AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size"); 1324 1325 unsigned MinWavesPerEU = ST.getMinWavesPerEU(); 1326 unsigned MaxWavesPerEU = ST.getMaxWavesPerEU(); 1327 1328 unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize(); 1329 unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize(); 1330 if (FlatWgrpSizeAttr.has_value()) { 1331 MinFlatWgrpSize = FlatWgrpSizeAttr->first; 1332 MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second); 1333 } 1334 1335 // Start with the "best" range. 1336 unsigned Min = MinWavesPerEU; 1337 unsigned Max = MinWavesPerEU; 1338 1339 // Compute the range from flat workgroup size. `getWavesPerEU` will also 1340 // account for the 'amdgpu-waves-er-eu' attribute. 1341 auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] = 1342 ST.getWavesPerEU(F, {MinFlatWgrpSize, MaxFlatWgrpSize}); 1343 1344 // For the lower bound, we have to "tighten" it. 1345 Min = std::max(Min, MinFromFlatWgrpSize); 1346 // For the upper bound, we have to "extend" it. 1347 Max = std::max(Max, MaxFromFlatWgrpSize); 1348 1349 // Clamp the range to the max range. 1350 Min = std::max(Min, MinWavesPerEU); 1351 Max = std::min(Max, MaxWavesPerEU); 1352 1353 // Update the attribute if it is not the max. 1354 if (Min != MinWavesPerEU || Max != MaxWavesPerEU) { 1355 SmallString<10> Buffer; 1356 raw_svector_ostream OS(Buffer); 1357 OS << Min << ',' << Max; 1358 Attribute OldAttr = F.getFnAttribute("amdgpu-waves-per-eu"); 1359 Attribute NewAttr = Attribute::get(Ctx, "amdgpu-waves-per-eu", OS.str()); 1360 F.addFnAttr(NewAttr); 1361 Changed |= OldAttr == NewAttr; 1362 } 1363 } 1364 1365 return Changed; 1366 } 1367 1368 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, 1369 AMDGPUAttributorOptions Options, 1370 ThinOrFullLTOPhase LTOPhase) { 1371 SetVector<Function *> Functions; 1372 for (Function &F : M) { 1373 if (!F.isIntrinsic()) 1374 Functions.insert(&F); 1375 } 1376 1377 CallGraphUpdater CGUpdater; 1378 BumpPtrAllocator Allocator; 1379 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM); 1380 DenseSet<const char *> Allowed( 1381 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, 1382 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, 1383 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, 1384 &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, 1385 &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, 1386 &AAIndirectCallInfo::ID, &AAInstanceInfo::ID}); 1387 1388 AttributorConfig AC(CGUpdater); 1389 AC.IsClosedWorldModule = Options.IsClosedWorld; 1390 AC.Allowed = &Allowed; 1391 AC.IsModulePass = true; 1392 AC.DefaultInitializeLiveInternals = false; 1393 AC.IndirectCalleeSpecializationCallback = 1394 [](Attributor &A, const AbstractAttribute &AA, CallBase &CB, 1395 Function &Callee, unsigned NumAssumedCallees) { 1396 return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) && 1397 (NumAssumedCallees <= IndirectCallSpecializationThreshold); 1398 }; 1399 AC.IPOAmendableCB = [](const Function &F) { 1400 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL; 1401 }; 1402 1403 Attributor A(Functions, InfoCache, AC); 1404 1405 LLVM_DEBUG({ 1406 StringRef LTOPhaseStr = to_string(LTOPhase); 1407 dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n' 1408 << "[AMDGPUAttributor] Module " << M.getName() << " is " 1409 << (AC.IsClosedWorldModule ? "" : "not ") 1410 << "assumed to be a closed world.\n"; 1411 }); 1412 1413 for (auto *F : Functions) { 1414 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F)); 1415 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F)); 1416 A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F)); 1417 A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); 1418 CallingConv::ID CC = F->getCallingConv(); 1419 if (!AMDGPU::isEntryFunctionCC(CC)) { 1420 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F)); 1421 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F)); 1422 } 1423 1424 for (auto &I : instructions(F)) { 1425 Value *Ptr = nullptr; 1426 if (auto *LI = dyn_cast<LoadInst>(&I)) 1427 Ptr = LI->getPointerOperand(); 1428 else if (auto *SI = dyn_cast<StoreInst>(&I)) 1429 Ptr = SI->getPointerOperand(); 1430 else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) 1431 Ptr = RMW->getPointerOperand(); 1432 else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I)) 1433 Ptr = CmpX->getPointerOperand(); 1434 1435 if (Ptr) { 1436 A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr)); 1437 A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr)); 1438 } 1439 } 1440 } 1441 1442 bool Changed = A.run() == ChangeStatus::CHANGED; 1443 1444 Changed |= updateWavesPerEU(M, TM); 1445 1446 return Changed; 1447 } 1448 } // namespace 1449 1450 PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M, 1451 ModuleAnalysisManager &AM) { 1452 1453 FunctionAnalysisManager &FAM = 1454 AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1455 AnalysisGetter AG(FAM); 1456 1457 // TODO: Probably preserves CFG 1458 return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none() 1459 : PreservedAnalyses::all(); 1460 } 1461