1 //===- AMDGPUAttributor.cpp -----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AMDGPU.h" 14 #include "GCNSubtarget.h" 15 #include "Utils/AMDGPUBaseInfo.h" 16 #include "llvm/Analysis/CycleAnalysis.h" 17 #include "llvm/CodeGen/TargetPassConfig.h" 18 #include "llvm/IR/IntrinsicsAMDGPU.h" 19 #include "llvm/IR/IntrinsicsR600.h" 20 #include "llvm/Target/TargetMachine.h" 21 #include "llvm/Transforms/IPO/Attributor.h" 22 23 #define DEBUG_TYPE "amdgpu-attributor" 24 25 namespace llvm { 26 void initializeCycleInfoWrapperPassPass(PassRegistry &); 27 } 28 29 using namespace llvm; 30 31 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS, 32 33 enum ImplicitArgumentPositions { 34 #include "AMDGPUAttributes.def" 35 LAST_ARG_POS 36 }; 37 38 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, 39 40 enum ImplicitArgumentMask { 41 NOT_IMPLICIT_INPUT = 0, 42 #include "AMDGPUAttributes.def" 43 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 44 }; 45 46 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, 47 static constexpr std::pair<ImplicitArgumentMask, 48 StringLiteral> ImplicitAttrs[] = { 49 #include "AMDGPUAttributes.def" 50 }; 51 52 // We do not need to note the x workitem or workgroup id because they are always 53 // initialized. 54 // 55 // TODO: We should not add the attributes if the known compile time workgroup 56 // size is 1 for y/z. 57 static ImplicitArgumentMask 58 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, 59 bool HasApertureRegs, bool SupportsGetDoorBellID, 60 unsigned CodeObjectVersion) { 61 switch (ID) { 62 case Intrinsic::amdgcn_workitem_id_x: 63 NonKernelOnly = true; 64 return WORKITEM_ID_X; 65 case Intrinsic::amdgcn_workgroup_id_x: 66 NonKernelOnly = true; 67 return WORKGROUP_ID_X; 68 case Intrinsic::amdgcn_workitem_id_y: 69 case Intrinsic::r600_read_tidig_y: 70 return WORKITEM_ID_Y; 71 case Intrinsic::amdgcn_workitem_id_z: 72 case Intrinsic::r600_read_tidig_z: 73 return WORKITEM_ID_Z; 74 case Intrinsic::amdgcn_workgroup_id_y: 75 case Intrinsic::r600_read_tgid_y: 76 return WORKGROUP_ID_Y; 77 case Intrinsic::amdgcn_workgroup_id_z: 78 case Intrinsic::r600_read_tgid_z: 79 return WORKGROUP_ID_Z; 80 case Intrinsic::amdgcn_lds_kernel_id: 81 return LDS_KERNEL_ID; 82 case Intrinsic::amdgcn_dispatch_ptr: 83 return DISPATCH_PTR; 84 case Intrinsic::amdgcn_dispatch_id: 85 return DISPATCH_ID; 86 case Intrinsic::amdgcn_implicitarg_ptr: 87 return IMPLICIT_ARG_PTR; 88 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access 89 // queue_ptr. 90 case Intrinsic::amdgcn_queue_ptr: 91 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); 92 return QUEUE_PTR; 93 case Intrinsic::amdgcn_is_shared: 94 case Intrinsic::amdgcn_is_private: 95 if (HasApertureRegs) 96 return NOT_IMPLICIT_INPUT; 97 // Under V5, we need implicitarg_ptr + offsets to access private_base or 98 // shared_base. For pre-V5, however, need to access them through queue_ptr + 99 // offsets. 100 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR : 101 QUEUE_PTR; 102 case Intrinsic::trap: 103 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4. 104 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT : 105 QUEUE_PTR; 106 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); 107 return QUEUE_PTR; 108 default: 109 return NOT_IMPLICIT_INPUT; 110 } 111 } 112 113 static bool castRequiresQueuePtr(unsigned SrcAS) { 114 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 115 } 116 117 static bool isDSAddress(const Constant *C) { 118 const GlobalValue *GV = dyn_cast<GlobalValue>(C); 119 if (!GV) 120 return false; 121 unsigned AS = GV->getAddressSpace(); 122 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; 123 } 124 125 /// Returns true if the function requires the implicit argument be passed 126 /// regardless of the function contents. 127 static bool funcRequiresHostcallPtr(const Function &F) { 128 // Sanitizers require the hostcall buffer passed in the implicit arguments. 129 return F.hasFnAttribute(Attribute::SanitizeAddress) || 130 F.hasFnAttribute(Attribute::SanitizeThread) || 131 F.hasFnAttribute(Attribute::SanitizeMemory) || 132 F.hasFnAttribute(Attribute::SanitizeHWAddress) || 133 F.hasFnAttribute(Attribute::SanitizeMemTag); 134 } 135 136 namespace { 137 class AMDGPUInformationCache : public InformationCache { 138 public: 139 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, 140 BumpPtrAllocator &Allocator, 141 SetVector<Function *> *CGSCC, TargetMachine &TM) 142 : InformationCache(M, AG, Allocator, CGSCC), TM(TM), 143 CodeObjectVersion(AMDGPU::getCodeObjectVersion(M)) {} 144 145 TargetMachine &TM; 146 147 enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 }; 148 149 /// Check if the subtarget has aperture regs. 150 bool hasApertureRegs(Function &F) { 151 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 152 return ST.hasApertureRegs(); 153 } 154 155 /// Check if the subtarget supports GetDoorbellID. 156 bool supportsGetDoorbellID(Function &F) { 157 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 158 return ST.supportsGetDoorbellID(); 159 } 160 161 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) { 162 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 163 return ST.getFlatWorkGroupSizes(F); 164 } 165 166 std::pair<unsigned, unsigned> 167 getMaximumFlatWorkGroupRange(const Function &F) { 168 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 169 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; 170 } 171 172 /// Get code object version. 173 unsigned getCodeObjectVersion() const { 174 return CodeObjectVersion; 175 } 176 177 /// Get the effective value of "amdgpu-waves-per-eu" for the function, 178 /// accounting for the interaction with the passed value to use for 179 /// "amdgpu-flat-work-group-size". 180 std::pair<unsigned, unsigned> 181 getWavesPerEU(const Function &F, 182 std::pair<unsigned, unsigned> FlatWorkGroupSize) { 183 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 184 return ST.getWavesPerEU(F, FlatWorkGroupSize); 185 } 186 187 std::pair<unsigned, unsigned> 188 getEffectiveWavesPerEU(const Function &F, 189 std::pair<unsigned, unsigned> WavesPerEU, 190 std::pair<unsigned, unsigned> FlatWorkGroupSize) { 191 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 192 return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize); 193 } 194 195 unsigned getMaxWavesPerEU(const Function &F) { 196 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 197 return ST.getMaxWavesPerEU(); 198 } 199 200 private: 201 /// Check if the ConstantExpr \p CE requires the queue pointer. 202 static bool visitConstExpr(const ConstantExpr *CE) { 203 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 204 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 205 return castRequiresQueuePtr(SrcAS); 206 } 207 return false; 208 } 209 210 /// Get the constant access bitmap for \p C. 211 uint8_t getConstantAccess(const Constant *C, 212 SmallPtrSetImpl<const Constant *> &Visited) { 213 auto It = ConstantStatus.find(C); 214 if (It != ConstantStatus.end()) 215 return It->second; 216 217 uint8_t Result = 0; 218 if (isDSAddress(C)) 219 Result = DS_GLOBAL; 220 221 if (const auto *CE = dyn_cast<ConstantExpr>(C)) 222 if (visitConstExpr(CE)) 223 Result |= ADDR_SPACE_CAST; 224 225 for (const Use &U : C->operands()) { 226 const auto *OpC = dyn_cast<Constant>(U); 227 if (!OpC || !Visited.insert(OpC).second) 228 continue; 229 230 Result |= getConstantAccess(OpC, Visited); 231 } 232 return Result; 233 } 234 235 public: 236 /// Returns true if \p Fn needs the queue pointer because of \p C. 237 bool needsQueuePtr(const Constant *C, Function &Fn) { 238 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv()); 239 bool HasAperture = hasApertureRegs(Fn); 240 241 // No need to explore the constants. 242 if (!IsNonEntryFunc && HasAperture) 243 return false; 244 245 SmallPtrSet<const Constant *, 8> Visited; 246 uint8_t Access = getConstantAccess(C, Visited); 247 248 // We need to trap on DS globals in non-entry functions. 249 if (IsNonEntryFunc && (Access & DS_GLOBAL)) 250 return true; 251 252 return !HasAperture && (Access & ADDR_SPACE_CAST); 253 } 254 255 private: 256 /// Used to determine if the Constant needs the queue pointer. 257 DenseMap<const Constant *, uint8_t> ConstantStatus; 258 const unsigned CodeObjectVersion; 259 }; 260 261 struct AAAMDAttributes 262 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>, 263 AbstractAttribute> { 264 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>, 265 AbstractAttribute>; 266 267 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 268 269 /// Create an abstract attribute view for the position \p IRP. 270 static AAAMDAttributes &createForPosition(const IRPosition &IRP, 271 Attributor &A); 272 273 /// See AbstractAttribute::getName(). 274 const std::string getName() const override { return "AAAMDAttributes"; } 275 276 /// See AbstractAttribute::getIdAddr(). 277 const char *getIdAddr() const override { return &ID; } 278 279 /// This function should return true if the type of the \p AA is 280 /// AAAMDAttributes. 281 static bool classof(const AbstractAttribute *AA) { 282 return (AA->getIdAddr() == &ID); 283 } 284 285 /// Unique ID (due to the unique address) 286 static const char ID; 287 }; 288 const char AAAMDAttributes::ID = 0; 289 290 struct AAUniformWorkGroupSize 291 : public StateWrapper<BooleanState, AbstractAttribute> { 292 using Base = StateWrapper<BooleanState, AbstractAttribute>; 293 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 294 295 /// Create an abstract attribute view for the position \p IRP. 296 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP, 297 Attributor &A); 298 299 /// See AbstractAttribute::getName(). 300 const std::string getName() const override { 301 return "AAUniformWorkGroupSize"; 302 } 303 304 /// See AbstractAttribute::getIdAddr(). 305 const char *getIdAddr() const override { return &ID; } 306 307 /// This function should return true if the type of the \p AA is 308 /// AAAMDAttributes. 309 static bool classof(const AbstractAttribute *AA) { 310 return (AA->getIdAddr() == &ID); 311 } 312 313 /// Unique ID (due to the unique address) 314 static const char ID; 315 }; 316 const char AAUniformWorkGroupSize::ID = 0; 317 318 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize { 319 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A) 320 : AAUniformWorkGroupSize(IRP, A) {} 321 322 void initialize(Attributor &A) override { 323 Function *F = getAssociatedFunction(); 324 CallingConv::ID CC = F->getCallingConv(); 325 326 if (CC != CallingConv::AMDGPU_KERNEL) 327 return; 328 329 bool InitialValue = false; 330 if (F->hasFnAttribute("uniform-work-group-size")) 331 InitialValue = F->getFnAttribute("uniform-work-group-size") 332 .getValueAsString() 333 .equals("true"); 334 335 if (InitialValue) 336 indicateOptimisticFixpoint(); 337 else 338 indicatePessimisticFixpoint(); 339 } 340 341 ChangeStatus updateImpl(Attributor &A) override { 342 ChangeStatus Change = ChangeStatus::UNCHANGED; 343 344 auto CheckCallSite = [&](AbstractCallSite CS) { 345 Function *Caller = CS.getInstruction()->getFunction(); 346 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName() 347 << "->" << getAssociatedFunction()->getName() << "\n"); 348 349 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>( 350 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 351 if (!CallerInfo) 352 return false; 353 354 Change = Change | clampStateAndIndicateChange(this->getState(), 355 CallerInfo->getState()); 356 357 return true; 358 }; 359 360 bool AllCallSitesKnown = true; 361 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 362 return indicatePessimisticFixpoint(); 363 364 return Change; 365 } 366 367 ChangeStatus manifest(Attributor &A) override { 368 SmallVector<Attribute, 8> AttrList; 369 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 370 371 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size", 372 getAssumed() ? "true" : "false")); 373 return A.manifestAttrs(getIRPosition(), AttrList, 374 /* ForceReplace */ true); 375 } 376 377 bool isValidState() const override { 378 // This state is always valid, even when the state is false. 379 return true; 380 } 381 382 const std::string getAsStr(Attributor *) const override { 383 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]"; 384 } 385 386 /// See AbstractAttribute::trackStatistics() 387 void trackStatistics() const override {} 388 }; 389 390 AAUniformWorkGroupSize & 391 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP, 392 Attributor &A) { 393 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 394 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A); 395 llvm_unreachable( 396 "AAUniformWorkGroupSize is only valid for function position"); 397 } 398 399 struct AAAMDAttributesFunction : public AAAMDAttributes { 400 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A) 401 : AAAMDAttributes(IRP, A) {} 402 403 void initialize(Attributor &A) override { 404 Function *F = getAssociatedFunction(); 405 406 // If the function requires the implicit arg pointer due to sanitizers, 407 // assume it's needed even if explicitly marked as not requiring it. 408 const bool NeedsHostcall = funcRequiresHostcallPtr(*F); 409 if (NeedsHostcall) { 410 removeAssumedBits(IMPLICIT_ARG_PTR); 411 removeAssumedBits(HOSTCALL_PTR); 412 } 413 414 for (auto Attr : ImplicitAttrs) { 415 if (NeedsHostcall && 416 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR)) 417 continue; 418 419 if (F->hasFnAttribute(Attr.second)) 420 addKnownBits(Attr.first); 421 } 422 423 if (F->isDeclaration()) 424 return; 425 426 // Ignore functions with graphics calling conventions, these are currently 427 // not allowed to have kernel arguments. 428 if (AMDGPU::isGraphics(F->getCallingConv())) { 429 indicatePessimisticFixpoint(); 430 return; 431 } 432 } 433 434 ChangeStatus updateImpl(Attributor &A) override { 435 Function *F = getAssociatedFunction(); 436 // The current assumed state used to determine a change. 437 auto OrigAssumed = getAssumed(); 438 439 // Check for Intrinsics and propagate attributes. 440 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>( 441 *this, this->getIRPosition(), DepClassTy::REQUIRED); 442 if (!AAEdges || AAEdges->hasNonAsmUnknownCallee()) 443 return indicatePessimisticFixpoint(); 444 445 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); 446 447 bool NeedsImplicit = false; 448 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 449 bool HasApertureRegs = InfoCache.hasApertureRegs(*F); 450 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F); 451 unsigned COV = InfoCache.getCodeObjectVersion(); 452 453 for (Function *Callee : AAEdges->getOptimisticEdges()) { 454 Intrinsic::ID IID = Callee->getIntrinsicID(); 455 if (IID == Intrinsic::not_intrinsic) { 456 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>( 457 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); 458 if (!AAAMD) 459 return indicatePessimisticFixpoint(); 460 *this &= *AAAMD; 461 continue; 462 } 463 464 bool NonKernelOnly = false; 465 ImplicitArgumentMask AttrMask = 466 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, 467 HasApertureRegs, SupportsGetDoorbellID, COV); 468 if (AttrMask != NOT_IMPLICIT_INPUT) { 469 if ((IsNonEntryFunc || !NonKernelOnly)) 470 removeAssumedBits(AttrMask); 471 } 472 } 473 474 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base. 475 if (NeedsImplicit) 476 removeAssumedBits(IMPLICIT_ARG_PTR); 477 478 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) { 479 // Under V5, we need implicitarg_ptr + offsets to access private_base or 480 // shared_base. We do not actually need queue_ptr. 481 if (COV >= 5) 482 removeAssumedBits(IMPLICIT_ARG_PTR); 483 else 484 removeAssumedBits(QUEUE_PTR); 485 } 486 487 if (funcRetrievesMultigridSyncArg(A, COV)) { 488 assert(!isAssumed(IMPLICIT_ARG_PTR) && 489 "multigrid_sync_arg needs implicitarg_ptr"); 490 removeAssumedBits(MULTIGRID_SYNC_ARG); 491 } 492 493 if (funcRetrievesHostcallPtr(A, COV)) { 494 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr"); 495 removeAssumedBits(HOSTCALL_PTR); 496 } 497 498 if (funcRetrievesHeapPtr(A, COV)) { 499 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr"); 500 removeAssumedBits(HEAP_PTR); 501 } 502 503 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) { 504 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr"); 505 removeAssumedBits(QUEUE_PTR); 506 } 507 508 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) { 509 removeAssumedBits(LDS_KERNEL_ID); 510 } 511 512 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV)) 513 removeAssumedBits(DEFAULT_QUEUE); 514 515 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV)) 516 removeAssumedBits(COMPLETION_ACTION); 517 518 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED 519 : ChangeStatus::UNCHANGED; 520 } 521 522 ChangeStatus manifest(Attributor &A) override { 523 SmallVector<Attribute, 8> AttrList; 524 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 525 526 for (auto Attr : ImplicitAttrs) { 527 if (isKnown(Attr.first)) 528 AttrList.push_back(Attribute::get(Ctx, Attr.second)); 529 } 530 531 return A.manifestAttrs(getIRPosition(), AttrList, 532 /* ForceReplace */ true); 533 } 534 535 const std::string getAsStr(Attributor *) const override { 536 std::string Str; 537 raw_string_ostream OS(Str); 538 OS << "AMDInfo["; 539 for (auto Attr : ImplicitAttrs) 540 if (isAssumed(Attr.first)) 541 OS << ' ' << Attr.second; 542 OS << " ]"; 543 return OS.str(); 544 } 545 546 /// See AbstractAttribute::trackStatistics() 547 void trackStatistics() const override {} 548 549 private: 550 bool checkForQueuePtr(Attributor &A) { 551 Function *F = getAssociatedFunction(); 552 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); 553 554 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 555 556 bool NeedsQueuePtr = false; 557 558 auto CheckAddrSpaceCasts = [&](Instruction &I) { 559 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace(); 560 if (castRequiresQueuePtr(SrcAS)) { 561 NeedsQueuePtr = true; 562 return false; 563 } 564 return true; 565 }; 566 567 bool HasApertureRegs = InfoCache.hasApertureRegs(*F); 568 569 // `checkForAllInstructions` is much more cheaper than going through all 570 // instructions, try it first. 571 572 // The queue pointer is not needed if aperture regs is present. 573 if (!HasApertureRegs) { 574 bool UsedAssumedInformation = false; 575 A.checkForAllInstructions(CheckAddrSpaceCasts, *this, 576 {Instruction::AddrSpaceCast}, 577 UsedAssumedInformation); 578 } 579 580 // If we found that we need the queue pointer, nothing else to do. 581 if (NeedsQueuePtr) 582 return true; 583 584 if (!IsNonEntryFunc && HasApertureRegs) 585 return false; 586 587 for (BasicBlock &BB : *F) { 588 for (Instruction &I : BB) { 589 for (const Use &U : I.operands()) { 590 if (const auto *C = dyn_cast<Constant>(U)) { 591 if (InfoCache.needsQueuePtr(C, *F)) 592 return true; 593 } 594 } 595 } 596 } 597 598 return false; 599 } 600 601 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) { 602 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV); 603 AA::RangeTy Range(Pos, 8); 604 return funcRetrievesImplicitKernelArg(A, Range); 605 } 606 607 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) { 608 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV); 609 AA::RangeTy Range(Pos, 8); 610 return funcRetrievesImplicitKernelArg(A, Range); 611 } 612 613 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) { 614 auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV); 615 AA::RangeTy Range(Pos, 8); 616 return funcRetrievesImplicitKernelArg(A, Range); 617 } 618 619 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) { 620 auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV); 621 AA::RangeTy Range(Pos, 8); 622 return funcRetrievesImplicitKernelArg(A, Range); 623 } 624 625 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) { 626 if (COV < 5) 627 return false; 628 AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8); 629 return funcRetrievesImplicitKernelArg(A, Range); 630 } 631 632 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) { 633 if (COV < 5) 634 return false; 635 AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8); 636 return funcRetrievesImplicitKernelArg(A, Range); 637 } 638 639 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) { 640 // Check if this is a call to the implicitarg_ptr builtin and it 641 // is used to retrieve the hostcall pointer. The implicit arg for 642 // hostcall is not used only if every use of the implicitarg_ptr 643 // is a load that clearly does not retrieve any byte of the 644 // hostcall pointer. We check this by tracing all the uses of the 645 // initial call to the implicitarg_ptr intrinsic. 646 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) { 647 auto &Call = cast<CallBase>(I); 648 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr) 649 return true; 650 651 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>( 652 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED); 653 if (!PointerInfoAA) 654 return false; 655 656 return PointerInfoAA->forallInterferingAccesses( 657 Range, [](const AAPointerInfo::Access &Acc, bool IsExact) { 658 return Acc.getRemoteInst()->isDroppable(); 659 }); 660 }; 661 662 bool UsedAssumedInformation = false; 663 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this, 664 UsedAssumedInformation); 665 } 666 667 bool funcRetrievesLDSKernelId(Attributor &A) { 668 auto DoesNotRetrieve = [&](Instruction &I) { 669 auto &Call = cast<CallBase>(I); 670 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id; 671 }; 672 bool UsedAssumedInformation = false; 673 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this, 674 UsedAssumedInformation); 675 } 676 }; 677 678 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, 679 Attributor &A) { 680 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 681 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A); 682 llvm_unreachable("AAAMDAttributes is only valid for function position"); 683 } 684 685 /// Base class to derive different size ranges. 686 struct AAAMDSizeRangeAttribute 687 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> { 688 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>; 689 690 StringRef AttrName; 691 692 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A, 693 StringRef AttrName) 694 : Base(IRP, 32), AttrName(AttrName) {} 695 696 /// See AbstractAttribute::trackStatistics() 697 void trackStatistics() const override {} 698 699 template <class AttributeImpl> 700 ChangeStatus updateImplImpl(Attributor &A) { 701 ChangeStatus Change = ChangeStatus::UNCHANGED; 702 703 auto CheckCallSite = [&](AbstractCallSite CS) { 704 Function *Caller = CS.getInstruction()->getFunction(); 705 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName() 706 << "->" << getAssociatedFunction()->getName() << '\n'); 707 708 const auto *CallerInfo = A.getAAFor<AttributeImpl>( 709 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 710 if (!CallerInfo) 711 return false; 712 713 Change |= 714 clampStateAndIndicateChange(this->getState(), CallerInfo->getState()); 715 716 return true; 717 }; 718 719 bool AllCallSitesKnown = true; 720 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 721 return indicatePessimisticFixpoint(); 722 723 return Change; 724 } 725 726 ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min, 727 unsigned Max) { 728 // Don't add the attribute if it's the implied default. 729 if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max) 730 return ChangeStatus::UNCHANGED; 731 732 Function *F = getAssociatedFunction(); 733 LLVMContext &Ctx = F->getContext(); 734 SmallString<10> Buffer; 735 raw_svector_ostream OS(Buffer); 736 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; 737 return A.manifestAttrs(getIRPosition(), 738 {Attribute::get(Ctx, AttrName, OS.str())}, 739 /* ForceReplace */ true); 740 } 741 742 const std::string getAsStr(Attributor *) const override { 743 std::string Str; 744 raw_string_ostream OS(Str); 745 OS << getName() << '['; 746 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; 747 OS << ']'; 748 return OS.str(); 749 } 750 }; 751 752 /// Propagate amdgpu-flat-work-group-size attribute. 753 struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute { 754 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A) 755 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {} 756 757 void initialize(Attributor &A) override { 758 Function *F = getAssociatedFunction(); 759 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 760 unsigned MinGroupSize, MaxGroupSize; 761 std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F); 762 intersectKnown( 763 ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1))); 764 765 if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) 766 indicatePessimisticFixpoint(); 767 } 768 769 ChangeStatus updateImpl(Attributor &A) override { 770 return updateImplImpl<AAAMDFlatWorkGroupSize>(A); 771 } 772 773 /// Create an abstract attribute view for the position \p IRP. 774 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP, 775 Attributor &A); 776 777 ChangeStatus manifest(Attributor &A) override { 778 Function *F = getAssociatedFunction(); 779 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 780 unsigned Min, Max; 781 std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F); 782 return emitAttributeIfNotDefault(A, Min, Max); 783 } 784 785 /// See AbstractAttribute::getName() 786 const std::string getName() const override { 787 return "AAAMDFlatWorkGroupSize"; 788 } 789 790 /// See AbstractAttribute::getIdAddr() 791 const char *getIdAddr() const override { return &ID; } 792 793 /// This function should return true if the type of the \p AA is 794 /// AAAMDFlatWorkGroupSize 795 static bool classof(const AbstractAttribute *AA) { 796 return (AA->getIdAddr() == &ID); 797 } 798 799 /// Unique ID (due to the unique address) 800 static const char ID; 801 }; 802 803 const char AAAMDFlatWorkGroupSize::ID = 0; 804 805 AAAMDFlatWorkGroupSize & 806 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP, 807 Attributor &A) { 808 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 809 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A); 810 llvm_unreachable( 811 "AAAMDFlatWorkGroupSize is only valid for function position"); 812 } 813 814 /// Propagate amdgpu-waves-per-eu attribute. 815 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { 816 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) 817 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} 818 819 bool isValidState() const override { 820 return !Assumed.isEmptySet() && IntegerRangeState::isValidState(); 821 } 822 823 void initialize(Attributor &A) override { 824 Function *F = getAssociatedFunction(); 825 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 826 827 if (const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>( 828 *this, IRPosition::function(*F), DepClassTy::REQUIRED)) { 829 830 unsigned Min, Max; 831 std::tie(Min, Max) = InfoCache.getWavesPerEU( 832 *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(), 833 AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); 834 835 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); 836 intersectKnown(Range); 837 } 838 839 if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) 840 indicatePessimisticFixpoint(); 841 } 842 843 ChangeStatus updateImpl(Attributor &A) override { 844 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 845 ChangeStatus Change = ChangeStatus::UNCHANGED; 846 847 auto CheckCallSite = [&](AbstractCallSite CS) { 848 Function *Caller = CS.getInstruction()->getFunction(); 849 Function *Func = getAssociatedFunction(); 850 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName() 851 << "->" << Func->getName() << '\n'); 852 853 const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>( 854 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 855 const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>( 856 *this, IRPosition::function(*Func), DepClassTy::REQUIRED); 857 if (!CallerInfo || !AssumedGroupSize) 858 return false; 859 860 unsigned Min, Max; 861 std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU( 862 *Caller, 863 {CallerInfo->getAssumed().getLower().getZExtValue(), 864 CallerInfo->getAssumed().getUpper().getZExtValue() - 1}, 865 {AssumedGroupSize->getAssumed().getLower().getZExtValue(), 866 AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); 867 ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1)); 868 IntegerRangeState CallerRangeState(CallerRange); 869 Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState); 870 871 return true; 872 }; 873 874 bool AllCallSitesKnown = true; 875 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 876 return indicatePessimisticFixpoint(); 877 878 return Change; 879 } 880 881 /// Create an abstract attribute view for the position \p IRP. 882 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP, 883 Attributor &A); 884 885 ChangeStatus manifest(Attributor &A) override { 886 Function *F = getAssociatedFunction(); 887 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 888 unsigned Max = InfoCache.getMaxWavesPerEU(*F); 889 return emitAttributeIfNotDefault(A, 1, Max); 890 } 891 892 /// See AbstractAttribute::getName() 893 const std::string getName() const override { return "AAAMDWavesPerEU"; } 894 895 /// See AbstractAttribute::getIdAddr() 896 const char *getIdAddr() const override { return &ID; } 897 898 /// This function should return true if the type of the \p AA is 899 /// AAAMDWavesPerEU 900 static bool classof(const AbstractAttribute *AA) { 901 return (AA->getIdAddr() == &ID); 902 } 903 904 /// Unique ID (due to the unique address) 905 static const char ID; 906 }; 907 908 const char AAAMDWavesPerEU::ID = 0; 909 910 AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, 911 Attributor &A) { 912 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 913 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A); 914 llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); 915 } 916 917 class AMDGPUAttributor : public ModulePass { 918 public: 919 AMDGPUAttributor() : ModulePass(ID) {} 920 921 /// doInitialization - Virtual method overridden by subclasses to do 922 /// any necessary initialization before any pass is run. 923 bool doInitialization(Module &) override { 924 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 925 if (!TPC) 926 report_fatal_error("TargetMachine is required"); 927 928 TM = &TPC->getTM<TargetMachine>(); 929 return false; 930 } 931 932 bool runOnModule(Module &M) override { 933 SetVector<Function *> Functions; 934 AnalysisGetter AG(this); 935 for (Function &F : M) { 936 if (!F.isIntrinsic()) 937 Functions.insert(&F); 938 } 939 940 CallGraphUpdater CGUpdater; 941 BumpPtrAllocator Allocator; 942 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); 943 DenseSet<const char *> Allowed( 944 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, 945 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, 946 &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID, 947 &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID}); 948 949 AttributorConfig AC(CGUpdater); 950 AC.Allowed = &Allowed; 951 AC.IsModulePass = true; 952 AC.DefaultInitializeLiveInternals = false; 953 AC.IPOAmendableCB = [](const Function &F) { 954 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL; 955 }; 956 957 Attributor A(Functions, InfoCache, AC); 958 959 for (Function &F : M) { 960 if (!F.isIntrinsic()) { 961 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F)); 962 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F)); 963 if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) { 964 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F)); 965 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F)); 966 } 967 } 968 } 969 970 ChangeStatus Change = A.run(); 971 return Change == ChangeStatus::CHANGED; 972 } 973 974 void getAnalysisUsage(AnalysisUsage &AU) const override { 975 AU.addRequired<CycleInfoWrapperPass>(); 976 } 977 978 StringRef getPassName() const override { return "AMDGPU Attributor"; } 979 TargetMachine *TM; 980 static char ID; 981 }; 982 } // namespace 983 984 char AMDGPUAttributor::ID = 0; 985 986 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); } 987 INITIALIZE_PASS_BEGIN(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, 988 false) 989 INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass); 990 INITIALIZE_PASS_END(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, 991 false) 992