1 //===- AMDGPUAttributor.cpp -----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AMDGPU.h" 14 #include "GCNSubtarget.h" 15 #include "Utils/AMDGPUBaseInfo.h" 16 #include "llvm/Analysis/CycleAnalysis.h" 17 #include "llvm/CodeGen/TargetPassConfig.h" 18 #include "llvm/IR/IntrinsicsAMDGPU.h" 19 #include "llvm/IR/IntrinsicsR600.h" 20 #include "llvm/Target/TargetMachine.h" 21 #include "llvm/Transforms/IPO/Attributor.h" 22 23 #define DEBUG_TYPE "amdgpu-attributor" 24 25 namespace llvm { 26 void initializeCycleInfoWrapperPassPass(PassRegistry &); 27 } 28 29 using namespace llvm; 30 31 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS, 32 33 enum ImplicitArgumentPositions { 34 #include "AMDGPUAttributes.def" 35 LAST_ARG_POS 36 }; 37 38 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, 39 40 enum ImplicitArgumentMask { 41 NOT_IMPLICIT_INPUT = 0, 42 #include "AMDGPUAttributes.def" 43 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 44 }; 45 46 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, 47 static constexpr std::pair<ImplicitArgumentMask, 48 StringLiteral> ImplicitAttrs[] = { 49 #include "AMDGPUAttributes.def" 50 }; 51 52 // We do not need to note the x workitem or workgroup id because they are always 53 // initialized. 54 // 55 // TODO: We should not add the attributes if the known compile time workgroup 56 // size is 1 for y/z. 57 static ImplicitArgumentMask 58 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, 59 bool HasApertureRegs, bool SupportsGetDoorBellID) { 60 unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion(); 61 switch (ID) { 62 case Intrinsic::amdgcn_workitem_id_x: 63 NonKernelOnly = true; 64 return WORKITEM_ID_X; 65 case Intrinsic::amdgcn_workgroup_id_x: 66 NonKernelOnly = true; 67 return WORKGROUP_ID_X; 68 case Intrinsic::amdgcn_workitem_id_y: 69 case Intrinsic::r600_read_tidig_y: 70 return WORKITEM_ID_Y; 71 case Intrinsic::amdgcn_workitem_id_z: 72 case Intrinsic::r600_read_tidig_z: 73 return WORKITEM_ID_Z; 74 case Intrinsic::amdgcn_workgroup_id_y: 75 case Intrinsic::r600_read_tgid_y: 76 return WORKGROUP_ID_Y; 77 case Intrinsic::amdgcn_workgroup_id_z: 78 case Intrinsic::r600_read_tgid_z: 79 return WORKGROUP_ID_Z; 80 case Intrinsic::amdgcn_lds_kernel_id: 81 return LDS_KERNEL_ID; 82 case Intrinsic::amdgcn_dispatch_ptr: 83 return DISPATCH_PTR; 84 case Intrinsic::amdgcn_dispatch_id: 85 return DISPATCH_ID; 86 case Intrinsic::amdgcn_implicitarg_ptr: 87 return IMPLICIT_ARG_PTR; 88 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access 89 // queue_ptr. 90 case Intrinsic::amdgcn_queue_ptr: 91 NeedsImplicit = (CodeObjectVersion == 5); 92 return QUEUE_PTR; 93 case Intrinsic::amdgcn_is_shared: 94 case Intrinsic::amdgcn_is_private: 95 if (HasApertureRegs) 96 return NOT_IMPLICIT_INPUT; 97 // Under V5, we need implicitarg_ptr + offsets to access private_base or 98 // shared_base. For pre-V5, however, need to access them through queue_ptr + 99 // offsets. 100 return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR; 101 case Intrinsic::trap: 102 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4. 103 return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR; 104 NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5. 105 return QUEUE_PTR; 106 default: 107 return NOT_IMPLICIT_INPUT; 108 } 109 } 110 111 static bool castRequiresQueuePtr(unsigned SrcAS) { 112 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 113 } 114 115 static bool isDSAddress(const Constant *C) { 116 const GlobalValue *GV = dyn_cast<GlobalValue>(C); 117 if (!GV) 118 return false; 119 unsigned AS = GV->getAddressSpace(); 120 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; 121 } 122 123 /// Returns true if the function requires the implicit argument be passed 124 /// regardless of the function contents. 125 static bool funcRequiresHostcallPtr(const Function &F) { 126 // Sanitizers require the hostcall buffer passed in the implicit arguments. 127 return F.hasFnAttribute(Attribute::SanitizeAddress) || 128 F.hasFnAttribute(Attribute::SanitizeThread) || 129 F.hasFnAttribute(Attribute::SanitizeMemory) || 130 F.hasFnAttribute(Attribute::SanitizeHWAddress) || 131 F.hasFnAttribute(Attribute::SanitizeMemTag); 132 } 133 134 namespace { 135 class AMDGPUInformationCache : public InformationCache { 136 public: 137 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, 138 BumpPtrAllocator &Allocator, 139 SetVector<Function *> *CGSCC, TargetMachine &TM) 140 : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {} 141 TargetMachine &TM; 142 143 enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 }; 144 145 /// Check if the subtarget has aperture regs. 146 bool hasApertureRegs(Function &F) { 147 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 148 return ST.hasApertureRegs(); 149 } 150 151 /// Check if the subtarget supports GetDoorbellID. 152 bool supportsGetDoorbellID(Function &F) { 153 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 154 return ST.supportsGetDoorbellID(); 155 } 156 157 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) { 158 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 159 return ST.getFlatWorkGroupSizes(F); 160 } 161 162 std::pair<unsigned, unsigned> 163 getMaximumFlatWorkGroupRange(const Function &F) { 164 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 165 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; 166 } 167 168 private: 169 /// Check if the ConstantExpr \p CE requires the queue pointer. 170 static bool visitConstExpr(const ConstantExpr *CE) { 171 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 172 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 173 return castRequiresQueuePtr(SrcAS); 174 } 175 return false; 176 } 177 178 /// Get the constant access bitmap for \p C. 179 uint8_t getConstantAccess(const Constant *C) { 180 auto It = ConstantStatus.find(C); 181 if (It != ConstantStatus.end()) 182 return It->second; 183 184 uint8_t Result = 0; 185 if (isDSAddress(C)) 186 Result = DS_GLOBAL; 187 188 if (const auto *CE = dyn_cast<ConstantExpr>(C)) 189 if (visitConstExpr(CE)) 190 Result |= ADDR_SPACE_CAST; 191 192 for (const Use &U : C->operands()) { 193 const auto *OpC = dyn_cast<Constant>(U); 194 if (!OpC) 195 continue; 196 197 Result |= getConstantAccess(OpC); 198 } 199 return Result; 200 } 201 202 public: 203 /// Returns true if \p Fn needs the queue pointer because of \p C. 204 bool needsQueuePtr(const Constant *C, Function &Fn) { 205 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv()); 206 bool HasAperture = hasApertureRegs(Fn); 207 208 // No need to explore the constants. 209 if (!IsNonEntryFunc && HasAperture) 210 return false; 211 212 uint8_t Access = getConstantAccess(C); 213 214 // We need to trap on DS globals in non-entry functions. 215 if (IsNonEntryFunc && (Access & DS_GLOBAL)) 216 return true; 217 218 return !HasAperture && (Access & ADDR_SPACE_CAST); 219 } 220 221 private: 222 /// Used to determine if the Constant needs the queue pointer. 223 DenseMap<const Constant *, uint8_t> ConstantStatus; 224 }; 225 226 struct AAAMDAttributes 227 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>, 228 AbstractAttribute> { 229 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>, 230 AbstractAttribute>; 231 232 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 233 234 /// Create an abstract attribute view for the position \p IRP. 235 static AAAMDAttributes &createForPosition(const IRPosition &IRP, 236 Attributor &A); 237 238 /// See AbstractAttribute::getName(). 239 const std::string getName() const override { return "AAAMDAttributes"; } 240 241 /// See AbstractAttribute::getIdAddr(). 242 const char *getIdAddr() const override { return &ID; } 243 244 /// This function should return true if the type of the \p AA is 245 /// AAAMDAttributes. 246 static bool classof(const AbstractAttribute *AA) { 247 return (AA->getIdAddr() == &ID); 248 } 249 250 /// Unique ID (due to the unique address) 251 static const char ID; 252 }; 253 const char AAAMDAttributes::ID = 0; 254 255 struct AAUniformWorkGroupSize 256 : public StateWrapper<BooleanState, AbstractAttribute> { 257 using Base = StateWrapper<BooleanState, AbstractAttribute>; 258 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 259 260 /// Create an abstract attribute view for the position \p IRP. 261 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP, 262 Attributor &A); 263 264 /// See AbstractAttribute::getName(). 265 const std::string getName() const override { 266 return "AAUniformWorkGroupSize"; 267 } 268 269 /// See AbstractAttribute::getIdAddr(). 270 const char *getIdAddr() const override { return &ID; } 271 272 /// This function should return true if the type of the \p AA is 273 /// AAAMDAttributes. 274 static bool classof(const AbstractAttribute *AA) { 275 return (AA->getIdAddr() == &ID); 276 } 277 278 /// Unique ID (due to the unique address) 279 static const char ID; 280 }; 281 const char AAUniformWorkGroupSize::ID = 0; 282 283 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize { 284 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A) 285 : AAUniformWorkGroupSize(IRP, A) {} 286 287 void initialize(Attributor &A) override { 288 Function *F = getAssociatedFunction(); 289 CallingConv::ID CC = F->getCallingConv(); 290 291 if (CC != CallingConv::AMDGPU_KERNEL) 292 return; 293 294 bool InitialValue = false; 295 if (F->hasFnAttribute("uniform-work-group-size")) 296 InitialValue = F->getFnAttribute("uniform-work-group-size") 297 .getValueAsString() 298 .equals("true"); 299 300 if (InitialValue) 301 indicateOptimisticFixpoint(); 302 else 303 indicatePessimisticFixpoint(); 304 } 305 306 ChangeStatus updateImpl(Attributor &A) override { 307 ChangeStatus Change = ChangeStatus::UNCHANGED; 308 309 auto CheckCallSite = [&](AbstractCallSite CS) { 310 Function *Caller = CS.getInstruction()->getFunction(); 311 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName() 312 << "->" << getAssociatedFunction()->getName() << "\n"); 313 314 const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>( 315 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 316 317 Change = Change | clampStateAndIndicateChange(this->getState(), 318 CallerInfo.getState()); 319 320 return true; 321 }; 322 323 bool AllCallSitesKnown = true; 324 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 325 return indicatePessimisticFixpoint(); 326 327 return Change; 328 } 329 330 ChangeStatus manifest(Attributor &A) override { 331 SmallVector<Attribute, 8> AttrList; 332 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 333 334 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size", 335 getAssumed() ? "true" : "false")); 336 return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, 337 /* ForceReplace */ true); 338 } 339 340 bool isValidState() const override { 341 // This state is always valid, even when the state is false. 342 return true; 343 } 344 345 const std::string getAsStr() const override { 346 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]"; 347 } 348 349 /// See AbstractAttribute::trackStatistics() 350 void trackStatistics() const override {} 351 }; 352 353 AAUniformWorkGroupSize & 354 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP, 355 Attributor &A) { 356 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 357 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A); 358 llvm_unreachable( 359 "AAUniformWorkGroupSize is only valid for function position"); 360 } 361 362 struct AAAMDAttributesFunction : public AAAMDAttributes { 363 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A) 364 : AAAMDAttributes(IRP, A) {} 365 366 void initialize(Attributor &A) override { 367 Function *F = getAssociatedFunction(); 368 369 // If the function requires the implicit arg pointer due to sanitizers, 370 // assume it's needed even if explicitly marked as not requiring it. 371 const bool NeedsHostcall = funcRequiresHostcallPtr(*F); 372 if (NeedsHostcall) { 373 removeAssumedBits(IMPLICIT_ARG_PTR); 374 removeAssumedBits(HOSTCALL_PTR); 375 } 376 377 for (auto Attr : ImplicitAttrs) { 378 if (NeedsHostcall && 379 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR)) 380 continue; 381 382 if (F->hasFnAttribute(Attr.second)) 383 addKnownBits(Attr.first); 384 } 385 386 if (F->isDeclaration()) 387 return; 388 389 // Ignore functions with graphics calling conventions, these are currently 390 // not allowed to have kernel arguments. 391 if (AMDGPU::isGraphics(F->getCallingConv())) { 392 indicatePessimisticFixpoint(); 393 return; 394 } 395 } 396 397 ChangeStatus updateImpl(Attributor &A) override { 398 Function *F = getAssociatedFunction(); 399 // The current assumed state used to determine a change. 400 auto OrigAssumed = getAssumed(); 401 402 // Check for Intrinsics and propagate attributes. 403 const AACallEdges &AAEdges = A.getAAFor<AACallEdges>( 404 *this, this->getIRPosition(), DepClassTy::REQUIRED); 405 if (AAEdges.hasNonAsmUnknownCallee()) 406 return indicatePessimisticFixpoint(); 407 408 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); 409 410 bool NeedsImplicit = false; 411 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 412 bool HasApertureRegs = InfoCache.hasApertureRegs(*F); 413 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F); 414 415 for (Function *Callee : AAEdges.getOptimisticEdges()) { 416 Intrinsic::ID IID = Callee->getIntrinsicID(); 417 if (IID == Intrinsic::not_intrinsic) { 418 const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>( 419 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); 420 *this &= AAAMD; 421 continue; 422 } 423 424 bool NonKernelOnly = false; 425 ImplicitArgumentMask AttrMask = 426 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, 427 HasApertureRegs, SupportsGetDoorbellID); 428 if (AttrMask != NOT_IMPLICIT_INPUT) { 429 if ((IsNonEntryFunc || !NonKernelOnly)) 430 removeAssumedBits(AttrMask); 431 } 432 } 433 434 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base. 435 if (NeedsImplicit) 436 removeAssumedBits(IMPLICIT_ARG_PTR); 437 438 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) { 439 // Under V5, we need implicitarg_ptr + offsets to access private_base or 440 // shared_base. We do not actually need queue_ptr. 441 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) 442 removeAssumedBits(IMPLICIT_ARG_PTR); 443 else 444 removeAssumedBits(QUEUE_PTR); 445 } 446 447 if (funcRetrievesMultigridSyncArg(A)) { 448 assert(!isAssumed(IMPLICIT_ARG_PTR) && 449 "multigrid_sync_arg needs implicitarg_ptr"); 450 removeAssumedBits(MULTIGRID_SYNC_ARG); 451 } 452 453 if (funcRetrievesHostcallPtr(A)) { 454 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr"); 455 removeAssumedBits(HOSTCALL_PTR); 456 } 457 458 if (funcRetrievesHeapPtr(A)) { 459 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr"); 460 removeAssumedBits(HEAP_PTR); 461 } 462 463 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) { 464 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr"); 465 removeAssumedBits(QUEUE_PTR); 466 } 467 468 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) { 469 removeAssumedBits(LDS_KERNEL_ID); 470 } 471 472 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A)) 473 removeAssumedBits(DEFAULT_QUEUE); 474 475 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A)) 476 removeAssumedBits(COMPLETION_ACTION); 477 478 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED 479 : ChangeStatus::UNCHANGED; 480 } 481 482 ChangeStatus manifest(Attributor &A) override { 483 SmallVector<Attribute, 8> AttrList; 484 LLVMContext &Ctx = getAssociatedFunction()->getContext(); 485 486 for (auto Attr : ImplicitAttrs) { 487 if (isKnown(Attr.first)) 488 AttrList.push_back(Attribute::get(Ctx, Attr.second)); 489 } 490 491 return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, 492 /* ForceReplace */ true); 493 } 494 495 const std::string getAsStr() const override { 496 std::string Str; 497 raw_string_ostream OS(Str); 498 OS << "AMDInfo["; 499 for (auto Attr : ImplicitAttrs) 500 OS << ' ' << Attr.second; 501 OS << " ]"; 502 return OS.str(); 503 } 504 505 /// See AbstractAttribute::trackStatistics() 506 void trackStatistics() const override {} 507 508 private: 509 bool checkForQueuePtr(Attributor &A) { 510 Function *F = getAssociatedFunction(); 511 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); 512 513 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 514 515 bool NeedsQueuePtr = false; 516 517 auto CheckAddrSpaceCasts = [&](Instruction &I) { 518 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace(); 519 if (castRequiresQueuePtr(SrcAS)) { 520 NeedsQueuePtr = true; 521 return false; 522 } 523 return true; 524 }; 525 526 bool HasApertureRegs = InfoCache.hasApertureRegs(*F); 527 528 // `checkForAllInstructions` is much more cheaper than going through all 529 // instructions, try it first. 530 531 // The queue pointer is not needed if aperture regs is present. 532 if (!HasApertureRegs) { 533 bool UsedAssumedInformation = false; 534 A.checkForAllInstructions(CheckAddrSpaceCasts, *this, 535 {Instruction::AddrSpaceCast}, 536 UsedAssumedInformation); 537 } 538 539 // If we found that we need the queue pointer, nothing else to do. 540 if (NeedsQueuePtr) 541 return true; 542 543 if (!IsNonEntryFunc && HasApertureRegs) 544 return false; 545 546 for (BasicBlock &BB : *F) { 547 for (Instruction &I : BB) { 548 for (const Use &U : I.operands()) { 549 if (const auto *C = dyn_cast<Constant>(U)) { 550 if (InfoCache.needsQueuePtr(C, *F)) 551 return true; 552 } 553 } 554 } 555 } 556 557 return false; 558 } 559 560 bool funcRetrievesMultigridSyncArg(Attributor &A) { 561 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(); 562 AA::RangeTy Range(Pos, 8); 563 return funcRetrievesImplicitKernelArg(A, Range); 564 } 565 566 bool funcRetrievesHostcallPtr(Attributor &A) { 567 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(); 568 AA::RangeTy Range(Pos, 8); 569 return funcRetrievesImplicitKernelArg(A, Range); 570 } 571 572 bool funcRetrievesDefaultQueue(Attributor &A) { 573 auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(); 574 AA::RangeTy Range(Pos, 8); 575 return funcRetrievesImplicitKernelArg(A, Range); 576 } 577 578 bool funcRetrievesCompletionAction(Attributor &A) { 579 auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(); 580 AA::RangeTy Range(Pos, 8); 581 return funcRetrievesImplicitKernelArg(A, Range); 582 } 583 584 bool funcRetrievesHeapPtr(Attributor &A) { 585 if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) 586 return false; 587 AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8); 588 return funcRetrievesImplicitKernelArg(A, Range); 589 } 590 591 bool funcRetrievesQueuePtr(Attributor &A) { 592 if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) 593 return false; 594 AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8); 595 return funcRetrievesImplicitKernelArg(A, Range); 596 } 597 598 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) { 599 // Check if this is a call to the implicitarg_ptr builtin and it 600 // is used to retrieve the hostcall pointer. The implicit arg for 601 // hostcall is not used only if every use of the implicitarg_ptr 602 // is a load that clearly does not retrieve any byte of the 603 // hostcall pointer. We check this by tracing all the uses of the 604 // initial call to the implicitarg_ptr intrinsic. 605 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) { 606 auto &Call = cast<CallBase>(I); 607 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr) 608 return true; 609 610 const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>( 611 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED); 612 613 return PointerInfoAA.forallInterferingAccesses( 614 Range, [](const AAPointerInfo::Access &Acc, bool IsExact) { 615 return Acc.getRemoteInst()->isDroppable(); 616 }); 617 }; 618 619 bool UsedAssumedInformation = false; 620 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this, 621 UsedAssumedInformation); 622 } 623 624 bool funcRetrievesLDSKernelId(Attributor &A) { 625 auto DoesNotRetrieve = [&](Instruction &I) { 626 auto &Call = cast<CallBase>(I); 627 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id; 628 }; 629 bool UsedAssumedInformation = false; 630 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this, 631 UsedAssumedInformation); 632 } 633 }; 634 635 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, 636 Attributor &A) { 637 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 638 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A); 639 llvm_unreachable("AAAMDAttributes is only valid for function position"); 640 } 641 642 /// Propagate amdgpu-flat-work-group-size attribute. 643 struct AAAMDFlatWorkGroupSize 644 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> { 645 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>; 646 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A) 647 : Base(IRP, 32) {} 648 649 /// See AbstractAttribute::getState(...). 650 IntegerRangeState &getState() override { return *this; } 651 const IntegerRangeState &getState() const override { return *this; } 652 653 void initialize(Attributor &A) override { 654 Function *F = getAssociatedFunction(); 655 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 656 unsigned MinGroupSize, MaxGroupSize; 657 std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F); 658 intersectKnown( 659 ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1))); 660 661 if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) 662 indicatePessimisticFixpoint(); 663 } 664 665 ChangeStatus updateImpl(Attributor &A) override { 666 ChangeStatus Change = ChangeStatus::UNCHANGED; 667 668 auto CheckCallSite = [&](AbstractCallSite CS) { 669 Function *Caller = CS.getInstruction()->getFunction(); 670 LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName() 671 << "->" << getAssociatedFunction()->getName() << '\n'); 672 673 const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>( 674 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); 675 676 Change |= 677 clampStateAndIndicateChange(this->getState(), CallerInfo.getState()); 678 679 return true; 680 }; 681 682 bool AllCallSitesKnown = true; 683 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) 684 return indicatePessimisticFixpoint(); 685 686 return Change; 687 } 688 689 ChangeStatus manifest(Attributor &A) override { 690 SmallVector<Attribute, 8> AttrList; 691 Function *F = getAssociatedFunction(); 692 LLVMContext &Ctx = F->getContext(); 693 694 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); 695 unsigned Min, Max; 696 std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F); 697 698 // Don't add the attribute if it's the implied default. 699 if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max) 700 return ChangeStatus::UNCHANGED; 701 702 SmallString<10> Buffer; 703 raw_svector_ostream OS(Buffer); 704 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; 705 706 AttrList.push_back( 707 Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str())); 708 return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, 709 /* ForceReplace */ true); 710 } 711 712 const std::string getAsStr() const override { 713 std::string Str; 714 raw_string_ostream OS(Str); 715 OS << "AMDFlatWorkGroupSize["; 716 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; 717 OS << ']'; 718 return OS.str(); 719 } 720 721 /// See AbstractAttribute::trackStatistics() 722 void trackStatistics() const override {} 723 724 /// Create an abstract attribute view for the position \p IRP. 725 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP, 726 Attributor &A); 727 728 /// See AbstractAttribute::getName() 729 const std::string getName() const override { 730 return "AAAMDFlatWorkGroupSize"; 731 } 732 733 /// See AbstractAttribute::getIdAddr() 734 const char *getIdAddr() const override { return &ID; } 735 736 /// This function should return true if the type of the \p AA is 737 /// AAAMDFlatWorkGroupSize 738 static bool classof(const AbstractAttribute *AA) { 739 return (AA->getIdAddr() == &ID); 740 } 741 742 /// Unique ID (due to the unique address) 743 static const char ID; 744 }; 745 746 const char AAAMDFlatWorkGroupSize::ID = 0; 747 748 AAAMDFlatWorkGroupSize & 749 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP, 750 Attributor &A) { 751 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) 752 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A); 753 llvm_unreachable( 754 "AAAMDFlatWorkGroupSize is only valid for function position"); 755 } 756 757 class AMDGPUAttributor : public ModulePass { 758 public: 759 AMDGPUAttributor() : ModulePass(ID) {} 760 761 /// doInitialization - Virtual method overridden by subclasses to do 762 /// any necessary initialization before any pass is run. 763 bool doInitialization(Module &) override { 764 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 765 if (!TPC) 766 report_fatal_error("TargetMachine is required"); 767 768 TM = &TPC->getTM<TargetMachine>(); 769 return false; 770 } 771 772 bool runOnModule(Module &M) override { 773 SetVector<Function *> Functions; 774 AnalysisGetter AG(this); 775 for (Function &F : M) { 776 if (!F.isIntrinsic()) 777 Functions.insert(&F); 778 } 779 780 CallGraphUpdater CGUpdater; 781 BumpPtrAllocator Allocator; 782 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); 783 DenseSet<const char *> Allowed( 784 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, 785 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, 786 &AAPointerInfo::ID, &AAPotentialConstantValues::ID}); 787 788 AttributorConfig AC(CGUpdater); 789 AC.Allowed = &Allowed; 790 AC.IsModulePass = true; 791 AC.DefaultInitializeLiveInternals = false; 792 793 Attributor A(Functions, InfoCache, AC); 794 795 for (Function &F : M) { 796 if (!F.isIntrinsic()) { 797 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F)); 798 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F)); 799 if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) { 800 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F)); 801 } 802 } 803 } 804 805 ChangeStatus Change = A.run(); 806 return Change == ChangeStatus::CHANGED; 807 } 808 809 void getAnalysisUsage(AnalysisUsage &AU) const override { 810 AU.addRequired<CycleInfoWrapperPass>(); 811 } 812 813 StringRef getPassName() const override { return "AMDGPU Attributor"; } 814 TargetMachine *TM; 815 static char ID; 816 }; 817 } // namespace 818 819 char AMDGPUAttributor::ID = 0; 820 821 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); } 822 INITIALIZE_PASS_BEGIN(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, 823 false) 824 INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass); 825 INITIALIZE_PASS_END(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, 826 false) 827