1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass adds target attributes to functions which use intrinsics 10 /// which will impact calling convention lowering. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "llvm/Analysis/CallGraph.h" 17 #include "llvm/Analysis/CallGraphSCCPass.h" 18 #include "llvm/CodeGen/TargetPassConfig.h" 19 #include "llvm/IR/IntrinsicsAMDGPU.h" 20 #include "llvm/IR/IntrinsicsR600.h" 21 #include "llvm/Target/TargetMachine.h" 22 23 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 24 25 using namespace llvm; 26 27 namespace { 28 static constexpr StringLiteral ImplicitAttrNames[] = { 29 // X ids unnecessarily propagated to kernels. 30 "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", 31 "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", 32 "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", 33 "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", 34 "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"}; 35 36 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 37 private: 38 const TargetMachine *TM = nullptr; 39 SmallVector<CallGraphNode*, 8> NodeList; 40 41 bool addFeatureAttributes(Function &F); 42 bool processUniformWorkGroupAttribute(); 43 bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); 44 45 public: 46 static char ID; 47 48 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 49 50 bool doInitialization(CallGraph &CG) override; 51 bool runOnSCC(CallGraphSCC &SCC) override; 52 53 StringRef getPassName() const override { 54 return "AMDGPU Annotate Kernel Features"; 55 } 56 57 void getAnalysisUsage(AnalysisUsage &AU) const override { 58 AU.setPreservesAll(); 59 CallGraphSCCPass::getAnalysisUsage(AU); 60 } 61 62 static bool visitConstantExpr(const ConstantExpr *CE); 63 static bool visitConstantExprsRecursively( 64 const Constant *EntryC, 65 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc, 66 bool HasApertureRegs); 67 }; 68 69 } // end anonymous namespace 70 71 char AMDGPUAnnotateKernelFeatures::ID = 0; 72 73 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 74 75 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 76 "Add AMDGPU function attributes", false, false) 77 78 79 // The queue ptr is only needed when casting to flat, not from it. 80 static bool castRequiresQueuePtr(unsigned SrcAS) { 81 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 82 } 83 84 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 85 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 86 } 87 88 static bool isDSAddress(const Constant *C) { 89 const GlobalValue *GV = dyn_cast<GlobalValue>(C); 90 if (!GV) 91 return false; 92 unsigned AS = GV->getAddressSpace(); 93 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; 94 } 95 96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 97 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 98 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 99 return castRequiresQueuePtr(SrcAS); 100 } 101 102 return false; 103 } 104 105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 106 const Constant *EntryC, 107 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, 108 bool IsFunc, bool HasApertureRegs) { 109 110 if (!ConstantExprVisited.insert(EntryC).second) 111 return false; 112 113 SmallVector<const Constant *, 16> Stack; 114 Stack.push_back(EntryC); 115 116 while (!Stack.empty()) { 117 const Constant *C = Stack.pop_back_val(); 118 119 // We need to trap on DS globals in non-entry functions. 120 if (IsFunc && isDSAddress(C)) 121 return true; 122 123 // Check this constant expression. 124 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 125 if (!HasApertureRegs && visitConstantExpr(CE)) 126 return true; 127 } 128 129 // Visit all sub-expressions. 130 for (const Use &U : C->operands()) { 131 const auto *OpC = dyn_cast<Constant>(U); 132 if (!OpC) 133 continue; 134 135 if (!ConstantExprVisited.insert(OpC).second) 136 continue; 137 138 Stack.push_back(OpC); 139 } 140 } 141 142 return false; 143 } 144 145 // We do not need to note the x workitem or workgroup id because they are always 146 // initialized. 147 // 148 // TODO: We should not add the attributes if the known compile time workgroup 149 // size is 1 for y/z. 150 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 151 bool &NonKernelOnly, 152 bool &IsQueuePtr) { 153 switch (ID) { 154 case Intrinsic::amdgcn_workitem_id_x: 155 NonKernelOnly = true; 156 return "amdgpu-work-item-id-x"; 157 case Intrinsic::amdgcn_workgroup_id_x: 158 NonKernelOnly = true; 159 return "amdgpu-work-group-id-x"; 160 case Intrinsic::amdgcn_workitem_id_y: 161 case Intrinsic::r600_read_tidig_y: 162 return "amdgpu-work-item-id-y"; 163 case Intrinsic::amdgcn_workitem_id_z: 164 case Intrinsic::r600_read_tidig_z: 165 return "amdgpu-work-item-id-z"; 166 case Intrinsic::amdgcn_workgroup_id_y: 167 case Intrinsic::r600_read_tgid_y: 168 return "amdgpu-work-group-id-y"; 169 case Intrinsic::amdgcn_workgroup_id_z: 170 case Intrinsic::r600_read_tgid_z: 171 return "amdgpu-work-group-id-z"; 172 case Intrinsic::amdgcn_dispatch_ptr: 173 return "amdgpu-dispatch-ptr"; 174 case Intrinsic::amdgcn_dispatch_id: 175 return "amdgpu-dispatch-id"; 176 case Intrinsic::amdgcn_kernarg_segment_ptr: 177 return "amdgpu-kernarg-segment-ptr"; 178 case Intrinsic::amdgcn_implicitarg_ptr: 179 return "amdgpu-implicitarg-ptr"; 180 case Intrinsic::amdgcn_queue_ptr: 181 case Intrinsic::amdgcn_is_shared: 182 case Intrinsic::amdgcn_is_private: 183 // TODO: Does not require queue ptr on gfx9+ 184 case Intrinsic::trap: 185 case Intrinsic::debugtrap: 186 IsQueuePtr = true; 187 return "amdgpu-queue-ptr"; 188 default: 189 return ""; 190 } 191 } 192 193 static bool handleAttr(Function &Parent, const Function &Callee, 194 StringRef Name) { 195 if (Callee.hasFnAttribute(Name)) { 196 Parent.addFnAttr(Name); 197 return true; 198 } 199 return false; 200 } 201 202 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 203 bool &NeedQueuePtr) { 204 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 205 NeedQueuePtr = true; 206 207 for (StringRef AttrName : ImplicitAttrNames) 208 handleAttr(Parent, Callee, AttrName); 209 } 210 211 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { 212 bool Changed = false; 213 214 for (auto *Node : reverse(NodeList)) { 215 Function *Caller = Node->getFunction(); 216 217 for (auto I : *Node) { 218 Function *Callee = std::get<1>(I)->getFunction(); 219 if (Callee) 220 Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); 221 } 222 } 223 224 return Changed; 225 } 226 227 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( 228 Function &Caller, Function &Callee) { 229 230 // Check for externally defined function 231 if (!Callee.hasExactDefinition()) { 232 Callee.addFnAttr("uniform-work-group-size", "false"); 233 if (!Caller.hasFnAttribute("uniform-work-group-size")) 234 Caller.addFnAttr("uniform-work-group-size", "false"); 235 236 return true; 237 } 238 // Check if the Caller has the attribute 239 if (Caller.hasFnAttribute("uniform-work-group-size")) { 240 // Check if the value of the attribute is true 241 if (Caller.getFnAttribute("uniform-work-group-size") 242 .getValueAsString().equals("true")) { 243 // Propagate the attribute to the Callee, if it does not have it 244 if (!Callee.hasFnAttribute("uniform-work-group-size")) { 245 Callee.addFnAttr("uniform-work-group-size", "true"); 246 return true; 247 } 248 } else { 249 Callee.addFnAttr("uniform-work-group-size", "false"); 250 return true; 251 } 252 } else { 253 // If the attribute is absent, set it as false 254 Caller.addFnAttr("uniform-work-group-size", "false"); 255 Callee.addFnAttr("uniform-work-group-size", "false"); 256 return true; 257 } 258 return false; 259 } 260 261 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 262 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 263 bool HasApertureRegs = ST.hasApertureRegs(); 264 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 265 266 bool HaveStackObjects = false; 267 bool Changed = false; 268 bool NeedQueuePtr = false; 269 bool HaveCall = false; 270 bool HasIndirectCall = false; 271 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 272 CallingConv::ID CC = F.getCallingConv(); 273 bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); 274 275 // If this function hasAddressTaken() = true 276 // then add all attributes corresponding to the implicit args. 277 if (CallingConvSupportsAllImplicits && 278 F.hasAddressTaken(nullptr, true, true, true)) { 279 for (StringRef AttrName : ImplicitAttrNames) { 280 F.addFnAttr(AttrName); 281 } 282 Changed = true; 283 } 284 285 for (BasicBlock &BB : F) { 286 for (Instruction &I : BB) { 287 if (isa<AllocaInst>(I)) { 288 HaveStackObjects = true; 289 continue; 290 } 291 292 if (auto *CB = dyn_cast<CallBase>(&I)) { 293 const Function *Callee = 294 dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts()); 295 296 // Note the occurence of indirect call. 297 if (!Callee) { 298 if (!CB->isInlineAsm()) { 299 HasIndirectCall = true; 300 HaveCall = true; 301 } 302 continue; 303 } 304 305 Intrinsic::ID IID = Callee->getIntrinsicID(); 306 if (IID == Intrinsic::not_intrinsic) { 307 HaveCall = true; 308 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 309 Changed = true; 310 } else { 311 bool NonKernelOnly = false; 312 313 if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { 314 F.addFnAttr("amdgpu-kernarg-segment-ptr"); 315 } else { 316 StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly, 317 NeedQueuePtr); 318 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 319 F.addFnAttr(AttrName); 320 Changed = true; 321 } 322 } 323 } 324 } 325 326 if (NeedQueuePtr || (!IsFunc && HasApertureRegs)) 327 continue; 328 329 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 330 if (!HasApertureRegs && castRequiresQueuePtr(ASC)) { 331 NeedQueuePtr = true; 332 continue; 333 } 334 } 335 336 for (const Use &U : I.operands()) { 337 const auto *OpC = dyn_cast<Constant>(U); 338 if (!OpC) 339 continue; 340 341 if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc, 342 HasApertureRegs)) { 343 NeedQueuePtr = true; 344 break; 345 } 346 } 347 } 348 } 349 350 if (NeedQueuePtr) { 351 F.addFnAttr("amdgpu-queue-ptr"); 352 Changed = true; 353 } 354 355 // TODO: We could refine this to captured pointers that could possibly be 356 // accessed by flat instructions. For now this is mostly a poor way of 357 // estimating whether there are calls before argument lowering. 358 if (!IsFunc && HaveCall) { 359 F.addFnAttr("amdgpu-calls"); 360 Changed = true; 361 } 362 363 if (HaveStackObjects) { 364 F.addFnAttr("amdgpu-stack-objects"); 365 Changed = true; 366 } 367 368 // This pass cannot copy attributes from callees to callers 369 // if there is an indirect call and in thus such cases, 370 // hasAddressTaken() would be false for kernels and functions 371 // making an indirect call (if they are themselves not indirectly called). 372 // We must tag all such kernels/functions with all implicits attributes 373 // for correctness. 374 // e.g. 375 // 1. Kernel K1 makes an indirect call to function F1. 376 // Without detecting an indirect call in K1, this pass will not 377 // add all implicit args to K1 (which is incorrect). 378 // 2. Kernel K1 makes direct call to F1 which makes indirect call to function 379 // F2. 380 // Without detecting an indirect call in F1 (whose hasAddressTaken() is 381 // false), the pass will not add all implicit args to F1 (which is 382 // essential for correctness). 383 if (CallingConvSupportsAllImplicits && HasIndirectCall) { 384 for (StringRef AttrName : ImplicitAttrNames) { 385 F.addFnAttr(AttrName); 386 } 387 Changed = true; 388 } 389 390 return Changed; 391 } 392 393 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 394 bool Changed = false; 395 396 for (CallGraphNode *I : SCC) { 397 // Build a list of CallGraphNodes from most number of uses to least 398 if (I->getNumReferences()) 399 NodeList.push_back(I); 400 else { 401 processUniformWorkGroupAttribute(); 402 NodeList.clear(); 403 } 404 405 Function *F = I->getFunction(); 406 // Ignore functions with graphics calling conventions, these are currently 407 // not allowed to have kernel arguments. 408 if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv())) 409 continue; 410 // Add feature attributes 411 Changed |= addFeatureAttributes(*F); 412 } 413 414 return Changed; 415 } 416 417 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 418 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 419 if (!TPC) 420 report_fatal_error("TargetMachine is required"); 421 422 TM = &TPC->getTM<TargetMachine>(); 423 return false; 424 } 425 426 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 427 return new AMDGPUAnnotateKernelFeatures(); 428 } 429