1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass adds target attributes to functions which use intrinsics 10 /// which will impact calling convention lowering. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "llvm/Analysis/CallGraph.h" 17 #include "llvm/Analysis/CallGraphSCCPass.h" 18 #include "llvm/CodeGen/TargetPassConfig.h" 19 #include "llvm/IR/IntrinsicsAMDGPU.h" 20 #include "llvm/IR/IntrinsicsR600.h" 21 #include "llvm/Target/TargetMachine.h" 22 23 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 24 25 using namespace llvm; 26 27 namespace { 28 29 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 30 private: 31 const TargetMachine *TM = nullptr; 32 SmallVector<CallGraphNode*, 8> NodeList; 33 34 bool addFeatureAttributes(Function &F); 35 bool processUniformWorkGroupAttribute(); 36 bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); 37 38 public: 39 static char ID; 40 41 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 42 43 bool doInitialization(CallGraph &CG) override; 44 bool runOnSCC(CallGraphSCC &SCC) override; 45 46 StringRef getPassName() const override { 47 return "AMDGPU Annotate Kernel Features"; 48 } 49 50 void getAnalysisUsage(AnalysisUsage &AU) const override { 51 AU.setPreservesAll(); 52 CallGraphSCCPass::getAnalysisUsage(AU); 53 } 54 55 static bool visitConstantExpr(const ConstantExpr *CE); 56 static bool visitConstantExprsRecursively( 57 const Constant *EntryC, 58 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc, 59 bool HasApertureRegs); 60 }; 61 62 } // end anonymous namespace 63 64 char AMDGPUAnnotateKernelFeatures::ID = 0; 65 66 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 67 68 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 69 "Add AMDGPU function attributes", false, false) 70 71 72 // The queue ptr is only needed when casting to flat, not from it. 73 static bool castRequiresQueuePtr(unsigned SrcAS) { 74 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 75 } 76 77 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 78 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 79 } 80 81 static bool isDSAddress(const Constant *C) { 82 const GlobalValue *GV = dyn_cast<GlobalValue>(C); 83 if (!GV) 84 return false; 85 unsigned AS = GV->getAddressSpace(); 86 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; 87 } 88 89 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 90 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 91 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 92 return castRequiresQueuePtr(SrcAS); 93 } 94 95 return false; 96 } 97 98 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 99 const Constant *EntryC, 100 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, 101 bool IsFunc, bool HasApertureRegs) { 102 103 if (!ConstantExprVisited.insert(EntryC).second) 104 return false; 105 106 SmallVector<const Constant *, 16> Stack; 107 Stack.push_back(EntryC); 108 109 while (!Stack.empty()) { 110 const Constant *C = Stack.pop_back_val(); 111 112 // We need to trap on DS globals in non-entry functions. 113 if (IsFunc && isDSAddress(C)) 114 return true; 115 116 // Check this constant expression. 117 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 118 if (!HasApertureRegs && visitConstantExpr(CE)) 119 return true; 120 } 121 122 // Visit all sub-expressions. 123 for (const Use &U : C->operands()) { 124 const auto *OpC = dyn_cast<Constant>(U); 125 if (!OpC) 126 continue; 127 128 if (!ConstantExprVisited.insert(OpC).second) 129 continue; 130 131 Stack.push_back(OpC); 132 } 133 } 134 135 return false; 136 } 137 138 // We do not need to note the x workitem or workgroup id because they are always 139 // initialized. 140 // 141 // TODO: We should not add the attributes if the known compile time workgroup 142 // size is 1 for y/z. 143 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 144 bool &NonKernelOnly, 145 bool &IsQueuePtr) { 146 switch (ID) { 147 case Intrinsic::amdgcn_workitem_id_x: 148 NonKernelOnly = true; 149 return "amdgpu-work-item-id-x"; 150 case Intrinsic::amdgcn_workgroup_id_x: 151 NonKernelOnly = true; 152 return "amdgpu-work-group-id-x"; 153 case Intrinsic::amdgcn_workitem_id_y: 154 case Intrinsic::r600_read_tidig_y: 155 return "amdgpu-work-item-id-y"; 156 case Intrinsic::amdgcn_workitem_id_z: 157 case Intrinsic::r600_read_tidig_z: 158 return "amdgpu-work-item-id-z"; 159 case Intrinsic::amdgcn_workgroup_id_y: 160 case Intrinsic::r600_read_tgid_y: 161 return "amdgpu-work-group-id-y"; 162 case Intrinsic::amdgcn_workgroup_id_z: 163 case Intrinsic::r600_read_tgid_z: 164 return "amdgpu-work-group-id-z"; 165 case Intrinsic::amdgcn_dispatch_ptr: 166 return "amdgpu-dispatch-ptr"; 167 case Intrinsic::amdgcn_dispatch_id: 168 return "amdgpu-dispatch-id"; 169 case Intrinsic::amdgcn_kernarg_segment_ptr: 170 return "amdgpu-kernarg-segment-ptr"; 171 case Intrinsic::amdgcn_implicitarg_ptr: 172 return "amdgpu-implicitarg-ptr"; 173 case Intrinsic::amdgcn_queue_ptr: 174 case Intrinsic::amdgcn_is_shared: 175 case Intrinsic::amdgcn_is_private: 176 // TODO: Does not require queue ptr on gfx9+ 177 case Intrinsic::trap: 178 case Intrinsic::debugtrap: 179 IsQueuePtr = true; 180 return "amdgpu-queue-ptr"; 181 default: 182 return ""; 183 } 184 } 185 186 static bool handleAttr(Function &Parent, const Function &Callee, 187 StringRef Name) { 188 if (Callee.hasFnAttribute(Name)) { 189 Parent.addFnAttr(Name); 190 return true; 191 } 192 return false; 193 } 194 195 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 196 bool &NeedQueuePtr) { 197 // X ids unnecessarily propagated to kernels. 198 static constexpr StringLiteral AttrNames[] = { 199 "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", 200 "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", 201 "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", 202 "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", 203 "amdgpu-implicitarg-ptr"}; 204 205 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 206 NeedQueuePtr = true; 207 208 for (StringRef AttrName : AttrNames) 209 handleAttr(Parent, Callee, AttrName); 210 } 211 212 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { 213 bool Changed = false; 214 215 for (auto *Node : reverse(NodeList)) { 216 Function *Caller = Node->getFunction(); 217 218 for (auto I : *Node) { 219 Function *Callee = std::get<1>(I)->getFunction(); 220 if (Callee) 221 Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); 222 } 223 } 224 225 return Changed; 226 } 227 228 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( 229 Function &Caller, Function &Callee) { 230 231 // Check for externally defined function 232 if (!Callee.hasExactDefinition()) { 233 Callee.addFnAttr("uniform-work-group-size", "false"); 234 if (!Caller.hasFnAttribute("uniform-work-group-size")) 235 Caller.addFnAttr("uniform-work-group-size", "false"); 236 237 return true; 238 } 239 // Check if the Caller has the attribute 240 if (Caller.hasFnAttribute("uniform-work-group-size")) { 241 // Check if the value of the attribute is true 242 if (Caller.getFnAttribute("uniform-work-group-size") 243 .getValueAsString().equals("true")) { 244 // Propagate the attribute to the Callee, if it does not have it 245 if (!Callee.hasFnAttribute("uniform-work-group-size")) { 246 Callee.addFnAttr("uniform-work-group-size", "true"); 247 return true; 248 } 249 } else { 250 Callee.addFnAttr("uniform-work-group-size", "false"); 251 return true; 252 } 253 } else { 254 // If the attribute is absent, set it as false 255 Caller.addFnAttr("uniform-work-group-size", "false"); 256 Callee.addFnAttr("uniform-work-group-size", "false"); 257 return true; 258 } 259 return false; 260 } 261 262 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 263 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 264 bool HasApertureRegs = ST.hasApertureRegs(); 265 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 266 267 bool HaveStackObjects = false; 268 bool Changed = false; 269 bool NeedQueuePtr = false; 270 bool HaveCall = false; 271 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 272 273 for (BasicBlock &BB : F) { 274 for (Instruction &I : BB) { 275 if (isa<AllocaInst>(I)) { 276 HaveStackObjects = true; 277 continue; 278 } 279 280 if (auto *CB = dyn_cast<CallBase>(&I)) { 281 const Function *Callee = 282 dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts()); 283 284 // TODO: Do something with indirect calls. 285 if (!Callee) { 286 if (!CB->isInlineAsm()) 287 HaveCall = true; 288 continue; 289 } 290 291 Intrinsic::ID IID = Callee->getIntrinsicID(); 292 if (IID == Intrinsic::not_intrinsic) { 293 HaveCall = true; 294 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 295 Changed = true; 296 } else { 297 bool NonKernelOnly = false; 298 299 if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { 300 F.addFnAttr("amdgpu-kernarg-segment-ptr"); 301 } else { 302 StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly, 303 NeedQueuePtr); 304 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 305 F.addFnAttr(AttrName); 306 Changed = true; 307 } 308 } 309 } 310 } 311 312 if (NeedQueuePtr || (!IsFunc && HasApertureRegs)) 313 continue; 314 315 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 316 if (!HasApertureRegs && castRequiresQueuePtr(ASC)) { 317 NeedQueuePtr = true; 318 continue; 319 } 320 } 321 322 for (const Use &U : I.operands()) { 323 const auto *OpC = dyn_cast<Constant>(U); 324 if (!OpC) 325 continue; 326 327 if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc, 328 HasApertureRegs)) { 329 NeedQueuePtr = true; 330 break; 331 } 332 } 333 } 334 } 335 336 if (NeedQueuePtr) { 337 F.addFnAttr("amdgpu-queue-ptr"); 338 Changed = true; 339 } 340 341 // TODO: We could refine this to captured pointers that could possibly be 342 // accessed by flat instructions. For now this is mostly a poor way of 343 // estimating whether there are calls before argument lowering. 344 if (!IsFunc && HaveCall) { 345 F.addFnAttr("amdgpu-calls"); 346 Changed = true; 347 } 348 349 if (HaveStackObjects) { 350 F.addFnAttr("amdgpu-stack-objects"); 351 Changed = true; 352 } 353 354 return Changed; 355 } 356 357 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 358 bool Changed = false; 359 360 for (CallGraphNode *I : SCC) { 361 // Build a list of CallGraphNodes from most number of uses to least 362 if (I->getNumReferences()) 363 NodeList.push_back(I); 364 else { 365 processUniformWorkGroupAttribute(); 366 NodeList.clear(); 367 } 368 369 Function *F = I->getFunction(); 370 // Add feature attributes 371 if (!F || F->isDeclaration()) 372 continue; 373 Changed |= addFeatureAttributes(*F); 374 } 375 376 return Changed; 377 } 378 379 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 380 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 381 if (!TPC) 382 report_fatal_error("TargetMachine is required"); 383 384 TM = &TPC->getTM<TargetMachine>(); 385 return false; 386 } 387 388 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 389 return new AMDGPUAnnotateKernelFeatures(); 390 } 391