1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass adds target attributes to functions which use intrinsics 10 /// which will impact calling convention lowering. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUSubtarget.h" 16 #include "Utils/AMDGPUBaseInfo.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Triple.h" 21 #include "llvm/Analysis/CallGraph.h" 22 #include "llvm/Analysis/CallGraphSCCPass.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/IR/Constant.h" 25 #include "llvm/IR/Constants.h" 26 #include "llvm/IR/Function.h" 27 #include "llvm/IR/Instruction.h" 28 #include "llvm/IR/Instructions.h" 29 #include "llvm/IR/Intrinsics.h" 30 #include "llvm/IR/Module.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/IR/Use.h" 33 #include "llvm/Pass.h" 34 #include "llvm/Support/Casting.h" 35 #include "llvm/Support/ErrorHandling.h" 36 #include "llvm/Target/TargetMachine.h" 37 38 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 39 40 using namespace llvm; 41 42 namespace { 43 44 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 45 private: 46 const TargetMachine *TM = nullptr; 47 SmallVector<CallGraphNode*, 8> NodeList; 48 49 bool addFeatureAttributes(Function &F); 50 bool processUniformWorkGroupAttribute(); 51 bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); 52 53 public: 54 static char ID; 55 56 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 57 58 bool doInitialization(CallGraph &CG) override; 59 bool runOnSCC(CallGraphSCC &SCC) override; 60 61 StringRef getPassName() const override { 62 return "AMDGPU Annotate Kernel Features"; 63 } 64 65 void getAnalysisUsage(AnalysisUsage &AU) const override { 66 AU.setPreservesAll(); 67 CallGraphSCCPass::getAnalysisUsage(AU); 68 } 69 70 static bool visitConstantExpr(const ConstantExpr *CE); 71 static bool visitConstantExprsRecursively( 72 const Constant *EntryC, 73 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc, 74 bool HasApertureRegs); 75 }; 76 77 } // end anonymous namespace 78 79 char AMDGPUAnnotateKernelFeatures::ID = 0; 80 81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 82 83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 84 "Add AMDGPU function attributes", false, false) 85 86 87 // The queue ptr is only needed when casting to flat, not from it. 88 static bool castRequiresQueuePtr(unsigned SrcAS) { 89 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 90 } 91 92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 93 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 94 } 95 96 static bool isDSAddress(const Constant *C) { 97 const GlobalValue *GV = dyn_cast<GlobalValue>(C); 98 if (!GV) 99 return false; 100 unsigned AS = GV->getAddressSpace(); 101 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; 102 } 103 104 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 105 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 106 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 107 return castRequiresQueuePtr(SrcAS); 108 } 109 110 return false; 111 } 112 113 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 114 const Constant *EntryC, 115 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, 116 bool IsFunc, bool HasApertureRegs) { 117 118 if (!ConstantExprVisited.insert(EntryC).second) 119 return false; 120 121 SmallVector<const Constant *, 16> Stack; 122 Stack.push_back(EntryC); 123 124 while (!Stack.empty()) { 125 const Constant *C = Stack.pop_back_val(); 126 127 // We need to trap on DS globals in non-entry functions. 128 if (IsFunc && isDSAddress(C)) 129 return true; 130 131 // Check this constant expression. 132 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 133 if (!HasApertureRegs && visitConstantExpr(CE)) 134 return true; 135 } 136 137 // Visit all sub-expressions. 138 for (const Use &U : C->operands()) { 139 const auto *OpC = dyn_cast<Constant>(U); 140 if (!OpC) 141 continue; 142 143 if (!ConstantExprVisited.insert(OpC).second) 144 continue; 145 146 Stack.push_back(OpC); 147 } 148 } 149 150 return false; 151 } 152 153 // We do not need to note the x workitem or workgroup id because they are always 154 // initialized. 155 // 156 // TODO: We should not add the attributes if the known compile time workgroup 157 // size is 1 for y/z. 158 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 159 bool &NonKernelOnly, 160 bool &IsQueuePtr) { 161 switch (ID) { 162 case Intrinsic::amdgcn_workitem_id_x: 163 NonKernelOnly = true; 164 return "amdgpu-work-item-id-x"; 165 case Intrinsic::amdgcn_workgroup_id_x: 166 NonKernelOnly = true; 167 return "amdgpu-work-group-id-x"; 168 case Intrinsic::amdgcn_workitem_id_y: 169 case Intrinsic::r600_read_tidig_y: 170 return "amdgpu-work-item-id-y"; 171 case Intrinsic::amdgcn_workitem_id_z: 172 case Intrinsic::r600_read_tidig_z: 173 return "amdgpu-work-item-id-z"; 174 case Intrinsic::amdgcn_workgroup_id_y: 175 case Intrinsic::r600_read_tgid_y: 176 return "amdgpu-work-group-id-y"; 177 case Intrinsic::amdgcn_workgroup_id_z: 178 case Intrinsic::r600_read_tgid_z: 179 return "amdgpu-work-group-id-z"; 180 case Intrinsic::amdgcn_dispatch_ptr: 181 return "amdgpu-dispatch-ptr"; 182 case Intrinsic::amdgcn_dispatch_id: 183 return "amdgpu-dispatch-id"; 184 case Intrinsic::amdgcn_kernarg_segment_ptr: 185 return "amdgpu-kernarg-segment-ptr"; 186 case Intrinsic::amdgcn_implicitarg_ptr: 187 return "amdgpu-implicitarg-ptr"; 188 case Intrinsic::amdgcn_queue_ptr: 189 case Intrinsic::amdgcn_is_shared: 190 case Intrinsic::amdgcn_is_private: 191 // TODO: Does not require queue ptr on gfx9+ 192 case Intrinsic::trap: 193 case Intrinsic::debugtrap: 194 IsQueuePtr = true; 195 return "amdgpu-queue-ptr"; 196 default: 197 return ""; 198 } 199 } 200 201 static bool handleAttr(Function &Parent, const Function &Callee, 202 StringRef Name) { 203 if (Callee.hasFnAttribute(Name)) { 204 Parent.addFnAttr(Name); 205 return true; 206 } 207 return false; 208 } 209 210 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 211 bool &NeedQueuePtr) { 212 // X ids unnecessarily propagated to kernels. 213 static constexpr StringLiteral AttrNames[] = { 214 "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", 215 "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", 216 "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", 217 "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", 218 "amdgpu-implicitarg-ptr"}; 219 220 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 221 NeedQueuePtr = true; 222 223 for (StringRef AttrName : AttrNames) 224 handleAttr(Parent, Callee, AttrName); 225 } 226 227 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { 228 bool Changed = false; 229 230 for (auto *Node : reverse(NodeList)) { 231 Function *Caller = Node->getFunction(); 232 233 for (auto I : *Node) { 234 Function *Callee = std::get<1>(I)->getFunction(); 235 if (Callee) 236 Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); 237 } 238 } 239 240 return Changed; 241 } 242 243 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( 244 Function &Caller, Function &Callee) { 245 246 // Check for externally defined function 247 if (!Callee.hasExactDefinition()) { 248 Callee.addFnAttr("uniform-work-group-size", "false"); 249 if (!Caller.hasFnAttribute("uniform-work-group-size")) 250 Caller.addFnAttr("uniform-work-group-size", "false"); 251 252 return true; 253 } 254 // Check if the Caller has the attribute 255 if (Caller.hasFnAttribute("uniform-work-group-size")) { 256 // Check if the value of the attribute is true 257 if (Caller.getFnAttribute("uniform-work-group-size") 258 .getValueAsString().equals("true")) { 259 // Propagate the attribute to the Callee, if it does not have it 260 if (!Callee.hasFnAttribute("uniform-work-group-size")) { 261 Callee.addFnAttr("uniform-work-group-size", "true"); 262 return true; 263 } 264 } else { 265 Callee.addFnAttr("uniform-work-group-size", "false"); 266 return true; 267 } 268 } else { 269 // If the attribute is absent, set it as false 270 Caller.addFnAttr("uniform-work-group-size", "false"); 271 Callee.addFnAttr("uniform-work-group-size", "false"); 272 return true; 273 } 274 return false; 275 } 276 277 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 278 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 279 bool HasApertureRegs = ST.hasApertureRegs(); 280 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 281 282 bool HaveStackObjects = false; 283 bool Changed = false; 284 bool NeedQueuePtr = false; 285 bool HaveCall = false; 286 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 287 288 for (BasicBlock &BB : F) { 289 for (Instruction &I : BB) { 290 if (isa<AllocaInst>(I)) { 291 HaveStackObjects = true; 292 continue; 293 } 294 295 if (auto *CB = dyn_cast<CallBase>(&I)) { 296 const Function *Callee = 297 dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts()); 298 299 // TODO: Do something with indirect calls. 300 if (!Callee) { 301 if (!CB->isInlineAsm()) 302 HaveCall = true; 303 continue; 304 } 305 306 Intrinsic::ID IID = Callee->getIntrinsicID(); 307 if (IID == Intrinsic::not_intrinsic) { 308 HaveCall = true; 309 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 310 Changed = true; 311 } else { 312 bool NonKernelOnly = false; 313 314 if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { 315 F.addFnAttr("amdgpu-kernarg-segment-ptr"); 316 } else { 317 StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly, 318 NeedQueuePtr); 319 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 320 F.addFnAttr(AttrName); 321 Changed = true; 322 } 323 } 324 } 325 } 326 327 if (NeedQueuePtr || (!IsFunc && HasApertureRegs)) 328 continue; 329 330 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 331 if (!HasApertureRegs && castRequiresQueuePtr(ASC)) { 332 NeedQueuePtr = true; 333 continue; 334 } 335 } 336 337 for (const Use &U : I.operands()) { 338 const auto *OpC = dyn_cast<Constant>(U); 339 if (!OpC) 340 continue; 341 342 if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc, 343 HasApertureRegs)) { 344 NeedQueuePtr = true; 345 break; 346 } 347 } 348 } 349 } 350 351 if (NeedQueuePtr) { 352 F.addFnAttr("amdgpu-queue-ptr"); 353 Changed = true; 354 } 355 356 // TODO: We could refine this to captured pointers that could possibly be 357 // accessed by flat instructions. For now this is mostly a poor way of 358 // estimating whether there are calls before argument lowering. 359 if (!IsFunc && HaveCall) { 360 F.addFnAttr("amdgpu-calls"); 361 Changed = true; 362 } 363 364 if (HaveStackObjects) { 365 F.addFnAttr("amdgpu-stack-objects"); 366 Changed = true; 367 } 368 369 return Changed; 370 } 371 372 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 373 bool Changed = false; 374 375 for (CallGraphNode *I : SCC) { 376 // Build a list of CallGraphNodes from most number of uses to least 377 if (I->getNumReferences()) 378 NodeList.push_back(I); 379 else { 380 processUniformWorkGroupAttribute(); 381 NodeList.clear(); 382 } 383 384 Function *F = I->getFunction(); 385 // Add feature attributes 386 if (!F || F->isDeclaration()) 387 continue; 388 Changed |= addFeatureAttributes(*F); 389 } 390 391 return Changed; 392 } 393 394 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 395 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 396 if (!TPC) 397 report_fatal_error("TargetMachine is required"); 398 399 TM = &TPC->getTM<TargetMachine>(); 400 return false; 401 } 402 403 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 404 return new AMDGPUAnnotateKernelFeatures(); 405 } 406