1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass adds target attributes to functions which use intrinsics 10 /// which will impact calling convention lowering. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUSubtarget.h" 16 #include "Utils/AMDGPUBaseInfo.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Triple.h" 21 #include "llvm/Analysis/CallGraph.h" 22 #include "llvm/Analysis/CallGraphSCCPass.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/IR/CallSite.h" 25 #include "llvm/IR/Constant.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Intrinsics.h" 31 #include "llvm/IR/Module.h" 32 #include "llvm/IR/Type.h" 33 #include "llvm/IR/Use.h" 34 #include "llvm/Pass.h" 35 #include "llvm/Support/Casting.h" 36 #include "llvm/Support/ErrorHandling.h" 37 #include "llvm/Target/TargetMachine.h" 38 39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 40 41 using namespace llvm; 42 43 namespace { 44 45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 46 private: 47 const TargetMachine *TM = nullptr; 48 SmallVector<CallGraphNode*, 8> NodeList; 49 50 bool addFeatureAttributes(Function &F); 51 bool processUniformWorkGroupAttribute(); 52 bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); 53 54 public: 55 static char ID; 56 57 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 58 59 bool doInitialization(CallGraph &CG) override; 60 bool runOnSCC(CallGraphSCC &SCC) override; 61 62 StringRef getPassName() const override { 63 return "AMDGPU Annotate Kernel Features"; 64 } 65 66 void getAnalysisUsage(AnalysisUsage &AU) const override { 67 AU.setPreservesAll(); 68 CallGraphSCCPass::getAnalysisUsage(AU); 69 } 70 71 static bool visitConstantExpr(const ConstantExpr *CE); 72 static bool visitConstantExprsRecursively( 73 const Constant *EntryC, 74 SmallPtrSet<const Constant *, 8> &ConstantExprVisited); 75 }; 76 77 } // end anonymous namespace 78 79 char AMDGPUAnnotateKernelFeatures::ID = 0; 80 81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 82 83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 84 "Add AMDGPU function attributes", false, false) 85 86 87 // The queue ptr is only needed when casting to flat, not from it. 88 static bool castRequiresQueuePtr(unsigned SrcAS) { 89 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 90 } 91 92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 93 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 94 } 95 96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 97 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 98 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 99 return castRequiresQueuePtr(SrcAS); 100 } 101 102 return false; 103 } 104 105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 106 const Constant *EntryC, 107 SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { 108 109 if (!ConstantExprVisited.insert(EntryC).second) 110 return false; 111 112 SmallVector<const Constant *, 16> Stack; 113 Stack.push_back(EntryC); 114 115 while (!Stack.empty()) { 116 const Constant *C = Stack.pop_back_val(); 117 118 // Check this constant expression. 119 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 120 if (visitConstantExpr(CE)) 121 return true; 122 } 123 124 // Visit all sub-expressions. 125 for (const Use &U : C->operands()) { 126 const auto *OpC = dyn_cast<Constant>(U); 127 if (!OpC) 128 continue; 129 130 if (!ConstantExprVisited.insert(OpC).second) 131 continue; 132 133 Stack.push_back(OpC); 134 } 135 } 136 137 return false; 138 } 139 140 // We do not need to note the x workitem or workgroup id because they are always 141 // initialized. 142 // 143 // TODO: We should not add the attributes if the known compile time workgroup 144 // size is 1 for y/z. 145 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 146 bool &NonKernelOnly, 147 bool &IsQueuePtr) { 148 switch (ID) { 149 case Intrinsic::amdgcn_workitem_id_x: 150 NonKernelOnly = true; 151 return "amdgpu-work-item-id-x"; 152 case Intrinsic::amdgcn_workgroup_id_x: 153 NonKernelOnly = true; 154 return "amdgpu-work-group-id-x"; 155 case Intrinsic::amdgcn_workitem_id_y: 156 case Intrinsic::r600_read_tidig_y: 157 return "amdgpu-work-item-id-y"; 158 case Intrinsic::amdgcn_workitem_id_z: 159 case Intrinsic::r600_read_tidig_z: 160 return "amdgpu-work-item-id-z"; 161 case Intrinsic::amdgcn_workgroup_id_y: 162 case Intrinsic::r600_read_tgid_y: 163 return "amdgpu-work-group-id-y"; 164 case Intrinsic::amdgcn_workgroup_id_z: 165 case Intrinsic::r600_read_tgid_z: 166 return "amdgpu-work-group-id-z"; 167 case Intrinsic::amdgcn_dispatch_ptr: 168 return "amdgpu-dispatch-ptr"; 169 case Intrinsic::amdgcn_dispatch_id: 170 return "amdgpu-dispatch-id"; 171 case Intrinsic::amdgcn_kernarg_segment_ptr: 172 return "amdgpu-kernarg-segment-ptr"; 173 case Intrinsic::amdgcn_implicitarg_ptr: 174 return "amdgpu-implicitarg-ptr"; 175 case Intrinsic::amdgcn_queue_ptr: 176 case Intrinsic::amdgcn_is_shared: 177 case Intrinsic::amdgcn_is_private: 178 // TODO: Does not require queue ptr on gfx9+ 179 case Intrinsic::trap: 180 case Intrinsic::debugtrap: 181 IsQueuePtr = true; 182 return "amdgpu-queue-ptr"; 183 default: 184 return ""; 185 } 186 } 187 188 static bool handleAttr(Function &Parent, const Function &Callee, 189 StringRef Name) { 190 if (Callee.hasFnAttribute(Name)) { 191 Parent.addFnAttr(Name); 192 return true; 193 } 194 return false; 195 } 196 197 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 198 bool &NeedQueuePtr) { 199 // X ids unnecessarily propagated to kernels. 200 static constexpr StringLiteral AttrNames[] = { 201 "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", 202 "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", 203 "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", 204 "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", 205 "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"}; 206 207 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 208 NeedQueuePtr = true; 209 210 for (StringRef AttrName : AttrNames) 211 handleAttr(Parent, Callee, AttrName); 212 } 213 214 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { 215 bool Changed = false; 216 217 for (auto *Node : reverse(NodeList)) { 218 Function *Caller = Node->getFunction(); 219 220 for (auto I : *Node) { 221 Function *Callee = std::get<1>(I)->getFunction(); 222 if (Callee) 223 Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); 224 } 225 } 226 227 return Changed; 228 } 229 230 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( 231 Function &Caller, Function &Callee) { 232 233 // Check for externally defined function 234 if (!Callee.hasExactDefinition()) { 235 Callee.addFnAttr("uniform-work-group-size", "false"); 236 if (!Caller.hasFnAttribute("uniform-work-group-size")) 237 Caller.addFnAttr("uniform-work-group-size", "false"); 238 239 return true; 240 } 241 // Check if the Caller has the attribute 242 if (Caller.hasFnAttribute("uniform-work-group-size")) { 243 // Check if the value of the attribute is true 244 if (Caller.getFnAttribute("uniform-work-group-size") 245 .getValueAsString().equals("true")) { 246 // Propagate the attribute to the Callee, if it does not have it 247 if (!Callee.hasFnAttribute("uniform-work-group-size")) { 248 Callee.addFnAttr("uniform-work-group-size", "true"); 249 return true; 250 } 251 } else { 252 Callee.addFnAttr("uniform-work-group-size", "false"); 253 return true; 254 } 255 } else { 256 // If the attribute is absent, set it as false 257 Caller.addFnAttr("uniform-work-group-size", "false"); 258 Callee.addFnAttr("uniform-work-group-size", "false"); 259 return true; 260 } 261 return false; 262 } 263 264 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 265 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 266 bool HasFlat = ST.hasFlatAddressSpace(); 267 bool HasApertureRegs = ST.hasApertureRegs(); 268 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 269 270 bool Changed = false; 271 bool NeedQueuePtr = false; 272 bool HaveCall = false; 273 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 274 275 for (BasicBlock &BB : F) { 276 for (Instruction &I : BB) { 277 CallSite CS(&I); 278 if (CS) { 279 Function *Callee = CS.getCalledFunction(); 280 281 // TODO: Do something with indirect calls. 282 if (!Callee) { 283 if (!CS.isInlineAsm()) 284 HaveCall = true; 285 continue; 286 } 287 288 Intrinsic::ID IID = Callee->getIntrinsicID(); 289 if (IID == Intrinsic::not_intrinsic) { 290 HaveCall = true; 291 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 292 Changed = true; 293 } else { 294 bool NonKernelOnly = false; 295 StringRef AttrName = intrinsicToAttrName(IID, 296 NonKernelOnly, NeedQueuePtr); 297 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 298 F.addFnAttr(AttrName); 299 Changed = true; 300 } 301 } 302 } 303 304 if (NeedQueuePtr || HasApertureRegs) 305 continue; 306 307 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 308 if (castRequiresQueuePtr(ASC)) { 309 NeedQueuePtr = true; 310 continue; 311 } 312 } 313 314 for (const Use &U : I.operands()) { 315 const auto *OpC = dyn_cast<Constant>(U); 316 if (!OpC) 317 continue; 318 319 if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) { 320 NeedQueuePtr = true; 321 break; 322 } 323 } 324 } 325 } 326 327 if (NeedQueuePtr) { 328 F.addFnAttr("amdgpu-queue-ptr"); 329 Changed = true; 330 } 331 332 // TODO: We could refine this to captured pointers that could possibly be 333 // accessed by flat instructions. For now this is mostly a poor way of 334 // estimating whether there are calls before argument lowering. 335 if (HasFlat && !IsFunc && HaveCall) { 336 F.addFnAttr("amdgpu-flat-scratch"); 337 Changed = true; 338 } 339 340 return Changed; 341 } 342 343 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 344 bool Changed = false; 345 346 for (CallGraphNode *I : SCC) { 347 // Build a list of CallGraphNodes from most number of uses to least 348 if (I->getNumReferences()) 349 NodeList.push_back(I); 350 else { 351 processUniformWorkGroupAttribute(); 352 NodeList.clear(); 353 } 354 355 Function *F = I->getFunction(); 356 // Add feature attributes 357 if (!F || F->isDeclaration()) 358 continue; 359 Changed |= addFeatureAttributes(*F); 360 } 361 362 return Changed; 363 } 364 365 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 366 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 367 if (!TPC) 368 report_fatal_error("TargetMachine is required"); 369 370 TM = &TPC->getTM<TargetMachine>(); 371 return false; 372 } 373 374 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 375 return new AMDGPUAnnotateKernelFeatures(); 376 } 377