1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass adds target attributes to functions which use intrinsics 10 /// which will impact calling convention lowering. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUSubtarget.h" 16 #include "Utils/AMDGPUBaseInfo.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Triple.h" 21 #include "llvm/Analysis/CallGraph.h" 22 #include "llvm/Analysis/CallGraphSCCPass.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/IR/CallSite.h" 25 #include "llvm/IR/Constant.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Intrinsics.h" 31 #include "llvm/IR/Module.h" 32 #include "llvm/IR/Type.h" 33 #include "llvm/IR/Use.h" 34 #include "llvm/Pass.h" 35 #include "llvm/Support/Casting.h" 36 #include "llvm/Support/ErrorHandling.h" 37 #include "llvm/Target/TargetMachine.h" 38 39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 40 41 using namespace llvm; 42 43 namespace { 44 45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 46 private: 47 const TargetMachine *TM = nullptr; 48 SmallVector<CallGraphNode*, 8> NodeList; 49 50 bool addFeatureAttributes(Function &F); 51 bool processUniformWorkGroupAttribute(); 52 bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); 53 54 public: 55 static char ID; 56 57 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 58 59 bool doInitialization(CallGraph &CG) override; 60 bool runOnSCC(CallGraphSCC &SCC) override; 61 62 StringRef getPassName() const override { 63 return "AMDGPU Annotate Kernel Features"; 64 } 65 66 void getAnalysisUsage(AnalysisUsage &AU) const override { 67 AU.setPreservesAll(); 68 CallGraphSCCPass::getAnalysisUsage(AU); 69 } 70 71 static bool visitConstantExpr(const ConstantExpr *CE); 72 static bool visitConstantExprsRecursively( 73 const Constant *EntryC, 74 SmallPtrSet<const Constant *, 8> &ConstantExprVisited); 75 }; 76 77 } // end anonymous namespace 78 79 char AMDGPUAnnotateKernelFeatures::ID = 0; 80 81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 82 83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 84 "Add AMDGPU function attributes", false, false) 85 86 87 // The queue ptr is only needed when casting to flat, not from it. 88 static bool castRequiresQueuePtr(unsigned SrcAS) { 89 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 90 } 91 92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 93 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 94 } 95 96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 97 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 98 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 99 return castRequiresQueuePtr(SrcAS); 100 } 101 102 return false; 103 } 104 105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 106 const Constant *EntryC, 107 SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { 108 109 if (!ConstantExprVisited.insert(EntryC).second) 110 return false; 111 112 SmallVector<const Constant *, 16> Stack; 113 Stack.push_back(EntryC); 114 115 while (!Stack.empty()) { 116 const Constant *C = Stack.pop_back_val(); 117 118 // Check this constant expression. 119 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 120 if (visitConstantExpr(CE)) 121 return true; 122 } 123 124 // Visit all sub-expressions. 125 for (const Use &U : C->operands()) { 126 const auto *OpC = dyn_cast<Constant>(U); 127 if (!OpC) 128 continue; 129 130 if (!ConstantExprVisited.insert(OpC).second) 131 continue; 132 133 Stack.push_back(OpC); 134 } 135 } 136 137 return false; 138 } 139 140 // We do not need to note the x workitem or workgroup id because they are always 141 // initialized. 142 // 143 // TODO: We should not add the attributes if the known compile time workgroup 144 // size is 1 for y/z. 145 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 146 bool &NonKernelOnly, 147 bool &IsQueuePtr) { 148 switch (ID) { 149 case Intrinsic::amdgcn_workitem_id_x: 150 NonKernelOnly = true; 151 return "amdgpu-work-item-id-x"; 152 case Intrinsic::amdgcn_workgroup_id_x: 153 NonKernelOnly = true; 154 return "amdgpu-work-group-id-x"; 155 case Intrinsic::amdgcn_workitem_id_y: 156 case Intrinsic::r600_read_tidig_y: 157 return "amdgpu-work-item-id-y"; 158 case Intrinsic::amdgcn_workitem_id_z: 159 case Intrinsic::r600_read_tidig_z: 160 return "amdgpu-work-item-id-z"; 161 case Intrinsic::amdgcn_workgroup_id_y: 162 case Intrinsic::r600_read_tgid_y: 163 return "amdgpu-work-group-id-y"; 164 case Intrinsic::amdgcn_workgroup_id_z: 165 case Intrinsic::r600_read_tgid_z: 166 return "amdgpu-work-group-id-z"; 167 case Intrinsic::amdgcn_dispatch_ptr: 168 return "amdgpu-dispatch-ptr"; 169 case Intrinsic::amdgcn_dispatch_id: 170 return "amdgpu-dispatch-id"; 171 case Intrinsic::amdgcn_kernarg_segment_ptr: 172 return "amdgpu-kernarg-segment-ptr"; 173 case Intrinsic::amdgcn_implicitarg_ptr: 174 return "amdgpu-implicitarg-ptr"; 175 case Intrinsic::amdgcn_queue_ptr: 176 case Intrinsic::trap: 177 case Intrinsic::debugtrap: 178 IsQueuePtr = true; 179 return "amdgpu-queue-ptr"; 180 default: 181 return ""; 182 } 183 } 184 185 static bool handleAttr(Function &Parent, const Function &Callee, 186 StringRef Name) { 187 if (Callee.hasFnAttribute(Name)) { 188 Parent.addFnAttr(Name); 189 return true; 190 } 191 return false; 192 } 193 194 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 195 bool &NeedQueuePtr) { 196 // X ids unnecessarily propagated to kernels. 197 static const StringRef AttrNames[] = { 198 { "amdgpu-work-item-id-x" }, 199 { "amdgpu-work-item-id-y" }, 200 { "amdgpu-work-item-id-z" }, 201 { "amdgpu-work-group-id-x" }, 202 { "amdgpu-work-group-id-y" }, 203 { "amdgpu-work-group-id-z" }, 204 { "amdgpu-dispatch-ptr" }, 205 { "amdgpu-dispatch-id" }, 206 { "amdgpu-kernarg-segment-ptr" }, 207 { "amdgpu-implicitarg-ptr" } 208 }; 209 210 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 211 NeedQueuePtr = true; 212 213 for (StringRef AttrName : AttrNames) 214 handleAttr(Parent, Callee, AttrName); 215 } 216 217 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { 218 bool Changed = false; 219 220 for (auto *Node : reverse(NodeList)) { 221 Function *Caller = Node->getFunction(); 222 223 for (auto I : *Node) { 224 Function *Callee = std::get<1>(I)->getFunction(); 225 if (Callee) 226 Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); 227 } 228 } 229 230 return Changed; 231 } 232 233 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( 234 Function &Caller, Function &Callee) { 235 236 // Check for externally defined function 237 if (!Callee.hasExactDefinition()) { 238 Callee.addFnAttr("uniform-work-group-size", "false"); 239 if (!Caller.hasFnAttribute("uniform-work-group-size")) 240 Caller.addFnAttr("uniform-work-group-size", "false"); 241 242 return true; 243 } 244 // Check if the Caller has the attribute 245 if (Caller.hasFnAttribute("uniform-work-group-size")) { 246 // Check if the value of the attribute is true 247 if (Caller.getFnAttribute("uniform-work-group-size") 248 .getValueAsString().equals("true")) { 249 // Propagate the attribute to the Callee, if it does not have it 250 if (!Callee.hasFnAttribute("uniform-work-group-size")) { 251 Callee.addFnAttr("uniform-work-group-size", "true"); 252 return true; 253 } 254 } else { 255 Callee.addFnAttr("uniform-work-group-size", "false"); 256 return true; 257 } 258 } else { 259 // If the attribute is absent, set it as false 260 Caller.addFnAttr("uniform-work-group-size", "false"); 261 Callee.addFnAttr("uniform-work-group-size", "false"); 262 return true; 263 } 264 return false; 265 } 266 267 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 268 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 269 bool HasFlat = ST.hasFlatAddressSpace(); 270 bool HasApertureRegs = ST.hasApertureRegs(); 271 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 272 273 bool Changed = false; 274 bool NeedQueuePtr = false; 275 bool HaveCall = false; 276 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 277 278 for (BasicBlock &BB : F) { 279 for (Instruction &I : BB) { 280 CallSite CS(&I); 281 if (CS) { 282 Function *Callee = CS.getCalledFunction(); 283 284 // TODO: Do something with indirect calls. 285 if (!Callee) { 286 if (!CS.isInlineAsm()) 287 HaveCall = true; 288 continue; 289 } 290 291 Intrinsic::ID IID = Callee->getIntrinsicID(); 292 if (IID == Intrinsic::not_intrinsic) { 293 HaveCall = true; 294 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 295 Changed = true; 296 } else { 297 bool NonKernelOnly = false; 298 StringRef AttrName = intrinsicToAttrName(IID, 299 NonKernelOnly, NeedQueuePtr); 300 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 301 F.addFnAttr(AttrName); 302 Changed = true; 303 } 304 } 305 } 306 307 if (NeedQueuePtr || HasApertureRegs) 308 continue; 309 310 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 311 if (castRequiresQueuePtr(ASC)) { 312 NeedQueuePtr = true; 313 continue; 314 } 315 } 316 317 for (const Use &U : I.operands()) { 318 const auto *OpC = dyn_cast<Constant>(U); 319 if (!OpC) 320 continue; 321 322 if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) { 323 NeedQueuePtr = true; 324 break; 325 } 326 } 327 } 328 } 329 330 if (NeedQueuePtr) { 331 F.addFnAttr("amdgpu-queue-ptr"); 332 Changed = true; 333 } 334 335 // TODO: We could refine this to captured pointers that could possibly be 336 // accessed by flat instructions. For now this is mostly a poor way of 337 // estimating whether there are calls before argument lowering. 338 if (HasFlat && !IsFunc && HaveCall) { 339 F.addFnAttr("amdgpu-flat-scratch"); 340 Changed = true; 341 } 342 343 return Changed; 344 } 345 346 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 347 bool Changed = false; 348 349 for (CallGraphNode *I : SCC) { 350 // Build a list of CallGraphNodes from most number of uses to least 351 if (I->getNumReferences()) 352 NodeList.push_back(I); 353 else { 354 processUniformWorkGroupAttribute(); 355 NodeList.clear(); 356 } 357 358 Function *F = I->getFunction(); 359 // Add feature attributes 360 if (!F || F->isDeclaration()) 361 continue; 362 Changed |= addFeatureAttributes(*F); 363 } 364 365 return Changed; 366 } 367 368 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 369 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 370 if (!TPC) 371 report_fatal_error("TargetMachine is required"); 372 373 TM = &TPC->getTM<TargetMachine>(); 374 return false; 375 } 376 377 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 378 return new AMDGPUAnnotateKernelFeatures(); 379 } 380