1 //===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa 10 // or dim=2darraymsaa into a single image_msaa_load intrinsic if: 11 // 12 // - they refer to the same vaddr except for sample_id, 13 // - they use a constant sample_id and they fall into the same group, 14 // - they have the same dmask and the number of intrinsics and the number of 15 // vaddr/vdata dword transfers is reduced by the combine. 16 // 17 // Examples for the tradeoff (all are assuming 2DMsaa for vaddr): 18 // 19 // +----------+-----+-----+-------+---------+------------+---------+----------+ 20 // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? | 21 // | (dmask) | | | | vdata | | vdata | | 22 // +----------+-----+-----+-------+---------+------------+---------+----------+ 23 // | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes | 24 // +----------+-----+-----+-------+---------+------------+---------+----------+ 25 // | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? | 26 // +----------+-----+-----+-------+---------+------------+---------+----------+ 27 // | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes | 28 // +----------+-----+-----+-------+---------+------------+---------+----------+ 29 // | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no | 30 // +----------+-----+-----+-------+---------+------------+---------+----------+ 31 // | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes | 32 // +----------+-----+-----+-------+---------+------------+---------+----------+ 33 // 34 // Some cases are of questionable benefit, like the one marked with "yes?" 35 // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP 36 // and TX, but higher vdata. We start by erring on the side of converting these 37 // to MSAA_LOAD. 38 // 39 // clang-format off 40 // 41 // This pass will combine intrinsics such as (not neccessarily consecutive): 42 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) 43 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0) 44 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0) 45 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0) 46 // ==> 47 // call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) 48 // 49 // clang-format on 50 // 51 // Future improvements: 52 // 53 // - We may occasionally not want to do the combine if it increases the maximum 54 // register pressure. 55 // 56 // - Ensure clausing when multiple MSAA_LOAD are generated. 57 // 58 // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this 59 // combine only applies to gfx11, due to a limitation in gfx10: the gfx10 60 // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and 61 // we don't know the format at compile time. 62 //===----------------------------------------------------------------------===// 63 64 #include "AMDGPU.h" 65 #include "AMDGPUInstrInfo.h" 66 #include "AMDGPUTargetMachine.h" 67 #include "llvm/IR/Function.h" 68 #include "llvm/IR/IRBuilder.h" 69 #include "llvm/IR/IntrinsicInst.h" 70 #include "llvm/IR/IntrinsicsAMDGPU.h" 71 #include "llvm/Pass.h" 72 #include "llvm/Support/raw_ostream.h" 73 74 using namespace llvm; 75 76 #define DEBUG_TYPE "amdgpu-image-intrinsic-opt" 77 78 namespace { 79 class AMDGPUImageIntrinsicOptimizer : public FunctionPass { 80 const TargetMachine *TM; 81 82 public: 83 static char ID; 84 85 AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr) 86 : FunctionPass(ID), TM(TM) {} 87 88 bool runOnFunction(Function &F) override; 89 90 }; // End of class AMDGPUImageIntrinsicOptimizer 91 } // End anonymous namespace 92 93 INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, 94 "AMDGPU Image Intrinsic Optimizer", false, false) 95 96 char AMDGPUImageIntrinsicOptimizer::ID = 0; 97 98 void addInstToMergeableList( 99 IntrinsicInst *II, 100 SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts, 101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) { 102 for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) { 103 // Check Dim. 104 if (IIList.front()->getIntrinsicID() != II->getIntrinsicID()) 105 continue; 106 107 // Check D16. 108 if (IIList.front()->getType() != II->getType()) 109 continue; 110 111 // Check all arguments (DMask, VAddr, RSrc etc). 112 bool AllEqual = true; 113 assert(IIList.front()->arg_size() == II->arg_size()); 114 for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) { 115 Value *ArgList = IIList.front()->getArgOperand(I); 116 Value *Arg = II->getArgOperand(I); 117 if (I == ImageDimIntr->VAddrEnd - 1) { 118 // Check FragId group. 119 auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I)); 120 auto FragId = cast<ConstantInt>(II->getArgOperand(I)); 121 AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4); 122 } else { 123 // Check all arguments except FragId. 124 AllEqual = ArgList == Arg; 125 } 126 } 127 if (!AllEqual) 128 continue; 129 130 // Add to the list. 131 IIList.emplace_back(II); 132 return; 133 } 134 135 // Similar instruction not found, so add a new list. 136 MergeableInsts.emplace_back(1, II); 137 LLVM_DEBUG(dbgs() << "New: " << *II << "\n"); 138 } 139 140 // Collect list of all instructions we know how to merge in a subset of the 141 // block. It returns an iterator to the instruction after the last one analyzed. 142 BasicBlock::iterator collectMergeableInsts( 143 BasicBlock::iterator I, BasicBlock::iterator E, 144 SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) { 145 for (; I != E; ++I) { 146 // Don't combine if there is a store in the middle or if there is a memory 147 // barrier. 148 if (I->mayHaveSideEffects()) { 149 ++I; 150 break; 151 } 152 153 // Ignore non-intrinsics. 154 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 155 Intrinsic::ID IntrinID = II->getIntrinsicID(); 156 157 // Ignore other intrinsics. 158 if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa && 159 IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa) 160 continue; 161 162 // Check for constant FragId. 163 const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID); 164 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; 165 if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex))) 166 continue; 167 168 LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n"); 169 addInstToMergeableList(II, MergeableInsts, ImageDimIntr); 170 } 171 } 172 173 return I; 174 } 175 176 bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) { 177 bool Modified = false; 178 179 SmallVector<Instruction *, 4> InstrsToErase; 180 for (const auto &IIList : MergeableInsts) { 181 if (IIList.size() <= 1) 182 continue; 183 184 // Assume the arguments are unchanged and later override them, if needed. 185 SmallVector<Value *, 16> Args(IIList.front()->args()); 186 187 // Validate function argument and return types, extracting overloaded 188 // types along the way. 189 SmallVector<Type *, 6> OverloadTys; 190 Function *F = IIList.front()->getCalledFunction(); 191 if (!Intrinsic::getIntrinsicSignature(F, OverloadTys)) 192 continue; 193 194 Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID(); 195 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 196 AMDGPU::getImageDimIntrinsicInfo(IntrinID); 197 198 Type *EltTy = IIList.front()->getType()->getScalarType(); 199 Type *NewTy = FixedVectorType::get(EltTy, 4); 200 OverloadTys[0] = NewTy; 201 bool isD16 = EltTy->isHalfTy(); 202 203 ConstantInt *DMask = cast<ConstantInt>( 204 IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex)); 205 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 206 unsigned NumElts = popcount(DMaskVal); 207 208 // Number of instructions and the number of vaddr/vdata dword transfers 209 // should be reduced. 210 unsigned NumLoads = IIList.size(); 211 unsigned NumMsaas = NumElts; 212 unsigned NumVAddrLoads = 3 * NumLoads; 213 unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads; 214 unsigned NumVAddrMsaas = 3 * NumMsaas; 215 unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas; 216 217 if (NumLoads < NumMsaas || 218 (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas)) 219 continue; 220 221 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; 222 auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex)); 223 const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4; 224 225 // Create the new instructions. 226 IRBuilder<> B(IIList.front()); 227 228 // Create the new image_msaa_load intrinsic. 229 SmallVector<Instruction *, 4> NewCalls; 230 while (DMaskVal != 0) { 231 unsigned NewMaskVal = 1 << countr_zero(DMaskVal); 232 233 Intrinsic::ID NewIntrinID; 234 if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa) 235 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa; 236 else 237 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; 238 239 Function *NewIntrin = Intrinsic::getDeclaration( 240 IIList.front()->getModule(), NewIntrinID, OverloadTys); 241 Args[ImageDimIntr->DMaskIndex] = 242 ConstantInt::get(DMask->getType(), NewMaskVal); 243 Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal); 244 CallInst *NewCall = B.CreateCall(NewIntrin, Args); 245 LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n"); 246 247 NewCalls.push_back(NewCall); 248 DMaskVal -= NewMaskVal; 249 } 250 251 // Create the new extractelement instructions. 252 for (auto &II : IIList) { 253 Value *VecOp = nullptr; 254 auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex)); 255 B.SetCurrentDebugLocation(II->getDebugLoc()); 256 if (NumElts == 1) { 257 VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4)); 258 LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); 259 } else { 260 VecOp = UndefValue::get(II->getType()); 261 for (unsigned I = 0; I < NumElts; ++I) { 262 VecOp = B.CreateInsertElement( 263 VecOp, 264 B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I); 265 LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); 266 } 267 } 268 269 // Replace the old instruction. 270 II->replaceAllUsesWith(VecOp); 271 VecOp->takeName(II); 272 InstrsToErase.push_back(II); 273 } 274 275 Modified = true; 276 } 277 278 for (auto I : InstrsToErase) 279 I->eraseFromParent(); 280 281 return Modified; 282 } 283 284 static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { 285 if (!TM) 286 return false; 287 288 // This optimization only applies to GFX11 and beyond. 289 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 290 if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug()) 291 return false; 292 293 Module *M = F.getParent(); 294 295 // Early test to determine if the intrinsics are used. 296 if (std::none_of(M->begin(), M->end(), [](Function &F) { 297 return !F.users().empty() && 298 (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa || 299 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa); 300 })) 301 return false; 302 303 bool Modified = false; 304 for (auto &BB : F) { 305 BasicBlock::iterator SectionEnd; 306 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; 307 I = SectionEnd) { 308 SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts; 309 310 SectionEnd = collectMergeableInsts(I, E, MergeableInsts); 311 Modified |= optimizeSection(MergeableInsts); 312 } 313 } 314 315 return Modified; 316 } 317 318 bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) { 319 if (skipFunction(F)) 320 return false; 321 322 return imageIntrinsicOptimizerImpl(F, TM); 323 } 324 325 FunctionPass * 326 llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) { 327 return new AMDGPUImageIntrinsicOptimizer(TM); 328 } 329 330 PreservedAnalyses 331 AMDGPUImageIntrinsicOptimizerPass::run(Function &F, 332 FunctionAnalysisManager &AM) { 333 334 bool Changed = imageIntrinsicOptimizerImpl(F, &TM); 335 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 336 } 337