1 //===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
10 // or dim=2darraymsaa into a single image_msaa_load intrinsic if:
11 //
12 // - they refer to the same vaddr except for sample_id,
13 // - they use a constant sample_id and they fall into the same group,
14 // - they have the same dmask and the number of intrinsics and the number of
15 // vaddr/vdata dword transfers is reduced by the combine.
16 //
17 // Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
18 //
19 // +----------+-----+-----+-------+---------+------------+---------+----------+
20 // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
21 // | (dmask) | | | | vdata | | vdata | |
22 // +----------+-----+-----+-------+---------+------------+---------+----------+
23 // | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes |
24 // +----------+-----+-----+-------+---------+------------+---------+----------+
25 // | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? |
26 // +----------+-----+-----+-------+---------+------------+---------+----------+
27 // | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes |
28 // +----------+-----+-----+-------+---------+------------+---------+----------+
29 // | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no |
30 // +----------+-----+-----+-------+---------+------------+---------+----------+
31 // | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes |
32 // +----------+-----+-----+-------+---------+------------+---------+----------+
33 //
34 // Some cases are of questionable benefit, like the one marked with "yes?"
35 // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
36 // and TX, but higher vdata. We start by erring on the side of converting these
37 // to MSAA_LOAD.
38 //
39 // clang-format off
40 //
41 // This pass will combine intrinsics such as (not neccessarily consecutive):
42 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
43 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
44 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
45 // call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
46 // ==>
47 // call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
48 //
49 // clang-format on
50 //
51 // Future improvements:
52 //
53 // - We may occasionally not want to do the combine if it increases the maximum
54 // register pressure.
55 //
56 // - Ensure clausing when multiple MSAA_LOAD are generated.
57 //
58 // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
59 // combine only applies to gfx11, due to a limitation in gfx10: the gfx10
60 // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
61 // we don't know the format at compile time.
62 //===----------------------------------------------------------------------===//
63
64 #include "AMDGPU.h"
65 #include "AMDGPUInstrInfo.h"
66 #include "AMDGPUTargetMachine.h"
67 #include "llvm/IR/Function.h"
68 #include "llvm/IR/IRBuilder.h"
69 #include "llvm/IR/IntrinsicInst.h"
70 #include "llvm/IR/IntrinsicsAMDGPU.h"
71 #include "llvm/Pass.h"
72 #include "llvm/Support/raw_ostream.h"
73
74 using namespace llvm;
75
76 #define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77
78 namespace {
79 class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
80 const TargetMachine *TM;
81
82 public:
83 static char ID;
84
AMDGPUImageIntrinsicOptimizer(const TargetMachine * TM=nullptr)85 AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
86 : FunctionPass(ID), TM(TM) {}
87
88 bool runOnFunction(Function &F) override;
89
90 }; // End of class AMDGPUImageIntrinsicOptimizer
91 } // End anonymous namespace
92
93 INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
94 "AMDGPU Image Intrinsic Optimizer", false, false)
95
96 char AMDGPUImageIntrinsicOptimizer::ID = 0;
97
addInstToMergeableList(IntrinsicInst * II,SmallVector<SmallVector<IntrinsicInst *,4>> & MergeableInsts,const AMDGPU::ImageDimIntrinsicInfo * ImageDimIntr)98 void addInstToMergeableList(
99 IntrinsicInst *II,
100 SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
102 for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {
103 // Check Dim.
104 if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105 continue;
106
107 // Check D16.
108 if (IIList.front()->getType() != II->getType())
109 continue;
110
111 // Check all arguments (DMask, VAddr, RSrc etc).
112 bool AllEqual = true;
113 assert(IIList.front()->arg_size() == II->arg_size());
114 for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
115 Value *ArgList = IIList.front()->getArgOperand(I);
116 Value *Arg = II->getArgOperand(I);
117 if (I == ImageDimIntr->VAddrEnd - 1) {
118 // Check FragId group.
119 auto *FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
120 auto *FragId = cast<ConstantInt>(II->getArgOperand(I));
121 AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
122 } else {
123 // Check all arguments except FragId.
124 AllEqual = ArgList == Arg;
125 }
126 }
127 if (!AllEqual)
128 continue;
129
130 // Add to the list.
131 IIList.emplace_back(II);
132 return;
133 }
134
135 // Similar instruction not found, so add a new list.
136 MergeableInsts.emplace_back(1, II);
137 LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
138 }
139
140 // Collect list of all instructions we know how to merge in a subset of the
141 // block. It returns an iterator to the instruction after the last one analyzed.
collectMergeableInsts(BasicBlock::iterator I,BasicBlock::iterator E,SmallVector<SmallVector<IntrinsicInst *,4>> & MergeableInsts)142 BasicBlock::iterator collectMergeableInsts(
143 BasicBlock::iterator I, BasicBlock::iterator E,
144 SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {
145 for (; I != E; ++I) {
146 // Don't combine if there is a store in the middle or if there is a memory
147 // barrier.
148 if (I->mayHaveSideEffects()) {
149 ++I;
150 break;
151 }
152
153 // Ignore non-intrinsics.
154 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
155 Intrinsic::ID IntrinID = II->getIntrinsicID();
156
157 // Ignore other intrinsics.
158 if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159 IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
160 continue;
161
162 // Check for constant FragId.
163 const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
164 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
165 if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
166 continue;
167
168 LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
169 addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
170 }
171 }
172
173 return I;
174 }
175
optimizeSection(ArrayRef<SmallVector<IntrinsicInst *,4>> MergeableInsts)176 bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
177 bool Modified = false;
178
179 SmallVector<Instruction *, 4> InstrsToErase;
180 for (const auto &IIList : MergeableInsts) {
181 if (IIList.size() <= 1)
182 continue;
183
184 // Assume the arguments are unchanged and later override them, if needed.
185 SmallVector<Value *, 16> Args(IIList.front()->args());
186
187 // Validate function argument and return types, extracting overloaded
188 // types along the way.
189 SmallVector<Type *, 6> OverloadTys;
190 Function *F = IIList.front()->getCalledFunction();
191 if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
192 continue;
193
194 Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
195 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
196 AMDGPU::getImageDimIntrinsicInfo(IntrinID);
197
198 Type *EltTy = IIList.front()->getType()->getScalarType();
199 Type *NewTy = FixedVectorType::get(EltTy, 4);
200 OverloadTys[0] = NewTy;
201 bool isD16 = EltTy->isHalfTy();
202
203 ConstantInt *DMask = cast<ConstantInt>(
204 IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
205 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
206 unsigned NumElts = popcount(DMaskVal);
207
208 // Number of instructions and the number of vaddr/vdata dword transfers
209 // should be reduced.
210 unsigned NumLoads = IIList.size();
211 unsigned NumMsaas = NumElts;
212 unsigned NumVAddrLoads = 3 * NumLoads;
213 unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
214 unsigned NumVAddrMsaas = 3 * NumMsaas;
215 unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
216
217 if (NumLoads < NumMsaas ||
218 (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
219 continue;
220
221 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
222 auto *FragId =
223 cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
224 const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
225
226 // Create the new instructions.
227 IRBuilder<> B(IIList.front());
228
229 // Create the new image_msaa_load intrinsic.
230 SmallVector<Instruction *, 4> NewCalls;
231 while (DMaskVal != 0) {
232 unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
233
234 Intrinsic::ID NewIntrinID;
235 if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
236 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
237 else
238 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
239
240 Args[ImageDimIntr->DMaskIndex] =
241 ConstantInt::get(DMask->getType(), NewMaskVal);
242 Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
243 CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args);
244 LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
245
246 NewCalls.push_back(NewCall);
247 DMaskVal -= NewMaskVal;
248 }
249
250 // Create the new extractelement instructions.
251 for (auto &II : IIList) {
252 Value *VecOp = nullptr;
253 auto *Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
254 B.SetCurrentDebugLocation(II->getDebugLoc());
255 if (NumElts == 1) {
256 VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
257 LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
258 } else {
259 VecOp = PoisonValue::get(II->getType());
260 for (unsigned I = 0; I < NumElts; ++I) {
261 VecOp = B.CreateInsertElement(
262 VecOp,
263 B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
264 LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
265 }
266 }
267
268 // Replace the old instruction.
269 II->replaceAllUsesWith(VecOp);
270 VecOp->takeName(II);
271 InstrsToErase.push_back(II);
272 }
273
274 Modified = true;
275 }
276
277 for (auto *I : InstrsToErase)
278 I->eraseFromParent();
279
280 return Modified;
281 }
282
imageIntrinsicOptimizerImpl(Function & F,const TargetMachine * TM)283 static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
284 if (!TM)
285 return false;
286
287 // This optimization only applies to GFX11 and beyond.
288 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
289 if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
290 return false;
291
292 Module *M = F.getParent();
293
294 // Early test to determine if the intrinsics are used.
295 if (llvm::none_of(*M, [](Function &F) {
296 return !F.users().empty() &&
297 (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
298 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
299 }))
300 return false;
301
302 bool Modified = false;
303 for (auto &BB : F) {
304 BasicBlock::iterator SectionEnd;
305 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
306 I = SectionEnd) {
307 SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;
308
309 SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
310 Modified |= optimizeSection(MergeableInsts);
311 }
312 }
313
314 return Modified;
315 }
316
runOnFunction(Function & F)317 bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
318 if (skipFunction(F))
319 return false;
320
321 return imageIntrinsicOptimizerImpl(F, TM);
322 }
323
324 FunctionPass *
createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine * TM)325 llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {
326 return new AMDGPUImageIntrinsicOptimizer(TM);
327 }
328
329 PreservedAnalyses
run(Function & F,FunctionAnalysisManager & AM)330 AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
331 FunctionAnalysisManager &AM) {
332
333 bool Changed = imageIntrinsicOptimizerImpl(F, &TM);
334 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
335 }
336