1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66
67 using namespace llvm;
68
69 #define DEBUG_TYPE "si-load-store-opt"
70
71 namespace {
72 enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 S_BUFFER_LOAD_SGPR_IMM,
78 S_LOAD_IMM,
79 BUFFER_LOAD,
80 BUFFER_STORE,
81 MIMG,
82 TBUFFER_LOAD,
83 TBUFFER_STORE,
84 GLOBAL_LOAD_SADDR,
85 GLOBAL_STORE_SADDR,
86 FLAT_LOAD,
87 FLAT_STORE,
88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89 GLOBAL_STORE // any CombineInfo, they are only ever returned by
90 // getCommonInstClass.
91 };
92
93 struct AddressRegs {
94 unsigned char NumVAddrs = 0;
95 bool SBase = false;
96 bool SRsrc = false;
97 bool SOffset = false;
98 bool SAddr = false;
99 bool VAddr = false;
100 bool Addr = false;
101 bool SSamp = false;
102 };
103
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108 struct CombineInfo {
109 MachineBasicBlock::iterator I;
110 unsigned EltSize;
111 unsigned Offset;
112 unsigned Width;
113 unsigned Format;
114 unsigned BaseOff;
115 unsigned DMask;
116 InstClassEnum InstClass;
117 unsigned CPol = 0;
118 bool IsAGPR;
119 bool UseST64;
120 int AddrIdx[MaxAddressRegs];
121 const MachineOperand *AddrReg[MaxAddressRegs];
122 unsigned NumAddresses;
123 unsigned Order;
124
hasSameBaseAddress__anon1ff0d37f0111::SILoadStoreOptimizer::CombineInfo125 bool hasSameBaseAddress(const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
127 return false;
128
129 const MachineInstr &MI = *CI.I;
130 for (unsigned i = 0; i < NumAddresses; i++) {
131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132
133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136 return false;
137 }
138 continue;
139 }
140
141 // Check same base pointer. Be careful of subregisters, which can occur
142 // with vectors of pointers.
143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145 return false;
146 }
147 }
148 return true;
149 }
150
hasMergeableAddress__anon1ff0d37f0111::SILoadStoreOptimizer::CombineInfo151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152 for (unsigned i = 0; i < NumAddresses; ++i) {
153 const MachineOperand *AddrOp = AddrReg[i];
154 // Immediates are always OK.
155 if (AddrOp->isImm())
156 continue;
157
158 // Don't try to merge addresses that aren't either immediates or registers.
159 // TODO: Should be possible to merge FrameIndexes and maybe some other
160 // non-register
161 if (!AddrOp->isReg())
162 return false;
163
164 // TODO: We should be able to merge instructions with other physical reg
165 // addresses too.
166 if (AddrOp->getReg().isPhysical() &&
167 AddrOp->getReg() != AMDGPU::SGPR_NULL)
168 return false;
169
170 // If an address has only one use then there will be no other
171 // instructions with the same address, so we can't merge this one.
172 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173 return false;
174 }
175 return true;
176 }
177
178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179
180 // Compare by pointer order.
operator <__anon1ff0d37f0111::SILoadStoreOptimizer::CombineInfo181 bool operator<(const CombineInfo& Other) const {
182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183 }
184 };
185
186 struct BaseRegisters {
187 Register LoReg;
188 Register HiReg;
189
190 unsigned LoSubReg = 0;
191 unsigned HiSubReg = 0;
192 };
193
194 struct MemAddress {
195 BaseRegisters Base;
196 int64_t Offset = 0;
197 };
198
199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200
201 private:
202 const GCNSubtarget *STM = nullptr;
203 const SIInstrInfo *TII = nullptr;
204 const SIRegisterInfo *TRI = nullptr;
205 MachineRegisterInfo *MRI = nullptr;
206 AliasAnalysis *AA = nullptr;
207 bool OptimizeAgain;
208
209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210 const DenseSet<Register> &ARegUses,
211 const MachineInstr &A, const MachineInstr &B) const;
212 static bool dmasksCanBeCombined(const CombineInfo &CI,
213 const SIInstrInfo &TII,
214 const CombineInfo &Paired);
215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216 CombineInfo &Paired, bool Modify = false);
217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218 const CombineInfo &Paired);
219 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221 const CombineInfo &Paired);
222 const TargetRegisterClass *
223 getTargetRegisterClass(const CombineInfo &CI,
224 const CombineInfo &Paired) const;
225 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
226
227 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
228
229 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
230 MachineBasicBlock::iterator InsertBefore, int OpName,
231 Register DestReg) const;
232 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
233 MachineBasicBlock::iterator InsertBefore,
234 int OpName) const;
235
236 unsigned read2Opcode(unsigned EltSize) const;
237 unsigned read2ST64Opcode(unsigned EltSize) const;
238 MachineBasicBlock::iterator
239 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
240 MachineBasicBlock::iterator InsertBefore);
241
242 unsigned write2Opcode(unsigned EltSize) const;
243 unsigned write2ST64Opcode(unsigned EltSize) const;
244 MachineBasicBlock::iterator
245 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
246 MachineBasicBlock::iterator InsertBefore);
247 MachineBasicBlock::iterator
248 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
249 MachineBasicBlock::iterator InsertBefore);
250 MachineBasicBlock::iterator
251 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
252 MachineBasicBlock::iterator InsertBefore);
253 MachineBasicBlock::iterator
254 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
255 MachineBasicBlock::iterator InsertBefore);
256 MachineBasicBlock::iterator
257 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
258 MachineBasicBlock::iterator InsertBefore);
259 MachineBasicBlock::iterator
260 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
261 MachineBasicBlock::iterator InsertBefore);
262 MachineBasicBlock::iterator
263 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
264 MachineBasicBlock::iterator InsertBefore);
265 MachineBasicBlock::iterator
266 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
267 MachineBasicBlock::iterator InsertBefore);
268 MachineBasicBlock::iterator
269 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
270 MachineBasicBlock::iterator InsertBefore);
271
272 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
273 int32_t NewOffset) const;
274 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
275 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
276 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
277 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
278 /// Promotes constant offset to the immediate by adjusting the base. It
279 /// tries to use a base from the nearby instructions that allows it to have
280 /// a 13bit constant offset which gets promoted to the immediate.
281 bool promoteConstantOffsetToImm(MachineInstr &CI,
282 MemInfoMap &Visited,
283 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
284 void addInstToMergeableList(const CombineInfo &CI,
285 std::list<std::list<CombineInfo> > &MergeableInsts) const;
286
287 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
288 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
289 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
290 std::list<std::list<CombineInfo>> &MergeableInsts) const;
291
292 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
293 const CombineInfo &Paired);
294
295 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
296 const CombineInfo &Paired);
297
298 public:
299 static char ID;
300
SILoadStoreOptimizer()301 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
302 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
303 }
304
305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306 bool &OptimizeListAgain);
307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
308
309 bool runOnMachineFunction(MachineFunction &MF) override;
310
getPassName() const311 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
312
getAnalysisUsage(AnalysisUsage & AU) const313 void getAnalysisUsage(AnalysisUsage &AU) const override {
314 AU.setPreservesCFG();
315 AU.addRequired<AAResultsWrapperPass>();
316
317 MachineFunctionPass::getAnalysisUsage(AU);
318 }
319
getRequiredProperties() const320 MachineFunctionProperties getRequiredProperties() const override {
321 return MachineFunctionProperties()
322 .set(MachineFunctionProperties::Property::IsSSA);
323 }
324 };
325
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)326 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
327 const unsigned Opc = MI.getOpcode();
328
329 if (TII.isMUBUF(Opc)) {
330 // FIXME: Handle d16 correctly
331 return AMDGPU::getMUBUFElements(Opc);
332 }
333 if (TII.isImage(MI)) {
334 uint64_t DMaskImm =
335 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
336 return llvm::popcount(DMaskImm);
337 }
338 if (TII.isMTBUF(Opc)) {
339 return AMDGPU::getMTBUFElements(Opc);
340 }
341
342 switch (Opc) {
343 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345 case AMDGPU::S_LOAD_DWORD_IMM:
346 case AMDGPU::GLOBAL_LOAD_DWORD:
347 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348 case AMDGPU::GLOBAL_STORE_DWORD:
349 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350 case AMDGPU::FLAT_LOAD_DWORD:
351 case AMDGPU::FLAT_STORE_DWORD:
352 return 1;
353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355 case AMDGPU::S_LOAD_DWORDX2_IMM:
356 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
357 case AMDGPU::GLOBAL_LOAD_DWORDX2:
358 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
359 case AMDGPU::GLOBAL_STORE_DWORDX2:
360 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
361 case AMDGPU::FLAT_LOAD_DWORDX2:
362 case AMDGPU::FLAT_STORE_DWORDX2:
363 return 2;
364 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
365 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
366 case AMDGPU::S_LOAD_DWORDX3_IMM:
367 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
368 case AMDGPU::GLOBAL_LOAD_DWORDX3:
369 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
370 case AMDGPU::GLOBAL_STORE_DWORDX3:
371 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
372 case AMDGPU::FLAT_LOAD_DWORDX3:
373 case AMDGPU::FLAT_STORE_DWORDX3:
374 return 3;
375 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
377 case AMDGPU::S_LOAD_DWORDX4_IMM:
378 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
379 case AMDGPU::GLOBAL_LOAD_DWORDX4:
380 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
381 case AMDGPU::GLOBAL_STORE_DWORDX4:
382 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
383 case AMDGPU::FLAT_LOAD_DWORDX4:
384 case AMDGPU::FLAT_STORE_DWORDX4:
385 return 4;
386 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
388 case AMDGPU::S_LOAD_DWORDX8_IMM:
389 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
390 return 8;
391 case AMDGPU::DS_READ_B32:
392 case AMDGPU::DS_READ_B32_gfx9:
393 case AMDGPU::DS_WRITE_B32:
394 case AMDGPU::DS_WRITE_B32_gfx9:
395 return 1;
396 case AMDGPU::DS_READ_B64:
397 case AMDGPU::DS_READ_B64_gfx9:
398 case AMDGPU::DS_WRITE_B64:
399 case AMDGPU::DS_WRITE_B64_gfx9:
400 return 2;
401 default:
402 return 0;
403 }
404 }
405
406 /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)407 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
408 switch (Opc) {
409 default:
410 if (TII.isMUBUF(Opc)) {
411 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
412 default:
413 return UNKNOWN;
414 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
415 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
416 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
417 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
418 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
419 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
420 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
421 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
422 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
423 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
424 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
425 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
426 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
427 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
428 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
429 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
430 return BUFFER_LOAD;
431 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
432 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
433 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
434 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
435 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
436 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
437 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
438 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
439 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
440 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
441 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
442 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
443 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
444 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
445 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
446 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
447 return BUFFER_STORE;
448 }
449 }
450 if (TII.isImage(Opc)) {
451 // Ignore instructions encoded without vaddr.
452 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
453 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
454 return UNKNOWN;
455 // Ignore BVH instructions
456 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
457 return UNKNOWN;
458 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
459 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
460 TII.isGather4(Opc))
461 return UNKNOWN;
462 return MIMG;
463 }
464 if (TII.isMTBUF(Opc)) {
465 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
466 default:
467 return UNKNOWN;
468 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
469 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
470 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
471 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
472 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
473 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
474 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
475 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
476 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
477 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
478 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
479 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
480 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
484 return TBUFFER_LOAD;
485 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
486 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
487 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
488 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
489 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
490 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
491 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
492 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
493 return TBUFFER_STORE;
494 }
495 }
496 return UNKNOWN;
497 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
498 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
499 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
500 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
502 return S_BUFFER_LOAD_IMM;
503 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
505 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
508 return S_BUFFER_LOAD_SGPR_IMM;
509 case AMDGPU::S_LOAD_DWORD_IMM:
510 case AMDGPU::S_LOAD_DWORDX2_IMM:
511 case AMDGPU::S_LOAD_DWORDX3_IMM:
512 case AMDGPU::S_LOAD_DWORDX4_IMM:
513 case AMDGPU::S_LOAD_DWORDX8_IMM:
514 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
515 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
516 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
517 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
518 return S_LOAD_IMM;
519 case AMDGPU::DS_READ_B32:
520 case AMDGPU::DS_READ_B32_gfx9:
521 case AMDGPU::DS_READ_B64:
522 case AMDGPU::DS_READ_B64_gfx9:
523 return DS_READ;
524 case AMDGPU::DS_WRITE_B32:
525 case AMDGPU::DS_WRITE_B32_gfx9:
526 case AMDGPU::DS_WRITE_B64:
527 case AMDGPU::DS_WRITE_B64_gfx9:
528 return DS_WRITE;
529 case AMDGPU::GLOBAL_LOAD_DWORD:
530 case AMDGPU::GLOBAL_LOAD_DWORDX2:
531 case AMDGPU::GLOBAL_LOAD_DWORDX3:
532 case AMDGPU::GLOBAL_LOAD_DWORDX4:
533 case AMDGPU::FLAT_LOAD_DWORD:
534 case AMDGPU::FLAT_LOAD_DWORDX2:
535 case AMDGPU::FLAT_LOAD_DWORDX3:
536 case AMDGPU::FLAT_LOAD_DWORDX4:
537 return FLAT_LOAD;
538 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
539 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
540 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
541 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
542 return GLOBAL_LOAD_SADDR;
543 case AMDGPU::GLOBAL_STORE_DWORD:
544 case AMDGPU::GLOBAL_STORE_DWORDX2:
545 case AMDGPU::GLOBAL_STORE_DWORDX3:
546 case AMDGPU::GLOBAL_STORE_DWORDX4:
547 case AMDGPU::FLAT_STORE_DWORD:
548 case AMDGPU::FLAT_STORE_DWORDX2:
549 case AMDGPU::FLAT_STORE_DWORDX3:
550 case AMDGPU::FLAT_STORE_DWORDX4:
551 return FLAT_STORE;
552 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
553 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
554 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
555 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
556 return GLOBAL_STORE_SADDR;
557 }
558 }
559
560 /// Determines instruction subclass from opcode. Only instructions
561 /// of the same subclass can be merged together. The merged instruction may have
562 /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)563 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
564 switch (Opc) {
565 default:
566 if (TII.isMUBUF(Opc))
567 return AMDGPU::getMUBUFBaseOpcode(Opc);
568 if (TII.isImage(Opc)) {
569 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
570 assert(Info);
571 return Info->BaseOpcode;
572 }
573 if (TII.isMTBUF(Opc))
574 return AMDGPU::getMTBUFBaseOpcode(Opc);
575 return -1;
576 case AMDGPU::DS_READ_B32:
577 case AMDGPU::DS_READ_B32_gfx9:
578 case AMDGPU::DS_READ_B64:
579 case AMDGPU::DS_READ_B64_gfx9:
580 case AMDGPU::DS_WRITE_B32:
581 case AMDGPU::DS_WRITE_B32_gfx9:
582 case AMDGPU::DS_WRITE_B64:
583 case AMDGPU::DS_WRITE_B64_gfx9:
584 return Opc;
585 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
586 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
587 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
588 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
590 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
593 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
596 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597 case AMDGPU::S_LOAD_DWORD_IMM:
598 case AMDGPU::S_LOAD_DWORDX2_IMM:
599 case AMDGPU::S_LOAD_DWORDX3_IMM:
600 case AMDGPU::S_LOAD_DWORDX4_IMM:
601 case AMDGPU::S_LOAD_DWORDX8_IMM:
602 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
603 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
604 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
605 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
606 return AMDGPU::S_LOAD_DWORD_IMM;
607 case AMDGPU::GLOBAL_LOAD_DWORD:
608 case AMDGPU::GLOBAL_LOAD_DWORDX2:
609 case AMDGPU::GLOBAL_LOAD_DWORDX3:
610 case AMDGPU::GLOBAL_LOAD_DWORDX4:
611 case AMDGPU::FLAT_LOAD_DWORD:
612 case AMDGPU::FLAT_LOAD_DWORDX2:
613 case AMDGPU::FLAT_LOAD_DWORDX3:
614 case AMDGPU::FLAT_LOAD_DWORDX4:
615 return AMDGPU::FLAT_LOAD_DWORD;
616 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
617 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
618 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
619 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
620 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
621 case AMDGPU::GLOBAL_STORE_DWORD:
622 case AMDGPU::GLOBAL_STORE_DWORDX2:
623 case AMDGPU::GLOBAL_STORE_DWORDX3:
624 case AMDGPU::GLOBAL_STORE_DWORDX4:
625 case AMDGPU::FLAT_STORE_DWORD:
626 case AMDGPU::FLAT_STORE_DWORDX2:
627 case AMDGPU::FLAT_STORE_DWORDX3:
628 case AMDGPU::FLAT_STORE_DWORDX4:
629 return AMDGPU::FLAT_STORE_DWORD;
630 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
631 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
632 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
633 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
634 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
635 }
636 }
637
638 // GLOBAL loads and stores are classified as FLAT initially. If both combined
639 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
640 // If either or both instructions are non segment specific FLAT the resulting
641 // combined operation will be FLAT, potentially promoting one of the GLOBAL
642 // operations to FLAT.
643 // For other instructions return the original unmodified class.
644 InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)645 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
646 const CombineInfo &Paired) {
647 assert(CI.InstClass == Paired.InstClass);
648
649 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
650 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
651 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
652
653 return CI.InstClass;
654 }
655
getRegs(unsigned Opc,const SIInstrInfo & TII)656 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
657 AddressRegs Result;
658
659 if (TII.isMUBUF(Opc)) {
660 if (AMDGPU::getMUBUFHasVAddr(Opc))
661 Result.VAddr = true;
662 if (AMDGPU::getMUBUFHasSrsrc(Opc))
663 Result.SRsrc = true;
664 if (AMDGPU::getMUBUFHasSoffset(Opc))
665 Result.SOffset = true;
666
667 return Result;
668 }
669
670 if (TII.isImage(Opc)) {
671 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
672 if (VAddr0Idx >= 0) {
673 int RsrcName =
674 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
675 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
676 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
677 } else {
678 Result.VAddr = true;
679 }
680 Result.SRsrc = true;
681 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
682 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
683 Result.SSamp = true;
684
685 return Result;
686 }
687 if (TII.isMTBUF(Opc)) {
688 if (AMDGPU::getMTBUFHasVAddr(Opc))
689 Result.VAddr = true;
690 if (AMDGPU::getMTBUFHasSrsrc(Opc))
691 Result.SRsrc = true;
692 if (AMDGPU::getMTBUFHasSoffset(Opc))
693 Result.SOffset = true;
694
695 return Result;
696 }
697
698 switch (Opc) {
699 default:
700 return Result;
701 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
702 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
703 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
706 Result.SOffset = true;
707 [[fallthrough]];
708 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
709 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
710 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
711 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
713 case AMDGPU::S_LOAD_DWORD_IMM:
714 case AMDGPU::S_LOAD_DWORDX2_IMM:
715 case AMDGPU::S_LOAD_DWORDX3_IMM:
716 case AMDGPU::S_LOAD_DWORDX4_IMM:
717 case AMDGPU::S_LOAD_DWORDX8_IMM:
718 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
719 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
720 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
721 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
722 Result.SBase = true;
723 return Result;
724 case AMDGPU::DS_READ_B32:
725 case AMDGPU::DS_READ_B64:
726 case AMDGPU::DS_READ_B32_gfx9:
727 case AMDGPU::DS_READ_B64_gfx9:
728 case AMDGPU::DS_WRITE_B32:
729 case AMDGPU::DS_WRITE_B64:
730 case AMDGPU::DS_WRITE_B32_gfx9:
731 case AMDGPU::DS_WRITE_B64_gfx9:
732 Result.Addr = true;
733 return Result;
734 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
735 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
736 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
737 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
738 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
739 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
740 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
741 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
742 Result.SAddr = true;
743 [[fallthrough]];
744 case AMDGPU::GLOBAL_LOAD_DWORD:
745 case AMDGPU::GLOBAL_LOAD_DWORDX2:
746 case AMDGPU::GLOBAL_LOAD_DWORDX3:
747 case AMDGPU::GLOBAL_LOAD_DWORDX4:
748 case AMDGPU::GLOBAL_STORE_DWORD:
749 case AMDGPU::GLOBAL_STORE_DWORDX2:
750 case AMDGPU::GLOBAL_STORE_DWORDX3:
751 case AMDGPU::GLOBAL_STORE_DWORDX4:
752 case AMDGPU::FLAT_LOAD_DWORD:
753 case AMDGPU::FLAT_LOAD_DWORDX2:
754 case AMDGPU::FLAT_LOAD_DWORDX3:
755 case AMDGPU::FLAT_LOAD_DWORDX4:
756 case AMDGPU::FLAT_STORE_DWORD:
757 case AMDGPU::FLAT_STORE_DWORDX2:
758 case AMDGPU::FLAT_STORE_DWORDX3:
759 case AMDGPU::FLAT_STORE_DWORDX4:
760 Result.VAddr = true;
761 return Result;
762 }
763 }
764
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)765 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
766 const SILoadStoreOptimizer &LSO) {
767 I = MI;
768 unsigned Opc = MI->getOpcode();
769 InstClass = getInstClass(Opc, *LSO.TII);
770
771 if (InstClass == UNKNOWN)
772 return;
773
774 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
775
776 switch (InstClass) {
777 case DS_READ:
778 EltSize =
779 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
780 : 4;
781 break;
782 case DS_WRITE:
783 EltSize =
784 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
785 : 4;
786 break;
787 case S_BUFFER_LOAD_IMM:
788 case S_BUFFER_LOAD_SGPR_IMM:
789 case S_LOAD_IMM:
790 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
791 break;
792 default:
793 EltSize = 4;
794 break;
795 }
796
797 if (InstClass == MIMG) {
798 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
799 // Offset is not considered for MIMG instructions.
800 Offset = 0;
801 } else {
802 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
803 Offset = I->getOperand(OffsetIdx).getImm();
804 }
805
806 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
807 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
808
809 Width = getOpcodeWidth(*I, *LSO.TII);
810
811 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
812 Offset &= 0xffff;
813 } else if (InstClass != MIMG) {
814 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
815 }
816
817 AddressRegs Regs = getRegs(Opc, *LSO.TII);
818 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
819
820 NumAddresses = 0;
821 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
822 AddrIdx[NumAddresses++] =
823 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
824 if (Regs.Addr)
825 AddrIdx[NumAddresses++] =
826 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
827 if (Regs.SBase)
828 AddrIdx[NumAddresses++] =
829 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
830 if (Regs.SRsrc)
831 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
832 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
833 if (Regs.SOffset)
834 AddrIdx[NumAddresses++] =
835 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
836 if (Regs.SAddr)
837 AddrIdx[NumAddresses++] =
838 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
839 if (Regs.VAddr)
840 AddrIdx[NumAddresses++] =
841 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
842 if (Regs.SSamp)
843 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
844 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
845 assert(NumAddresses <= MaxAddressRegs);
846
847 for (unsigned J = 0; J < NumAddresses; J++)
848 AddrReg[J] = &I->getOperand(AddrIdx[J]);
849 }
850
851 } // end anonymous namespace.
852
853 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
854 "SI Load Store Optimizer", false, false)
855 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
856 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
857 false, false)
858
859 char SILoadStoreOptimizer::ID = 0;
860
861 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
862
createSILoadStoreOptimizerPass()863 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
864 return new SILoadStoreOptimizer();
865 }
866
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)867 static void addDefsUsesToList(const MachineInstr &MI,
868 DenseSet<Register> &RegDefs,
869 DenseSet<Register> &RegUses) {
870 for (const auto &Op : MI.operands()) {
871 if (!Op.isReg())
872 continue;
873 if (Op.isDef())
874 RegDefs.insert(Op.getReg());
875 if (Op.readsReg())
876 RegUses.insert(Op.getReg());
877 }
878 }
879
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const880 bool SILoadStoreOptimizer::canSwapInstructions(
881 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
882 const MachineInstr &A, const MachineInstr &B) const {
883 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
884 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
885 return false;
886 for (const auto &BOp : B.operands()) {
887 if (!BOp.isReg())
888 continue;
889 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
890 return false;
891 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
892 return false;
893 }
894 return true;
895 }
896
897 // Given that \p CI and \p Paired are adjacent memory operations produce a new
898 // MMO for the combined operation with a new access size.
899 MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)900 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
901 const CombineInfo &Paired) {
902 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
903 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
904
905 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
906
907 // A base pointer for the combined operation is the same as the leading
908 // operation's pointer.
909 if (Paired < CI)
910 std::swap(MMOa, MMOb);
911
912 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
913 // If merging FLAT and GLOBAL set address space to FLAT.
914 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
915 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
916
917 MachineFunction *MF = CI.I->getMF();
918 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
919 }
920
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)921 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
922 const SIInstrInfo &TII,
923 const CombineInfo &Paired) {
924 assert(CI.InstClass == MIMG);
925
926 // Ignore instructions with tfe/lwe set.
927 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
928 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
929
930 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
931 return false;
932
933 // Check other optional immediate operands for equality.
934 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
935 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
936 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
937
938 for (auto op : OperandsToMatch) {
939 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
940 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
941 return false;
942 if (Idx != -1 &&
943 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
944 return false;
945 }
946
947 // Check DMask for overlaps.
948 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
949 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
950
951 if (!MaxMask)
952 return false;
953
954 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
955 if ((1u << AllowedBitsForMin) <= MinMask)
956 return false;
957
958 return true;
959 }
960
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)961 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
962 unsigned ComponentCount,
963 const GCNSubtarget &STI) {
964 if (ComponentCount > 4)
965 return 0;
966
967 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
968 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
969 if (!OldFormatInfo)
970 return 0;
971
972 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
973 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
974 ComponentCount,
975 OldFormatInfo->NumFormat, STI);
976
977 if (!NewFormatInfo)
978 return 0;
979
980 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
981 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
982
983 return NewFormatInfo->Format;
984 }
985
986 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
987 // highest power of two. Note that the result is well defined for all inputs
988 // including corner cases like:
989 // - if Lo == Hi, return that value
990 // - if Lo == 0, return 0 (even though the "- 1" below underflows
991 // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)992 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
993 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
994 }
995
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)996 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
997 const GCNSubtarget &STI,
998 CombineInfo &Paired,
999 bool Modify) {
1000 assert(CI.InstClass != MIMG);
1001
1002 // XXX - Would the same offset be OK? Is there any reason this would happen or
1003 // be useful?
1004 if (CI.Offset == Paired.Offset)
1005 return false;
1006
1007 // This won't be valid if the offset isn't aligned.
1008 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1009 return false;
1010
1011 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1012
1013 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1014 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1015 if (!Info0)
1016 return false;
1017 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1018 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1019 if (!Info1)
1020 return false;
1021
1022 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1023 Info0->NumFormat != Info1->NumFormat)
1024 return false;
1025
1026 // TODO: Should be possible to support more formats, but if format loads
1027 // are not dword-aligned, the merged load might not be valid.
1028 if (Info0->BitsPerComp != 32)
1029 return false;
1030
1031 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1032 return false;
1033 }
1034
1035 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1036 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1037 CI.UseST64 = false;
1038 CI.BaseOff = 0;
1039
1040 // Handle all non-DS instructions.
1041 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1042 if (EltOffset0 + CI.Width != EltOffset1 &&
1043 EltOffset1 + Paired.Width != EltOffset0)
1044 return false;
1045 if (CI.CPol != Paired.CPol)
1046 return false;
1047 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1048 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1049 // Reject cases like:
1050 // dword + dwordx2 -> dwordx3
1051 // dword + dwordx3 -> dwordx4
1052 // If we tried to combine these cases, we would fail to extract a subreg
1053 // for the result of the second load due to SGPR alignment requirements.
1054 if (CI.Width != Paired.Width &&
1055 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1056 return false;
1057 }
1058 return true;
1059 }
1060
1061 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1062 // the stride 64 versions.
1063 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1064 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1065 if (Modify) {
1066 CI.Offset = EltOffset0 / 64;
1067 Paired.Offset = EltOffset1 / 64;
1068 CI.UseST64 = true;
1069 }
1070 return true;
1071 }
1072
1073 // Check if the new offsets fit in the reduced 8-bit range.
1074 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1075 if (Modify) {
1076 CI.Offset = EltOffset0;
1077 Paired.Offset = EltOffset1;
1078 }
1079 return true;
1080 }
1081
1082 // Try to shift base address to decrease offsets.
1083 uint32_t Min = std::min(EltOffset0, EltOffset1);
1084 uint32_t Max = std::max(EltOffset0, EltOffset1);
1085
1086 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1087 if (((Max - Min) & ~Mask) == 0) {
1088 if (Modify) {
1089 // From the range of values we could use for BaseOff, choose the one that
1090 // is aligned to the highest power of two, to maximise the chance that
1091 // the same offset can be reused for other load/store pairs.
1092 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1093 // Copy the low bits of the offsets, so that when we adjust them by
1094 // subtracting BaseOff they will be multiples of 64.
1095 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1096 CI.BaseOff = BaseOff * CI.EltSize;
1097 CI.Offset = (EltOffset0 - BaseOff) / 64;
1098 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1099 CI.UseST64 = true;
1100 }
1101 return true;
1102 }
1103
1104 if (isUInt<8>(Max - Min)) {
1105 if (Modify) {
1106 // From the range of values we could use for BaseOff, choose the one that
1107 // is aligned to the highest power of two, to maximise the chance that
1108 // the same offset can be reused for other load/store pairs.
1109 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1110 CI.BaseOff = BaseOff * CI.EltSize;
1111 CI.Offset = EltOffset0 - BaseOff;
1112 Paired.Offset = EltOffset1 - BaseOff;
1113 }
1114 return true;
1115 }
1116
1117 return false;
1118 }
1119
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)1120 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1121 const CombineInfo &CI,
1122 const CombineInfo &Paired) {
1123 const unsigned Width = (CI.Width + Paired.Width);
1124 switch (CI.InstClass) {
1125 default:
1126 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1127 case S_BUFFER_LOAD_IMM:
1128 case S_BUFFER_LOAD_SGPR_IMM:
1129 case S_LOAD_IMM:
1130 switch (Width) {
1131 default:
1132 return false;
1133 case 2:
1134 case 4:
1135 case 8:
1136 return true;
1137 case 3:
1138 return STM.hasScalarDwordx3Loads();
1139 }
1140 }
1141 }
1142
1143 const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const1144 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1145 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1146 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1147 }
1148 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1149 return TRI->getRegClassForReg(*MRI, Src->getReg());
1150 }
1151 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1152 return TRI->getRegClassForReg(*MRI, Src->getReg());
1153 }
1154 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1155 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1156 }
1157 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1158 return TRI->getRegClassForReg(*MRI, Src->getReg());
1159 }
1160 return nullptr;
1161 }
1162
1163 /// This function assumes that CI comes before Paired in a basic block. Return
1164 /// an insertion point for the merged instruction or nullptr on failure.
1165 SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)1166 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1167 CombineInfo &Paired) {
1168 // If another instruction has already been merged into CI, it may now be a
1169 // type that we can't do any further merging into.
1170 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1171 return nullptr;
1172 assert(CI.InstClass == Paired.InstClass);
1173
1174 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1175 getInstSubclass(Paired.I->getOpcode(), *TII))
1176 return nullptr;
1177
1178 // Check both offsets (or masks for MIMG) can be combined and fit in the
1179 // reduced range.
1180 if (CI.InstClass == MIMG) {
1181 if (!dmasksCanBeCombined(CI, *TII, Paired))
1182 return nullptr;
1183 } else {
1184 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1185 return nullptr;
1186 }
1187
1188 DenseSet<Register> RegDefs;
1189 DenseSet<Register> RegUses;
1190 CombineInfo *Where;
1191 if (CI.I->mayLoad()) {
1192 // Try to hoist Paired up to CI.
1193 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1194 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1195 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1196 return nullptr;
1197 }
1198 Where = &CI;
1199 } else {
1200 // Try to sink CI down to Paired.
1201 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1202 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1203 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1204 return nullptr;
1205 }
1206 Where = &Paired;
1207 }
1208
1209 // Call offsetsCanBeCombined with modify = true so that the offsets are
1210 // correct for the new instruction. This should return true, because
1211 // this function should only be called on CombineInfo objects that
1212 // have already been confirmed to be mergeable.
1213 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1214 offsetsCanBeCombined(CI, *STM, Paired, true);
1215 return Where;
1216 }
1217
1218 // Copy the merged load result from DestReg to the original dest regs of CI and
1219 // Paired.
copyToDestRegs(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore,int OpName,Register DestReg) const1220 void SILoadStoreOptimizer::copyToDestRegs(
1221 CombineInfo &CI, CombineInfo &Paired,
1222 MachineBasicBlock::iterator InsertBefore, int OpName,
1223 Register DestReg) const {
1224 MachineBasicBlock *MBB = CI.I->getParent();
1225 DebugLoc DL = CI.I->getDebugLoc();
1226
1227 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1228
1229 // Copy to the old destination registers.
1230 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1231 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1232 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1233
1234 // The constrained sload instructions in S_LOAD_IMM class will have
1235 // `early-clobber` flag in the dst operand. Remove the flag before using the
1236 // MOs in copies.
1237 Dest0->setIsEarlyClobber(false);
1238 Dest1->setIsEarlyClobber(false);
1239
1240 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1241 .add(*Dest0) // Copy to same destination including flags and sub reg.
1242 .addReg(DestReg, 0, SubRegIdx0);
1243 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1244 .add(*Dest1)
1245 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1246 }
1247
1248 // Return a register for the source of the merged store after copying the
1249 // original source regs of CI and Paired into it.
1250 Register
copyFromSrcRegs(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore,int OpName) const1251 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1252 MachineBasicBlock::iterator InsertBefore,
1253 int OpName) const {
1254 MachineBasicBlock *MBB = CI.I->getParent();
1255 DebugLoc DL = CI.I->getDebugLoc();
1256
1257 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1258
1259 // Copy to the new source register.
1260 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1261 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1262
1263 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1264 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1265
1266 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1267 .add(*Src0)
1268 .addImm(SubRegIdx0)
1269 .add(*Src1)
1270 .addImm(SubRegIdx1);
1271
1272 return SrcReg;
1273 }
1274
read2Opcode(unsigned EltSize) const1275 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1276 if (STM->ldsRequiresM0Init())
1277 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1278 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1279 }
1280
read2ST64Opcode(unsigned EltSize) const1281 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1282 if (STM->ldsRequiresM0Init())
1283 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1284
1285 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1286 : AMDGPU::DS_READ2ST64_B64_gfx9;
1287 }
1288
1289 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1290 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1291 MachineBasicBlock::iterator InsertBefore) {
1292 MachineBasicBlock *MBB = CI.I->getParent();
1293
1294 // Be careful, since the addresses could be subregisters themselves in weird
1295 // cases, like vectors of pointers.
1296 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1297
1298 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1299 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1300 unsigned Opc =
1301 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1302
1303 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1304 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1305
1306 const MCInstrDesc &Read2Desc = TII->get(Opc);
1307
1308 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1309 Register DestReg = MRI->createVirtualRegister(SuperRC);
1310
1311 DebugLoc DL = CI.I->getDebugLoc();
1312
1313 Register BaseReg = AddrReg->getReg();
1314 unsigned BaseSubReg = AddrReg->getSubReg();
1315 unsigned BaseRegFlags = 0;
1316 if (CI.BaseOff) {
1317 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1318 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1319 .addImm(CI.BaseOff);
1320
1321 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1322 BaseRegFlags = RegState::Kill;
1323
1324 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1325 .addReg(ImmReg)
1326 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1327 .addImm(0); // clamp bit
1328 BaseSubReg = 0;
1329 }
1330
1331 MachineInstrBuilder Read2 =
1332 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1333 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1334 .addImm(NewOffset0) // offset0
1335 .addImm(NewOffset1) // offset1
1336 .addImm(0) // gds
1337 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1338
1339 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1340
1341 CI.I->eraseFromParent();
1342 Paired.I->eraseFromParent();
1343
1344 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1345 return Read2;
1346 }
1347
write2Opcode(unsigned EltSize) const1348 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1349 if (STM->ldsRequiresM0Init())
1350 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1351 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1352 : AMDGPU::DS_WRITE2_B64_gfx9;
1353 }
1354
write2ST64Opcode(unsigned EltSize) const1355 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1356 if (STM->ldsRequiresM0Init())
1357 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1358 : AMDGPU::DS_WRITE2ST64_B64;
1359
1360 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1361 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1362 }
1363
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1364 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1365 CombineInfo &CI, CombineInfo &Paired,
1366 MachineBasicBlock::iterator InsertBefore) {
1367 MachineBasicBlock *MBB = CI.I->getParent();
1368
1369 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1370 // sure we preserve the subregister index and any register flags set on them.
1371 const MachineOperand *AddrReg =
1372 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1373 const MachineOperand *Data0 =
1374 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1375 const MachineOperand *Data1 =
1376 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1377
1378 unsigned NewOffset0 = CI.Offset;
1379 unsigned NewOffset1 = Paired.Offset;
1380 unsigned Opc =
1381 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1382
1383 if (NewOffset0 > NewOffset1) {
1384 // Canonicalize the merged instruction so the smaller offset comes first.
1385 std::swap(NewOffset0, NewOffset1);
1386 std::swap(Data0, Data1);
1387 }
1388
1389 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1390 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1391
1392 const MCInstrDesc &Write2Desc = TII->get(Opc);
1393 DebugLoc DL = CI.I->getDebugLoc();
1394
1395 Register BaseReg = AddrReg->getReg();
1396 unsigned BaseSubReg = AddrReg->getSubReg();
1397 unsigned BaseRegFlags = 0;
1398 if (CI.BaseOff) {
1399 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1400 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1401 .addImm(CI.BaseOff);
1402
1403 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1404 BaseRegFlags = RegState::Kill;
1405
1406 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1407 .addReg(ImmReg)
1408 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1409 .addImm(0); // clamp bit
1410 BaseSubReg = 0;
1411 }
1412
1413 MachineInstrBuilder Write2 =
1414 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1415 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1416 .add(*Data0) // data0
1417 .add(*Data1) // data1
1418 .addImm(NewOffset0) // offset0
1419 .addImm(NewOffset1) // offset1
1420 .addImm(0) // gds
1421 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1422
1423 CI.I->eraseFromParent();
1424 Paired.I->eraseFromParent();
1425
1426 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1427 return Write2;
1428 }
1429
1430 MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1431 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1432 MachineBasicBlock::iterator InsertBefore) {
1433 MachineBasicBlock *MBB = CI.I->getParent();
1434 DebugLoc DL = CI.I->getDebugLoc();
1435 const unsigned Opcode = getNewOpcode(CI, Paired);
1436
1437 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1438
1439 Register DestReg = MRI->createVirtualRegister(SuperRC);
1440 unsigned MergedDMask = CI.DMask | Paired.DMask;
1441 unsigned DMaskIdx =
1442 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1443
1444 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1445 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1446 if (I == DMaskIdx)
1447 MIB.addImm(MergedDMask);
1448 else
1449 MIB.add((*CI.I).getOperand(I));
1450 }
1451
1452 // It shouldn't be possible to get this far if the two instructions
1453 // don't have a single memoperand, because MachineInstr::mayAlias()
1454 // will return true if this is the case.
1455 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1456
1457 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1458
1459 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1460
1461 CI.I->eraseFromParent();
1462 Paired.I->eraseFromParent();
1463 return New;
1464 }
1465
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1466 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1467 CombineInfo &CI, CombineInfo &Paired,
1468 MachineBasicBlock::iterator InsertBefore) {
1469 MachineBasicBlock *MBB = CI.I->getParent();
1470 DebugLoc DL = CI.I->getDebugLoc();
1471 const unsigned Opcode = getNewOpcode(CI, Paired);
1472
1473 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1474
1475 Register DestReg = MRI->createVirtualRegister(SuperRC);
1476 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1477
1478 // It shouldn't be possible to get this far if the two instructions
1479 // don't have a single memoperand, because MachineInstr::mayAlias()
1480 // will return true if this is the case.
1481 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1482
1483 MachineInstrBuilder New =
1484 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1485 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1486 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1487 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1488 New.addImm(MergedOffset);
1489 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1490
1491 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1492
1493 CI.I->eraseFromParent();
1494 Paired.I->eraseFromParent();
1495 return New;
1496 }
1497
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1498 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1499 CombineInfo &CI, CombineInfo &Paired,
1500 MachineBasicBlock::iterator InsertBefore) {
1501 MachineBasicBlock *MBB = CI.I->getParent();
1502 DebugLoc DL = CI.I->getDebugLoc();
1503
1504 const unsigned Opcode = getNewOpcode(CI, Paired);
1505
1506 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1507
1508 // Copy to the new source register.
1509 Register DestReg = MRI->createVirtualRegister(SuperRC);
1510 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1511
1512 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1513
1514 AddressRegs Regs = getRegs(Opcode, *TII);
1515
1516 if (Regs.VAddr)
1517 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1518
1519 // It shouldn't be possible to get this far if the two instructions
1520 // don't have a single memoperand, because MachineInstr::mayAlias()
1521 // will return true if this is the case.
1522 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1523
1524 MachineInstr *New =
1525 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1526 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1527 .addImm(MergedOffset) // offset
1528 .addImm(CI.CPol) // cpol
1529 .addImm(0) // swz
1530 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1531
1532 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1533
1534 CI.I->eraseFromParent();
1535 Paired.I->eraseFromParent();
1536 return New;
1537 }
1538
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1539 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1540 CombineInfo &CI, CombineInfo &Paired,
1541 MachineBasicBlock::iterator InsertBefore) {
1542 MachineBasicBlock *MBB = CI.I->getParent();
1543 DebugLoc DL = CI.I->getDebugLoc();
1544
1545 const unsigned Opcode = getNewOpcode(CI, Paired);
1546
1547 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1548
1549 // Copy to the new source register.
1550 Register DestReg = MRI->createVirtualRegister(SuperRC);
1551 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1552
1553 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1554
1555 AddressRegs Regs = getRegs(Opcode, *TII);
1556
1557 if (Regs.VAddr)
1558 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1559
1560 unsigned JoinedFormat =
1561 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1562
1563 // It shouldn't be possible to get this far if the two instructions
1564 // don't have a single memoperand, because MachineInstr::mayAlias()
1565 // will return true if this is the case.
1566 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1567
1568 MachineInstr *New =
1569 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1571 .addImm(MergedOffset) // offset
1572 .addImm(JoinedFormat) // format
1573 .addImm(CI.CPol) // cpol
1574 .addImm(0) // swz
1575 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1576
1577 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1578
1579 CI.I->eraseFromParent();
1580 Paired.I->eraseFromParent();
1581 return New;
1582 }
1583
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1584 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1585 CombineInfo &CI, CombineInfo &Paired,
1586 MachineBasicBlock::iterator InsertBefore) {
1587 MachineBasicBlock *MBB = CI.I->getParent();
1588 DebugLoc DL = CI.I->getDebugLoc();
1589
1590 const unsigned Opcode = getNewOpcode(CI, Paired);
1591
1592 Register SrcReg =
1593 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1594
1595 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1596 .addReg(SrcReg, RegState::Kill);
1597
1598 AddressRegs Regs = getRegs(Opcode, *TII);
1599
1600 if (Regs.VAddr)
1601 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1602
1603 unsigned JoinedFormat =
1604 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1605
1606 // It shouldn't be possible to get this far if the two instructions
1607 // don't have a single memoperand, because MachineInstr::mayAlias()
1608 // will return true if this is the case.
1609 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1610
1611 MachineInstr *New =
1612 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1613 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1614 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1615 .addImm(JoinedFormat) // format
1616 .addImm(CI.CPol) // cpol
1617 .addImm(0) // swz
1618 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1619
1620 CI.I->eraseFromParent();
1621 Paired.I->eraseFromParent();
1622 return New;
1623 }
1624
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1625 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1626 CombineInfo &CI, CombineInfo &Paired,
1627 MachineBasicBlock::iterator InsertBefore) {
1628 MachineBasicBlock *MBB = CI.I->getParent();
1629 DebugLoc DL = CI.I->getDebugLoc();
1630
1631 const unsigned Opcode = getNewOpcode(CI, Paired);
1632
1633 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1634 Register DestReg = MRI->createVirtualRegister(SuperRC);
1635
1636 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1637
1638 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1639 MIB.add(*SAddr);
1640
1641 MachineInstr *New =
1642 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1643 .addImm(std::min(CI.Offset, Paired.Offset))
1644 .addImm(CI.CPol)
1645 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1646
1647 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1648
1649 CI.I->eraseFromParent();
1650 Paired.I->eraseFromParent();
1651 return New;
1652 }
1653
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1654 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1655 CombineInfo &CI, CombineInfo &Paired,
1656 MachineBasicBlock::iterator InsertBefore) {
1657 MachineBasicBlock *MBB = CI.I->getParent();
1658 DebugLoc DL = CI.I->getDebugLoc();
1659
1660 const unsigned Opcode = getNewOpcode(CI, Paired);
1661
1662 Register SrcReg =
1663 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1664
1665 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1666 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1667 .addReg(SrcReg, RegState::Kill);
1668
1669 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1670 MIB.add(*SAddr);
1671
1672 MachineInstr *New =
1673 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1674 .addImm(CI.CPol)
1675 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1676
1677 CI.I->eraseFromParent();
1678 Paired.I->eraseFromParent();
1679 return New;
1680 }
1681
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1682 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1683 const CombineInfo &Paired) {
1684 const unsigned Width = CI.Width + Paired.Width;
1685
1686 switch (getCommonInstClass(CI, Paired)) {
1687 default:
1688 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1689 // FIXME: Handle d16 correctly
1690 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1691 Width);
1692 case TBUFFER_LOAD:
1693 case TBUFFER_STORE:
1694 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1695 Width);
1696
1697 case UNKNOWN:
1698 llvm_unreachable("Unknown instruction class");
1699 case S_BUFFER_LOAD_IMM:
1700 switch (Width) {
1701 default:
1702 return 0;
1703 case 2:
1704 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1705 case 3:
1706 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1707 case 4:
1708 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1709 case 8:
1710 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1711 }
1712 case S_BUFFER_LOAD_SGPR_IMM:
1713 switch (Width) {
1714 default:
1715 return 0;
1716 case 2:
1717 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1718 case 3:
1719 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1720 case 4:
1721 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1722 case 8:
1723 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1724 }
1725 case S_LOAD_IMM: {
1726 // If XNACK is enabled, use the constrained opcodes when the first load is
1727 // under-aligned.
1728 const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1729 bool NeedsConstrainedOpc =
1730 STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
1731 switch (Width) {
1732 default:
1733 return 0;
1734 case 2:
1735 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736 : AMDGPU::S_LOAD_DWORDX2_IMM;
1737 case 3:
1738 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739 : AMDGPU::S_LOAD_DWORDX3_IMM;
1740 case 4:
1741 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742 : AMDGPU::S_LOAD_DWORDX4_IMM;
1743 case 8:
1744 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745 : AMDGPU::S_LOAD_DWORDX8_IMM;
1746 }
1747 }
1748 case GLOBAL_LOAD:
1749 switch (Width) {
1750 default:
1751 return 0;
1752 case 2:
1753 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1754 case 3:
1755 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1756 case 4:
1757 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1758 }
1759 case GLOBAL_LOAD_SADDR:
1760 switch (Width) {
1761 default:
1762 return 0;
1763 case 2:
1764 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1765 case 3:
1766 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1767 case 4:
1768 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1769 }
1770 case GLOBAL_STORE:
1771 switch (Width) {
1772 default:
1773 return 0;
1774 case 2:
1775 return AMDGPU::GLOBAL_STORE_DWORDX2;
1776 case 3:
1777 return AMDGPU::GLOBAL_STORE_DWORDX3;
1778 case 4:
1779 return AMDGPU::GLOBAL_STORE_DWORDX4;
1780 }
1781 case GLOBAL_STORE_SADDR:
1782 switch (Width) {
1783 default:
1784 return 0;
1785 case 2:
1786 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1787 case 3:
1788 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1789 case 4:
1790 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1791 }
1792 case FLAT_LOAD:
1793 switch (Width) {
1794 default:
1795 return 0;
1796 case 2:
1797 return AMDGPU::FLAT_LOAD_DWORDX2;
1798 case 3:
1799 return AMDGPU::FLAT_LOAD_DWORDX3;
1800 case 4:
1801 return AMDGPU::FLAT_LOAD_DWORDX4;
1802 }
1803 case FLAT_STORE:
1804 switch (Width) {
1805 default:
1806 return 0;
1807 case 2:
1808 return AMDGPU::FLAT_STORE_DWORDX2;
1809 case 3:
1810 return AMDGPU::FLAT_STORE_DWORDX3;
1811 case 4:
1812 return AMDGPU::FLAT_STORE_DWORDX4;
1813 }
1814 case MIMG:
1815 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1816 "No overlaps");
1817 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1818 }
1819 }
1820
1821 std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1822 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1823 const CombineInfo &Paired) {
1824 assert((CI.InstClass != MIMG ||
1825 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1826 CI.Width + Paired.Width)) &&
1827 "No overlaps");
1828
1829 unsigned Idx0;
1830 unsigned Idx1;
1831
1832 static const unsigned Idxs[5][4] = {
1833 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1834 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1835 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1836 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1837 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1838 };
1839
1840 assert(CI.Width >= 1 && CI.Width <= 4);
1841 assert(Paired.Width >= 1 && Paired.Width <= 4);
1842
1843 if (Paired < CI) {
1844 Idx1 = Idxs[0][Paired.Width - 1];
1845 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1846 } else {
1847 Idx0 = Idxs[0][CI.Width - 1];
1848 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1849 }
1850
1851 return {Idx0, Idx1};
1852 }
1853
1854 const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired) const1855 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1856 const CombineInfo &Paired) const {
1857 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1858 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1859 switch (CI.Width + Paired.Width) {
1860 default:
1861 return nullptr;
1862 case 2:
1863 return &AMDGPU::SReg_64_XEXECRegClass;
1864 case 3:
1865 return &AMDGPU::SGPR_96RegClass;
1866 case 4:
1867 return &AMDGPU::SGPR_128RegClass;
1868 case 8:
1869 return &AMDGPU::SGPR_256RegClass;
1870 case 16:
1871 return &AMDGPU::SGPR_512RegClass;
1872 }
1873 }
1874
1875 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1876 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1877 ? TRI->getAGPRClassForBitWidth(BitWidth)
1878 : TRI->getVGPRClassForBitWidth(BitWidth);
1879 }
1880
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1881 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1882 CombineInfo &CI, CombineInfo &Paired,
1883 MachineBasicBlock::iterator InsertBefore) {
1884 MachineBasicBlock *MBB = CI.I->getParent();
1885 DebugLoc DL = CI.I->getDebugLoc();
1886
1887 const unsigned Opcode = getNewOpcode(CI, Paired);
1888
1889 Register SrcReg =
1890 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1891
1892 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1893 .addReg(SrcReg, RegState::Kill);
1894
1895 AddressRegs Regs = getRegs(Opcode, *TII);
1896
1897 if (Regs.VAddr)
1898 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1899
1900
1901 // It shouldn't be possible to get this far if the two instructions
1902 // don't have a single memoperand, because MachineInstr::mayAlias()
1903 // will return true if this is the case.
1904 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1905
1906 MachineInstr *New =
1907 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1908 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1909 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1910 .addImm(CI.CPol) // cpol
1911 .addImm(0) // swz
1912 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1913
1914 CI.I->eraseFromParent();
1915 Paired.I->eraseFromParent();
1916 return New;
1917 }
1918
1919 MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const1920 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1921 APInt V(32, Val, true);
1922 if (TII->isInlineConstant(V))
1923 return MachineOperand::CreateImm(Val);
1924
1925 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1926 MachineInstr *Mov =
1927 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1928 TII->get(AMDGPU::S_MOV_B32), Reg)
1929 .addImm(Val);
1930 (void)Mov;
1931 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1932 return MachineOperand::CreateReg(Reg, false);
1933 }
1934
1935 // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1936 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1937 const MemAddress &Addr) const {
1938 MachineBasicBlock *MBB = MI.getParent();
1939 MachineBasicBlock::iterator MBBI = MI.getIterator();
1940 DebugLoc DL = MI.getDebugLoc();
1941
1942 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1943 Addr.Base.LoSubReg) &&
1944 "Expected 32-bit Base-Register-Low!!");
1945
1946 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1947 Addr.Base.HiSubReg) &&
1948 "Expected 32-bit Base-Register-Hi!!");
1949
1950 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1951 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1952 MachineOperand OffsetHi =
1953 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1954
1955 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1956 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1957 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1958
1959 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1960 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1961 MachineInstr *LoHalf =
1962 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1963 .addReg(CarryReg, RegState::Define)
1964 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1965 .add(OffsetLo)
1966 .addImm(0); // clamp bit
1967 (void)LoHalf;
1968 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1969
1970 MachineInstr *HiHalf =
1971 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1972 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1973 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1974 .add(OffsetHi)
1975 .addReg(CarryReg, RegState::Kill)
1976 .addImm(0); // clamp bit
1977 (void)HiHalf;
1978 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1979
1980 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1981 MachineInstr *FullBase =
1982 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1983 .addReg(DestSub0)
1984 .addImm(AMDGPU::sub0)
1985 .addReg(DestSub1)
1986 .addImm(AMDGPU::sub1);
1987 (void)FullBase;
1988 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1989
1990 return FullDestReg;
1991 }
1992
1993 // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const1994 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1995 Register NewBase,
1996 int32_t NewOffset) const {
1997 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1998 Base->setReg(NewBase);
1999 Base->setIsKill(false);
2000 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2001 }
2002
2003 std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const2004 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2005 if (Op.isImm())
2006 return Op.getImm();
2007
2008 if (!Op.isReg())
2009 return std::nullopt;
2010
2011 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2012 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2013 !Def->getOperand(1).isImm())
2014 return std::nullopt;
2015
2016 return Def->getOperand(1).getImm();
2017 }
2018
2019 // Analyze Base and extracts:
2020 // - 32bit base registers, subregisters
2021 // - 64bit constant offset
2022 // Expecting base computation as:
2023 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
2024 // %LO:vgpr_32, %c:sreg_64_xexec =
2025 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2026 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2027 // %Base:vreg_64 =
2028 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const2029 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2030 MemAddress &Addr) const {
2031 if (!Base.isReg())
2032 return;
2033
2034 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2035 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2036 || Def->getNumOperands() != 5)
2037 return;
2038
2039 MachineOperand BaseLo = Def->getOperand(1);
2040 MachineOperand BaseHi = Def->getOperand(3);
2041 if (!BaseLo.isReg() || !BaseHi.isReg())
2042 return;
2043
2044 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2045 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2046
2047 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2048 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2049 return;
2050
2051 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2052 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2053
2054 auto Offset0P = extractConstOffset(*Src0);
2055 if (Offset0P)
2056 BaseLo = *Src1;
2057 else {
2058 if (!(Offset0P = extractConstOffset(*Src1)))
2059 return;
2060 BaseLo = *Src0;
2061 }
2062
2063 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2064 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2065
2066 if (Src0->isImm())
2067 std::swap(Src0, Src1);
2068
2069 if (!Src1->isImm() || Src0->isImm())
2070 return;
2071
2072 uint64_t Offset1 = Src1->getImm();
2073 BaseHi = *Src0;
2074
2075 Addr.Base.LoReg = BaseLo.getReg();
2076 Addr.Base.HiReg = BaseHi.getReg();
2077 Addr.Base.LoSubReg = BaseLo.getSubReg();
2078 Addr.Base.HiSubReg = BaseHi.getSubReg();
2079 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2080 }
2081
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const2082 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2083 MachineInstr &MI,
2084 MemInfoMap &Visited,
2085 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2086
2087 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2088 return false;
2089
2090 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2091 if (SIInstrInfo::isFLATScratch(MI))
2092 return false;
2093
2094 unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2095 : AMDGPUAS::FLAT_ADDRESS;
2096
2097 if (AnchorList.count(&MI))
2098 return false;
2099
2100 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2101
2102 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2103 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2104 return false;
2105 }
2106
2107 // Step1: Find the base-registers and a 64bit constant offset.
2108 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2109 MemAddress MAddr;
2110 if (!Visited.contains(&MI)) {
2111 processBaseWithConstOffset(Base, MAddr);
2112 Visited[&MI] = MAddr;
2113 } else
2114 MAddr = Visited[&MI];
2115
2116 if (MAddr.Offset == 0) {
2117 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2118 " constant offsets that can be promoted.\n";);
2119 return false;
2120 }
2121
2122 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2123 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2124
2125 // Step2: Traverse through MI's basic block and find an anchor(that has the
2126 // same base-registers) with the highest 13bit distance from MI's offset.
2127 // E.g. (64bit loads)
2128 // bb:
2129 // addr1 = &a + 4096; load1 = load(addr1, 0)
2130 // addr2 = &a + 6144; load2 = load(addr2, 0)
2131 // addr3 = &a + 8192; load3 = load(addr3, 0)
2132 // addr4 = &a + 10240; load4 = load(addr4, 0)
2133 // addr5 = &a + 12288; load5 = load(addr5, 0)
2134 //
2135 // Starting from the first load, the optimization will try to find a new base
2136 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2137 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2138 // as the new-base(anchor) because of the maximum distance which can
2139 // accommodate more intermediate bases presumably.
2140 //
2141 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2142 // (&a + 8192) for load1, load2, load4.
2143 // addr = &a + 8192
2144 // load1 = load(addr, -4096)
2145 // load2 = load(addr, -2048)
2146 // load3 = load(addr, 0)
2147 // load4 = load(addr, 2048)
2148 // addr5 = &a + 12288; load5 = load(addr5, 0)
2149 //
2150 MachineInstr *AnchorInst = nullptr;
2151 MemAddress AnchorAddr;
2152 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2153 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2154
2155 MachineBasicBlock *MBB = MI.getParent();
2156 MachineBasicBlock::iterator E = MBB->end();
2157 MachineBasicBlock::iterator MBBI = MI.getIterator();
2158 ++MBBI;
2159 const SITargetLowering *TLI =
2160 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2161
2162 for ( ; MBBI != E; ++MBBI) {
2163 MachineInstr &MINext = *MBBI;
2164 // TODO: Support finding an anchor(with same base) from store addresses or
2165 // any other load addresses where the opcodes are different.
2166 if (MINext.getOpcode() != MI.getOpcode() ||
2167 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2168 continue;
2169
2170 const MachineOperand &BaseNext =
2171 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2172 MemAddress MAddrNext;
2173 if (!Visited.contains(&MINext)) {
2174 processBaseWithConstOffset(BaseNext, MAddrNext);
2175 Visited[&MINext] = MAddrNext;
2176 } else
2177 MAddrNext = Visited[&MINext];
2178
2179 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2180 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2181 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2182 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2183 continue;
2184
2185 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2186
2187 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2188 TargetLoweringBase::AddrMode AM;
2189 AM.HasBaseReg = true;
2190 AM.BaseOffs = Dist;
2191 if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2192 (uint32_t)std::abs(Dist) > MaxDist) {
2193 MaxDist = std::abs(Dist);
2194
2195 AnchorAddr = MAddrNext;
2196 AnchorInst = &MINext;
2197 }
2198 }
2199
2200 if (AnchorInst) {
2201 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2202 AnchorInst->dump());
2203 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2204 << AnchorAddr.Offset << "\n\n");
2205
2206 // Instead of moving up, just re-compute anchor-instruction's base address.
2207 Register Base = computeBase(MI, AnchorAddr);
2208
2209 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2210 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2211
2212 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2213 TargetLoweringBase::AddrMode AM;
2214 AM.HasBaseReg = true;
2215 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2216
2217 if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2218 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2219 OtherMI->dump());
2220 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2221 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2222 }
2223 }
2224 AnchorList.insert(AnchorInst);
2225 return true;
2226 }
2227
2228 return false;
2229 }
2230
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const2231 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2232 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2233 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2234 if (AddrList.front().InstClass == CI.InstClass &&
2235 AddrList.front().IsAGPR == CI.IsAGPR &&
2236 AddrList.front().hasSameBaseAddress(CI)) {
2237 AddrList.emplace_back(CI);
2238 return;
2239 }
2240 }
2241
2242 // Base address not found, so add a new list.
2243 MergeableInsts.emplace_back(1, CI);
2244 }
2245
2246 std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const2247 SILoadStoreOptimizer::collectMergeableInsts(
2248 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2249 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2250 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2251 bool Modified = false;
2252
2253 // Sort potential mergeable instructions into lists. One list per base address.
2254 unsigned Order = 0;
2255 MachineBasicBlock::iterator BlockI = Begin;
2256 for (; BlockI != End; ++BlockI) {
2257 MachineInstr &MI = *BlockI;
2258
2259 // We run this before checking if an address is mergeable, because it can produce
2260 // better code even if the instructions aren't mergeable.
2261 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2262 Modified = true;
2263
2264 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2265 // barriers. We can look after this barrier for separate merges.
2266 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2267 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2268
2269 // Search will resume after this instruction in a separate merge list.
2270 ++BlockI;
2271 break;
2272 }
2273
2274 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2275 if (InstClass == UNKNOWN)
2276 continue;
2277
2278 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2279 int Swizzled =
2280 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2281 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2282 continue;
2283
2284 CombineInfo CI;
2285 CI.setMI(MI, *this);
2286 CI.Order = Order++;
2287
2288 if (!CI.hasMergeableAddress(*MRI))
2289 continue;
2290
2291 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2292 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2293 // operands. However we are reporting that ds_write2 shall have
2294 // only VGPR data so that machine copy propagation does not
2295 // create an illegal instruction with a VGPR and AGPR sources.
2296 // Consequenctially if we create such instruction the verifier
2297 // will complain.
2298 continue;
2299 }
2300
2301 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2302
2303 addInstToMergeableList(CI, MergeableInsts);
2304 }
2305
2306 // At this point we have lists of Mergeable instructions.
2307 //
2308 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2309 // list try to find an instruction that can be merged with I. If an instruction
2310 // is found, it is stored in the Paired field. If no instructions are found, then
2311 // the CombineInfo object is deleted from the list.
2312
2313 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2314 E = MergeableInsts.end(); I != E;) {
2315
2316 std::list<CombineInfo> &MergeList = *I;
2317 if (MergeList.size() <= 1) {
2318 // This means we have found only one instruction with a given address
2319 // that can be merged, and we need at least 2 instructions to do a merge,
2320 // so this list can be discarded.
2321 I = MergeableInsts.erase(I);
2322 continue;
2323 }
2324
2325 // Sort the lists by offsets, this way mergeable instructions will be
2326 // adjacent to each other in the list, which will make it easier to find
2327 // matches.
2328 MergeList.sort(
2329 [] (const CombineInfo &A, const CombineInfo &B) {
2330 return A.Offset < B.Offset;
2331 });
2332 ++I;
2333 }
2334
2335 return {BlockI, Modified};
2336 }
2337
2338 // Scan through looking for adjacent LDS operations with constant offsets from
2339 // the same base register. We rely on the scheduler to do the hard work of
2340 // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)2341 bool SILoadStoreOptimizer::optimizeBlock(
2342 std::list<std::list<CombineInfo> > &MergeableInsts) {
2343 bool Modified = false;
2344
2345 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2346 E = MergeableInsts.end(); I != E;) {
2347 std::list<CombineInfo> &MergeList = *I;
2348
2349 bool OptimizeListAgain = false;
2350 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2351 // We weren't able to make any changes, so delete the list so we don't
2352 // process the same instructions the next time we try to optimize this
2353 // block.
2354 I = MergeableInsts.erase(I);
2355 continue;
2356 }
2357
2358 Modified = true;
2359
2360 // We made changes, but also determined that there were no more optimization
2361 // opportunities, so we don't need to reprocess the list
2362 if (!OptimizeListAgain) {
2363 I = MergeableInsts.erase(I);
2364 continue;
2365 }
2366 OptimizeAgain = true;
2367 }
2368 return Modified;
2369 }
2370
2371 bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)2372 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2373 std::list<CombineInfo> &MergeList,
2374 bool &OptimizeListAgain) {
2375 if (MergeList.empty())
2376 return false;
2377
2378 bool Modified = false;
2379
2380 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2381 Next = std::next(I)) {
2382
2383 auto First = I;
2384 auto Second = Next;
2385
2386 if ((*First).Order > (*Second).Order)
2387 std::swap(First, Second);
2388 CombineInfo &CI = *First;
2389 CombineInfo &Paired = *Second;
2390
2391 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2392 if (!Where) {
2393 ++I;
2394 continue;
2395 }
2396
2397 Modified = true;
2398
2399 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2400
2401 MachineBasicBlock::iterator NewMI;
2402 switch (CI.InstClass) {
2403 default:
2404 llvm_unreachable("unknown InstClass");
2405 break;
2406 case DS_READ:
2407 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2408 break;
2409 case DS_WRITE:
2410 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2411 break;
2412 case S_BUFFER_LOAD_IMM:
2413 case S_BUFFER_LOAD_SGPR_IMM:
2414 case S_LOAD_IMM:
2415 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2416 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2417 break;
2418 case BUFFER_LOAD:
2419 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2420 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2421 break;
2422 case BUFFER_STORE:
2423 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2424 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2425 break;
2426 case MIMG:
2427 NewMI = mergeImagePair(CI, Paired, Where->I);
2428 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2429 break;
2430 case TBUFFER_LOAD:
2431 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2432 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2433 break;
2434 case TBUFFER_STORE:
2435 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2436 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2437 break;
2438 case FLAT_LOAD:
2439 case GLOBAL_LOAD:
2440 case GLOBAL_LOAD_SADDR:
2441 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2442 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2443 break;
2444 case FLAT_STORE:
2445 case GLOBAL_STORE:
2446 case GLOBAL_STORE_SADDR:
2447 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2448 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2449 break;
2450 }
2451 CI.setMI(NewMI, *this);
2452 CI.Order = Where->Order;
2453 if (I == Second)
2454 I = Next;
2455
2456 MergeList.erase(Second);
2457 }
2458
2459 return Modified;
2460 }
2461
runOnMachineFunction(MachineFunction & MF)2462 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2463 if (skipFunction(MF.getFunction()))
2464 return false;
2465
2466 STM = &MF.getSubtarget<GCNSubtarget>();
2467 if (!STM->loadStoreOptEnabled())
2468 return false;
2469
2470 TII = STM->getInstrInfo();
2471 TRI = &TII->getRegisterInfo();
2472
2473 MRI = &MF.getRegInfo();
2474 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2475
2476 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2477
2478 bool Modified = false;
2479
2480 // Contains the list of instructions for which constant offsets are being
2481 // promoted to the IMM. This is tracked for an entire block at time.
2482 SmallPtrSet<MachineInstr *, 4> AnchorList;
2483 MemInfoMap Visited;
2484
2485 for (MachineBasicBlock &MBB : MF) {
2486 MachineBasicBlock::iterator SectionEnd;
2487 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2488 I = SectionEnd) {
2489 bool CollectModified;
2490 std::list<std::list<CombineInfo>> MergeableInsts;
2491
2492 // First pass: Collect list of all instructions we know how to merge in a
2493 // subset of the block.
2494 std::tie(SectionEnd, CollectModified) =
2495 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2496
2497 Modified |= CollectModified;
2498
2499 do {
2500 OptimizeAgain = false;
2501 Modified |= optimizeBlock(MergeableInsts);
2502 } while (OptimizeAgain);
2503 }
2504
2505 Visited.clear();
2506 AnchorList.clear();
2507 }
2508
2509 return Modified;
2510 }
2511