xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   S_BUFFER_LOAD_SGPR_IMM,
78   S_LOAD_IMM,
79   BUFFER_LOAD,
80   BUFFER_STORE,
81   MIMG,
82   TBUFFER_LOAD,
83   TBUFFER_STORE,
84   GLOBAL_LOAD_SADDR,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE,
88   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90                // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94   unsigned char NumVAddrs = 0;
95   bool SBase = false;
96   bool SRsrc = false;
97   bool SOffset = false;
98   bool SAddr = false;
99   bool VAddr = false;
100   bool Addr = false;
101   bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108   struct CombineInfo {
109     MachineBasicBlock::iterator I;
110     unsigned EltSize;
111     unsigned Offset;
112     unsigned Width;
113     unsigned Format;
114     unsigned BaseOff;
115     unsigned DMask;
116     InstClassEnum InstClass;
117     unsigned CPol = 0;
118     bool IsAGPR;
119     bool UseST64;
120     int AddrIdx[MaxAddressRegs];
121     const MachineOperand *AddrReg[MaxAddressRegs];
122     unsigned NumAddresses;
123     unsigned Order;
124 
hasSameBaseAddress__anon1ff0d37f0111::SILoadStoreOptimizer::CombineInfo125     bool hasSameBaseAddress(const CombineInfo &CI) {
126       if (NumAddresses != CI.NumAddresses)
127         return false;
128 
129       const MachineInstr &MI = *CI.I;
130       for (unsigned i = 0; i < NumAddresses; i++) {
131         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136             return false;
137           }
138           continue;
139         }
140 
141         // Check same base pointer. Be careful of subregisters, which can occur
142         // with vectors of pointers.
143         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145          return false;
146         }
147       }
148       return true;
149     }
150 
hasMergeableAddress__anon1ff0d37f0111::SILoadStoreOptimizer::CombineInfo151     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152       for (unsigned i = 0; i < NumAddresses; ++i) {
153         const MachineOperand *AddrOp = AddrReg[i];
154         // Immediates are always OK.
155         if (AddrOp->isImm())
156           continue;
157 
158         // Don't try to merge addresses that aren't either immediates or registers.
159         // TODO: Should be possible to merge FrameIndexes and maybe some other
160         // non-register
161         if (!AddrOp->isReg())
162           return false;
163 
164         // TODO: We should be able to merge instructions with other physical reg
165         // addresses too.
166         if (AddrOp->getReg().isPhysical() &&
167             AddrOp->getReg() != AMDGPU::SGPR_NULL)
168           return false;
169 
170         // If an address has only one use then there will be no other
171         // instructions with the same address, so we can't merge this one.
172         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173           return false;
174       }
175       return true;
176     }
177 
178     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179 
180     // Compare by pointer order.
operator <__anon1ff0d37f0111::SILoadStoreOptimizer::CombineInfo181     bool operator<(const CombineInfo& Other) const {
182       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183     }
184   };
185 
186   struct BaseRegisters {
187     Register LoReg;
188     Register HiReg;
189 
190     unsigned LoSubReg = 0;
191     unsigned HiSubReg = 0;
192   };
193 
194   struct MemAddress {
195     BaseRegisters Base;
196     int64_t Offset = 0;
197   };
198 
199   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200 
201 private:
202   const GCNSubtarget *STM = nullptr;
203   const SIInstrInfo *TII = nullptr;
204   const SIRegisterInfo *TRI = nullptr;
205   MachineRegisterInfo *MRI = nullptr;
206   AliasAnalysis *AA = nullptr;
207   bool OptimizeAgain;
208 
209   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210                            const DenseSet<Register> &ARegUses,
211                            const MachineInstr &A, const MachineInstr &B) const;
212   static bool dmasksCanBeCombined(const CombineInfo &CI,
213                                   const SIInstrInfo &TII,
214                                   const CombineInfo &Paired);
215   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216                                    CombineInfo &Paired, bool Modify = false);
217   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218                         const CombineInfo &Paired);
219   unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221                                                      const CombineInfo &Paired);
222   const TargetRegisterClass *
223   getTargetRegisterClass(const CombineInfo &CI,
224                          const CombineInfo &Paired) const;
225   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
226 
227   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
228 
229   void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
230                       MachineBasicBlock::iterator InsertBefore, int OpName,
231                       Register DestReg) const;
232   Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
233                            MachineBasicBlock::iterator InsertBefore,
234                            int OpName) const;
235 
236   unsigned read2Opcode(unsigned EltSize) const;
237   unsigned read2ST64Opcode(unsigned EltSize) const;
238   MachineBasicBlock::iterator
239   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
240                  MachineBasicBlock::iterator InsertBefore);
241 
242   unsigned write2Opcode(unsigned EltSize) const;
243   unsigned write2ST64Opcode(unsigned EltSize) const;
244   MachineBasicBlock::iterator
245   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
246                   MachineBasicBlock::iterator InsertBefore);
247   MachineBasicBlock::iterator
248   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
249                  MachineBasicBlock::iterator InsertBefore);
250   MachineBasicBlock::iterator
251   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
252                        MachineBasicBlock::iterator InsertBefore);
253   MachineBasicBlock::iterator
254   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
255                       MachineBasicBlock::iterator InsertBefore);
256   MachineBasicBlock::iterator
257   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
258                        MachineBasicBlock::iterator InsertBefore);
259   MachineBasicBlock::iterator
260   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
261                        MachineBasicBlock::iterator InsertBefore);
262   MachineBasicBlock::iterator
263   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
264                         MachineBasicBlock::iterator InsertBefore);
265   MachineBasicBlock::iterator
266   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
267                     MachineBasicBlock::iterator InsertBefore);
268   MachineBasicBlock::iterator
269   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
270                      MachineBasicBlock::iterator InsertBefore);
271 
272   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
273                            int32_t NewOffset) const;
274   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
275   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
276   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
277   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
278   /// Promotes constant offset to the immediate by adjusting the base. It
279   /// tries to use a base from the nearby instructions that allows it to have
280   /// a 13bit constant offset which gets promoted to the immediate.
281   bool promoteConstantOffsetToImm(MachineInstr &CI,
282                                   MemInfoMap &Visited,
283                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
284   void addInstToMergeableList(const CombineInfo &CI,
285                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
286 
287   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
288       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
289       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
290       std::list<std::list<CombineInfo>> &MergeableInsts) const;
291 
292   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
293                                                      const CombineInfo &Paired);
294 
295   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
296                                           const CombineInfo &Paired);
297 
298 public:
299   static char ID;
300 
SILoadStoreOptimizer()301   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
302     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
303   }
304 
305   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306                                      bool &OptimizeListAgain);
307   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
308 
309   bool runOnMachineFunction(MachineFunction &MF) override;
310 
getPassName() const311   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
312 
getAnalysisUsage(AnalysisUsage & AU) const313   void getAnalysisUsage(AnalysisUsage &AU) const override {
314     AU.setPreservesCFG();
315     AU.addRequired<AAResultsWrapperPass>();
316 
317     MachineFunctionPass::getAnalysisUsage(AU);
318   }
319 
getRequiredProperties() const320   MachineFunctionProperties getRequiredProperties() const override {
321     return MachineFunctionProperties()
322       .set(MachineFunctionProperties::Property::IsSSA);
323   }
324 };
325 
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)326 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
327   const unsigned Opc = MI.getOpcode();
328 
329   if (TII.isMUBUF(Opc)) {
330     // FIXME: Handle d16 correctly
331     return AMDGPU::getMUBUFElements(Opc);
332   }
333   if (TII.isImage(MI)) {
334     uint64_t DMaskImm =
335         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
336     return llvm::popcount(DMaskImm);
337   }
338   if (TII.isMTBUF(Opc)) {
339     return AMDGPU::getMTBUFElements(Opc);
340   }
341 
342   switch (Opc) {
343   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345   case AMDGPU::S_LOAD_DWORD_IMM:
346   case AMDGPU::GLOBAL_LOAD_DWORD:
347   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348   case AMDGPU::GLOBAL_STORE_DWORD:
349   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350   case AMDGPU::FLAT_LOAD_DWORD:
351   case AMDGPU::FLAT_STORE_DWORD:
352     return 1;
353   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355   case AMDGPU::S_LOAD_DWORDX2_IMM:
356   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
357   case AMDGPU::GLOBAL_LOAD_DWORDX2:
358   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
359   case AMDGPU::GLOBAL_STORE_DWORDX2:
360   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
361   case AMDGPU::FLAT_LOAD_DWORDX2:
362   case AMDGPU::FLAT_STORE_DWORDX2:
363     return 2;
364   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
365   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
366   case AMDGPU::S_LOAD_DWORDX3_IMM:
367   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
368   case AMDGPU::GLOBAL_LOAD_DWORDX3:
369   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
370   case AMDGPU::GLOBAL_STORE_DWORDX3:
371   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
372   case AMDGPU::FLAT_LOAD_DWORDX3:
373   case AMDGPU::FLAT_STORE_DWORDX3:
374     return 3;
375   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
377   case AMDGPU::S_LOAD_DWORDX4_IMM:
378   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
379   case AMDGPU::GLOBAL_LOAD_DWORDX4:
380   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
381   case AMDGPU::GLOBAL_STORE_DWORDX4:
382   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
383   case AMDGPU::FLAT_LOAD_DWORDX4:
384   case AMDGPU::FLAT_STORE_DWORDX4:
385     return 4;
386   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
388   case AMDGPU::S_LOAD_DWORDX8_IMM:
389   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
390     return 8;
391   case AMDGPU::DS_READ_B32:
392   case AMDGPU::DS_READ_B32_gfx9:
393   case AMDGPU::DS_WRITE_B32:
394   case AMDGPU::DS_WRITE_B32_gfx9:
395     return 1;
396   case AMDGPU::DS_READ_B64:
397   case AMDGPU::DS_READ_B64_gfx9:
398   case AMDGPU::DS_WRITE_B64:
399   case AMDGPU::DS_WRITE_B64_gfx9:
400     return 2;
401   default:
402     return 0;
403   }
404 }
405 
406 /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)407 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
408   switch (Opc) {
409   default:
410     if (TII.isMUBUF(Opc)) {
411       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
412       default:
413         return UNKNOWN;
414       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
415       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
416       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
417       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
418       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
419       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
420       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
421       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
422       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
423       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
424       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
425       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
426       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
427       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
428       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
429       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
430         return BUFFER_LOAD;
431       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
432       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
433       case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
434       case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
435       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
436       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
437       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
438       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
439       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
440       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
441       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
442       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
443       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
444       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
445       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
446       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
447         return BUFFER_STORE;
448       }
449     }
450     if (TII.isImage(Opc)) {
451       // Ignore instructions encoded without vaddr.
452       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
453           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
454         return UNKNOWN;
455       // Ignore BVH instructions
456       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
457         return UNKNOWN;
458       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
459       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
460           TII.isGather4(Opc))
461         return UNKNOWN;
462       return MIMG;
463     }
464     if (TII.isMTBUF(Opc)) {
465       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
466       default:
467         return UNKNOWN;
468       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
469       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
470       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
471       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
472       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
473       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
474       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
475       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
476       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
477       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
478       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
479       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
480       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
481       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
482       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
483       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
484         return TBUFFER_LOAD;
485       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
486       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
487       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
488       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
489       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
490       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
491       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
492       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
493         return TBUFFER_STORE;
494       }
495     }
496     return UNKNOWN;
497   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
498   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
499   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
500   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
502     return S_BUFFER_LOAD_IMM;
503   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
505   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
508     return S_BUFFER_LOAD_SGPR_IMM;
509   case AMDGPU::S_LOAD_DWORD_IMM:
510   case AMDGPU::S_LOAD_DWORDX2_IMM:
511   case AMDGPU::S_LOAD_DWORDX3_IMM:
512   case AMDGPU::S_LOAD_DWORDX4_IMM:
513   case AMDGPU::S_LOAD_DWORDX8_IMM:
514   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
515   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
516   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
517   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
518     return S_LOAD_IMM;
519   case AMDGPU::DS_READ_B32:
520   case AMDGPU::DS_READ_B32_gfx9:
521   case AMDGPU::DS_READ_B64:
522   case AMDGPU::DS_READ_B64_gfx9:
523     return DS_READ;
524   case AMDGPU::DS_WRITE_B32:
525   case AMDGPU::DS_WRITE_B32_gfx9:
526   case AMDGPU::DS_WRITE_B64:
527   case AMDGPU::DS_WRITE_B64_gfx9:
528     return DS_WRITE;
529   case AMDGPU::GLOBAL_LOAD_DWORD:
530   case AMDGPU::GLOBAL_LOAD_DWORDX2:
531   case AMDGPU::GLOBAL_LOAD_DWORDX3:
532   case AMDGPU::GLOBAL_LOAD_DWORDX4:
533   case AMDGPU::FLAT_LOAD_DWORD:
534   case AMDGPU::FLAT_LOAD_DWORDX2:
535   case AMDGPU::FLAT_LOAD_DWORDX3:
536   case AMDGPU::FLAT_LOAD_DWORDX4:
537     return FLAT_LOAD;
538   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
539   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
540   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
541   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
542     return GLOBAL_LOAD_SADDR;
543   case AMDGPU::GLOBAL_STORE_DWORD:
544   case AMDGPU::GLOBAL_STORE_DWORDX2:
545   case AMDGPU::GLOBAL_STORE_DWORDX3:
546   case AMDGPU::GLOBAL_STORE_DWORDX4:
547   case AMDGPU::FLAT_STORE_DWORD:
548   case AMDGPU::FLAT_STORE_DWORDX2:
549   case AMDGPU::FLAT_STORE_DWORDX3:
550   case AMDGPU::FLAT_STORE_DWORDX4:
551     return FLAT_STORE;
552   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
553   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
554   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
555   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
556     return GLOBAL_STORE_SADDR;
557   }
558 }
559 
560 /// Determines instruction subclass from opcode. Only instructions
561 /// of the same subclass can be merged together. The merged instruction may have
562 /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)563 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
564   switch (Opc) {
565   default:
566     if (TII.isMUBUF(Opc))
567       return AMDGPU::getMUBUFBaseOpcode(Opc);
568     if (TII.isImage(Opc)) {
569       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
570       assert(Info);
571       return Info->BaseOpcode;
572     }
573     if (TII.isMTBUF(Opc))
574       return AMDGPU::getMTBUFBaseOpcode(Opc);
575     return -1;
576   case AMDGPU::DS_READ_B32:
577   case AMDGPU::DS_READ_B32_gfx9:
578   case AMDGPU::DS_READ_B64:
579   case AMDGPU::DS_READ_B64_gfx9:
580   case AMDGPU::DS_WRITE_B32:
581   case AMDGPU::DS_WRITE_B32_gfx9:
582   case AMDGPU::DS_WRITE_B64:
583   case AMDGPU::DS_WRITE_B64_gfx9:
584     return Opc;
585   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
586   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
587   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
588   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
590     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
593   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
596     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597   case AMDGPU::S_LOAD_DWORD_IMM:
598   case AMDGPU::S_LOAD_DWORDX2_IMM:
599   case AMDGPU::S_LOAD_DWORDX3_IMM:
600   case AMDGPU::S_LOAD_DWORDX4_IMM:
601   case AMDGPU::S_LOAD_DWORDX8_IMM:
602   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
603   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
604   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
605   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
606     return AMDGPU::S_LOAD_DWORD_IMM;
607   case AMDGPU::GLOBAL_LOAD_DWORD:
608   case AMDGPU::GLOBAL_LOAD_DWORDX2:
609   case AMDGPU::GLOBAL_LOAD_DWORDX3:
610   case AMDGPU::GLOBAL_LOAD_DWORDX4:
611   case AMDGPU::FLAT_LOAD_DWORD:
612   case AMDGPU::FLAT_LOAD_DWORDX2:
613   case AMDGPU::FLAT_LOAD_DWORDX3:
614   case AMDGPU::FLAT_LOAD_DWORDX4:
615     return AMDGPU::FLAT_LOAD_DWORD;
616   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
617   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
618   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
619   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
620     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
621   case AMDGPU::GLOBAL_STORE_DWORD:
622   case AMDGPU::GLOBAL_STORE_DWORDX2:
623   case AMDGPU::GLOBAL_STORE_DWORDX3:
624   case AMDGPU::GLOBAL_STORE_DWORDX4:
625   case AMDGPU::FLAT_STORE_DWORD:
626   case AMDGPU::FLAT_STORE_DWORDX2:
627   case AMDGPU::FLAT_STORE_DWORDX3:
628   case AMDGPU::FLAT_STORE_DWORDX4:
629     return AMDGPU::FLAT_STORE_DWORD;
630   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
631   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
632   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
633   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
634     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
635   }
636 }
637 
638 // GLOBAL loads and stores are classified as FLAT initially. If both combined
639 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
640 // If either or both instructions are non segment specific FLAT the resulting
641 // combined operation will be FLAT, potentially promoting one of the GLOBAL
642 // operations to FLAT.
643 // For other instructions return the original unmodified class.
644 InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)645 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
646                                          const CombineInfo &Paired) {
647   assert(CI.InstClass == Paired.InstClass);
648 
649   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
650       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
651     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
652 
653   return CI.InstClass;
654 }
655 
getRegs(unsigned Opc,const SIInstrInfo & TII)656 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
657   AddressRegs Result;
658 
659   if (TII.isMUBUF(Opc)) {
660     if (AMDGPU::getMUBUFHasVAddr(Opc))
661       Result.VAddr = true;
662     if (AMDGPU::getMUBUFHasSrsrc(Opc))
663       Result.SRsrc = true;
664     if (AMDGPU::getMUBUFHasSoffset(Opc))
665       Result.SOffset = true;
666 
667     return Result;
668   }
669 
670   if (TII.isImage(Opc)) {
671     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
672     if (VAddr0Idx >= 0) {
673       int RsrcName =
674           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
675       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
676       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
677     } else {
678       Result.VAddr = true;
679     }
680     Result.SRsrc = true;
681     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
682     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
683       Result.SSamp = true;
684 
685     return Result;
686   }
687   if (TII.isMTBUF(Opc)) {
688     if (AMDGPU::getMTBUFHasVAddr(Opc))
689       Result.VAddr = true;
690     if (AMDGPU::getMTBUFHasSrsrc(Opc))
691       Result.SRsrc = true;
692     if (AMDGPU::getMTBUFHasSoffset(Opc))
693       Result.SOffset = true;
694 
695     return Result;
696   }
697 
698   switch (Opc) {
699   default:
700     return Result;
701   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
702   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
703   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
706     Result.SOffset = true;
707     [[fallthrough]];
708   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
709   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
710   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
711   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
713   case AMDGPU::S_LOAD_DWORD_IMM:
714   case AMDGPU::S_LOAD_DWORDX2_IMM:
715   case AMDGPU::S_LOAD_DWORDX3_IMM:
716   case AMDGPU::S_LOAD_DWORDX4_IMM:
717   case AMDGPU::S_LOAD_DWORDX8_IMM:
718   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
719   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
720   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
721   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
722     Result.SBase = true;
723     return Result;
724   case AMDGPU::DS_READ_B32:
725   case AMDGPU::DS_READ_B64:
726   case AMDGPU::DS_READ_B32_gfx9:
727   case AMDGPU::DS_READ_B64_gfx9:
728   case AMDGPU::DS_WRITE_B32:
729   case AMDGPU::DS_WRITE_B64:
730   case AMDGPU::DS_WRITE_B32_gfx9:
731   case AMDGPU::DS_WRITE_B64_gfx9:
732     Result.Addr = true;
733     return Result;
734   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
735   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
736   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
737   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
738   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
739   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
740   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
741   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
742     Result.SAddr = true;
743     [[fallthrough]];
744   case AMDGPU::GLOBAL_LOAD_DWORD:
745   case AMDGPU::GLOBAL_LOAD_DWORDX2:
746   case AMDGPU::GLOBAL_LOAD_DWORDX3:
747   case AMDGPU::GLOBAL_LOAD_DWORDX4:
748   case AMDGPU::GLOBAL_STORE_DWORD:
749   case AMDGPU::GLOBAL_STORE_DWORDX2:
750   case AMDGPU::GLOBAL_STORE_DWORDX3:
751   case AMDGPU::GLOBAL_STORE_DWORDX4:
752   case AMDGPU::FLAT_LOAD_DWORD:
753   case AMDGPU::FLAT_LOAD_DWORDX2:
754   case AMDGPU::FLAT_LOAD_DWORDX3:
755   case AMDGPU::FLAT_LOAD_DWORDX4:
756   case AMDGPU::FLAT_STORE_DWORD:
757   case AMDGPU::FLAT_STORE_DWORDX2:
758   case AMDGPU::FLAT_STORE_DWORDX3:
759   case AMDGPU::FLAT_STORE_DWORDX4:
760     Result.VAddr = true;
761     return Result;
762   }
763 }
764 
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)765 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
766                                               const SILoadStoreOptimizer &LSO) {
767   I = MI;
768   unsigned Opc = MI->getOpcode();
769   InstClass = getInstClass(Opc, *LSO.TII);
770 
771   if (InstClass == UNKNOWN)
772     return;
773 
774   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
775 
776   switch (InstClass) {
777   case DS_READ:
778    EltSize =
779           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
780                                                                           : 4;
781    break;
782   case DS_WRITE:
783     EltSize =
784           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
785                                                                             : 4;
786     break;
787   case S_BUFFER_LOAD_IMM:
788   case S_BUFFER_LOAD_SGPR_IMM:
789   case S_LOAD_IMM:
790     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
791     break;
792   default:
793     EltSize = 4;
794     break;
795   }
796 
797   if (InstClass == MIMG) {
798     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
799     // Offset is not considered for MIMG instructions.
800     Offset = 0;
801   } else {
802     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
803     Offset = I->getOperand(OffsetIdx).getImm();
804   }
805 
806   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
807     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
808 
809   Width = getOpcodeWidth(*I, *LSO.TII);
810 
811   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
812     Offset &= 0xffff;
813   } else if (InstClass != MIMG) {
814     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
815   }
816 
817   AddressRegs Regs = getRegs(Opc, *LSO.TII);
818   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
819 
820   NumAddresses = 0;
821   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
822     AddrIdx[NumAddresses++] =
823         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
824   if (Regs.Addr)
825     AddrIdx[NumAddresses++] =
826         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
827   if (Regs.SBase)
828     AddrIdx[NumAddresses++] =
829         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
830   if (Regs.SRsrc)
831     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
832         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
833   if (Regs.SOffset)
834     AddrIdx[NumAddresses++] =
835         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
836   if (Regs.SAddr)
837     AddrIdx[NumAddresses++] =
838         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
839   if (Regs.VAddr)
840     AddrIdx[NumAddresses++] =
841         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
842   if (Regs.SSamp)
843     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
844         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
845   assert(NumAddresses <= MaxAddressRegs);
846 
847   for (unsigned J = 0; J < NumAddresses; J++)
848     AddrReg[J] = &I->getOperand(AddrIdx[J]);
849 }
850 
851 } // end anonymous namespace.
852 
853 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
854                       "SI Load Store Optimizer", false, false)
855 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
856 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
857                     false, false)
858 
859 char SILoadStoreOptimizer::ID = 0;
860 
861 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
862 
createSILoadStoreOptimizerPass()863 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
864   return new SILoadStoreOptimizer();
865 }
866 
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)867 static void addDefsUsesToList(const MachineInstr &MI,
868                               DenseSet<Register> &RegDefs,
869                               DenseSet<Register> &RegUses) {
870   for (const auto &Op : MI.operands()) {
871     if (!Op.isReg())
872       continue;
873     if (Op.isDef())
874       RegDefs.insert(Op.getReg());
875     if (Op.readsReg())
876       RegUses.insert(Op.getReg());
877   }
878 }
879 
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const880 bool SILoadStoreOptimizer::canSwapInstructions(
881     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
882     const MachineInstr &A, const MachineInstr &B) const {
883   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
884       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
885     return false;
886   for (const auto &BOp : B.operands()) {
887     if (!BOp.isReg())
888       continue;
889     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
890       return false;
891     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
892       return false;
893   }
894   return true;
895 }
896 
897 // Given that \p CI and \p Paired are adjacent memory operations produce a new
898 // MMO for the combined operation with a new access size.
899 MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)900 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
901                                                const CombineInfo &Paired) {
902   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
903   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
904 
905   unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
906 
907   // A base pointer for the combined operation is the same as the leading
908   // operation's pointer.
909   if (Paired < CI)
910     std::swap(MMOa, MMOb);
911 
912   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
913   // If merging FLAT and GLOBAL set address space to FLAT.
914   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
915     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
916 
917   MachineFunction *MF = CI.I->getMF();
918   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
919 }
920 
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)921 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
922                                                const SIInstrInfo &TII,
923                                                const CombineInfo &Paired) {
924   assert(CI.InstClass == MIMG);
925 
926   // Ignore instructions with tfe/lwe set.
927   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
928   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
929 
930   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
931     return false;
932 
933   // Check other optional immediate operands for equality.
934   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
935                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
936                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
937 
938   for (auto op : OperandsToMatch) {
939     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
940     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
941       return false;
942     if (Idx != -1 &&
943         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
944       return false;
945   }
946 
947   // Check DMask for overlaps.
948   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
949   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
950 
951   if (!MaxMask)
952     return false;
953 
954   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
955   if ((1u << AllowedBitsForMin) <= MinMask)
956     return false;
957 
958   return true;
959 }
960 
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)961 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
962                                        unsigned ComponentCount,
963                                        const GCNSubtarget &STI) {
964   if (ComponentCount > 4)
965     return 0;
966 
967   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
968       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
969   if (!OldFormatInfo)
970     return 0;
971 
972   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
973       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
974                                            ComponentCount,
975                                            OldFormatInfo->NumFormat, STI);
976 
977   if (!NewFormatInfo)
978     return 0;
979 
980   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
981          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
982 
983   return NewFormatInfo->Format;
984 }
985 
986 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
987 // highest power of two. Note that the result is well defined for all inputs
988 // including corner cases like:
989 // - if Lo == Hi, return that value
990 // - if Lo == 0, return 0 (even though the "- 1" below underflows
991 // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)992 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
993   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
994 }
995 
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)996 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
997                                                 const GCNSubtarget &STI,
998                                                 CombineInfo &Paired,
999                                                 bool Modify) {
1000   assert(CI.InstClass != MIMG);
1001 
1002   // XXX - Would the same offset be OK? Is there any reason this would happen or
1003   // be useful?
1004   if (CI.Offset == Paired.Offset)
1005     return false;
1006 
1007   // This won't be valid if the offset isn't aligned.
1008   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1009     return false;
1010 
1011   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1012 
1013     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1014         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1015     if (!Info0)
1016       return false;
1017     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1018         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1019     if (!Info1)
1020       return false;
1021 
1022     if (Info0->BitsPerComp != Info1->BitsPerComp ||
1023         Info0->NumFormat != Info1->NumFormat)
1024       return false;
1025 
1026     // TODO: Should be possible to support more formats, but if format loads
1027     // are not dword-aligned, the merged load might not be valid.
1028     if (Info0->BitsPerComp != 32)
1029       return false;
1030 
1031     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1032       return false;
1033   }
1034 
1035   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1036   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1037   CI.UseST64 = false;
1038   CI.BaseOff = 0;
1039 
1040   // Handle all non-DS instructions.
1041   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1042     if (EltOffset0 + CI.Width != EltOffset1 &&
1043             EltOffset1 + Paired.Width != EltOffset0)
1044       return false;
1045     if (CI.CPol != Paired.CPol)
1046       return false;
1047     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1048         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1049       // Reject cases like:
1050       //   dword + dwordx2 -> dwordx3
1051       //   dword + dwordx3 -> dwordx4
1052       // If we tried to combine these cases, we would fail to extract a subreg
1053       // for the result of the second load due to SGPR alignment requirements.
1054       if (CI.Width != Paired.Width &&
1055           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1056         return false;
1057     }
1058     return true;
1059   }
1060 
1061   // If the offset in elements doesn't fit in 8-bits, we might be able to use
1062   // the stride 64 versions.
1063   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1064       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1065     if (Modify) {
1066       CI.Offset = EltOffset0 / 64;
1067       Paired.Offset = EltOffset1 / 64;
1068       CI.UseST64 = true;
1069     }
1070     return true;
1071   }
1072 
1073   // Check if the new offsets fit in the reduced 8-bit range.
1074   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1075     if (Modify) {
1076       CI.Offset = EltOffset0;
1077       Paired.Offset = EltOffset1;
1078     }
1079     return true;
1080   }
1081 
1082   // Try to shift base address to decrease offsets.
1083   uint32_t Min = std::min(EltOffset0, EltOffset1);
1084   uint32_t Max = std::max(EltOffset0, EltOffset1);
1085 
1086   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1087   if (((Max - Min) & ~Mask) == 0) {
1088     if (Modify) {
1089       // From the range of values we could use for BaseOff, choose the one that
1090       // is aligned to the highest power of two, to maximise the chance that
1091       // the same offset can be reused for other load/store pairs.
1092       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1093       // Copy the low bits of the offsets, so that when we adjust them by
1094       // subtracting BaseOff they will be multiples of 64.
1095       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1096       CI.BaseOff = BaseOff * CI.EltSize;
1097       CI.Offset = (EltOffset0 - BaseOff) / 64;
1098       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1099       CI.UseST64 = true;
1100     }
1101     return true;
1102   }
1103 
1104   if (isUInt<8>(Max - Min)) {
1105     if (Modify) {
1106       // From the range of values we could use for BaseOff, choose the one that
1107       // is aligned to the highest power of two, to maximise the chance that
1108       // the same offset can be reused for other load/store pairs.
1109       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1110       CI.BaseOff = BaseOff * CI.EltSize;
1111       CI.Offset = EltOffset0 - BaseOff;
1112       Paired.Offset = EltOffset1 - BaseOff;
1113     }
1114     return true;
1115   }
1116 
1117   return false;
1118 }
1119 
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)1120 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1121                                      const CombineInfo &CI,
1122                                      const CombineInfo &Paired) {
1123   const unsigned Width = (CI.Width + Paired.Width);
1124   switch (CI.InstClass) {
1125   default:
1126     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1127   case S_BUFFER_LOAD_IMM:
1128   case S_BUFFER_LOAD_SGPR_IMM:
1129   case S_LOAD_IMM:
1130     switch (Width) {
1131     default:
1132       return false;
1133     case 2:
1134     case 4:
1135     case 8:
1136       return true;
1137     case 3:
1138       return STM.hasScalarDwordx3Loads();
1139     }
1140   }
1141 }
1142 
1143 const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const1144 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1145   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1146     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1147   }
1148   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1149     return TRI->getRegClassForReg(*MRI, Src->getReg());
1150   }
1151   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1152     return TRI->getRegClassForReg(*MRI, Src->getReg());
1153   }
1154   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1155     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1156   }
1157   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1158     return TRI->getRegClassForReg(*MRI, Src->getReg());
1159   }
1160   return nullptr;
1161 }
1162 
1163 /// This function assumes that CI comes before Paired in a basic block. Return
1164 /// an insertion point for the merged instruction or nullptr on failure.
1165 SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)1166 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1167                                            CombineInfo &Paired) {
1168   // If another instruction has already been merged into CI, it may now be a
1169   // type that we can't do any further merging into.
1170   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1171     return nullptr;
1172   assert(CI.InstClass == Paired.InstClass);
1173 
1174   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1175       getInstSubclass(Paired.I->getOpcode(), *TII))
1176     return nullptr;
1177 
1178   // Check both offsets (or masks for MIMG) can be combined and fit in the
1179   // reduced range.
1180   if (CI.InstClass == MIMG) {
1181     if (!dmasksCanBeCombined(CI, *TII, Paired))
1182       return nullptr;
1183   } else {
1184     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1185       return nullptr;
1186   }
1187 
1188   DenseSet<Register> RegDefs;
1189   DenseSet<Register> RegUses;
1190   CombineInfo *Where;
1191   if (CI.I->mayLoad()) {
1192     // Try to hoist Paired up to CI.
1193     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1194     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1195       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1196         return nullptr;
1197     }
1198     Where = &CI;
1199   } else {
1200     // Try to sink CI down to Paired.
1201     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1202     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1203       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1204         return nullptr;
1205     }
1206     Where = &Paired;
1207   }
1208 
1209   // Call offsetsCanBeCombined with modify = true so that the offsets are
1210   // correct for the new instruction.  This should return true, because
1211   // this function should only be called on CombineInfo objects that
1212   // have already been confirmed to be mergeable.
1213   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1214     offsetsCanBeCombined(CI, *STM, Paired, true);
1215   return Where;
1216 }
1217 
1218 // Copy the merged load result from DestReg to the original dest regs of CI and
1219 // Paired.
copyToDestRegs(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore,int OpName,Register DestReg) const1220 void SILoadStoreOptimizer::copyToDestRegs(
1221     CombineInfo &CI, CombineInfo &Paired,
1222     MachineBasicBlock::iterator InsertBefore, int OpName,
1223     Register DestReg) const {
1224   MachineBasicBlock *MBB = CI.I->getParent();
1225   DebugLoc DL = CI.I->getDebugLoc();
1226 
1227   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1228 
1229   // Copy to the old destination registers.
1230   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1231   auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1232   auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1233 
1234   // The constrained sload instructions in S_LOAD_IMM class will have
1235   // `early-clobber` flag in the dst operand. Remove the flag before using the
1236   // MOs in copies.
1237   Dest0->setIsEarlyClobber(false);
1238   Dest1->setIsEarlyClobber(false);
1239 
1240   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1241       .add(*Dest0) // Copy to same destination including flags and sub reg.
1242       .addReg(DestReg, 0, SubRegIdx0);
1243   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1244       .add(*Dest1)
1245       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1246 }
1247 
1248 // Return a register for the source of the merged store after copying the
1249 // original source regs of CI and Paired into it.
1250 Register
copyFromSrcRegs(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore,int OpName) const1251 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1252                                       MachineBasicBlock::iterator InsertBefore,
1253                                       int OpName) const {
1254   MachineBasicBlock *MBB = CI.I->getParent();
1255   DebugLoc DL = CI.I->getDebugLoc();
1256 
1257   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1258 
1259   // Copy to the new source register.
1260   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1261   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1262 
1263   const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1264   const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1265 
1266   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1267       .add(*Src0)
1268       .addImm(SubRegIdx0)
1269       .add(*Src1)
1270       .addImm(SubRegIdx1);
1271 
1272   return SrcReg;
1273 }
1274 
read2Opcode(unsigned EltSize) const1275 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1276   if (STM->ldsRequiresM0Init())
1277     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1278   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1279 }
1280 
read2ST64Opcode(unsigned EltSize) const1281 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1282   if (STM->ldsRequiresM0Init())
1283     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1284 
1285   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1286                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1287 }
1288 
1289 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1290 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1291                                      MachineBasicBlock::iterator InsertBefore) {
1292   MachineBasicBlock *MBB = CI.I->getParent();
1293 
1294   // Be careful, since the addresses could be subregisters themselves in weird
1295   // cases, like vectors of pointers.
1296   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1297 
1298   unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1299   unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1300   unsigned Opc =
1301       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1302 
1303   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1304          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1305 
1306   const MCInstrDesc &Read2Desc = TII->get(Opc);
1307 
1308   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1309   Register DestReg = MRI->createVirtualRegister(SuperRC);
1310 
1311   DebugLoc DL = CI.I->getDebugLoc();
1312 
1313   Register BaseReg = AddrReg->getReg();
1314   unsigned BaseSubReg = AddrReg->getSubReg();
1315   unsigned BaseRegFlags = 0;
1316   if (CI.BaseOff) {
1317     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1318     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1319         .addImm(CI.BaseOff);
1320 
1321     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1322     BaseRegFlags = RegState::Kill;
1323 
1324     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1325         .addReg(ImmReg)
1326         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1327         .addImm(0); // clamp bit
1328     BaseSubReg = 0;
1329   }
1330 
1331   MachineInstrBuilder Read2 =
1332       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1333           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1334           .addImm(NewOffset0)                        // offset0
1335           .addImm(NewOffset1)                        // offset1
1336           .addImm(0)                                 // gds
1337           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1338 
1339   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1340 
1341   CI.I->eraseFromParent();
1342   Paired.I->eraseFromParent();
1343 
1344   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1345   return Read2;
1346 }
1347 
write2Opcode(unsigned EltSize) const1348 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1349   if (STM->ldsRequiresM0Init())
1350     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1351   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1352                         : AMDGPU::DS_WRITE2_B64_gfx9;
1353 }
1354 
write2ST64Opcode(unsigned EltSize) const1355 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1356   if (STM->ldsRequiresM0Init())
1357     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1358                           : AMDGPU::DS_WRITE2ST64_B64;
1359 
1360   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1361                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1362 }
1363 
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1364 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1365     CombineInfo &CI, CombineInfo &Paired,
1366     MachineBasicBlock::iterator InsertBefore) {
1367   MachineBasicBlock *MBB = CI.I->getParent();
1368 
1369   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1370   // sure we preserve the subregister index and any register flags set on them.
1371   const MachineOperand *AddrReg =
1372       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1373   const MachineOperand *Data0 =
1374       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1375   const MachineOperand *Data1 =
1376       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1377 
1378   unsigned NewOffset0 = CI.Offset;
1379   unsigned NewOffset1 = Paired.Offset;
1380   unsigned Opc =
1381       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1382 
1383   if (NewOffset0 > NewOffset1) {
1384     // Canonicalize the merged instruction so the smaller offset comes first.
1385     std::swap(NewOffset0, NewOffset1);
1386     std::swap(Data0, Data1);
1387   }
1388 
1389   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1390          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1391 
1392   const MCInstrDesc &Write2Desc = TII->get(Opc);
1393   DebugLoc DL = CI.I->getDebugLoc();
1394 
1395   Register BaseReg = AddrReg->getReg();
1396   unsigned BaseSubReg = AddrReg->getSubReg();
1397   unsigned BaseRegFlags = 0;
1398   if (CI.BaseOff) {
1399     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1400     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1401         .addImm(CI.BaseOff);
1402 
1403     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1404     BaseRegFlags = RegState::Kill;
1405 
1406     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1407         .addReg(ImmReg)
1408         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1409         .addImm(0); // clamp bit
1410     BaseSubReg = 0;
1411   }
1412 
1413   MachineInstrBuilder Write2 =
1414       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1415           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1416           .add(*Data0)                               // data0
1417           .add(*Data1)                               // data1
1418           .addImm(NewOffset0)                        // offset0
1419           .addImm(NewOffset1)                        // offset1
1420           .addImm(0)                                 // gds
1421           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1422 
1423   CI.I->eraseFromParent();
1424   Paired.I->eraseFromParent();
1425 
1426   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1427   return Write2;
1428 }
1429 
1430 MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1431 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1432                                      MachineBasicBlock::iterator InsertBefore) {
1433   MachineBasicBlock *MBB = CI.I->getParent();
1434   DebugLoc DL = CI.I->getDebugLoc();
1435   const unsigned Opcode = getNewOpcode(CI, Paired);
1436 
1437   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1438 
1439   Register DestReg = MRI->createVirtualRegister(SuperRC);
1440   unsigned MergedDMask = CI.DMask | Paired.DMask;
1441   unsigned DMaskIdx =
1442       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1443 
1444   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1445   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1446     if (I == DMaskIdx)
1447       MIB.addImm(MergedDMask);
1448     else
1449       MIB.add((*CI.I).getOperand(I));
1450   }
1451 
1452   // It shouldn't be possible to get this far if the two instructions
1453   // don't have a single memoperand, because MachineInstr::mayAlias()
1454   // will return true if this is the case.
1455   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1456 
1457   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1458 
1459   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1460 
1461   CI.I->eraseFromParent();
1462   Paired.I->eraseFromParent();
1463   return New;
1464 }
1465 
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1466 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1467     CombineInfo &CI, CombineInfo &Paired,
1468     MachineBasicBlock::iterator InsertBefore) {
1469   MachineBasicBlock *MBB = CI.I->getParent();
1470   DebugLoc DL = CI.I->getDebugLoc();
1471   const unsigned Opcode = getNewOpcode(CI, Paired);
1472 
1473   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1474 
1475   Register DestReg = MRI->createVirtualRegister(SuperRC);
1476   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1477 
1478   // It shouldn't be possible to get this far if the two instructions
1479   // don't have a single memoperand, because MachineInstr::mayAlias()
1480   // will return true if this is the case.
1481   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1482 
1483   MachineInstrBuilder New =
1484       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1485           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1486   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1487     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1488   New.addImm(MergedOffset);
1489   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1490 
1491   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1492 
1493   CI.I->eraseFromParent();
1494   Paired.I->eraseFromParent();
1495   return New;
1496 }
1497 
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1498 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1499     CombineInfo &CI, CombineInfo &Paired,
1500     MachineBasicBlock::iterator InsertBefore) {
1501   MachineBasicBlock *MBB = CI.I->getParent();
1502   DebugLoc DL = CI.I->getDebugLoc();
1503 
1504   const unsigned Opcode = getNewOpcode(CI, Paired);
1505 
1506   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1507 
1508   // Copy to the new source register.
1509   Register DestReg = MRI->createVirtualRegister(SuperRC);
1510   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1511 
1512   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1513 
1514   AddressRegs Regs = getRegs(Opcode, *TII);
1515 
1516   if (Regs.VAddr)
1517     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1518 
1519   // It shouldn't be possible to get this far if the two instructions
1520   // don't have a single memoperand, because MachineInstr::mayAlias()
1521   // will return true if this is the case.
1522   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1523 
1524   MachineInstr *New =
1525     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1526         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1527         .addImm(MergedOffset) // offset
1528         .addImm(CI.CPol)      // cpol
1529         .addImm(0)            // swz
1530         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1531 
1532   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1533 
1534   CI.I->eraseFromParent();
1535   Paired.I->eraseFromParent();
1536   return New;
1537 }
1538 
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1539 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1540     CombineInfo &CI, CombineInfo &Paired,
1541     MachineBasicBlock::iterator InsertBefore) {
1542   MachineBasicBlock *MBB = CI.I->getParent();
1543   DebugLoc DL = CI.I->getDebugLoc();
1544 
1545   const unsigned Opcode = getNewOpcode(CI, Paired);
1546 
1547   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1548 
1549   // Copy to the new source register.
1550   Register DestReg = MRI->createVirtualRegister(SuperRC);
1551   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1552 
1553   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1554 
1555   AddressRegs Regs = getRegs(Opcode, *TII);
1556 
1557   if (Regs.VAddr)
1558     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1559 
1560   unsigned JoinedFormat =
1561       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1562 
1563   // It shouldn't be possible to get this far if the two instructions
1564   // don't have a single memoperand, because MachineInstr::mayAlias()
1565   // will return true if this is the case.
1566   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1567 
1568   MachineInstr *New =
1569       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1571           .addImm(MergedOffset) // offset
1572           .addImm(JoinedFormat) // format
1573           .addImm(CI.CPol)      // cpol
1574           .addImm(0)            // swz
1575           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1576 
1577   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1578 
1579   CI.I->eraseFromParent();
1580   Paired.I->eraseFromParent();
1581   return New;
1582 }
1583 
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1584 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1585     CombineInfo &CI, CombineInfo &Paired,
1586     MachineBasicBlock::iterator InsertBefore) {
1587   MachineBasicBlock *MBB = CI.I->getParent();
1588   DebugLoc DL = CI.I->getDebugLoc();
1589 
1590   const unsigned Opcode = getNewOpcode(CI, Paired);
1591 
1592   Register SrcReg =
1593       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1594 
1595   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1596                  .addReg(SrcReg, RegState::Kill);
1597 
1598   AddressRegs Regs = getRegs(Opcode, *TII);
1599 
1600   if (Regs.VAddr)
1601     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1602 
1603   unsigned JoinedFormat =
1604       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1605 
1606   // It shouldn't be possible to get this far if the two instructions
1607   // don't have a single memoperand, because MachineInstr::mayAlias()
1608   // will return true if this is the case.
1609   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1610 
1611   MachineInstr *New =
1612       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1613           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1614           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1615           .addImm(JoinedFormat)                     // format
1616           .addImm(CI.CPol)                          // cpol
1617           .addImm(0)                                // swz
1618           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1619 
1620   CI.I->eraseFromParent();
1621   Paired.I->eraseFromParent();
1622   return New;
1623 }
1624 
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1625 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1626     CombineInfo &CI, CombineInfo &Paired,
1627     MachineBasicBlock::iterator InsertBefore) {
1628   MachineBasicBlock *MBB = CI.I->getParent();
1629   DebugLoc DL = CI.I->getDebugLoc();
1630 
1631   const unsigned Opcode = getNewOpcode(CI, Paired);
1632 
1633   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1634   Register DestReg = MRI->createVirtualRegister(SuperRC);
1635 
1636   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1637 
1638   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1639     MIB.add(*SAddr);
1640 
1641   MachineInstr *New =
1642     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1643        .addImm(std::min(CI.Offset, Paired.Offset))
1644        .addImm(CI.CPol)
1645        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1646 
1647   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1648 
1649   CI.I->eraseFromParent();
1650   Paired.I->eraseFromParent();
1651   return New;
1652 }
1653 
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1654 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1655     CombineInfo &CI, CombineInfo &Paired,
1656     MachineBasicBlock::iterator InsertBefore) {
1657   MachineBasicBlock *MBB = CI.I->getParent();
1658   DebugLoc DL = CI.I->getDebugLoc();
1659 
1660   const unsigned Opcode = getNewOpcode(CI, Paired);
1661 
1662   Register SrcReg =
1663       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1664 
1665   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1666                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1667                  .addReg(SrcReg, RegState::Kill);
1668 
1669   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1670     MIB.add(*SAddr);
1671 
1672   MachineInstr *New =
1673     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1674        .addImm(CI.CPol)
1675        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1676 
1677   CI.I->eraseFromParent();
1678   Paired.I->eraseFromParent();
1679   return New;
1680 }
1681 
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1682 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1683                                             const CombineInfo &Paired) {
1684   const unsigned Width = CI.Width + Paired.Width;
1685 
1686   switch (getCommonInstClass(CI, Paired)) {
1687   default:
1688     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1689     // FIXME: Handle d16 correctly
1690     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1691                                   Width);
1692   case TBUFFER_LOAD:
1693   case TBUFFER_STORE:
1694     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1695                                   Width);
1696 
1697   case UNKNOWN:
1698     llvm_unreachable("Unknown instruction class");
1699   case S_BUFFER_LOAD_IMM:
1700     switch (Width) {
1701     default:
1702       return 0;
1703     case 2:
1704       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1705     case 3:
1706       return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1707     case 4:
1708       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1709     case 8:
1710       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1711     }
1712   case S_BUFFER_LOAD_SGPR_IMM:
1713     switch (Width) {
1714     default:
1715       return 0;
1716     case 2:
1717       return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1718     case 3:
1719       return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1720     case 4:
1721       return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1722     case 8:
1723       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1724     }
1725   case S_LOAD_IMM: {
1726     // If XNACK is enabled, use the constrained opcodes when the first load is
1727     // under-aligned.
1728     const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1729     bool NeedsConstrainedOpc =
1730         STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
1731     switch (Width) {
1732     default:
1733       return 0;
1734     case 2:
1735       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736                                  : AMDGPU::S_LOAD_DWORDX2_IMM;
1737     case 3:
1738       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739                                  : AMDGPU::S_LOAD_DWORDX3_IMM;
1740     case 4:
1741       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742                                  : AMDGPU::S_LOAD_DWORDX4_IMM;
1743     case 8:
1744       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745                                  : AMDGPU::S_LOAD_DWORDX8_IMM;
1746     }
1747   }
1748   case GLOBAL_LOAD:
1749     switch (Width) {
1750     default:
1751       return 0;
1752     case 2:
1753       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1754     case 3:
1755       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1756     case 4:
1757       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1758     }
1759   case GLOBAL_LOAD_SADDR:
1760     switch (Width) {
1761     default:
1762       return 0;
1763     case 2:
1764       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1765     case 3:
1766       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1767     case 4:
1768       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1769     }
1770   case GLOBAL_STORE:
1771     switch (Width) {
1772     default:
1773       return 0;
1774     case 2:
1775       return AMDGPU::GLOBAL_STORE_DWORDX2;
1776     case 3:
1777       return AMDGPU::GLOBAL_STORE_DWORDX3;
1778     case 4:
1779       return AMDGPU::GLOBAL_STORE_DWORDX4;
1780     }
1781   case GLOBAL_STORE_SADDR:
1782     switch (Width) {
1783     default:
1784       return 0;
1785     case 2:
1786       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1787     case 3:
1788       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1789     case 4:
1790       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1791     }
1792   case FLAT_LOAD:
1793     switch (Width) {
1794     default:
1795       return 0;
1796     case 2:
1797       return AMDGPU::FLAT_LOAD_DWORDX2;
1798     case 3:
1799       return AMDGPU::FLAT_LOAD_DWORDX3;
1800     case 4:
1801       return AMDGPU::FLAT_LOAD_DWORDX4;
1802     }
1803   case FLAT_STORE:
1804     switch (Width) {
1805     default:
1806       return 0;
1807     case 2:
1808       return AMDGPU::FLAT_STORE_DWORDX2;
1809     case 3:
1810       return AMDGPU::FLAT_STORE_DWORDX3;
1811     case 4:
1812       return AMDGPU::FLAT_STORE_DWORDX4;
1813     }
1814   case MIMG:
1815     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1816            "No overlaps");
1817     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1818   }
1819 }
1820 
1821 std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1822 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1823                                     const CombineInfo &Paired) {
1824   assert((CI.InstClass != MIMG ||
1825           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1826            CI.Width + Paired.Width)) &&
1827          "No overlaps");
1828 
1829   unsigned Idx0;
1830   unsigned Idx1;
1831 
1832   static const unsigned Idxs[5][4] = {
1833       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1834       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1835       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1836       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1837       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1838   };
1839 
1840   assert(CI.Width >= 1 && CI.Width <= 4);
1841   assert(Paired.Width >= 1 && Paired.Width <= 4);
1842 
1843   if (Paired < CI) {
1844     Idx1 = Idxs[0][Paired.Width - 1];
1845     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1846   } else {
1847     Idx0 = Idxs[0][CI.Width - 1];
1848     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1849   }
1850 
1851   return {Idx0, Idx1};
1852 }
1853 
1854 const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired) const1855 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1856                                              const CombineInfo &Paired) const {
1857   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1858       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1859     switch (CI.Width + Paired.Width) {
1860     default:
1861       return nullptr;
1862     case 2:
1863       return &AMDGPU::SReg_64_XEXECRegClass;
1864     case 3:
1865       return &AMDGPU::SGPR_96RegClass;
1866     case 4:
1867       return &AMDGPU::SGPR_128RegClass;
1868     case 8:
1869       return &AMDGPU::SGPR_256RegClass;
1870     case 16:
1871       return &AMDGPU::SGPR_512RegClass;
1872     }
1873   }
1874 
1875   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1876   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1877              ? TRI->getAGPRClassForBitWidth(BitWidth)
1878              : TRI->getVGPRClassForBitWidth(BitWidth);
1879 }
1880 
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1881 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1882     CombineInfo &CI, CombineInfo &Paired,
1883     MachineBasicBlock::iterator InsertBefore) {
1884   MachineBasicBlock *MBB = CI.I->getParent();
1885   DebugLoc DL = CI.I->getDebugLoc();
1886 
1887   const unsigned Opcode = getNewOpcode(CI, Paired);
1888 
1889   Register SrcReg =
1890       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1891 
1892   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1893                  .addReg(SrcReg, RegState::Kill);
1894 
1895   AddressRegs Regs = getRegs(Opcode, *TII);
1896 
1897   if (Regs.VAddr)
1898     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1899 
1900 
1901   // It shouldn't be possible to get this far if the two instructions
1902   // don't have a single memoperand, because MachineInstr::mayAlias()
1903   // will return true if this is the case.
1904   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1905 
1906   MachineInstr *New =
1907     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1908         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1909         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1910         .addImm(CI.CPol)      // cpol
1911         .addImm(0)            // swz
1912         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1913 
1914   CI.I->eraseFromParent();
1915   Paired.I->eraseFromParent();
1916   return New;
1917 }
1918 
1919 MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const1920 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1921   APInt V(32, Val, true);
1922   if (TII->isInlineConstant(V))
1923     return MachineOperand::CreateImm(Val);
1924 
1925   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1926   MachineInstr *Mov =
1927   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1928           TII->get(AMDGPU::S_MOV_B32), Reg)
1929     .addImm(Val);
1930   (void)Mov;
1931   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1932   return MachineOperand::CreateReg(Reg, false);
1933 }
1934 
1935 // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1936 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1937                                            const MemAddress &Addr) const {
1938   MachineBasicBlock *MBB = MI.getParent();
1939   MachineBasicBlock::iterator MBBI = MI.getIterator();
1940   DebugLoc DL = MI.getDebugLoc();
1941 
1942   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1943           Addr.Base.LoSubReg) &&
1944          "Expected 32-bit Base-Register-Low!!");
1945 
1946   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1947           Addr.Base.HiSubReg) &&
1948          "Expected 32-bit Base-Register-Hi!!");
1949 
1950   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1951   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1952   MachineOperand OffsetHi =
1953     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1954 
1955   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1956   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1957   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1958 
1959   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1960   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1961   MachineInstr *LoHalf =
1962     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1963       .addReg(CarryReg, RegState::Define)
1964       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1965       .add(OffsetLo)
1966       .addImm(0); // clamp bit
1967   (void)LoHalf;
1968   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1969 
1970   MachineInstr *HiHalf =
1971   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1972     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1973     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1974     .add(OffsetHi)
1975     .addReg(CarryReg, RegState::Kill)
1976     .addImm(0); // clamp bit
1977   (void)HiHalf;
1978   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1979 
1980   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1981   MachineInstr *FullBase =
1982     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1983       .addReg(DestSub0)
1984       .addImm(AMDGPU::sub0)
1985       .addReg(DestSub1)
1986       .addImm(AMDGPU::sub1);
1987   (void)FullBase;
1988   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1989 
1990   return FullDestReg;
1991 }
1992 
1993 // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const1994 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1995                                                Register NewBase,
1996                                                int32_t NewOffset) const {
1997   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1998   Base->setReg(NewBase);
1999   Base->setIsKill(false);
2000   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2001 }
2002 
2003 std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const2004 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2005   if (Op.isImm())
2006     return Op.getImm();
2007 
2008   if (!Op.isReg())
2009     return std::nullopt;
2010 
2011   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2012   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2013       !Def->getOperand(1).isImm())
2014     return std::nullopt;
2015 
2016   return Def->getOperand(1).getImm();
2017 }
2018 
2019 // Analyze Base and extracts:
2020 //  - 32bit base registers, subregisters
2021 //  - 64bit constant offset
2022 // Expecting base computation as:
2023 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2024 //   %LO:vgpr_32, %c:sreg_64_xexec =
2025 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2026 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2027 //   %Base:vreg_64 =
2028 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const2029 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2030                                                       MemAddress &Addr) const {
2031   if (!Base.isReg())
2032     return;
2033 
2034   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2035   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2036       || Def->getNumOperands() != 5)
2037     return;
2038 
2039   MachineOperand BaseLo = Def->getOperand(1);
2040   MachineOperand BaseHi = Def->getOperand(3);
2041   if (!BaseLo.isReg() || !BaseHi.isReg())
2042     return;
2043 
2044   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2045   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2046 
2047   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2048       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2049     return;
2050 
2051   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2052   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2053 
2054   auto Offset0P = extractConstOffset(*Src0);
2055   if (Offset0P)
2056     BaseLo = *Src1;
2057   else {
2058     if (!(Offset0P = extractConstOffset(*Src1)))
2059       return;
2060     BaseLo = *Src0;
2061   }
2062 
2063   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2064   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2065 
2066   if (Src0->isImm())
2067     std::swap(Src0, Src1);
2068 
2069   if (!Src1->isImm() || Src0->isImm())
2070     return;
2071 
2072   uint64_t Offset1 = Src1->getImm();
2073   BaseHi = *Src0;
2074 
2075   Addr.Base.LoReg = BaseLo.getReg();
2076   Addr.Base.HiReg = BaseHi.getReg();
2077   Addr.Base.LoSubReg = BaseLo.getSubReg();
2078   Addr.Base.HiSubReg = BaseHi.getSubReg();
2079   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2080 }
2081 
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const2082 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2083     MachineInstr &MI,
2084     MemInfoMap &Visited,
2085     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2086 
2087   if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2088     return false;
2089 
2090   // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2091   if (SIInstrInfo::isFLATScratch(MI))
2092     return false;
2093 
2094   unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2095                                               : AMDGPUAS::FLAT_ADDRESS;
2096 
2097   if (AnchorList.count(&MI))
2098     return false;
2099 
2100   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2101 
2102   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2103     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2104     return false;
2105   }
2106 
2107   // Step1: Find the base-registers and a 64bit constant offset.
2108   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2109   MemAddress MAddr;
2110   if (!Visited.contains(&MI)) {
2111     processBaseWithConstOffset(Base, MAddr);
2112     Visited[&MI] = MAddr;
2113   } else
2114     MAddr = Visited[&MI];
2115 
2116   if (MAddr.Offset == 0) {
2117     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2118                          " constant offsets that can be promoted.\n";);
2119     return false;
2120   }
2121 
2122   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2123              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2124 
2125   // Step2: Traverse through MI's basic block and find an anchor(that has the
2126   // same base-registers) with the highest 13bit distance from MI's offset.
2127   // E.g. (64bit loads)
2128   // bb:
2129   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2130   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2131   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2132   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2133   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2134   //
2135   // Starting from the first load, the optimization will try to find a new base
2136   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2137   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2138   // as the new-base(anchor) because of the maximum distance which can
2139   // accommodate more intermediate bases presumably.
2140   //
2141   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2142   // (&a + 8192) for load1, load2, load4.
2143   //   addr = &a + 8192
2144   //   load1 = load(addr,       -4096)
2145   //   load2 = load(addr,       -2048)
2146   //   load3 = load(addr,       0)
2147   //   load4 = load(addr,       2048)
2148   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2149   //
2150   MachineInstr *AnchorInst = nullptr;
2151   MemAddress AnchorAddr;
2152   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2153   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2154 
2155   MachineBasicBlock *MBB = MI.getParent();
2156   MachineBasicBlock::iterator E = MBB->end();
2157   MachineBasicBlock::iterator MBBI = MI.getIterator();
2158   ++MBBI;
2159   const SITargetLowering *TLI =
2160     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2161 
2162   for ( ; MBBI != E; ++MBBI) {
2163     MachineInstr &MINext = *MBBI;
2164     // TODO: Support finding an anchor(with same base) from store addresses or
2165     // any other load addresses where the opcodes are different.
2166     if (MINext.getOpcode() != MI.getOpcode() ||
2167         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2168       continue;
2169 
2170     const MachineOperand &BaseNext =
2171       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2172     MemAddress MAddrNext;
2173     if (!Visited.contains(&MINext)) {
2174       processBaseWithConstOffset(BaseNext, MAddrNext);
2175       Visited[&MINext] = MAddrNext;
2176     } else
2177       MAddrNext = Visited[&MINext];
2178 
2179     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2180         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2181         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2182         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2183       continue;
2184 
2185     InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2186 
2187     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2188     TargetLoweringBase::AddrMode AM;
2189     AM.HasBaseReg = true;
2190     AM.BaseOffs = Dist;
2191     if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2192         (uint32_t)std::abs(Dist) > MaxDist) {
2193       MaxDist = std::abs(Dist);
2194 
2195       AnchorAddr = MAddrNext;
2196       AnchorInst = &MINext;
2197     }
2198   }
2199 
2200   if (AnchorInst) {
2201     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2202                AnchorInst->dump());
2203     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2204                <<  AnchorAddr.Offset << "\n\n");
2205 
2206     // Instead of moving up, just re-compute anchor-instruction's base address.
2207     Register Base = computeBase(MI, AnchorAddr);
2208 
2209     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2210     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2211 
2212     for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2213       TargetLoweringBase::AddrMode AM;
2214       AM.HasBaseReg = true;
2215       AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2216 
2217       if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2218         LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";
2219                    OtherMI->dump());
2220         updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2221         LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());
2222       }
2223     }
2224     AnchorList.insert(AnchorInst);
2225     return true;
2226   }
2227 
2228   return false;
2229 }
2230 
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const2231 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2232                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2233   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2234     if (AddrList.front().InstClass == CI.InstClass &&
2235         AddrList.front().IsAGPR == CI.IsAGPR &&
2236         AddrList.front().hasSameBaseAddress(CI)) {
2237       AddrList.emplace_back(CI);
2238       return;
2239     }
2240   }
2241 
2242   // Base address not found, so add a new list.
2243   MergeableInsts.emplace_back(1, CI);
2244 }
2245 
2246 std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const2247 SILoadStoreOptimizer::collectMergeableInsts(
2248     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2249     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2250     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2251   bool Modified = false;
2252 
2253   // Sort potential mergeable instructions into lists.  One list per base address.
2254   unsigned Order = 0;
2255   MachineBasicBlock::iterator BlockI = Begin;
2256   for (; BlockI != End; ++BlockI) {
2257     MachineInstr &MI = *BlockI;
2258 
2259     // We run this before checking if an address is mergeable, because it can produce
2260     // better code even if the instructions aren't mergeable.
2261     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2262       Modified = true;
2263 
2264     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2265     // barriers. We can look after this barrier for separate merges.
2266     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2267       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2268 
2269       // Search will resume after this instruction in a separate merge list.
2270       ++BlockI;
2271       break;
2272     }
2273 
2274     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2275     if (InstClass == UNKNOWN)
2276       continue;
2277 
2278     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2279     int Swizzled =
2280         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2281     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2282       continue;
2283 
2284     CombineInfo CI;
2285     CI.setMI(MI, *this);
2286     CI.Order = Order++;
2287 
2288     if (!CI.hasMergeableAddress(*MRI))
2289       continue;
2290 
2291     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2292       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2293       //        operands. However we are reporting that ds_write2 shall have
2294       //        only VGPR data so that machine copy propagation does not
2295       //        create an illegal instruction with a VGPR and AGPR sources.
2296       //        Consequenctially if we create such instruction the verifier
2297       //        will complain.
2298       continue;
2299     }
2300 
2301     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2302 
2303     addInstToMergeableList(CI, MergeableInsts);
2304   }
2305 
2306   // At this point we have lists of Mergeable instructions.
2307   //
2308   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2309   // list try to find an instruction that can be merged with I.  If an instruction
2310   // is found, it is stored in the Paired field.  If no instructions are found, then
2311   // the CombineInfo object is deleted from the list.
2312 
2313   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2314                                                    E = MergeableInsts.end(); I != E;) {
2315 
2316     std::list<CombineInfo> &MergeList = *I;
2317     if (MergeList.size() <= 1) {
2318       // This means we have found only one instruction with a given address
2319       // that can be merged, and we need at least 2 instructions to do a merge,
2320       // so this list can be discarded.
2321       I = MergeableInsts.erase(I);
2322       continue;
2323     }
2324 
2325     // Sort the lists by offsets, this way mergeable instructions will be
2326     // adjacent to each other in the list, which will make it easier to find
2327     // matches.
2328     MergeList.sort(
2329         [] (const CombineInfo &A, const CombineInfo &B) {
2330           return A.Offset < B.Offset;
2331         });
2332     ++I;
2333   }
2334 
2335   return {BlockI, Modified};
2336 }
2337 
2338 // Scan through looking for adjacent LDS operations with constant offsets from
2339 // the same base register. We rely on the scheduler to do the hard work of
2340 // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)2341 bool SILoadStoreOptimizer::optimizeBlock(
2342                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2343   bool Modified = false;
2344 
2345   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2346                                                    E = MergeableInsts.end(); I != E;) {
2347     std::list<CombineInfo> &MergeList = *I;
2348 
2349     bool OptimizeListAgain = false;
2350     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2351       // We weren't able to make any changes, so delete the list so we don't
2352       // process the same instructions the next time we try to optimize this
2353       // block.
2354       I = MergeableInsts.erase(I);
2355       continue;
2356     }
2357 
2358     Modified = true;
2359 
2360     // We made changes, but also determined that there were no more optimization
2361     // opportunities, so we don't need to reprocess the list
2362     if (!OptimizeListAgain) {
2363       I = MergeableInsts.erase(I);
2364       continue;
2365     }
2366     OptimizeAgain = true;
2367   }
2368   return Modified;
2369 }
2370 
2371 bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)2372 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2373                                           std::list<CombineInfo> &MergeList,
2374                                           bool &OptimizeListAgain) {
2375   if (MergeList.empty())
2376     return false;
2377 
2378   bool Modified = false;
2379 
2380   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2381        Next = std::next(I)) {
2382 
2383     auto First = I;
2384     auto Second = Next;
2385 
2386     if ((*First).Order > (*Second).Order)
2387       std::swap(First, Second);
2388     CombineInfo &CI = *First;
2389     CombineInfo &Paired = *Second;
2390 
2391     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2392     if (!Where) {
2393       ++I;
2394       continue;
2395     }
2396 
2397     Modified = true;
2398 
2399     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2400 
2401     MachineBasicBlock::iterator NewMI;
2402     switch (CI.InstClass) {
2403     default:
2404       llvm_unreachable("unknown InstClass");
2405       break;
2406     case DS_READ:
2407       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2408       break;
2409     case DS_WRITE:
2410       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2411       break;
2412     case S_BUFFER_LOAD_IMM:
2413     case S_BUFFER_LOAD_SGPR_IMM:
2414     case S_LOAD_IMM:
2415       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2416       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2417       break;
2418     case BUFFER_LOAD:
2419       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2420       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2421       break;
2422     case BUFFER_STORE:
2423       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2424       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2425       break;
2426     case MIMG:
2427       NewMI = mergeImagePair(CI, Paired, Where->I);
2428       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2429       break;
2430     case TBUFFER_LOAD:
2431       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2432       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2433       break;
2434     case TBUFFER_STORE:
2435       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2436       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2437       break;
2438     case FLAT_LOAD:
2439     case GLOBAL_LOAD:
2440     case GLOBAL_LOAD_SADDR:
2441       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2442       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2443       break;
2444     case FLAT_STORE:
2445     case GLOBAL_STORE:
2446     case GLOBAL_STORE_SADDR:
2447       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2448       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2449       break;
2450     }
2451     CI.setMI(NewMI, *this);
2452     CI.Order = Where->Order;
2453     if (I == Second)
2454       I = Next;
2455 
2456     MergeList.erase(Second);
2457   }
2458 
2459   return Modified;
2460 }
2461 
runOnMachineFunction(MachineFunction & MF)2462 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2463   if (skipFunction(MF.getFunction()))
2464     return false;
2465 
2466   STM = &MF.getSubtarget<GCNSubtarget>();
2467   if (!STM->loadStoreOptEnabled())
2468     return false;
2469 
2470   TII = STM->getInstrInfo();
2471   TRI = &TII->getRegisterInfo();
2472 
2473   MRI = &MF.getRegInfo();
2474   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2475 
2476   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2477 
2478   bool Modified = false;
2479 
2480   // Contains the list of instructions for which constant offsets are being
2481   // promoted to the IMM. This is tracked for an entire block at time.
2482   SmallPtrSet<MachineInstr *, 4> AnchorList;
2483   MemInfoMap Visited;
2484 
2485   for (MachineBasicBlock &MBB : MF) {
2486     MachineBasicBlock::iterator SectionEnd;
2487     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2488          I = SectionEnd) {
2489       bool CollectModified;
2490       std::list<std::list<CombineInfo>> MergeableInsts;
2491 
2492       // First pass: Collect list of all instructions we know how to merge in a
2493       // subset of the block.
2494       std::tie(SectionEnd, CollectModified) =
2495           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2496 
2497       Modified |= CollectModified;
2498 
2499       do {
2500         OptimizeAgain = false;
2501         Modified |= optimizeBlock(MergeableInsts);
2502       } while (OptimizeAgain);
2503     }
2504 
2505     Visited.clear();
2506     AnchorList.clear();
2507   }
2508 
2509   return Modified;
2510 }
2511