xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "SILoadStoreOptimizer.h"
61 #include "AMDGPU.h"
62 #include "GCNSubtarget.h"
63 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
64 #include "llvm/Analysis/AliasAnalysis.h"
65 #include "llvm/CodeGen/MachineFunctionPass.h"
66 #include "llvm/InitializePasses.h"
67 
68 using namespace llvm;
69 
70 #define DEBUG_TYPE "si-load-store-opt"
71 
72 namespace {
73 enum InstClassEnum {
74   UNKNOWN,
75   DS_READ,
76   DS_WRITE,
77   S_BUFFER_LOAD_IMM,
78   S_BUFFER_LOAD_SGPR_IMM,
79   S_LOAD_IMM,
80   BUFFER_LOAD,
81   BUFFER_STORE,
82   MIMG,
83   TBUFFER_LOAD,
84   TBUFFER_STORE,
85   GLOBAL_LOAD_SADDR,
86   GLOBAL_STORE_SADDR,
87   FLAT_LOAD,
88   FLAT_STORE,
89   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
90   GLOBAL_STORE // any CombineInfo, they are only ever returned by
91                // getCommonInstClass.
92 };
93 
94 struct AddressRegs {
95   unsigned char NumVAddrs = 0;
96   bool SBase = false;
97   bool SRsrc = false;
98   bool SOffset = false;
99   bool SAddr = false;
100   bool VAddr = false;
101   bool Addr = false;
102   bool SSamp = false;
103 };
104 
105 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
106 const unsigned MaxAddressRegs = 12 + 1 + 1;
107 
108 class SILoadStoreOptimizer {
109   struct CombineInfo {
110     MachineBasicBlock::iterator I;
111     unsigned EltSize;
112     unsigned Offset;
113     unsigned Width;
114     unsigned Format;
115     unsigned BaseOff;
116     unsigned DMask;
117     InstClassEnum InstClass;
118     unsigned CPol = 0;
119     bool IsAGPR;
120     bool UseST64;
121     int AddrIdx[MaxAddressRegs];
122     const MachineOperand *AddrReg[MaxAddressRegs];
123     unsigned NumAddresses;
124     unsigned Order;
125 
126     bool hasSameBaseAddress(const CombineInfo &CI) {
127       if (NumAddresses != CI.NumAddresses)
128         return false;
129 
130       const MachineInstr &MI = *CI.I;
131       for (unsigned i = 0; i < NumAddresses; i++) {
132         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
133 
134         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
135           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
136               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
137             return false;
138           }
139           continue;
140         }
141 
142         // Check same base pointer. Be careful of subregisters, which can occur
143         // with vectors of pointers.
144         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
145             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
146          return false;
147         }
148       }
149       return true;
150     }
151 
152     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
153       for (unsigned i = 0; i < NumAddresses; ++i) {
154         const MachineOperand *AddrOp = AddrReg[i];
155         // Immediates are always OK.
156         if (AddrOp->isImm())
157           continue;
158 
159         // Don't try to merge addresses that aren't either immediates or registers.
160         // TODO: Should be possible to merge FrameIndexes and maybe some other
161         // non-register
162         if (!AddrOp->isReg())
163           return false;
164 
165         // TODO: We should be able to merge instructions with other physical reg
166         // addresses too.
167         if (AddrOp->getReg().isPhysical() &&
168             AddrOp->getReg() != AMDGPU::SGPR_NULL)
169           return false;
170 
171         // If an address has only one use then there will be no other
172         // instructions with the same address, so we can't merge this one.
173         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
174           return false;
175       }
176       return true;
177     }
178 
179     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
180 
181     // Compare by pointer order.
182     bool operator<(const CombineInfo& Other) const {
183       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
184     }
185   };
186 
187   struct BaseRegisters {
188     Register LoReg;
189     Register HiReg;
190 
191     unsigned LoSubReg = 0;
192     unsigned HiSubReg = 0;
193   };
194 
195   struct MemAddress {
196     BaseRegisters Base;
197     int64_t Offset = 0;
198   };
199 
200   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
201 
202 private:
203   const GCNSubtarget *STM = nullptr;
204   const SIInstrInfo *TII = nullptr;
205   const SIRegisterInfo *TRI = nullptr;
206   MachineRegisterInfo *MRI = nullptr;
207   AliasAnalysis *AA = nullptr;
208   bool OptimizeAgain;
209 
210   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
211                            const DenseSet<Register> &ARegUses,
212                            const MachineInstr &A, const MachineInstr &B) const;
213   static bool dmasksCanBeCombined(const CombineInfo &CI,
214                                   const SIInstrInfo &TII,
215                                   const CombineInfo &Paired);
216   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
217                                    CombineInfo &Paired, bool Modify = false);
218   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
219                         const CombineInfo &Paired);
220   unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
221   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
222                                                      const CombineInfo &Paired);
223   const TargetRegisterClass *
224   getTargetRegisterClass(const CombineInfo &CI,
225                          const CombineInfo &Paired) const;
226   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
227 
228   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
229 
230   void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
231                       MachineBasicBlock::iterator InsertBefore,
232                       AMDGPU::OpName OpName, Register DestReg) const;
233   Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
234                            MachineBasicBlock::iterator InsertBefore,
235                            AMDGPU::OpName OpName) const;
236 
237   unsigned read2Opcode(unsigned EltSize) const;
238   unsigned read2ST64Opcode(unsigned EltSize) const;
239   MachineBasicBlock::iterator
240   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
241                  MachineBasicBlock::iterator InsertBefore);
242 
243   unsigned write2Opcode(unsigned EltSize) const;
244   unsigned write2ST64Opcode(unsigned EltSize) const;
245   MachineBasicBlock::iterator
246   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
247                   MachineBasicBlock::iterator InsertBefore);
248   MachineBasicBlock::iterator
249   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
250                  MachineBasicBlock::iterator InsertBefore);
251   MachineBasicBlock::iterator
252   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
253                        MachineBasicBlock::iterator InsertBefore);
254   MachineBasicBlock::iterator
255   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
256                       MachineBasicBlock::iterator InsertBefore);
257   MachineBasicBlock::iterator
258   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
259                        MachineBasicBlock::iterator InsertBefore);
260   MachineBasicBlock::iterator
261   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
262                        MachineBasicBlock::iterator InsertBefore);
263   MachineBasicBlock::iterator
264   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
265                         MachineBasicBlock::iterator InsertBefore);
266   MachineBasicBlock::iterator
267   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
268                     MachineBasicBlock::iterator InsertBefore);
269   MachineBasicBlock::iterator
270   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
271                      MachineBasicBlock::iterator InsertBefore);
272 
273   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
274                            int32_t NewOffset) const;
275   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
276   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
277   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
278   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
279   /// Promotes constant offset to the immediate by adjusting the base. It
280   /// tries to use a base from the nearby instructions that allows it to have
281   /// a 13bit constant offset which gets promoted to the immediate.
282   bool promoteConstantOffsetToImm(MachineInstr &CI,
283                                   MemInfoMap &Visited,
284                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
285   void addInstToMergeableList(const CombineInfo &CI,
286                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
287 
288   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
289       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
290       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
291       std::list<std::list<CombineInfo>> &MergeableInsts) const;
292 
293   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
294                                                      const CombineInfo &Paired);
295 
296   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
297                                           const CombineInfo &Paired);
298 
299   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
300                                      bool &OptimizeListAgain);
301   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
302 
303 public:
304   SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
305   bool run(MachineFunction &MF);
306 };
307 
308 class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
309 public:
310   static char ID;
311 
312   SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
313 
314   bool runOnMachineFunction(MachineFunction &MF) override;
315 
316   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
317 
318   void getAnalysisUsage(AnalysisUsage &AU) const override {
319     AU.setPreservesCFG();
320     AU.addRequired<AAResultsWrapperPass>();
321 
322     MachineFunctionPass::getAnalysisUsage(AU);
323   }
324 
325   MachineFunctionProperties getRequiredProperties() const override {
326     return MachineFunctionProperties().setIsSSA();
327   }
328 };
329 
330 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
331   const unsigned Opc = MI.getOpcode();
332 
333   if (TII.isMUBUF(Opc)) {
334     // FIXME: Handle d16 correctly
335     return AMDGPU::getMUBUFElements(Opc);
336   }
337   if (TII.isImage(MI)) {
338     uint64_t DMaskImm =
339         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
340     return llvm::popcount(DMaskImm);
341   }
342   if (TII.isMTBUF(Opc)) {
343     return AMDGPU::getMTBUFElements(Opc);
344   }
345 
346   switch (Opc) {
347   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
348   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
349   case AMDGPU::S_LOAD_DWORD_IMM:
350   case AMDGPU::GLOBAL_LOAD_DWORD:
351   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
352   case AMDGPU::GLOBAL_STORE_DWORD:
353   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
354   case AMDGPU::FLAT_LOAD_DWORD:
355   case AMDGPU::FLAT_STORE_DWORD:
356     return 1;
357   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
358   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
359   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
360   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
361   case AMDGPU::S_LOAD_DWORDX2_IMM:
362   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
363   case AMDGPU::GLOBAL_LOAD_DWORDX2:
364   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
365   case AMDGPU::GLOBAL_STORE_DWORDX2:
366   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
367   case AMDGPU::FLAT_LOAD_DWORDX2:
368   case AMDGPU::FLAT_STORE_DWORDX2:
369     return 2;
370   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
371   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
372   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
373   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
374   case AMDGPU::S_LOAD_DWORDX3_IMM:
375   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
376   case AMDGPU::GLOBAL_LOAD_DWORDX3:
377   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
378   case AMDGPU::GLOBAL_STORE_DWORDX3:
379   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
380   case AMDGPU::FLAT_LOAD_DWORDX3:
381   case AMDGPU::FLAT_STORE_DWORDX3:
382     return 3;
383   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
384   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
385   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
386   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
387   case AMDGPU::S_LOAD_DWORDX4_IMM:
388   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
389   case AMDGPU::GLOBAL_LOAD_DWORDX4:
390   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
391   case AMDGPU::GLOBAL_STORE_DWORDX4:
392   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
393   case AMDGPU::FLAT_LOAD_DWORDX4:
394   case AMDGPU::FLAT_STORE_DWORDX4:
395     return 4;
396   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
397   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
398   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
399   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
400   case AMDGPU::S_LOAD_DWORDX8_IMM:
401   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
402     return 8;
403   case AMDGPU::DS_READ_B32:
404   case AMDGPU::DS_READ_B32_gfx9:
405   case AMDGPU::DS_WRITE_B32:
406   case AMDGPU::DS_WRITE_B32_gfx9:
407     return 1;
408   case AMDGPU::DS_READ_B64:
409   case AMDGPU::DS_READ_B64_gfx9:
410   case AMDGPU::DS_WRITE_B64:
411   case AMDGPU::DS_WRITE_B64_gfx9:
412     return 2;
413   default:
414     return 0;
415   }
416 }
417 
418 /// Maps instruction opcode to enum InstClassEnum.
419 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
420   switch (Opc) {
421   default:
422     if (TII.isMUBUF(Opc)) {
423       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
424       default:
425         return UNKNOWN;
426       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
427       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
428       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
429       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
430       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
431       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
432       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
433       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
434       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
435       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
436       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
437       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
438       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
439       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
440       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
441       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
442         return BUFFER_LOAD;
443       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
444       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
445       case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
446       case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
447       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
448       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
449       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
450       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
451       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
452       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
453       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
454       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
455       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
456       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
457       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
458       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
459         return BUFFER_STORE;
460       }
461     }
462     if (TII.isImage(Opc)) {
463       // Ignore instructions encoded without vaddr.
464       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
465           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
466         return UNKNOWN;
467       // Ignore BVH instructions
468       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
469         return UNKNOWN;
470       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
471       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
472           TII.isGather4(Opc))
473         return UNKNOWN;
474       return MIMG;
475     }
476     if (TII.isMTBUF(Opc)) {
477       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
478       default:
479         return UNKNOWN;
480       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
481       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
482       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
483       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
484       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
485       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
486       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
487       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
488       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
489       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
490       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
491       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
492       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
493       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
494       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
495       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
496         return TBUFFER_LOAD;
497       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
498       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
499       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
500       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
501       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
502       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
503       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
504       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
505         return TBUFFER_STORE;
506       }
507     }
508     return UNKNOWN;
509   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
510   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
511   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
512   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
513   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
514   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
515   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
516   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
517   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
518     return S_BUFFER_LOAD_IMM;
519   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
520   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
521   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
522   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
523   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
524   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
525   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
526   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
527   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
528     return S_BUFFER_LOAD_SGPR_IMM;
529   case AMDGPU::S_LOAD_DWORD_IMM:
530   case AMDGPU::S_LOAD_DWORDX2_IMM:
531   case AMDGPU::S_LOAD_DWORDX3_IMM:
532   case AMDGPU::S_LOAD_DWORDX4_IMM:
533   case AMDGPU::S_LOAD_DWORDX8_IMM:
534   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
535   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
536   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
537   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
538     return S_LOAD_IMM;
539   case AMDGPU::DS_READ_B32:
540   case AMDGPU::DS_READ_B32_gfx9:
541   case AMDGPU::DS_READ_B64:
542   case AMDGPU::DS_READ_B64_gfx9:
543     return DS_READ;
544   case AMDGPU::DS_WRITE_B32:
545   case AMDGPU::DS_WRITE_B32_gfx9:
546   case AMDGPU::DS_WRITE_B64:
547   case AMDGPU::DS_WRITE_B64_gfx9:
548     return DS_WRITE;
549   case AMDGPU::GLOBAL_LOAD_DWORD:
550   case AMDGPU::GLOBAL_LOAD_DWORDX2:
551   case AMDGPU::GLOBAL_LOAD_DWORDX3:
552   case AMDGPU::GLOBAL_LOAD_DWORDX4:
553   case AMDGPU::FLAT_LOAD_DWORD:
554   case AMDGPU::FLAT_LOAD_DWORDX2:
555   case AMDGPU::FLAT_LOAD_DWORDX3:
556   case AMDGPU::FLAT_LOAD_DWORDX4:
557     return FLAT_LOAD;
558   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
559   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
560   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
561   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
562     return GLOBAL_LOAD_SADDR;
563   case AMDGPU::GLOBAL_STORE_DWORD:
564   case AMDGPU::GLOBAL_STORE_DWORDX2:
565   case AMDGPU::GLOBAL_STORE_DWORDX3:
566   case AMDGPU::GLOBAL_STORE_DWORDX4:
567   case AMDGPU::FLAT_STORE_DWORD:
568   case AMDGPU::FLAT_STORE_DWORDX2:
569   case AMDGPU::FLAT_STORE_DWORDX3:
570   case AMDGPU::FLAT_STORE_DWORDX4:
571     return FLAT_STORE;
572   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
573   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
574   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
575   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
576     return GLOBAL_STORE_SADDR;
577   }
578 }
579 
580 /// Determines instruction subclass from opcode. Only instructions
581 /// of the same subclass can be merged together. The merged instruction may have
582 /// a different subclass but must have the same class.
583 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
584   switch (Opc) {
585   default:
586     if (TII.isMUBUF(Opc))
587       return AMDGPU::getMUBUFBaseOpcode(Opc);
588     if (TII.isImage(Opc)) {
589       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
590       assert(Info);
591       return Info->BaseOpcode;
592     }
593     if (TII.isMTBUF(Opc))
594       return AMDGPU::getMTBUFBaseOpcode(Opc);
595     return -1;
596   case AMDGPU::DS_READ_B32:
597   case AMDGPU::DS_READ_B32_gfx9:
598   case AMDGPU::DS_READ_B64:
599   case AMDGPU::DS_READ_B64_gfx9:
600   case AMDGPU::DS_WRITE_B32:
601   case AMDGPU::DS_WRITE_B32_gfx9:
602   case AMDGPU::DS_WRITE_B64:
603   case AMDGPU::DS_WRITE_B64_gfx9:
604     return Opc;
605   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
606   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
607   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
608   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
609   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
610   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
611   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
612   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
613   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
614     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
615   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
616   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
617   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
618   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
619   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
620   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
621   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
622   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
623   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
624     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
625   case AMDGPU::S_LOAD_DWORD_IMM:
626   case AMDGPU::S_LOAD_DWORDX2_IMM:
627   case AMDGPU::S_LOAD_DWORDX3_IMM:
628   case AMDGPU::S_LOAD_DWORDX4_IMM:
629   case AMDGPU::S_LOAD_DWORDX8_IMM:
630   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
631   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
632   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
633   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
634     return AMDGPU::S_LOAD_DWORD_IMM;
635   case AMDGPU::GLOBAL_LOAD_DWORD:
636   case AMDGPU::GLOBAL_LOAD_DWORDX2:
637   case AMDGPU::GLOBAL_LOAD_DWORDX3:
638   case AMDGPU::GLOBAL_LOAD_DWORDX4:
639   case AMDGPU::FLAT_LOAD_DWORD:
640   case AMDGPU::FLAT_LOAD_DWORDX2:
641   case AMDGPU::FLAT_LOAD_DWORDX3:
642   case AMDGPU::FLAT_LOAD_DWORDX4:
643     return AMDGPU::FLAT_LOAD_DWORD;
644   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
645   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
646   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
647   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
648     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
649   case AMDGPU::GLOBAL_STORE_DWORD:
650   case AMDGPU::GLOBAL_STORE_DWORDX2:
651   case AMDGPU::GLOBAL_STORE_DWORDX3:
652   case AMDGPU::GLOBAL_STORE_DWORDX4:
653   case AMDGPU::FLAT_STORE_DWORD:
654   case AMDGPU::FLAT_STORE_DWORDX2:
655   case AMDGPU::FLAT_STORE_DWORDX3:
656   case AMDGPU::FLAT_STORE_DWORDX4:
657     return AMDGPU::FLAT_STORE_DWORD;
658   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
659   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
660   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
661   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
662     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
663   }
664 }
665 
666 // GLOBAL loads and stores are classified as FLAT initially. If both combined
667 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
668 // If either or both instructions are non segment specific FLAT the resulting
669 // combined operation will be FLAT, potentially promoting one of the GLOBAL
670 // operations to FLAT.
671 // For other instructions return the original unmodified class.
672 InstClassEnum
673 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
674                                          const CombineInfo &Paired) {
675   assert(CI.InstClass == Paired.InstClass);
676 
677   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
678       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
679     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
680 
681   return CI.InstClass;
682 }
683 
684 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
685   AddressRegs Result;
686 
687   if (TII.isMUBUF(Opc)) {
688     if (AMDGPU::getMUBUFHasVAddr(Opc))
689       Result.VAddr = true;
690     if (AMDGPU::getMUBUFHasSrsrc(Opc))
691       Result.SRsrc = true;
692     if (AMDGPU::getMUBUFHasSoffset(Opc))
693       Result.SOffset = true;
694 
695     return Result;
696   }
697 
698   if (TII.isImage(Opc)) {
699     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
700     if (VAddr0Idx >= 0) {
701       AMDGPU::OpName RsrcName =
702           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
703       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
704       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
705     } else {
706       Result.VAddr = true;
707     }
708     Result.SRsrc = true;
709     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
710     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
711       Result.SSamp = true;
712 
713     return Result;
714   }
715   if (TII.isMTBUF(Opc)) {
716     if (AMDGPU::getMTBUFHasVAddr(Opc))
717       Result.VAddr = true;
718     if (AMDGPU::getMTBUFHasSrsrc(Opc))
719       Result.SRsrc = true;
720     if (AMDGPU::getMTBUFHasSoffset(Opc))
721       Result.SOffset = true;
722 
723     return Result;
724   }
725 
726   switch (Opc) {
727   default:
728     return Result;
729   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
730   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
731   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
732   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
733   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
734   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
735   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
736   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
737   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
738     Result.SOffset = true;
739     [[fallthrough]];
740   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
741   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
742   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
743   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
744   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
745   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
746   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
747   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
748   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
749   case AMDGPU::S_LOAD_DWORD_IMM:
750   case AMDGPU::S_LOAD_DWORDX2_IMM:
751   case AMDGPU::S_LOAD_DWORDX3_IMM:
752   case AMDGPU::S_LOAD_DWORDX4_IMM:
753   case AMDGPU::S_LOAD_DWORDX8_IMM:
754   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
755   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
756   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
757   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
758     Result.SBase = true;
759     return Result;
760   case AMDGPU::DS_READ_B32:
761   case AMDGPU::DS_READ_B64:
762   case AMDGPU::DS_READ_B32_gfx9:
763   case AMDGPU::DS_READ_B64_gfx9:
764   case AMDGPU::DS_WRITE_B32:
765   case AMDGPU::DS_WRITE_B64:
766   case AMDGPU::DS_WRITE_B32_gfx9:
767   case AMDGPU::DS_WRITE_B64_gfx9:
768     Result.Addr = true;
769     return Result;
770   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
771   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
772   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
773   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
774   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
775   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
776   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
777   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
778     Result.SAddr = true;
779     [[fallthrough]];
780   case AMDGPU::GLOBAL_LOAD_DWORD:
781   case AMDGPU::GLOBAL_LOAD_DWORDX2:
782   case AMDGPU::GLOBAL_LOAD_DWORDX3:
783   case AMDGPU::GLOBAL_LOAD_DWORDX4:
784   case AMDGPU::GLOBAL_STORE_DWORD:
785   case AMDGPU::GLOBAL_STORE_DWORDX2:
786   case AMDGPU::GLOBAL_STORE_DWORDX3:
787   case AMDGPU::GLOBAL_STORE_DWORDX4:
788   case AMDGPU::FLAT_LOAD_DWORD:
789   case AMDGPU::FLAT_LOAD_DWORDX2:
790   case AMDGPU::FLAT_LOAD_DWORDX3:
791   case AMDGPU::FLAT_LOAD_DWORDX4:
792   case AMDGPU::FLAT_STORE_DWORD:
793   case AMDGPU::FLAT_STORE_DWORDX2:
794   case AMDGPU::FLAT_STORE_DWORDX3:
795   case AMDGPU::FLAT_STORE_DWORDX4:
796     Result.VAddr = true;
797     return Result;
798   }
799 }
800 
801 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
802                                               const SILoadStoreOptimizer &LSO) {
803   I = MI;
804   unsigned Opc = MI->getOpcode();
805   InstClass = getInstClass(Opc, *LSO.TII);
806 
807   if (InstClass == UNKNOWN)
808     return;
809 
810   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
811 
812   switch (InstClass) {
813   case DS_READ:
814    EltSize =
815           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
816                                                                           : 4;
817    break;
818   case DS_WRITE:
819     EltSize =
820           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
821                                                                             : 4;
822     break;
823   case S_BUFFER_LOAD_IMM:
824   case S_BUFFER_LOAD_SGPR_IMM:
825   case S_LOAD_IMM:
826     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
827     break;
828   default:
829     EltSize = 4;
830     break;
831   }
832 
833   if (InstClass == MIMG) {
834     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
835     // Offset is not considered for MIMG instructions.
836     Offset = 0;
837   } else {
838     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
839     Offset = I->getOperand(OffsetIdx).getImm();
840   }
841 
842   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
843     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
844 
845   Width = getOpcodeWidth(*I, *LSO.TII);
846 
847   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
848     Offset &= 0xffff;
849   } else if (InstClass != MIMG) {
850     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
851   }
852 
853   AddressRegs Regs = getRegs(Opc, *LSO.TII);
854   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
855 
856   NumAddresses = 0;
857   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
858     AddrIdx[NumAddresses++] =
859         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
860   if (Regs.Addr)
861     AddrIdx[NumAddresses++] =
862         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
863   if (Regs.SBase)
864     AddrIdx[NumAddresses++] =
865         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
866   if (Regs.SRsrc)
867     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
868         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
869   if (Regs.SOffset)
870     AddrIdx[NumAddresses++] =
871         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
872   if (Regs.SAddr)
873     AddrIdx[NumAddresses++] =
874         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
875   if (Regs.VAddr)
876     AddrIdx[NumAddresses++] =
877         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
878   if (Regs.SSamp)
879     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
880         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
881   assert(NumAddresses <= MaxAddressRegs);
882 
883   for (unsigned J = 0; J < NumAddresses; J++)
884     AddrReg[J] = &I->getOperand(AddrIdx[J]);
885 }
886 
887 } // end anonymous namespace.
888 
889 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
890                       "SI Load Store Optimizer", false, false)
891 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
892 INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
893                     "SI Load Store Optimizer", false, false)
894 
895 char SILoadStoreOptimizerLegacy::ID = 0;
896 
897 char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
898 
899 FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() {
900   return new SILoadStoreOptimizerLegacy();
901 }
902 
903 static void addDefsUsesToList(const MachineInstr &MI,
904                               DenseSet<Register> &RegDefs,
905                               DenseSet<Register> &RegUses) {
906   for (const auto &Op : MI.operands()) {
907     if (!Op.isReg())
908       continue;
909     if (Op.isDef())
910       RegDefs.insert(Op.getReg());
911     if (Op.readsReg())
912       RegUses.insert(Op.getReg());
913   }
914 }
915 
916 bool SILoadStoreOptimizer::canSwapInstructions(
917     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
918     const MachineInstr &A, const MachineInstr &B) const {
919   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
920       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
921     return false;
922   for (const auto &BOp : B.operands()) {
923     if (!BOp.isReg())
924       continue;
925     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
926       return false;
927     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
928       return false;
929   }
930   return true;
931 }
932 
933 // Given that \p CI and \p Paired are adjacent memory operations produce a new
934 // MMO for the combined operation with a new access size.
935 MachineMemOperand *
936 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
937                                                const CombineInfo &Paired) {
938   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
939   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
940 
941   unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
942 
943   // A base pointer for the combined operation is the same as the leading
944   // operation's pointer.
945   if (Paired < CI)
946     std::swap(MMOa, MMOb);
947 
948   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
949   // If merging FLAT and GLOBAL set address space to FLAT.
950   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
951     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
952 
953   MachineFunction *MF = CI.I->getMF();
954   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
955 }
956 
957 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
958                                                const SIInstrInfo &TII,
959                                                const CombineInfo &Paired) {
960   assert(CI.InstClass == MIMG);
961 
962   // Ignore instructions with tfe/lwe set.
963   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
964   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
965 
966   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
967     return false;
968 
969   // Check other optional immediate operands for equality.
970   AMDGPU::OpName OperandsToMatch[] = {
971       AMDGPU::OpName::cpol, AMDGPU::OpName::d16,  AMDGPU::OpName::unorm,
972       AMDGPU::OpName::da,   AMDGPU::OpName::r128, AMDGPU::OpName::a16};
973 
974   for (AMDGPU::OpName op : OperandsToMatch) {
975     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
976     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
977       return false;
978     if (Idx != -1 &&
979         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
980       return false;
981   }
982 
983   // Check DMask for overlaps.
984   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
985   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
986 
987   if (!MaxMask)
988     return false;
989 
990   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
991   if ((1u << AllowedBitsForMin) <= MinMask)
992     return false;
993 
994   return true;
995 }
996 
997 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
998                                        unsigned ComponentCount,
999                                        const GCNSubtarget &STI) {
1000   if (ComponentCount > 4)
1001     return 0;
1002 
1003   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1004       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
1005   if (!OldFormatInfo)
1006     return 0;
1007 
1008   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1009       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
1010                                            ComponentCount,
1011                                            OldFormatInfo->NumFormat, STI);
1012 
1013   if (!NewFormatInfo)
1014     return 0;
1015 
1016   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1017          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1018 
1019   return NewFormatInfo->Format;
1020 }
1021 
1022 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
1023 // highest power of two. Note that the result is well defined for all inputs
1024 // including corner cases like:
1025 // - if Lo == Hi, return that value
1026 // - if Lo == 0, return 0 (even though the "- 1" below underflows
1027 // - if Lo > Hi, return 0 (as if the range wrapped around)
1028 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
1029   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
1030 }
1031 
1032 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1033                                                 const GCNSubtarget &STI,
1034                                                 CombineInfo &Paired,
1035                                                 bool Modify) {
1036   assert(CI.InstClass != MIMG);
1037 
1038   // XXX - Would the same offset be OK? Is there any reason this would happen or
1039   // be useful?
1040   if (CI.Offset == Paired.Offset)
1041     return false;
1042 
1043   // This won't be valid if the offset isn't aligned.
1044   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1045     return false;
1046 
1047   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1048 
1049     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1050         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1051     if (!Info0)
1052       return false;
1053     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1054         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1055     if (!Info1)
1056       return false;
1057 
1058     if (Info0->BitsPerComp != Info1->BitsPerComp ||
1059         Info0->NumFormat != Info1->NumFormat)
1060       return false;
1061 
1062     // TODO: Should be possible to support more formats, but if format loads
1063     // are not dword-aligned, the merged load might not be valid.
1064     if (Info0->BitsPerComp != 32)
1065       return false;
1066 
1067     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1068       return false;
1069   }
1070 
1071   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1072   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1073   CI.UseST64 = false;
1074   CI.BaseOff = 0;
1075 
1076   // Handle all non-DS instructions.
1077   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1078     if (EltOffset0 + CI.Width != EltOffset1 &&
1079             EltOffset1 + Paired.Width != EltOffset0)
1080       return false;
1081     if (CI.CPol != Paired.CPol)
1082       return false;
1083     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1084         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1085       // Reject cases like:
1086       //   dword + dwordx2 -> dwordx3
1087       //   dword + dwordx3 -> dwordx4
1088       // If we tried to combine these cases, we would fail to extract a subreg
1089       // for the result of the second load due to SGPR alignment requirements.
1090       if (CI.Width != Paired.Width &&
1091           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1092         return false;
1093     }
1094     return true;
1095   }
1096 
1097   // If the offset in elements doesn't fit in 8-bits, we might be able to use
1098   // the stride 64 versions.
1099   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1100       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1101     if (Modify) {
1102       CI.Offset = EltOffset0 / 64;
1103       Paired.Offset = EltOffset1 / 64;
1104       CI.UseST64 = true;
1105     }
1106     return true;
1107   }
1108 
1109   // Check if the new offsets fit in the reduced 8-bit range.
1110   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1111     if (Modify) {
1112       CI.Offset = EltOffset0;
1113       Paired.Offset = EltOffset1;
1114     }
1115     return true;
1116   }
1117 
1118   // Try to shift base address to decrease offsets.
1119   uint32_t Min = std::min(EltOffset0, EltOffset1);
1120   uint32_t Max = std::max(EltOffset0, EltOffset1);
1121 
1122   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1123   if (((Max - Min) & ~Mask) == 0) {
1124     if (Modify) {
1125       // From the range of values we could use for BaseOff, choose the one that
1126       // is aligned to the highest power of two, to maximise the chance that
1127       // the same offset can be reused for other load/store pairs.
1128       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1129       // Copy the low bits of the offsets, so that when we adjust them by
1130       // subtracting BaseOff they will be multiples of 64.
1131       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1132       CI.BaseOff = BaseOff * CI.EltSize;
1133       CI.Offset = (EltOffset0 - BaseOff) / 64;
1134       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1135       CI.UseST64 = true;
1136     }
1137     return true;
1138   }
1139 
1140   if (isUInt<8>(Max - Min)) {
1141     if (Modify) {
1142       // From the range of values we could use for BaseOff, choose the one that
1143       // is aligned to the highest power of two, to maximise the chance that
1144       // the same offset can be reused for other load/store pairs.
1145       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1146       CI.BaseOff = BaseOff * CI.EltSize;
1147       CI.Offset = EltOffset0 - BaseOff;
1148       Paired.Offset = EltOffset1 - BaseOff;
1149     }
1150     return true;
1151   }
1152 
1153   return false;
1154 }
1155 
1156 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1157                                      const CombineInfo &CI,
1158                                      const CombineInfo &Paired) {
1159   const unsigned Width = (CI.Width + Paired.Width);
1160   switch (CI.InstClass) {
1161   default:
1162     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1163   case S_BUFFER_LOAD_IMM:
1164   case S_BUFFER_LOAD_SGPR_IMM:
1165   case S_LOAD_IMM:
1166     switch (Width) {
1167     default:
1168       return false;
1169     case 2:
1170     case 4:
1171     case 8:
1172       return true;
1173     case 3:
1174       return STM.hasScalarDwordx3Loads();
1175     }
1176   }
1177 }
1178 
1179 const TargetRegisterClass *
1180 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1181   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1182     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1183   }
1184   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1185     return TRI->getRegClassForReg(*MRI, Src->getReg());
1186   }
1187   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1188     return TRI->getRegClassForReg(*MRI, Src->getReg());
1189   }
1190   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1191     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1192   }
1193   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1194     return TRI->getRegClassForReg(*MRI, Src->getReg());
1195   }
1196   return nullptr;
1197 }
1198 
1199 /// This function assumes that CI comes before Paired in a basic block. Return
1200 /// an insertion point for the merged instruction or nullptr on failure.
1201 SILoadStoreOptimizer::CombineInfo *
1202 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1203                                            CombineInfo &Paired) {
1204   // If another instruction has already been merged into CI, it may now be a
1205   // type that we can't do any further merging into.
1206   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1207     return nullptr;
1208   assert(CI.InstClass == Paired.InstClass);
1209 
1210   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1211       getInstSubclass(Paired.I->getOpcode(), *TII))
1212     return nullptr;
1213 
1214   // Check both offsets (or masks for MIMG) can be combined and fit in the
1215   // reduced range.
1216   if (CI.InstClass == MIMG) {
1217     if (!dmasksCanBeCombined(CI, *TII, Paired))
1218       return nullptr;
1219   } else {
1220     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1221       return nullptr;
1222   }
1223 
1224   DenseSet<Register> RegDefs;
1225   DenseSet<Register> RegUses;
1226   CombineInfo *Where;
1227   if (CI.I->mayLoad()) {
1228     // Try to hoist Paired up to CI.
1229     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1230     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1231       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1232         return nullptr;
1233     }
1234     Where = &CI;
1235   } else {
1236     // Try to sink CI down to Paired.
1237     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1238     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1239       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1240         return nullptr;
1241     }
1242     Where = &Paired;
1243   }
1244 
1245   // Call offsetsCanBeCombined with modify = true so that the offsets are
1246   // correct for the new instruction.  This should return true, because
1247   // this function should only be called on CombineInfo objects that
1248   // have already been confirmed to be mergeable.
1249   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1250     offsetsCanBeCombined(CI, *STM, Paired, true);
1251   return Where;
1252 }
1253 
1254 // Copy the merged load result from DestReg to the original dest regs of CI and
1255 // Paired.
1256 void SILoadStoreOptimizer::copyToDestRegs(
1257     CombineInfo &CI, CombineInfo &Paired,
1258     MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
1259     Register DestReg) const {
1260   MachineBasicBlock *MBB = CI.I->getParent();
1261   DebugLoc DL = CI.I->getDebugLoc();
1262 
1263   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1264 
1265   // Copy to the old destination registers.
1266   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1267   auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1268   auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1269 
1270   // The constrained sload instructions in S_LOAD_IMM class will have
1271   // `early-clobber` flag in the dst operand. Remove the flag before using the
1272   // MOs in copies.
1273   Dest0->setIsEarlyClobber(false);
1274   Dest1->setIsEarlyClobber(false);
1275 
1276   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1277       .add(*Dest0) // Copy to same destination including flags and sub reg.
1278       .addReg(DestReg, 0, SubRegIdx0);
1279   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1280       .add(*Dest1)
1281       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1282 }
1283 
1284 // Return a register for the source of the merged store after copying the
1285 // original source regs of CI and Paired into it.
1286 Register
1287 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1288                                       MachineBasicBlock::iterator InsertBefore,
1289                                       AMDGPU::OpName OpName) const {
1290   MachineBasicBlock *MBB = CI.I->getParent();
1291   DebugLoc DL = CI.I->getDebugLoc();
1292 
1293   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1294 
1295   // Copy to the new source register.
1296   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1297   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1298 
1299   const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1300   const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1301 
1302   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1303       .add(*Src0)
1304       .addImm(SubRegIdx0)
1305       .add(*Src1)
1306       .addImm(SubRegIdx1);
1307 
1308   return SrcReg;
1309 }
1310 
1311 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1312   if (STM->ldsRequiresM0Init())
1313     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1314   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1315 }
1316 
1317 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1318   if (STM->ldsRequiresM0Init())
1319     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1320 
1321   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1322                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1323 }
1324 
1325 MachineBasicBlock::iterator
1326 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1327                                      MachineBasicBlock::iterator InsertBefore) {
1328   MachineBasicBlock *MBB = CI.I->getParent();
1329 
1330   // Be careful, since the addresses could be subregisters themselves in weird
1331   // cases, like vectors of pointers.
1332   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1333 
1334   unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1335   unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1336   unsigned Opc =
1337       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1338 
1339   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1340          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1341 
1342   const MCInstrDesc &Read2Desc = TII->get(Opc);
1343 
1344   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1345   Register DestReg = MRI->createVirtualRegister(SuperRC);
1346 
1347   DebugLoc DL = CI.I->getDebugLoc();
1348 
1349   Register BaseReg = AddrReg->getReg();
1350   unsigned BaseSubReg = AddrReg->getSubReg();
1351   unsigned BaseRegFlags = 0;
1352   if (CI.BaseOff) {
1353     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1354     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1355         .addImm(CI.BaseOff);
1356 
1357     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1358     BaseRegFlags = RegState::Kill;
1359 
1360     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1361         .addReg(ImmReg)
1362         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1363         .addImm(0); // clamp bit
1364     BaseSubReg = 0;
1365   }
1366 
1367   MachineInstrBuilder Read2 =
1368       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1369           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1370           .addImm(NewOffset0)                        // offset0
1371           .addImm(NewOffset1)                        // offset1
1372           .addImm(0)                                 // gds
1373           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1374 
1375   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1376 
1377   CI.I->eraseFromParent();
1378   Paired.I->eraseFromParent();
1379 
1380   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1381   return Read2;
1382 }
1383 
1384 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1385   if (STM->ldsRequiresM0Init())
1386     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1387   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1388                         : AMDGPU::DS_WRITE2_B64_gfx9;
1389 }
1390 
1391 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1392   if (STM->ldsRequiresM0Init())
1393     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1394                           : AMDGPU::DS_WRITE2ST64_B64;
1395 
1396   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1397                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1398 }
1399 
1400 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1401     CombineInfo &CI, CombineInfo &Paired,
1402     MachineBasicBlock::iterator InsertBefore) {
1403   MachineBasicBlock *MBB = CI.I->getParent();
1404 
1405   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1406   // sure we preserve the subregister index and any register flags set on them.
1407   const MachineOperand *AddrReg =
1408       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1409   const MachineOperand *Data0 =
1410       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1411   const MachineOperand *Data1 =
1412       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1413 
1414   unsigned NewOffset0 = CI.Offset;
1415   unsigned NewOffset1 = Paired.Offset;
1416   unsigned Opc =
1417       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1418 
1419   if (NewOffset0 > NewOffset1) {
1420     // Canonicalize the merged instruction so the smaller offset comes first.
1421     std::swap(NewOffset0, NewOffset1);
1422     std::swap(Data0, Data1);
1423   }
1424 
1425   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1426          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1427 
1428   const MCInstrDesc &Write2Desc = TII->get(Opc);
1429   DebugLoc DL = CI.I->getDebugLoc();
1430 
1431   Register BaseReg = AddrReg->getReg();
1432   unsigned BaseSubReg = AddrReg->getSubReg();
1433   unsigned BaseRegFlags = 0;
1434   if (CI.BaseOff) {
1435     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1436     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1437         .addImm(CI.BaseOff);
1438 
1439     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1440     BaseRegFlags = RegState::Kill;
1441 
1442     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1443         .addReg(ImmReg)
1444         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1445         .addImm(0); // clamp bit
1446     BaseSubReg = 0;
1447   }
1448 
1449   MachineInstrBuilder Write2 =
1450       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1451           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1452           .add(*Data0)                               // data0
1453           .add(*Data1)                               // data1
1454           .addImm(NewOffset0)                        // offset0
1455           .addImm(NewOffset1)                        // offset1
1456           .addImm(0)                                 // gds
1457           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1458 
1459   CI.I->eraseFromParent();
1460   Paired.I->eraseFromParent();
1461 
1462   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1463   return Write2;
1464 }
1465 
1466 MachineBasicBlock::iterator
1467 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1468                                      MachineBasicBlock::iterator InsertBefore) {
1469   MachineBasicBlock *MBB = CI.I->getParent();
1470   DebugLoc DL = CI.I->getDebugLoc();
1471   const unsigned Opcode = getNewOpcode(CI, Paired);
1472 
1473   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1474 
1475   Register DestReg = MRI->createVirtualRegister(SuperRC);
1476   unsigned MergedDMask = CI.DMask | Paired.DMask;
1477   unsigned DMaskIdx =
1478       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1479 
1480   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1481   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1482     if (I == DMaskIdx)
1483       MIB.addImm(MergedDMask);
1484     else
1485       MIB.add((*CI.I).getOperand(I));
1486   }
1487 
1488   // It shouldn't be possible to get this far if the two instructions
1489   // don't have a single memoperand, because MachineInstr::mayAlias()
1490   // will return true if this is the case.
1491   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1492 
1493   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1494 
1495   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1496 
1497   CI.I->eraseFromParent();
1498   Paired.I->eraseFromParent();
1499   return New;
1500 }
1501 
1502 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1503     CombineInfo &CI, CombineInfo &Paired,
1504     MachineBasicBlock::iterator InsertBefore) {
1505   MachineBasicBlock *MBB = CI.I->getParent();
1506   DebugLoc DL = CI.I->getDebugLoc();
1507   const unsigned Opcode = getNewOpcode(CI, Paired);
1508 
1509   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1510 
1511   Register DestReg = MRI->createVirtualRegister(SuperRC);
1512   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1513 
1514   // It shouldn't be possible to get this far if the two instructions
1515   // don't have a single memoperand, because MachineInstr::mayAlias()
1516   // will return true if this is the case.
1517   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1518 
1519   MachineInstrBuilder New =
1520       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1521           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1522   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1523     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1524   New.addImm(MergedOffset);
1525   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1526 
1527   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1528 
1529   CI.I->eraseFromParent();
1530   Paired.I->eraseFromParent();
1531   return New;
1532 }
1533 
1534 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1535     CombineInfo &CI, CombineInfo &Paired,
1536     MachineBasicBlock::iterator InsertBefore) {
1537   MachineBasicBlock *MBB = CI.I->getParent();
1538   DebugLoc DL = CI.I->getDebugLoc();
1539 
1540   const unsigned Opcode = getNewOpcode(CI, Paired);
1541 
1542   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1543 
1544   // Copy to the new source register.
1545   Register DestReg = MRI->createVirtualRegister(SuperRC);
1546   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1547 
1548   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1549 
1550   AddressRegs Regs = getRegs(Opcode, *TII);
1551 
1552   if (Regs.VAddr)
1553     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1554 
1555   // It shouldn't be possible to get this far if the two instructions
1556   // don't have a single memoperand, because MachineInstr::mayAlias()
1557   // will return true if this is the case.
1558   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1559 
1560   MachineInstr *New =
1561     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1562         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1563         .addImm(MergedOffset) // offset
1564         .addImm(CI.CPol)      // cpol
1565         .addImm(0)            // swz
1566         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1567 
1568   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1569 
1570   CI.I->eraseFromParent();
1571   Paired.I->eraseFromParent();
1572   return New;
1573 }
1574 
1575 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1576     CombineInfo &CI, CombineInfo &Paired,
1577     MachineBasicBlock::iterator InsertBefore) {
1578   MachineBasicBlock *MBB = CI.I->getParent();
1579   DebugLoc DL = CI.I->getDebugLoc();
1580 
1581   const unsigned Opcode = getNewOpcode(CI, Paired);
1582 
1583   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1584 
1585   // Copy to the new source register.
1586   Register DestReg = MRI->createVirtualRegister(SuperRC);
1587   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1588 
1589   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1590 
1591   AddressRegs Regs = getRegs(Opcode, *TII);
1592 
1593   if (Regs.VAddr)
1594     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1595 
1596   unsigned JoinedFormat =
1597       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1598 
1599   // It shouldn't be possible to get this far if the two instructions
1600   // don't have a single memoperand, because MachineInstr::mayAlias()
1601   // will return true if this is the case.
1602   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1603 
1604   MachineInstr *New =
1605       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1606           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1607           .addImm(MergedOffset) // offset
1608           .addImm(JoinedFormat) // format
1609           .addImm(CI.CPol)      // cpol
1610           .addImm(0)            // swz
1611           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1612 
1613   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1614 
1615   CI.I->eraseFromParent();
1616   Paired.I->eraseFromParent();
1617   return New;
1618 }
1619 
1620 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1621     CombineInfo &CI, CombineInfo &Paired,
1622     MachineBasicBlock::iterator InsertBefore) {
1623   MachineBasicBlock *MBB = CI.I->getParent();
1624   DebugLoc DL = CI.I->getDebugLoc();
1625 
1626   const unsigned Opcode = getNewOpcode(CI, Paired);
1627 
1628   Register SrcReg =
1629       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1630 
1631   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1632                  .addReg(SrcReg, RegState::Kill);
1633 
1634   AddressRegs Regs = getRegs(Opcode, *TII);
1635 
1636   if (Regs.VAddr)
1637     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1638 
1639   unsigned JoinedFormat =
1640       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1641 
1642   // It shouldn't be possible to get this far if the two instructions
1643   // don't have a single memoperand, because MachineInstr::mayAlias()
1644   // will return true if this is the case.
1645   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1646 
1647   MachineInstr *New =
1648       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1649           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1650           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1651           .addImm(JoinedFormat)                     // format
1652           .addImm(CI.CPol)                          // cpol
1653           .addImm(0)                                // swz
1654           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1655 
1656   CI.I->eraseFromParent();
1657   Paired.I->eraseFromParent();
1658   return New;
1659 }
1660 
1661 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1662     CombineInfo &CI, CombineInfo &Paired,
1663     MachineBasicBlock::iterator InsertBefore) {
1664   MachineBasicBlock *MBB = CI.I->getParent();
1665   DebugLoc DL = CI.I->getDebugLoc();
1666 
1667   const unsigned Opcode = getNewOpcode(CI, Paired);
1668 
1669   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1670   Register DestReg = MRI->createVirtualRegister(SuperRC);
1671 
1672   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1673 
1674   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1675     MIB.add(*SAddr);
1676 
1677   MachineInstr *New =
1678     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1679        .addImm(std::min(CI.Offset, Paired.Offset))
1680        .addImm(CI.CPol)
1681        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1682 
1683   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1684 
1685   CI.I->eraseFromParent();
1686   Paired.I->eraseFromParent();
1687   return New;
1688 }
1689 
1690 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1691     CombineInfo &CI, CombineInfo &Paired,
1692     MachineBasicBlock::iterator InsertBefore) {
1693   MachineBasicBlock *MBB = CI.I->getParent();
1694   DebugLoc DL = CI.I->getDebugLoc();
1695 
1696   const unsigned Opcode = getNewOpcode(CI, Paired);
1697 
1698   Register SrcReg =
1699       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1700 
1701   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1702                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1703                  .addReg(SrcReg, RegState::Kill);
1704 
1705   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1706     MIB.add(*SAddr);
1707 
1708   MachineInstr *New =
1709     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1710        .addImm(CI.CPol)
1711        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1712 
1713   CI.I->eraseFromParent();
1714   Paired.I->eraseFromParent();
1715   return New;
1716 }
1717 
1718 static bool needsConstrainedOpcode(const GCNSubtarget &STM,
1719                                    ArrayRef<MachineMemOperand *> MMOs,
1720                                    unsigned Width) {
1721   // Conservatively returns true if not found the MMO.
1722   return STM.isXNACKEnabled() &&
1723          (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1724 }
1725 
1726 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1727                                             const CombineInfo &Paired) {
1728   const unsigned Width = CI.Width + Paired.Width;
1729 
1730   switch (getCommonInstClass(CI, Paired)) {
1731   default:
1732     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1733     // FIXME: Handle d16 correctly
1734     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1735                                   Width);
1736   case TBUFFER_LOAD:
1737   case TBUFFER_STORE:
1738     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1739                                   Width);
1740 
1741   case UNKNOWN:
1742     llvm_unreachable("Unknown instruction class");
1743   case S_BUFFER_LOAD_IMM: {
1744     // If XNACK is enabled, use the constrained opcodes when the first load is
1745     // under-aligned.
1746     bool NeedsConstrainedOpc =
1747         needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1748     switch (Width) {
1749     default:
1750       return 0;
1751     case 2:
1752       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1753                                  : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1754     case 3:
1755       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1756                                  : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1757     case 4:
1758       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1759                                  : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1760     case 8:
1761       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1762                                  : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1763     }
1764   }
1765   case S_BUFFER_LOAD_SGPR_IMM: {
1766     // If XNACK is enabled, use the constrained opcodes when the first load is
1767     // under-aligned.
1768     bool NeedsConstrainedOpc =
1769         needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1770     switch (Width) {
1771     default:
1772       return 0;
1773     case 2:
1774       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1775                                  : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1776     case 3:
1777       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1778                                  : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1779     case 4:
1780       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1781                                  : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1782     case 8:
1783       return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1784                                  : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1785     }
1786   }
1787   case S_LOAD_IMM: {
1788     // If XNACK is enabled, use the constrained opcodes when the first load is
1789     // under-aligned.
1790     bool NeedsConstrainedOpc =
1791         needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1792     switch (Width) {
1793     default:
1794       return 0;
1795     case 2:
1796       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1797                                  : AMDGPU::S_LOAD_DWORDX2_IMM;
1798     case 3:
1799       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1800                                  : AMDGPU::S_LOAD_DWORDX3_IMM;
1801     case 4:
1802       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1803                                  : AMDGPU::S_LOAD_DWORDX4_IMM;
1804     case 8:
1805       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1806                                  : AMDGPU::S_LOAD_DWORDX8_IMM;
1807     }
1808   }
1809   case GLOBAL_LOAD:
1810     switch (Width) {
1811     default:
1812       return 0;
1813     case 2:
1814       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1815     case 3:
1816       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1817     case 4:
1818       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1819     }
1820   case GLOBAL_LOAD_SADDR:
1821     switch (Width) {
1822     default:
1823       return 0;
1824     case 2:
1825       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1826     case 3:
1827       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1828     case 4:
1829       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1830     }
1831   case GLOBAL_STORE:
1832     switch (Width) {
1833     default:
1834       return 0;
1835     case 2:
1836       return AMDGPU::GLOBAL_STORE_DWORDX2;
1837     case 3:
1838       return AMDGPU::GLOBAL_STORE_DWORDX3;
1839     case 4:
1840       return AMDGPU::GLOBAL_STORE_DWORDX4;
1841     }
1842   case GLOBAL_STORE_SADDR:
1843     switch (Width) {
1844     default:
1845       return 0;
1846     case 2:
1847       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1848     case 3:
1849       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1850     case 4:
1851       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1852     }
1853   case FLAT_LOAD:
1854     switch (Width) {
1855     default:
1856       return 0;
1857     case 2:
1858       return AMDGPU::FLAT_LOAD_DWORDX2;
1859     case 3:
1860       return AMDGPU::FLAT_LOAD_DWORDX3;
1861     case 4:
1862       return AMDGPU::FLAT_LOAD_DWORDX4;
1863     }
1864   case FLAT_STORE:
1865     switch (Width) {
1866     default:
1867       return 0;
1868     case 2:
1869       return AMDGPU::FLAT_STORE_DWORDX2;
1870     case 3:
1871       return AMDGPU::FLAT_STORE_DWORDX3;
1872     case 4:
1873       return AMDGPU::FLAT_STORE_DWORDX4;
1874     }
1875   case MIMG:
1876     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1877            "No overlaps");
1878     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1879   }
1880 }
1881 
1882 std::pair<unsigned, unsigned>
1883 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1884                                     const CombineInfo &Paired) {
1885   assert((CI.InstClass != MIMG ||
1886           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1887            CI.Width + Paired.Width)) &&
1888          "No overlaps");
1889 
1890   unsigned Idx0;
1891   unsigned Idx1;
1892 
1893   static const unsigned Idxs[5][4] = {
1894       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1895       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1896       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1897       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1898       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1899   };
1900 
1901   assert(CI.Width >= 1 && CI.Width <= 4);
1902   assert(Paired.Width >= 1 && Paired.Width <= 4);
1903 
1904   if (Paired < CI) {
1905     Idx1 = Idxs[0][Paired.Width - 1];
1906     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1907   } else {
1908     Idx0 = Idxs[0][CI.Width - 1];
1909     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1910   }
1911 
1912   return {Idx0, Idx1};
1913 }
1914 
1915 const TargetRegisterClass *
1916 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1917                                              const CombineInfo &Paired) const {
1918   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1919       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1920     switch (CI.Width + Paired.Width) {
1921     default:
1922       return nullptr;
1923     case 2:
1924       return &AMDGPU::SReg_64_XEXECRegClass;
1925     case 3:
1926       return &AMDGPU::SGPR_96RegClass;
1927     case 4:
1928       return &AMDGPU::SGPR_128RegClass;
1929     case 8:
1930       return &AMDGPU::SGPR_256RegClass;
1931     case 16:
1932       return &AMDGPU::SGPR_512RegClass;
1933     }
1934   }
1935 
1936   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1937   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1938              ? TRI->getAGPRClassForBitWidth(BitWidth)
1939              : TRI->getVGPRClassForBitWidth(BitWidth);
1940 }
1941 
1942 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1943     CombineInfo &CI, CombineInfo &Paired,
1944     MachineBasicBlock::iterator InsertBefore) {
1945   MachineBasicBlock *MBB = CI.I->getParent();
1946   DebugLoc DL = CI.I->getDebugLoc();
1947 
1948   const unsigned Opcode = getNewOpcode(CI, Paired);
1949 
1950   Register SrcReg =
1951       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1952 
1953   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1954                  .addReg(SrcReg, RegState::Kill);
1955 
1956   AddressRegs Regs = getRegs(Opcode, *TII);
1957 
1958   if (Regs.VAddr)
1959     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1960 
1961 
1962   // It shouldn't be possible to get this far if the two instructions
1963   // don't have a single memoperand, because MachineInstr::mayAlias()
1964   // will return true if this is the case.
1965   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1966 
1967   MachineInstr *New =
1968     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1969         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1970         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1971         .addImm(CI.CPol)      // cpol
1972         .addImm(0)            // swz
1973         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1974 
1975   CI.I->eraseFromParent();
1976   Paired.I->eraseFromParent();
1977   return New;
1978 }
1979 
1980 MachineOperand
1981 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1982   APInt V(32, Val, true);
1983   if (TII->isInlineConstant(V))
1984     return MachineOperand::CreateImm(Val);
1985 
1986   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1987   MachineInstr *Mov =
1988   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1989           TII->get(AMDGPU::S_MOV_B32), Reg)
1990     .addImm(Val);
1991   (void)Mov;
1992   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1993   return MachineOperand::CreateReg(Reg, false);
1994 }
1995 
1996 // Compute base address using Addr and return the final register.
1997 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1998                                            const MemAddress &Addr) const {
1999   MachineBasicBlock *MBB = MI.getParent();
2000   MachineBasicBlock::iterator MBBI = MI.getIterator();
2001   DebugLoc DL = MI.getDebugLoc();
2002 
2003   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2004           Addr.Base.LoSubReg) &&
2005          "Expected 32-bit Base-Register-Low!!");
2006 
2007   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2008           Addr.Base.HiSubReg) &&
2009          "Expected 32-bit Base-Register-Hi!!");
2010 
2011   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
2012   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
2013   MachineOperand OffsetHi =
2014     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
2015 
2016   const auto *CarryRC = TRI->getWaveMaskRegClass();
2017   Register CarryReg = MRI->createVirtualRegister(CarryRC);
2018   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2019 
2020   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2021   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2022   MachineInstr *LoHalf =
2023     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2024       .addReg(CarryReg, RegState::Define)
2025       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2026       .add(OffsetLo)
2027       .addImm(0); // clamp bit
2028   (void)LoHalf;
2029   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
2030 
2031   MachineInstr *HiHalf =
2032   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2033     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2034     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2035     .add(OffsetHi)
2036     .addReg(CarryReg, RegState::Kill)
2037     .addImm(0); // clamp bit
2038   (void)HiHalf;
2039   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
2040 
2041   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2042   MachineInstr *FullBase =
2043     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2044       .addReg(DestSub0)
2045       .addImm(AMDGPU::sub0)
2046       .addReg(DestSub1)
2047       .addImm(AMDGPU::sub1);
2048   (void)FullBase;
2049   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
2050 
2051   return FullDestReg;
2052 }
2053 
2054 // Update base and offset with the NewBase and NewOffset in MI.
2055 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2056                                                Register NewBase,
2057                                                int32_t NewOffset) const {
2058   auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2059   Base->setReg(NewBase);
2060   Base->setIsKill(false);
2061   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2062 }
2063 
2064 std::optional<int32_t>
2065 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2066   if (Op.isImm())
2067     return Op.getImm();
2068 
2069   if (!Op.isReg())
2070     return std::nullopt;
2071 
2072   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2073   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2074       !Def->getOperand(1).isImm())
2075     return std::nullopt;
2076 
2077   return Def->getOperand(1).getImm();
2078 }
2079 
2080 // Analyze Base and extracts:
2081 //  - 32bit base registers, subregisters
2082 //  - 64bit constant offset
2083 // Expecting base computation as:
2084 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2085 //   %LO:vgpr_32, %c:sreg_64_xexec =
2086 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2087 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2088 //   %Base:vreg_64 =
2089 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2090 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2091                                                       MemAddress &Addr) const {
2092   if (!Base.isReg())
2093     return;
2094 
2095   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2096   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2097       || Def->getNumOperands() != 5)
2098     return;
2099 
2100   MachineOperand BaseLo = Def->getOperand(1);
2101   MachineOperand BaseHi = Def->getOperand(3);
2102   if (!BaseLo.isReg() || !BaseHi.isReg())
2103     return;
2104 
2105   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2106   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2107 
2108   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2109       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2110     return;
2111 
2112   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2113   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2114 
2115   auto Offset0P = extractConstOffset(*Src0);
2116   if (Offset0P)
2117     BaseLo = *Src1;
2118   else {
2119     if (!(Offset0P = extractConstOffset(*Src1)))
2120       return;
2121     BaseLo = *Src0;
2122   }
2123 
2124   if (!BaseLo.isReg())
2125     return;
2126 
2127   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2128   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2129 
2130   if (Src0->isImm())
2131     std::swap(Src0, Src1);
2132 
2133   if (!Src1->isImm() || Src0->isImm())
2134     return;
2135 
2136   uint64_t Offset1 = Src1->getImm();
2137   BaseHi = *Src0;
2138 
2139   if (!BaseHi.isReg())
2140     return;
2141 
2142   Addr.Base.LoReg = BaseLo.getReg();
2143   Addr.Base.HiReg = BaseHi.getReg();
2144   Addr.Base.LoSubReg = BaseLo.getSubReg();
2145   Addr.Base.HiSubReg = BaseHi.getSubReg();
2146   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2147 }
2148 
2149 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2150     MachineInstr &MI,
2151     MemInfoMap &Visited,
2152     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2153 
2154   if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2155     return false;
2156 
2157   // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2158   if (SIInstrInfo::isFLATScratch(MI))
2159     return false;
2160 
2161   unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2162                                               : AMDGPUAS::FLAT_ADDRESS;
2163 
2164   if (AnchorList.count(&MI))
2165     return false;
2166 
2167   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2168 
2169   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2170     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2171     return false;
2172   }
2173 
2174   // Step1: Find the base-registers and a 64bit constant offset.
2175   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2176   auto [It, Inserted] = Visited.try_emplace(&MI);
2177   MemAddress MAddr;
2178   if (Inserted) {
2179     processBaseWithConstOffset(Base, MAddr);
2180     It->second = MAddr;
2181   } else
2182     MAddr = It->second;
2183 
2184   if (MAddr.Offset == 0) {
2185     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2186                          " constant offsets that can be promoted.\n";);
2187     return false;
2188   }
2189 
2190   LLVM_DEBUG(dbgs() << "  BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", "
2191                     << printReg(MAddr.Base.LoReg, TRI)
2192                     << "} Offset: " << MAddr.Offset << "\n\n";);
2193 
2194   // Step2: Traverse through MI's basic block and find an anchor(that has the
2195   // same base-registers) with the highest 13bit distance from MI's offset.
2196   // E.g. (64bit loads)
2197   // bb:
2198   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2199   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2200   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2201   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2202   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2203   //
2204   // Starting from the first load, the optimization will try to find a new base
2205   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2206   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2207   // as the new-base(anchor) because of the maximum distance which can
2208   // accommodate more intermediate bases presumably.
2209   //
2210   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2211   // (&a + 8192) for load1, load2, load4.
2212   //   addr = &a + 8192
2213   //   load1 = load(addr,       -4096)
2214   //   load2 = load(addr,       -2048)
2215   //   load3 = load(addr,       0)
2216   //   load4 = load(addr,       2048)
2217   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2218   //
2219   MachineInstr *AnchorInst = nullptr;
2220   MemAddress AnchorAddr;
2221   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2222   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2223 
2224   MachineBasicBlock *MBB = MI.getParent();
2225   MachineBasicBlock::iterator E = MBB->end();
2226   MachineBasicBlock::iterator MBBI = MI.getIterator();
2227   ++MBBI;
2228   const SITargetLowering *TLI =
2229     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2230 
2231   for ( ; MBBI != E; ++MBBI) {
2232     MachineInstr &MINext = *MBBI;
2233     // TODO: Support finding an anchor(with same base) from store addresses or
2234     // any other load addresses where the opcodes are different.
2235     if (MINext.getOpcode() != MI.getOpcode() ||
2236         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2237       continue;
2238 
2239     const MachineOperand &BaseNext =
2240       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2241     MemAddress MAddrNext;
2242     auto [It, Inserted] = Visited.try_emplace(&MINext);
2243     if (Inserted) {
2244       processBaseWithConstOffset(BaseNext, MAddrNext);
2245       It->second = MAddrNext;
2246     } else
2247       MAddrNext = It->second;
2248 
2249     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2250         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2251         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2252         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2253       continue;
2254 
2255     InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2256 
2257     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2258     TargetLoweringBase::AddrMode AM;
2259     AM.HasBaseReg = true;
2260     AM.BaseOffs = Dist;
2261     if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2262         (uint32_t)std::abs(Dist) > MaxDist) {
2263       MaxDist = std::abs(Dist);
2264 
2265       AnchorAddr = MAddrNext;
2266       AnchorInst = &MINext;
2267     }
2268   }
2269 
2270   if (AnchorInst) {
2271     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2272                AnchorInst->dump());
2273     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2274                <<  AnchorAddr.Offset << "\n\n");
2275 
2276     // Instead of moving up, just re-compute anchor-instruction's base address.
2277     Register Base = computeBase(MI, AnchorAddr);
2278 
2279     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2280     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2281 
2282     for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2283       TargetLoweringBase::AddrMode AM;
2284       AM.HasBaseReg = true;
2285       AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2286 
2287       if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2288         LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";
2289                    OtherMI->dump());
2290         updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2291         LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());
2292       }
2293     }
2294     AnchorList.insert(AnchorInst);
2295     return true;
2296   }
2297 
2298   return false;
2299 }
2300 
2301 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2302                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2303   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2304     if (AddrList.front().InstClass == CI.InstClass &&
2305         AddrList.front().IsAGPR == CI.IsAGPR &&
2306         AddrList.front().hasSameBaseAddress(CI)) {
2307       AddrList.emplace_back(CI);
2308       return;
2309     }
2310   }
2311 
2312   // Base address not found, so add a new list.
2313   MergeableInsts.emplace_back(1, CI);
2314 }
2315 
2316 std::pair<MachineBasicBlock::iterator, bool>
2317 SILoadStoreOptimizer::collectMergeableInsts(
2318     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2319     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2320     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2321   bool Modified = false;
2322 
2323   // Sort potential mergeable instructions into lists.  One list per base address.
2324   unsigned Order = 0;
2325   MachineBasicBlock::iterator BlockI = Begin;
2326   for (; BlockI != End; ++BlockI) {
2327     MachineInstr &MI = *BlockI;
2328 
2329     // We run this before checking if an address is mergeable, because it can produce
2330     // better code even if the instructions aren't mergeable.
2331     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2332       Modified = true;
2333 
2334     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2335     // barriers. We can look after this barrier for separate merges.
2336     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2337       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2338 
2339       // Search will resume after this instruction in a separate merge list.
2340       ++BlockI;
2341       break;
2342     }
2343 
2344     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2345     if (InstClass == UNKNOWN)
2346       continue;
2347 
2348     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2349     int Swizzled =
2350         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2351     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2352       continue;
2353 
2354     CombineInfo CI;
2355     CI.setMI(MI, *this);
2356     CI.Order = Order++;
2357 
2358     if (!CI.hasMergeableAddress(*MRI))
2359       continue;
2360 
2361     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2362       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2363       //        operands. However we are reporting that ds_write2 shall have
2364       //        only VGPR data so that machine copy propagation does not
2365       //        create an illegal instruction with a VGPR and AGPR sources.
2366       //        Consequenctially if we create such instruction the verifier
2367       //        will complain.
2368       continue;
2369     }
2370 
2371     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2372 
2373     addInstToMergeableList(CI, MergeableInsts);
2374   }
2375 
2376   // At this point we have lists of Mergeable instructions.
2377   //
2378   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2379   // list try to find an instruction that can be merged with I.  If an instruction
2380   // is found, it is stored in the Paired field.  If no instructions are found, then
2381   // the CombineInfo object is deleted from the list.
2382 
2383   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2384                                                    E = MergeableInsts.end(); I != E;) {
2385 
2386     std::list<CombineInfo> &MergeList = *I;
2387     if (MergeList.size() <= 1) {
2388       // This means we have found only one instruction with a given address
2389       // that can be merged, and we need at least 2 instructions to do a merge,
2390       // so this list can be discarded.
2391       I = MergeableInsts.erase(I);
2392       continue;
2393     }
2394 
2395     // Sort the lists by offsets, this way mergeable instructions will be
2396     // adjacent to each other in the list, which will make it easier to find
2397     // matches.
2398     MergeList.sort(
2399         [] (const CombineInfo &A, const CombineInfo &B) {
2400           return A.Offset < B.Offset;
2401         });
2402     ++I;
2403   }
2404 
2405   return {BlockI, Modified};
2406 }
2407 
2408 // Scan through looking for adjacent LDS operations with constant offsets from
2409 // the same base register. We rely on the scheduler to do the hard work of
2410 // clustering nearby loads, and assume these are all adjacent.
2411 bool SILoadStoreOptimizer::optimizeBlock(
2412                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2413   bool Modified = false;
2414 
2415   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2416                                                    E = MergeableInsts.end(); I != E;) {
2417     std::list<CombineInfo> &MergeList = *I;
2418 
2419     bool OptimizeListAgain = false;
2420     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2421       // We weren't able to make any changes, so delete the list so we don't
2422       // process the same instructions the next time we try to optimize this
2423       // block.
2424       I = MergeableInsts.erase(I);
2425       continue;
2426     }
2427 
2428     Modified = true;
2429 
2430     // We made changes, but also determined that there were no more optimization
2431     // opportunities, so we don't need to reprocess the list
2432     if (!OptimizeListAgain) {
2433       I = MergeableInsts.erase(I);
2434       continue;
2435     }
2436     OptimizeAgain = true;
2437   }
2438   return Modified;
2439 }
2440 
2441 bool
2442 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2443                                           std::list<CombineInfo> &MergeList,
2444                                           bool &OptimizeListAgain) {
2445   if (MergeList.empty())
2446     return false;
2447 
2448   bool Modified = false;
2449 
2450   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2451        Next = std::next(I)) {
2452 
2453     auto First = I;
2454     auto Second = Next;
2455 
2456     if ((*First).Order > (*Second).Order)
2457       std::swap(First, Second);
2458     CombineInfo &CI = *First;
2459     CombineInfo &Paired = *Second;
2460 
2461     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2462     if (!Where) {
2463       ++I;
2464       continue;
2465     }
2466 
2467     Modified = true;
2468 
2469     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2470 
2471     MachineBasicBlock::iterator NewMI;
2472     switch (CI.InstClass) {
2473     default:
2474       llvm_unreachable("unknown InstClass");
2475       break;
2476     case DS_READ:
2477       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2478       break;
2479     case DS_WRITE:
2480       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2481       break;
2482     case S_BUFFER_LOAD_IMM:
2483     case S_BUFFER_LOAD_SGPR_IMM:
2484     case S_LOAD_IMM:
2485       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2486       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2487       break;
2488     case BUFFER_LOAD:
2489       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2490       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2491       break;
2492     case BUFFER_STORE:
2493       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2494       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2495       break;
2496     case MIMG:
2497       NewMI = mergeImagePair(CI, Paired, Where->I);
2498       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2499       break;
2500     case TBUFFER_LOAD:
2501       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2502       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2503       break;
2504     case TBUFFER_STORE:
2505       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2506       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2507       break;
2508     case FLAT_LOAD:
2509     case GLOBAL_LOAD:
2510     case GLOBAL_LOAD_SADDR:
2511       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2512       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2513       break;
2514     case FLAT_STORE:
2515     case GLOBAL_STORE:
2516     case GLOBAL_STORE_SADDR:
2517       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2518       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2519       break;
2520     }
2521     CI.setMI(NewMI, *this);
2522     CI.Order = Where->Order;
2523     if (I == Second)
2524       I = Next;
2525 
2526     MergeList.erase(Second);
2527   }
2528 
2529   return Modified;
2530 }
2531 
2532 bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2533   if (skipFunction(MF.getFunction()))
2534     return false;
2535   return SILoadStoreOptimizer(
2536              &getAnalysis<AAResultsWrapperPass>().getAAResults())
2537       .run(MF);
2538 }
2539 
2540 bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2541   STM = &MF.getSubtarget<GCNSubtarget>();
2542   if (!STM->loadStoreOptEnabled())
2543     return false;
2544 
2545   TII = STM->getInstrInfo();
2546   TRI = &TII->getRegisterInfo();
2547 
2548   MRI = &MF.getRegInfo();
2549 
2550   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2551 
2552   bool Modified = false;
2553 
2554   // Contains the list of instructions for which constant offsets are being
2555   // promoted to the IMM. This is tracked for an entire block at time.
2556   SmallPtrSet<MachineInstr *, 4> AnchorList;
2557   MemInfoMap Visited;
2558 
2559   for (MachineBasicBlock &MBB : MF) {
2560     MachineBasicBlock::iterator SectionEnd;
2561     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2562          I = SectionEnd) {
2563       bool CollectModified;
2564       std::list<std::list<CombineInfo>> MergeableInsts;
2565 
2566       // First pass: Collect list of all instructions we know how to merge in a
2567       // subset of the block.
2568       std::tie(SectionEnd, CollectModified) =
2569           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2570 
2571       Modified |= CollectModified;
2572 
2573       do {
2574         OptimizeAgain = false;
2575         Modified |= optimizeBlock(MergeableInsts);
2576       } while (OptimizeAgain);
2577     }
2578 
2579     Visited.clear();
2580     AnchorList.clear();
2581   }
2582 
2583   return Modified;
2584 }
2585 
2586 PreservedAnalyses
2587 SILoadStoreOptimizerPass::run(MachineFunction &MF,
2588                               MachineFunctionAnalysisManager &MFAM) {
2589   MFPropsModifier _(*this, MF);
2590 
2591   if (MF.getFunction().hasOptNone())
2592     return PreservedAnalyses::all();
2593 
2594   auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
2595                   .getManager();
2596   AAResults &AA = FAM.getResult<AAManager>(MF.getFunction());
2597 
2598   bool Changed = SILoadStoreOptimizer(&AA).run(MF);
2599   if (!Changed)
2600     return PreservedAnalyses::all();
2601 
2602   PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
2603   PA.preserveSet<CFGAnalyses>();
2604   return PA;
2605 }
2606