xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision 59c8e88e72633afbc47a4ace0d2170d00d51f7dc)
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   S_BUFFER_LOAD_SGPR_IMM,
78   S_LOAD_IMM,
79   BUFFER_LOAD,
80   BUFFER_STORE,
81   MIMG,
82   TBUFFER_LOAD,
83   TBUFFER_STORE,
84   GLOBAL_LOAD_SADDR,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE,
88   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90                // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94   unsigned char NumVAddrs = 0;
95   bool SBase = false;
96   bool SRsrc = false;
97   bool SOffset = false;
98   bool SAddr = false;
99   bool VAddr = false;
100   bool Addr = false;
101   bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108   struct CombineInfo {
109     MachineBasicBlock::iterator I;
110     unsigned EltSize;
111     unsigned Offset;
112     unsigned Width;
113     unsigned Format;
114     unsigned BaseOff;
115     unsigned DMask;
116     InstClassEnum InstClass;
117     unsigned CPol = 0;
118     bool IsAGPR;
119     bool UseST64;
120     int AddrIdx[MaxAddressRegs];
121     const MachineOperand *AddrReg[MaxAddressRegs];
122     unsigned NumAddresses;
123     unsigned Order;
124 
125     bool hasSameBaseAddress(const CombineInfo &CI) {
126       if (NumAddresses != CI.NumAddresses)
127         return false;
128 
129       const MachineInstr &MI = *CI.I;
130       for (unsigned i = 0; i < NumAddresses; i++) {
131         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136             return false;
137           }
138           continue;
139         }
140 
141         // Check same base pointer. Be careful of subregisters, which can occur
142         // with vectors of pointers.
143         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145          return false;
146         }
147       }
148       return true;
149     }
150 
151     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152       for (unsigned i = 0; i < NumAddresses; ++i) {
153         const MachineOperand *AddrOp = AddrReg[i];
154         // Immediates are always OK.
155         if (AddrOp->isImm())
156           continue;
157 
158         // Don't try to merge addresses that aren't either immediates or registers.
159         // TODO: Should be possible to merge FrameIndexes and maybe some other
160         // non-register
161         if (!AddrOp->isReg())
162           return false;
163 
164         // TODO: We should be able to merge physical reg addresses.
165         if (AddrOp->getReg().isPhysical())
166           return false;
167 
168         // If an address has only one use then there will be no other
169         // instructions with the same address, so we can't merge this one.
170         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
171           return false;
172       }
173       return true;
174     }
175 
176     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
177 
178     // Compare by pointer order.
179     bool operator<(const CombineInfo& Other) const {
180       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
181     }
182   };
183 
184   struct BaseRegisters {
185     Register LoReg;
186     Register HiReg;
187 
188     unsigned LoSubReg = 0;
189     unsigned HiSubReg = 0;
190   };
191 
192   struct MemAddress {
193     BaseRegisters Base;
194     int64_t Offset = 0;
195   };
196 
197   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
198 
199 private:
200   const GCNSubtarget *STM = nullptr;
201   const SIInstrInfo *TII = nullptr;
202   const SIRegisterInfo *TRI = nullptr;
203   MachineRegisterInfo *MRI = nullptr;
204   AliasAnalysis *AA = nullptr;
205   bool OptimizeAgain;
206 
207   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
208                            const DenseSet<Register> &ARegUses,
209                            const MachineInstr &A, const MachineInstr &B) const;
210   static bool dmasksCanBeCombined(const CombineInfo &CI,
211                                   const SIInstrInfo &TII,
212                                   const CombineInfo &Paired);
213   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
214                                    CombineInfo &Paired, bool Modify = false);
215   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
216                         const CombineInfo &Paired);
217   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
218   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
219                                                      const CombineInfo &Paired);
220   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
221                                                     const CombineInfo &Paired);
222   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
223 
224   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
225 
226   unsigned read2Opcode(unsigned EltSize) const;
227   unsigned read2ST64Opcode(unsigned EltSize) const;
228   MachineBasicBlock::iterator
229   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
230                  MachineBasicBlock::iterator InsertBefore);
231 
232   unsigned write2Opcode(unsigned EltSize) const;
233   unsigned write2ST64Opcode(unsigned EltSize) const;
234   MachineBasicBlock::iterator
235   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
236                   MachineBasicBlock::iterator InsertBefore);
237   MachineBasicBlock::iterator
238   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
239                  MachineBasicBlock::iterator InsertBefore);
240   MachineBasicBlock::iterator
241   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
242                        MachineBasicBlock::iterator InsertBefore);
243   MachineBasicBlock::iterator
244   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245                       MachineBasicBlock::iterator InsertBefore);
246   MachineBasicBlock::iterator
247   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248                        MachineBasicBlock::iterator InsertBefore);
249   MachineBasicBlock::iterator
250   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
251                        MachineBasicBlock::iterator InsertBefore);
252   MachineBasicBlock::iterator
253   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
254                         MachineBasicBlock::iterator InsertBefore);
255   MachineBasicBlock::iterator
256   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
257                     MachineBasicBlock::iterator InsertBefore);
258   MachineBasicBlock::iterator
259   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
260                      MachineBasicBlock::iterator InsertBefore);
261 
262   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
263                            int32_t NewOffset) const;
264   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
265   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
266   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
267   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
268   /// Promotes constant offset to the immediate by adjusting the base. It
269   /// tries to use a base from the nearby instructions that allows it to have
270   /// a 13bit constant offset which gets promoted to the immediate.
271   bool promoteConstantOffsetToImm(MachineInstr &CI,
272                                   MemInfoMap &Visited,
273                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
274   void addInstToMergeableList(const CombineInfo &CI,
275                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
276 
277   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
278       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
279       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
280       std::list<std::list<CombineInfo>> &MergeableInsts) const;
281 
282   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
283                                                      const CombineInfo &Paired);
284 
285   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
286                                           const CombineInfo &Paired);
287 
288 public:
289   static char ID;
290 
291   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
292     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
293   }
294 
295   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
296                                      bool &OptimizeListAgain);
297   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
298 
299   bool runOnMachineFunction(MachineFunction &MF) override;
300 
301   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
302 
303   void getAnalysisUsage(AnalysisUsage &AU) const override {
304     AU.setPreservesCFG();
305     AU.addRequired<AAResultsWrapperPass>();
306 
307     MachineFunctionPass::getAnalysisUsage(AU);
308   }
309 
310   MachineFunctionProperties getRequiredProperties() const override {
311     return MachineFunctionProperties()
312       .set(MachineFunctionProperties::Property::IsSSA);
313   }
314 };
315 
316 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
317   const unsigned Opc = MI.getOpcode();
318 
319   if (TII.isMUBUF(Opc)) {
320     // FIXME: Handle d16 correctly
321     return AMDGPU::getMUBUFElements(Opc);
322   }
323   if (TII.isMIMG(MI)) {
324     uint64_t DMaskImm =
325         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
326     return llvm::popcount(DMaskImm);
327   }
328   if (TII.isMTBUF(Opc)) {
329     return AMDGPU::getMTBUFElements(Opc);
330   }
331 
332   switch (Opc) {
333   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
334   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
335   case AMDGPU::S_LOAD_DWORD_IMM:
336   case AMDGPU::GLOBAL_LOAD_DWORD:
337   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
338   case AMDGPU::GLOBAL_STORE_DWORD:
339   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
340   case AMDGPU::FLAT_LOAD_DWORD:
341   case AMDGPU::FLAT_STORE_DWORD:
342     return 1;
343   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
344   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
345   case AMDGPU::S_LOAD_DWORDX2_IMM:
346   case AMDGPU::GLOBAL_LOAD_DWORDX2:
347   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
348   case AMDGPU::GLOBAL_STORE_DWORDX2:
349   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
350   case AMDGPU::FLAT_LOAD_DWORDX2:
351   case AMDGPU::FLAT_STORE_DWORDX2:
352     return 2;
353   case AMDGPU::GLOBAL_LOAD_DWORDX3:
354   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
355   case AMDGPU::GLOBAL_STORE_DWORDX3:
356   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
357   case AMDGPU::FLAT_LOAD_DWORDX3:
358   case AMDGPU::FLAT_STORE_DWORDX3:
359     return 3;
360   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
361   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
362   case AMDGPU::S_LOAD_DWORDX4_IMM:
363   case AMDGPU::GLOBAL_LOAD_DWORDX4:
364   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
365   case AMDGPU::GLOBAL_STORE_DWORDX4:
366   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
367   case AMDGPU::FLAT_LOAD_DWORDX4:
368   case AMDGPU::FLAT_STORE_DWORDX4:
369     return 4;
370   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
371   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
372   case AMDGPU::S_LOAD_DWORDX8_IMM:
373     return 8;
374   case AMDGPU::DS_READ_B32:      [[fallthrough]];
375   case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
376   case AMDGPU::DS_WRITE_B32:     [[fallthrough]];
377   case AMDGPU::DS_WRITE_B32_gfx9:
378     return 1;
379   case AMDGPU::DS_READ_B64:      [[fallthrough]];
380   case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
381   case AMDGPU::DS_WRITE_B64:     [[fallthrough]];
382   case AMDGPU::DS_WRITE_B64_gfx9:
383     return 2;
384   default:
385     return 0;
386   }
387 }
388 
389 /// Maps instruction opcode to enum InstClassEnum.
390 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
391   switch (Opc) {
392   default:
393     if (TII.isMUBUF(Opc)) {
394       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
395       default:
396         return UNKNOWN;
397       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
398       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
399       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
400       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
401         return BUFFER_LOAD;
402       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
403       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
404       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
405       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
406         return BUFFER_STORE;
407       }
408     }
409     if (TII.isMIMG(Opc)) {
410       // Ignore instructions encoded without vaddr.
411       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
412           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
413         return UNKNOWN;
414       // Ignore BVH instructions
415       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
416         return UNKNOWN;
417       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
418       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
419           TII.isGather4(Opc))
420         return UNKNOWN;
421       return MIMG;
422     }
423     if (TII.isMTBUF(Opc)) {
424       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
425       default:
426         return UNKNOWN;
427       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
428       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
429       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
430       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
431       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
432       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
433       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
434       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
435         return TBUFFER_LOAD;
436       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
437       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
438       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
439       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
440         return TBUFFER_STORE;
441       }
442     }
443     return UNKNOWN;
444   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
445   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
446   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
447   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
448     return S_BUFFER_LOAD_IMM;
449   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
450   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
451   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
452   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
453     return S_BUFFER_LOAD_SGPR_IMM;
454   case AMDGPU::S_LOAD_DWORD_IMM:
455   case AMDGPU::S_LOAD_DWORDX2_IMM:
456   case AMDGPU::S_LOAD_DWORDX4_IMM:
457   case AMDGPU::S_LOAD_DWORDX8_IMM:
458     return S_LOAD_IMM;
459   case AMDGPU::DS_READ_B32:
460   case AMDGPU::DS_READ_B32_gfx9:
461   case AMDGPU::DS_READ_B64:
462   case AMDGPU::DS_READ_B64_gfx9:
463     return DS_READ;
464   case AMDGPU::DS_WRITE_B32:
465   case AMDGPU::DS_WRITE_B32_gfx9:
466   case AMDGPU::DS_WRITE_B64:
467   case AMDGPU::DS_WRITE_B64_gfx9:
468     return DS_WRITE;
469   case AMDGPU::GLOBAL_LOAD_DWORD:
470   case AMDGPU::GLOBAL_LOAD_DWORDX2:
471   case AMDGPU::GLOBAL_LOAD_DWORDX3:
472   case AMDGPU::GLOBAL_LOAD_DWORDX4:
473   case AMDGPU::FLAT_LOAD_DWORD:
474   case AMDGPU::FLAT_LOAD_DWORDX2:
475   case AMDGPU::FLAT_LOAD_DWORDX3:
476   case AMDGPU::FLAT_LOAD_DWORDX4:
477     return FLAT_LOAD;
478   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
479   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
480   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
481   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
482     return GLOBAL_LOAD_SADDR;
483   case AMDGPU::GLOBAL_STORE_DWORD:
484   case AMDGPU::GLOBAL_STORE_DWORDX2:
485   case AMDGPU::GLOBAL_STORE_DWORDX3:
486   case AMDGPU::GLOBAL_STORE_DWORDX4:
487   case AMDGPU::FLAT_STORE_DWORD:
488   case AMDGPU::FLAT_STORE_DWORDX2:
489   case AMDGPU::FLAT_STORE_DWORDX3:
490   case AMDGPU::FLAT_STORE_DWORDX4:
491     return FLAT_STORE;
492   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
493   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
494   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
495   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
496     return GLOBAL_STORE_SADDR;
497   }
498 }
499 
500 /// Determines instruction subclass from opcode. Only instructions
501 /// of the same subclass can be merged together. The merged instruction may have
502 /// a different subclass but must have the same class.
503 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
504   switch (Opc) {
505   default:
506     if (TII.isMUBUF(Opc))
507       return AMDGPU::getMUBUFBaseOpcode(Opc);
508     if (TII.isMIMG(Opc)) {
509       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
510       assert(Info);
511       return Info->BaseOpcode;
512     }
513     if (TII.isMTBUF(Opc))
514       return AMDGPU::getMTBUFBaseOpcode(Opc);
515     return -1;
516   case AMDGPU::DS_READ_B32:
517   case AMDGPU::DS_READ_B32_gfx9:
518   case AMDGPU::DS_READ_B64:
519   case AMDGPU::DS_READ_B64_gfx9:
520   case AMDGPU::DS_WRITE_B32:
521   case AMDGPU::DS_WRITE_B32_gfx9:
522   case AMDGPU::DS_WRITE_B64:
523   case AMDGPU::DS_WRITE_B64_gfx9:
524     return Opc;
525   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
526   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
527   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
528   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
529     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
530   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
531   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
532   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
533   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
534     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
535   case AMDGPU::S_LOAD_DWORD_IMM:
536   case AMDGPU::S_LOAD_DWORDX2_IMM:
537   case AMDGPU::S_LOAD_DWORDX4_IMM:
538   case AMDGPU::S_LOAD_DWORDX8_IMM:
539     return AMDGPU::S_LOAD_DWORD_IMM;
540   case AMDGPU::GLOBAL_LOAD_DWORD:
541   case AMDGPU::GLOBAL_LOAD_DWORDX2:
542   case AMDGPU::GLOBAL_LOAD_DWORDX3:
543   case AMDGPU::GLOBAL_LOAD_DWORDX4:
544   case AMDGPU::FLAT_LOAD_DWORD:
545   case AMDGPU::FLAT_LOAD_DWORDX2:
546   case AMDGPU::FLAT_LOAD_DWORDX3:
547   case AMDGPU::FLAT_LOAD_DWORDX4:
548     return AMDGPU::FLAT_LOAD_DWORD;
549   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
550   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
551   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
552   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
553     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
554   case AMDGPU::GLOBAL_STORE_DWORD:
555   case AMDGPU::GLOBAL_STORE_DWORDX2:
556   case AMDGPU::GLOBAL_STORE_DWORDX3:
557   case AMDGPU::GLOBAL_STORE_DWORDX4:
558   case AMDGPU::FLAT_STORE_DWORD:
559   case AMDGPU::FLAT_STORE_DWORDX2:
560   case AMDGPU::FLAT_STORE_DWORDX3:
561   case AMDGPU::FLAT_STORE_DWORDX4:
562     return AMDGPU::FLAT_STORE_DWORD;
563   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
564   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
565   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
566   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
567     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
568   }
569 }
570 
571 // GLOBAL loads and stores are classified as FLAT initially. If both combined
572 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
573 // If either or both instructions are non segment specific FLAT the resulting
574 // combined operation will be FLAT, potentially promoting one of the GLOBAL
575 // operations to FLAT.
576 // For other instructions return the original unmodified class.
577 InstClassEnum
578 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
579                                          const CombineInfo &Paired) {
580   assert(CI.InstClass == Paired.InstClass);
581 
582   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
583       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
584     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
585 
586   return CI.InstClass;
587 }
588 
589 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
590   AddressRegs Result;
591 
592   if (TII.isMUBUF(Opc)) {
593     if (AMDGPU::getMUBUFHasVAddr(Opc))
594       Result.VAddr = true;
595     if (AMDGPU::getMUBUFHasSrsrc(Opc))
596       Result.SRsrc = true;
597     if (AMDGPU::getMUBUFHasSoffset(Opc))
598       Result.SOffset = true;
599 
600     return Result;
601   }
602 
603   if (TII.isMIMG(Opc)) {
604     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
605     if (VAddr0Idx >= 0) {
606       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
607       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
608     } else {
609       Result.VAddr = true;
610     }
611     Result.SRsrc = true;
612     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
613     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
614       Result.SSamp = true;
615 
616     return Result;
617   }
618   if (TII.isMTBUF(Opc)) {
619     if (AMDGPU::getMTBUFHasVAddr(Opc))
620       Result.VAddr = true;
621     if (AMDGPU::getMTBUFHasSrsrc(Opc))
622       Result.SRsrc = true;
623     if (AMDGPU::getMTBUFHasSoffset(Opc))
624       Result.SOffset = true;
625 
626     return Result;
627   }
628 
629   switch (Opc) {
630   default:
631     return Result;
632   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
633   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
634   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
635   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
636     Result.SOffset = true;
637     [[fallthrough]];
638   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
639   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
640   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
641   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
642   case AMDGPU::S_LOAD_DWORD_IMM:
643   case AMDGPU::S_LOAD_DWORDX2_IMM:
644   case AMDGPU::S_LOAD_DWORDX4_IMM:
645   case AMDGPU::S_LOAD_DWORDX8_IMM:
646     Result.SBase = true;
647     return Result;
648   case AMDGPU::DS_READ_B32:
649   case AMDGPU::DS_READ_B64:
650   case AMDGPU::DS_READ_B32_gfx9:
651   case AMDGPU::DS_READ_B64_gfx9:
652   case AMDGPU::DS_WRITE_B32:
653   case AMDGPU::DS_WRITE_B64:
654   case AMDGPU::DS_WRITE_B32_gfx9:
655   case AMDGPU::DS_WRITE_B64_gfx9:
656     Result.Addr = true;
657     return Result;
658   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
659   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
660   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
661   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
662   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
663   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
664   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
665   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
666     Result.SAddr = true;
667     [[fallthrough]];
668   case AMDGPU::GLOBAL_LOAD_DWORD:
669   case AMDGPU::GLOBAL_LOAD_DWORDX2:
670   case AMDGPU::GLOBAL_LOAD_DWORDX3:
671   case AMDGPU::GLOBAL_LOAD_DWORDX4:
672   case AMDGPU::GLOBAL_STORE_DWORD:
673   case AMDGPU::GLOBAL_STORE_DWORDX2:
674   case AMDGPU::GLOBAL_STORE_DWORDX3:
675   case AMDGPU::GLOBAL_STORE_DWORDX4:
676   case AMDGPU::FLAT_LOAD_DWORD:
677   case AMDGPU::FLAT_LOAD_DWORDX2:
678   case AMDGPU::FLAT_LOAD_DWORDX3:
679   case AMDGPU::FLAT_LOAD_DWORDX4:
680   case AMDGPU::FLAT_STORE_DWORD:
681   case AMDGPU::FLAT_STORE_DWORDX2:
682   case AMDGPU::FLAT_STORE_DWORDX3:
683   case AMDGPU::FLAT_STORE_DWORDX4:
684     Result.VAddr = true;
685     return Result;
686   }
687 }
688 
689 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
690                                               const SILoadStoreOptimizer &LSO) {
691   I = MI;
692   unsigned Opc = MI->getOpcode();
693   InstClass = getInstClass(Opc, *LSO.TII);
694 
695   if (InstClass == UNKNOWN)
696     return;
697 
698   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
699 
700   switch (InstClass) {
701   case DS_READ:
702    EltSize =
703           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
704                                                                           : 4;
705    break;
706   case DS_WRITE:
707     EltSize =
708           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
709                                                                             : 4;
710     break;
711   case S_BUFFER_LOAD_IMM:
712   case S_BUFFER_LOAD_SGPR_IMM:
713   case S_LOAD_IMM:
714     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
715     break;
716   default:
717     EltSize = 4;
718     break;
719   }
720 
721   if (InstClass == MIMG) {
722     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
723     // Offset is not considered for MIMG instructions.
724     Offset = 0;
725   } else {
726     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
727     Offset = I->getOperand(OffsetIdx).getImm();
728   }
729 
730   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
731     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
732 
733   Width = getOpcodeWidth(*I, *LSO.TII);
734 
735   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
736     Offset &= 0xffff;
737   } else if (InstClass != MIMG) {
738     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
739   }
740 
741   AddressRegs Regs = getRegs(Opc, *LSO.TII);
742 
743   NumAddresses = 0;
744   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
745     AddrIdx[NumAddresses++] =
746         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
747   if (Regs.Addr)
748     AddrIdx[NumAddresses++] =
749         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
750   if (Regs.SBase)
751     AddrIdx[NumAddresses++] =
752         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
753   if (Regs.SRsrc)
754     AddrIdx[NumAddresses++] =
755         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
756   if (Regs.SOffset)
757     AddrIdx[NumAddresses++] =
758         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
759   if (Regs.SAddr)
760     AddrIdx[NumAddresses++] =
761         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
762   if (Regs.VAddr)
763     AddrIdx[NumAddresses++] =
764         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
765   if (Regs.SSamp)
766     AddrIdx[NumAddresses++] =
767         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
768   assert(NumAddresses <= MaxAddressRegs);
769 
770   for (unsigned J = 0; J < NumAddresses; J++)
771     AddrReg[J] = &I->getOperand(AddrIdx[J]);
772 }
773 
774 } // end anonymous namespace.
775 
776 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
777                       "SI Load Store Optimizer", false, false)
778 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
779 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
780                     false, false)
781 
782 char SILoadStoreOptimizer::ID = 0;
783 
784 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
785 
786 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
787   return new SILoadStoreOptimizer();
788 }
789 
790 static void addDefsUsesToList(const MachineInstr &MI,
791                               DenseSet<Register> &RegDefs,
792                               DenseSet<Register> &RegUses) {
793   for (const auto &Op : MI.operands()) {
794     if (!Op.isReg())
795       continue;
796     if (Op.isDef())
797       RegDefs.insert(Op.getReg());
798     if (Op.readsReg())
799       RegUses.insert(Op.getReg());
800   }
801 }
802 
803 bool SILoadStoreOptimizer::canSwapInstructions(
804     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
805     const MachineInstr &A, const MachineInstr &B) const {
806   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
807       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
808     return false;
809   for (const auto &BOp : B.operands()) {
810     if (!BOp.isReg())
811       continue;
812     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
813       return false;
814     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
815       return false;
816   }
817   return true;
818 }
819 
820 // Given that \p CI and \p Paired are adjacent memory operations produce a new
821 // MMO for the combined operation with a new access size.
822 MachineMemOperand *
823 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
824                                                const CombineInfo &Paired) {
825   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
826   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
827 
828   unsigned Size = MMOa->getSize() + MMOb->getSize();
829 
830   // A base pointer for the combined operation is the same as the leading
831   // operation's pointer.
832   if (Paired < CI)
833     std::swap(MMOa, MMOb);
834 
835   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
836   // If merging FLAT and GLOBAL set address space to FLAT.
837   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
838     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
839 
840   MachineFunction *MF = CI.I->getMF();
841   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
842 }
843 
844 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
845                                                const SIInstrInfo &TII,
846                                                const CombineInfo &Paired) {
847   assert(CI.InstClass == MIMG);
848 
849   // Ignore instructions with tfe/lwe set.
850   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
851   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
852 
853   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
854     return false;
855 
856   // Check other optional immediate operands for equality.
857   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
858                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
859                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
860 
861   for (auto op : OperandsToMatch) {
862     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
863     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
864       return false;
865     if (Idx != -1 &&
866         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
867       return false;
868   }
869 
870   // Check DMask for overlaps.
871   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
872   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
873 
874   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
875   if ((1u << AllowedBitsForMin) <= MinMask)
876     return false;
877 
878   return true;
879 }
880 
881 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
882                                        unsigned ComponentCount,
883                                        const GCNSubtarget &STI) {
884   if (ComponentCount > 4)
885     return 0;
886 
887   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
888       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
889   if (!OldFormatInfo)
890     return 0;
891 
892   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
893       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
894                                            ComponentCount,
895                                            OldFormatInfo->NumFormat, STI);
896 
897   if (!NewFormatInfo)
898     return 0;
899 
900   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
901          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
902 
903   return NewFormatInfo->Format;
904 }
905 
906 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
907 // highest power of two. Note that the result is well defined for all inputs
908 // including corner cases like:
909 // - if Lo == Hi, return that value
910 // - if Lo == 0, return 0 (even though the "- 1" below underflows
911 // - if Lo > Hi, return 0 (as if the range wrapped around)
912 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
913   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
914 }
915 
916 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
917                                                 const GCNSubtarget &STI,
918                                                 CombineInfo &Paired,
919                                                 bool Modify) {
920   assert(CI.InstClass != MIMG);
921 
922   // XXX - Would the same offset be OK? Is there any reason this would happen or
923   // be useful?
924   if (CI.Offset == Paired.Offset)
925     return false;
926 
927   // This won't be valid if the offset isn't aligned.
928   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
929     return false;
930 
931   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
932 
933     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
934         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
935     if (!Info0)
936       return false;
937     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
938         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
939     if (!Info1)
940       return false;
941 
942     if (Info0->BitsPerComp != Info1->BitsPerComp ||
943         Info0->NumFormat != Info1->NumFormat)
944       return false;
945 
946     // TODO: Should be possible to support more formats, but if format loads
947     // are not dword-aligned, the merged load might not be valid.
948     if (Info0->BitsPerComp != 32)
949       return false;
950 
951     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
952       return false;
953   }
954 
955   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
956   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
957   CI.UseST64 = false;
958   CI.BaseOff = 0;
959 
960   // Handle all non-DS instructions.
961   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
962     if (EltOffset0 + CI.Width != EltOffset1 &&
963             EltOffset1 + Paired.Width != EltOffset0)
964       return false;
965     if (CI.CPol != Paired.CPol)
966       return false;
967     return true;
968   }
969 
970   // If the offset in elements doesn't fit in 8-bits, we might be able to use
971   // the stride 64 versions.
972   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
973       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
974     if (Modify) {
975       CI.Offset = EltOffset0 / 64;
976       Paired.Offset = EltOffset1 / 64;
977       CI.UseST64 = true;
978     }
979     return true;
980   }
981 
982   // Check if the new offsets fit in the reduced 8-bit range.
983   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
984     if (Modify) {
985       CI.Offset = EltOffset0;
986       Paired.Offset = EltOffset1;
987     }
988     return true;
989   }
990 
991   // Try to shift base address to decrease offsets.
992   uint32_t Min = std::min(EltOffset0, EltOffset1);
993   uint32_t Max = std::max(EltOffset0, EltOffset1);
994 
995   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
996   if (((Max - Min) & ~Mask) == 0) {
997     if (Modify) {
998       // From the range of values we could use for BaseOff, choose the one that
999       // is aligned to the highest power of two, to maximise the chance that
1000       // the same offset can be reused for other load/store pairs.
1001       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1002       // Copy the low bits of the offsets, so that when we adjust them by
1003       // subtracting BaseOff they will be multiples of 64.
1004       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1005       CI.BaseOff = BaseOff * CI.EltSize;
1006       CI.Offset = (EltOffset0 - BaseOff) / 64;
1007       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1008       CI.UseST64 = true;
1009     }
1010     return true;
1011   }
1012 
1013   if (isUInt<8>(Max - Min)) {
1014     if (Modify) {
1015       // From the range of values we could use for BaseOff, choose the one that
1016       // is aligned to the highest power of two, to maximise the chance that
1017       // the same offset can be reused for other load/store pairs.
1018       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1019       CI.BaseOff = BaseOff * CI.EltSize;
1020       CI.Offset = EltOffset0 - BaseOff;
1021       Paired.Offset = EltOffset1 - BaseOff;
1022     }
1023     return true;
1024   }
1025 
1026   return false;
1027 }
1028 
1029 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1030                                      const CombineInfo &CI,
1031                                      const CombineInfo &Paired) {
1032   const unsigned Width = (CI.Width + Paired.Width);
1033   switch (CI.InstClass) {
1034   default:
1035     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1036   case S_BUFFER_LOAD_IMM:
1037   case S_BUFFER_LOAD_SGPR_IMM:
1038   case S_LOAD_IMM:
1039     switch (Width) {
1040     default:
1041       return false;
1042     case 2:
1043     case 4:
1044     case 8:
1045       return true;
1046     }
1047   }
1048 }
1049 
1050 const TargetRegisterClass *
1051 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1052   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1053     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1054   }
1055   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1056     return TRI->getRegClassForReg(*MRI, Src->getReg());
1057   }
1058   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1059     return TRI->getRegClassForReg(*MRI, Src->getReg());
1060   }
1061   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1062     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1063   }
1064   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1065     return TRI->getRegClassForReg(*MRI, Src->getReg());
1066   }
1067   return nullptr;
1068 }
1069 
1070 /// This function assumes that CI comes before Paired in a basic block. Return
1071 /// an insertion point for the merged instruction or nullptr on failure.
1072 SILoadStoreOptimizer::CombineInfo *
1073 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1074                                            CombineInfo &Paired) {
1075   // If another instruction has already been merged into CI, it may now be a
1076   // type that we can't do any further merging into.
1077   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1078     return nullptr;
1079   assert(CI.InstClass == Paired.InstClass);
1080 
1081   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1082       getInstSubclass(Paired.I->getOpcode(), *TII))
1083     return nullptr;
1084 
1085   // Check both offsets (or masks for MIMG) can be combined and fit in the
1086   // reduced range.
1087   if (CI.InstClass == MIMG) {
1088     if (!dmasksCanBeCombined(CI, *TII, Paired))
1089       return nullptr;
1090   } else {
1091     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1092       return nullptr;
1093   }
1094 
1095   DenseSet<Register> RegDefs;
1096   DenseSet<Register> RegUses;
1097   CombineInfo *Where;
1098   if (CI.I->mayLoad()) {
1099     // Try to hoist Paired up to CI.
1100     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1101     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1102       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1103         return nullptr;
1104     }
1105     Where = &CI;
1106   } else {
1107     // Try to sink CI down to Paired.
1108     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1109     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1110       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1111         return nullptr;
1112     }
1113     Where = &Paired;
1114   }
1115 
1116   // Call offsetsCanBeCombined with modify = true so that the offsets are
1117   // correct for the new instruction.  This should return true, because
1118   // this function should only be called on CombineInfo objects that
1119   // have already been confirmed to be mergeable.
1120   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1121     offsetsCanBeCombined(CI, *STM, Paired, true);
1122   return Where;
1123 }
1124 
1125 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1126   if (STM->ldsRequiresM0Init())
1127     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1128   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1129 }
1130 
1131 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1132   if (STM->ldsRequiresM0Init())
1133     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1134 
1135   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1136                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1137 }
1138 
1139 MachineBasicBlock::iterator
1140 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1141                                      MachineBasicBlock::iterator InsertBefore) {
1142   MachineBasicBlock *MBB = CI.I->getParent();
1143 
1144   // Be careful, since the addresses could be subregisters themselves in weird
1145   // cases, like vectors of pointers.
1146   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1147 
1148   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1149   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1150 
1151   unsigned NewOffset0 = CI.Offset;
1152   unsigned NewOffset1 = Paired.Offset;
1153   unsigned Opc =
1154       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1155 
1156   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1157   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1158 
1159   if (NewOffset0 > NewOffset1) {
1160     // Canonicalize the merged instruction so the smaller offset comes first.
1161     std::swap(NewOffset0, NewOffset1);
1162     std::swap(SubRegIdx0, SubRegIdx1);
1163   }
1164 
1165   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1166          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1167 
1168   const MCInstrDesc &Read2Desc = TII->get(Opc);
1169 
1170   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1171   Register DestReg = MRI->createVirtualRegister(SuperRC);
1172 
1173   DebugLoc DL = CI.I->getDebugLoc();
1174 
1175   Register BaseReg = AddrReg->getReg();
1176   unsigned BaseSubReg = AddrReg->getSubReg();
1177   unsigned BaseRegFlags = 0;
1178   if (CI.BaseOff) {
1179     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1180     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1181         .addImm(CI.BaseOff);
1182 
1183     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1184     BaseRegFlags = RegState::Kill;
1185 
1186     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1187         .addReg(ImmReg)
1188         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1189         .addImm(0); // clamp bit
1190     BaseSubReg = 0;
1191   }
1192 
1193   MachineInstrBuilder Read2 =
1194       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1195           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1196           .addImm(NewOffset0)                        // offset0
1197           .addImm(NewOffset1)                        // offset1
1198           .addImm(0)                                 // gds
1199           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1200 
1201   (void)Read2;
1202 
1203   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1204 
1205   // Copy to the old destination registers.
1206   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1207       .add(*Dest0) // Copy to same destination including flags and sub reg.
1208       .addReg(DestReg, 0, SubRegIdx0);
1209   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1210       .add(*Dest1)
1211       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1212 
1213   CI.I->eraseFromParent();
1214   Paired.I->eraseFromParent();
1215 
1216   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1217   return Read2;
1218 }
1219 
1220 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1221   if (STM->ldsRequiresM0Init())
1222     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1223   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1224                         : AMDGPU::DS_WRITE2_B64_gfx9;
1225 }
1226 
1227 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1228   if (STM->ldsRequiresM0Init())
1229     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1230                           : AMDGPU::DS_WRITE2ST64_B64;
1231 
1232   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1233                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1234 }
1235 
1236 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1237     CombineInfo &CI, CombineInfo &Paired,
1238     MachineBasicBlock::iterator InsertBefore) {
1239   MachineBasicBlock *MBB = CI.I->getParent();
1240 
1241   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1242   // sure we preserve the subregister index and any register flags set on them.
1243   const MachineOperand *AddrReg =
1244       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1245   const MachineOperand *Data0 =
1246       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1247   const MachineOperand *Data1 =
1248       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1249 
1250   unsigned NewOffset0 = CI.Offset;
1251   unsigned NewOffset1 = Paired.Offset;
1252   unsigned Opc =
1253       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1254 
1255   if (NewOffset0 > NewOffset1) {
1256     // Canonicalize the merged instruction so the smaller offset comes first.
1257     std::swap(NewOffset0, NewOffset1);
1258     std::swap(Data0, Data1);
1259   }
1260 
1261   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1262          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1263 
1264   const MCInstrDesc &Write2Desc = TII->get(Opc);
1265   DebugLoc DL = CI.I->getDebugLoc();
1266 
1267   Register BaseReg = AddrReg->getReg();
1268   unsigned BaseSubReg = AddrReg->getSubReg();
1269   unsigned BaseRegFlags = 0;
1270   if (CI.BaseOff) {
1271     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1272     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1273         .addImm(CI.BaseOff);
1274 
1275     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1276     BaseRegFlags = RegState::Kill;
1277 
1278     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1279         .addReg(ImmReg)
1280         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1281         .addImm(0); // clamp bit
1282     BaseSubReg = 0;
1283   }
1284 
1285   MachineInstrBuilder Write2 =
1286       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1287           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1288           .add(*Data0)                               // data0
1289           .add(*Data1)                               // data1
1290           .addImm(NewOffset0)                        // offset0
1291           .addImm(NewOffset1)                        // offset1
1292           .addImm(0)                                 // gds
1293           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1294 
1295   CI.I->eraseFromParent();
1296   Paired.I->eraseFromParent();
1297 
1298   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1299   return Write2;
1300 }
1301 
1302 MachineBasicBlock::iterator
1303 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1304                                      MachineBasicBlock::iterator InsertBefore) {
1305   MachineBasicBlock *MBB = CI.I->getParent();
1306   DebugLoc DL = CI.I->getDebugLoc();
1307   const unsigned Opcode = getNewOpcode(CI, Paired);
1308 
1309   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1310 
1311   Register DestReg = MRI->createVirtualRegister(SuperRC);
1312   unsigned MergedDMask = CI.DMask | Paired.DMask;
1313   unsigned DMaskIdx =
1314       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1315 
1316   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1317   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1318     if (I == DMaskIdx)
1319       MIB.addImm(MergedDMask);
1320     else
1321       MIB.add((*CI.I).getOperand(I));
1322   }
1323 
1324   // It shouldn't be possible to get this far if the two instructions
1325   // don't have a single memoperand, because MachineInstr::mayAlias()
1326   // will return true if this is the case.
1327   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1328 
1329   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1330 
1331   unsigned SubRegIdx0, SubRegIdx1;
1332   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1333 
1334   // Copy to the old destination registers.
1335   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1336   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1337   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1338 
1339   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1340       .add(*Dest0) // Copy to same destination including flags and sub reg.
1341       .addReg(DestReg, 0, SubRegIdx0);
1342   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1343       .add(*Dest1)
1344       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1345 
1346   CI.I->eraseFromParent();
1347   Paired.I->eraseFromParent();
1348   return New;
1349 }
1350 
1351 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1352     CombineInfo &CI, CombineInfo &Paired,
1353     MachineBasicBlock::iterator InsertBefore) {
1354   MachineBasicBlock *MBB = CI.I->getParent();
1355   DebugLoc DL = CI.I->getDebugLoc();
1356   const unsigned Opcode = getNewOpcode(CI, Paired);
1357 
1358   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1359 
1360   Register DestReg = MRI->createVirtualRegister(SuperRC);
1361   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1362 
1363   // It shouldn't be possible to get this far if the two instructions
1364   // don't have a single memoperand, because MachineInstr::mayAlias()
1365   // will return true if this is the case.
1366   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1367 
1368   MachineInstrBuilder New =
1369       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1370           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1371   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1372     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1373   New.addImm(MergedOffset);
1374   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1375 
1376   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1377   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1378   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1379 
1380   // Copy to the old destination registers.
1381   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1382   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1383   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1384 
1385   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1386       .add(*Dest0) // Copy to same destination including flags and sub reg.
1387       .addReg(DestReg, 0, SubRegIdx0);
1388   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1389       .add(*Dest1)
1390       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1391 
1392   CI.I->eraseFromParent();
1393   Paired.I->eraseFromParent();
1394   return New;
1395 }
1396 
1397 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1398     CombineInfo &CI, CombineInfo &Paired,
1399     MachineBasicBlock::iterator InsertBefore) {
1400   MachineBasicBlock *MBB = CI.I->getParent();
1401   DebugLoc DL = CI.I->getDebugLoc();
1402 
1403   const unsigned Opcode = getNewOpcode(CI, Paired);
1404 
1405   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1406 
1407   // Copy to the new source register.
1408   Register DestReg = MRI->createVirtualRegister(SuperRC);
1409   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1410 
1411   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1412 
1413   AddressRegs Regs = getRegs(Opcode, *TII);
1414 
1415   if (Regs.VAddr)
1416     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1417 
1418   // It shouldn't be possible to get this far if the two instructions
1419   // don't have a single memoperand, because MachineInstr::mayAlias()
1420   // will return true if this is the case.
1421   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1422 
1423   MachineInstr *New =
1424     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1425         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1426         .addImm(MergedOffset) // offset
1427         .addImm(CI.CPol)      // cpol
1428         .addImm(0)            // swz
1429         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1430 
1431   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1432   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1433   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1434 
1435   // Copy to the old destination registers.
1436   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1437   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1438   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1439 
1440   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1441       .add(*Dest0) // Copy to same destination including flags and sub reg.
1442       .addReg(DestReg, 0, SubRegIdx0);
1443   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1444       .add(*Dest1)
1445       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1446 
1447   CI.I->eraseFromParent();
1448   Paired.I->eraseFromParent();
1449   return New;
1450 }
1451 
1452 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1453     CombineInfo &CI, CombineInfo &Paired,
1454     MachineBasicBlock::iterator InsertBefore) {
1455   MachineBasicBlock *MBB = CI.I->getParent();
1456   DebugLoc DL = CI.I->getDebugLoc();
1457 
1458   const unsigned Opcode = getNewOpcode(CI, Paired);
1459 
1460   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1461 
1462   // Copy to the new source register.
1463   Register DestReg = MRI->createVirtualRegister(SuperRC);
1464   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1465 
1466   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1467 
1468   AddressRegs Regs = getRegs(Opcode, *TII);
1469 
1470   if (Regs.VAddr)
1471     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1472 
1473   unsigned JoinedFormat =
1474       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1475 
1476   // It shouldn't be possible to get this far if the two instructions
1477   // don't have a single memoperand, because MachineInstr::mayAlias()
1478   // will return true if this is the case.
1479   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1480 
1481   MachineInstr *New =
1482       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1483           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1484           .addImm(MergedOffset) // offset
1485           .addImm(JoinedFormat) // format
1486           .addImm(CI.CPol)      // cpol
1487           .addImm(0)            // swz
1488           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1489 
1490   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1491   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1492   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1493 
1494   // Copy to the old destination registers.
1495   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1496   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1497   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1498 
1499   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1500       .add(*Dest0) // Copy to same destination including flags and sub reg.
1501       .addReg(DestReg, 0, SubRegIdx0);
1502   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1503       .add(*Dest1)
1504       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1505 
1506   CI.I->eraseFromParent();
1507   Paired.I->eraseFromParent();
1508   return New;
1509 }
1510 
1511 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1512     CombineInfo &CI, CombineInfo &Paired,
1513     MachineBasicBlock::iterator InsertBefore) {
1514   MachineBasicBlock *MBB = CI.I->getParent();
1515   DebugLoc DL = CI.I->getDebugLoc();
1516 
1517   const unsigned Opcode = getNewOpcode(CI, Paired);
1518 
1519   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1520   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1521   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1522 
1523   // Copy to the new source register.
1524   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1525   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1526 
1527   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1528   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1529 
1530   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1531       .add(*Src0)
1532       .addImm(SubRegIdx0)
1533       .add(*Src1)
1534       .addImm(SubRegIdx1);
1535 
1536   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1537                  .addReg(SrcReg, RegState::Kill);
1538 
1539   AddressRegs Regs = getRegs(Opcode, *TII);
1540 
1541   if (Regs.VAddr)
1542     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1543 
1544   unsigned JoinedFormat =
1545       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1546 
1547   // It shouldn't be possible to get this far if the two instructions
1548   // don't have a single memoperand, because MachineInstr::mayAlias()
1549   // will return true if this is the case.
1550   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1551 
1552   MachineInstr *New =
1553       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1554           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1555           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1556           .addImm(JoinedFormat)                     // format
1557           .addImm(CI.CPol)                          // cpol
1558           .addImm(0)                                // swz
1559           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1560 
1561   CI.I->eraseFromParent();
1562   Paired.I->eraseFromParent();
1563   return New;
1564 }
1565 
1566 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1567     CombineInfo &CI, CombineInfo &Paired,
1568     MachineBasicBlock::iterator InsertBefore) {
1569   MachineBasicBlock *MBB = CI.I->getParent();
1570   DebugLoc DL = CI.I->getDebugLoc();
1571 
1572   const unsigned Opcode = getNewOpcode(CI, Paired);
1573 
1574   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1575   Register DestReg = MRI->createVirtualRegister(SuperRC);
1576 
1577   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1578 
1579   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1580     MIB.add(*SAddr);
1581 
1582   MachineInstr *New =
1583     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1584        .addImm(std::min(CI.Offset, Paired.Offset))
1585        .addImm(CI.CPol)
1586        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1587 
1588   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1589   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1590   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1591 
1592   // Copy to the old destination registers.
1593   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1594   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1595   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1596 
1597   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1598       .add(*Dest0) // Copy to same destination including flags and sub reg.
1599       .addReg(DestReg, 0, SubRegIdx0);
1600   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1601       .add(*Dest1)
1602       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1603 
1604   CI.I->eraseFromParent();
1605   Paired.I->eraseFromParent();
1606   return New;
1607 }
1608 
1609 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1610     CombineInfo &CI, CombineInfo &Paired,
1611     MachineBasicBlock::iterator InsertBefore) {
1612   MachineBasicBlock *MBB = CI.I->getParent();
1613   DebugLoc DL = CI.I->getDebugLoc();
1614 
1615   const unsigned Opcode = getNewOpcode(CI, Paired);
1616 
1617   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1618   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1619   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1620 
1621   // Copy to the new source register.
1622   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1623   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1624 
1625   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1626   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1627 
1628   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1629       .add(*Src0)
1630       .addImm(SubRegIdx0)
1631       .add(*Src1)
1632       .addImm(SubRegIdx1);
1633 
1634   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1635                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1636                  .addReg(SrcReg, RegState::Kill);
1637 
1638   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1639     MIB.add(*SAddr);
1640 
1641   MachineInstr *New =
1642     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1643        .addImm(CI.CPol)
1644        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1645 
1646   CI.I->eraseFromParent();
1647   Paired.I->eraseFromParent();
1648   return New;
1649 }
1650 
1651 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1652                                             const CombineInfo &Paired) {
1653   const unsigned Width = CI.Width + Paired.Width;
1654 
1655   switch (getCommonInstClass(CI, Paired)) {
1656   default:
1657     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1658     // FIXME: Handle d16 correctly
1659     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1660                                   Width);
1661   case TBUFFER_LOAD:
1662   case TBUFFER_STORE:
1663     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1664                                   Width);
1665 
1666   case UNKNOWN:
1667     llvm_unreachable("Unknown instruction class");
1668   case S_BUFFER_LOAD_IMM:
1669     switch (Width) {
1670     default:
1671       return 0;
1672     case 2:
1673       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1674     case 4:
1675       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1676     case 8:
1677       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1678     }
1679   case S_BUFFER_LOAD_SGPR_IMM:
1680     switch (Width) {
1681     default:
1682       return 0;
1683     case 2:
1684       return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1685     case 4:
1686       return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1687     case 8:
1688       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1689     }
1690   case S_LOAD_IMM:
1691     switch (Width) {
1692     default:
1693       return 0;
1694     case 2:
1695       return AMDGPU::S_LOAD_DWORDX2_IMM;
1696     case 4:
1697       return AMDGPU::S_LOAD_DWORDX4_IMM;
1698     case 8:
1699       return AMDGPU::S_LOAD_DWORDX8_IMM;
1700     }
1701   case GLOBAL_LOAD:
1702     switch (Width) {
1703     default:
1704       return 0;
1705     case 2:
1706       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1707     case 3:
1708       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1709     case 4:
1710       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1711     }
1712   case GLOBAL_LOAD_SADDR:
1713     switch (Width) {
1714     default:
1715       return 0;
1716     case 2:
1717       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1718     case 3:
1719       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1720     case 4:
1721       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1722     }
1723   case GLOBAL_STORE:
1724     switch (Width) {
1725     default:
1726       return 0;
1727     case 2:
1728       return AMDGPU::GLOBAL_STORE_DWORDX2;
1729     case 3:
1730       return AMDGPU::GLOBAL_STORE_DWORDX3;
1731     case 4:
1732       return AMDGPU::GLOBAL_STORE_DWORDX4;
1733     }
1734   case GLOBAL_STORE_SADDR:
1735     switch (Width) {
1736     default:
1737       return 0;
1738     case 2:
1739       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1740     case 3:
1741       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1742     case 4:
1743       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1744     }
1745   case FLAT_LOAD:
1746     switch (Width) {
1747     default:
1748       return 0;
1749     case 2:
1750       return AMDGPU::FLAT_LOAD_DWORDX2;
1751     case 3:
1752       return AMDGPU::FLAT_LOAD_DWORDX3;
1753     case 4:
1754       return AMDGPU::FLAT_LOAD_DWORDX4;
1755     }
1756   case FLAT_STORE:
1757     switch (Width) {
1758     default:
1759       return 0;
1760     case 2:
1761       return AMDGPU::FLAT_STORE_DWORDX2;
1762     case 3:
1763       return AMDGPU::FLAT_STORE_DWORDX3;
1764     case 4:
1765       return AMDGPU::FLAT_STORE_DWORDX4;
1766     }
1767   case MIMG:
1768     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1769            "No overlaps");
1770     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1771   }
1772 }
1773 
1774 std::pair<unsigned, unsigned>
1775 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1776                                     const CombineInfo &Paired) {
1777   assert((CI.InstClass != MIMG ||
1778           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1779            CI.Width + Paired.Width)) &&
1780          "No overlaps");
1781 
1782   unsigned Idx0;
1783   unsigned Idx1;
1784 
1785   static const unsigned Idxs[5][4] = {
1786       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1787       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1788       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1789       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1790       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1791   };
1792 
1793   assert(CI.Width >= 1 && CI.Width <= 4);
1794   assert(Paired.Width >= 1 && Paired.Width <= 4);
1795 
1796   if (Paired < CI) {
1797     Idx1 = Idxs[0][Paired.Width - 1];
1798     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1799   } else {
1800     Idx0 = Idxs[0][CI.Width - 1];
1801     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1802   }
1803 
1804   return std::pair(Idx0, Idx1);
1805 }
1806 
1807 const TargetRegisterClass *
1808 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1809                                              const CombineInfo &Paired) {
1810   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1811       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1812     switch (CI.Width + Paired.Width) {
1813     default:
1814       return nullptr;
1815     case 2:
1816       return &AMDGPU::SReg_64_XEXECRegClass;
1817     case 4:
1818       return &AMDGPU::SGPR_128RegClass;
1819     case 8:
1820       return &AMDGPU::SGPR_256RegClass;
1821     case 16:
1822       return &AMDGPU::SGPR_512RegClass;
1823     }
1824   }
1825 
1826   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1827   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1828              ? TRI->getAGPRClassForBitWidth(BitWidth)
1829              : TRI->getVGPRClassForBitWidth(BitWidth);
1830 }
1831 
1832 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1833     CombineInfo &CI, CombineInfo &Paired,
1834     MachineBasicBlock::iterator InsertBefore) {
1835   MachineBasicBlock *MBB = CI.I->getParent();
1836   DebugLoc DL = CI.I->getDebugLoc();
1837 
1838   const unsigned Opcode = getNewOpcode(CI, Paired);
1839 
1840   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1841   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1842   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1843 
1844   // Copy to the new source register.
1845   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1846   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1847 
1848   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1849   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1850 
1851   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1852       .add(*Src0)
1853       .addImm(SubRegIdx0)
1854       .add(*Src1)
1855       .addImm(SubRegIdx1);
1856 
1857   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1858                  .addReg(SrcReg, RegState::Kill);
1859 
1860   AddressRegs Regs = getRegs(Opcode, *TII);
1861 
1862   if (Regs.VAddr)
1863     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1864 
1865 
1866   // It shouldn't be possible to get this far if the two instructions
1867   // don't have a single memoperand, because MachineInstr::mayAlias()
1868   // will return true if this is the case.
1869   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1870 
1871   MachineInstr *New =
1872     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1873         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1874         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1875         .addImm(CI.CPol)      // cpol
1876         .addImm(0)            // swz
1877         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1878 
1879   CI.I->eraseFromParent();
1880   Paired.I->eraseFromParent();
1881   return New;
1882 }
1883 
1884 MachineOperand
1885 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1886   APInt V(32, Val, true);
1887   if (TII->isInlineConstant(V))
1888     return MachineOperand::CreateImm(Val);
1889 
1890   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1891   MachineInstr *Mov =
1892   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1893           TII->get(AMDGPU::S_MOV_B32), Reg)
1894     .addImm(Val);
1895   (void)Mov;
1896   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1897   return MachineOperand::CreateReg(Reg, false);
1898 }
1899 
1900 // Compute base address using Addr and return the final register.
1901 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1902                                            const MemAddress &Addr) const {
1903   MachineBasicBlock *MBB = MI.getParent();
1904   MachineBasicBlock::iterator MBBI = MI.getIterator();
1905   DebugLoc DL = MI.getDebugLoc();
1906 
1907   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1908           Addr.Base.LoSubReg) &&
1909          "Expected 32-bit Base-Register-Low!!");
1910 
1911   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1912           Addr.Base.HiSubReg) &&
1913          "Expected 32-bit Base-Register-Hi!!");
1914 
1915   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1916   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1917   MachineOperand OffsetHi =
1918     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1919 
1920   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1921   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1922   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1923 
1924   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1925   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1926   MachineInstr *LoHalf =
1927     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1928       .addReg(CarryReg, RegState::Define)
1929       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1930       .add(OffsetLo)
1931       .addImm(0); // clamp bit
1932   (void)LoHalf;
1933   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1934 
1935   MachineInstr *HiHalf =
1936   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1937     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1938     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1939     .add(OffsetHi)
1940     .addReg(CarryReg, RegState::Kill)
1941     .addImm(0); // clamp bit
1942   (void)HiHalf;
1943   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1944 
1945   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1946   MachineInstr *FullBase =
1947     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1948       .addReg(DestSub0)
1949       .addImm(AMDGPU::sub0)
1950       .addReg(DestSub1)
1951       .addImm(AMDGPU::sub1);
1952   (void)FullBase;
1953   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1954 
1955   return FullDestReg;
1956 }
1957 
1958 // Update base and offset with the NewBase and NewOffset in MI.
1959 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1960                                                Register NewBase,
1961                                                int32_t NewOffset) const {
1962   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1963   Base->setReg(NewBase);
1964   Base->setIsKill(false);
1965   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1966 }
1967 
1968 std::optional<int32_t>
1969 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1970   if (Op.isImm())
1971     return Op.getImm();
1972 
1973   if (!Op.isReg())
1974     return std::nullopt;
1975 
1976   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1977   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1978       !Def->getOperand(1).isImm())
1979     return std::nullopt;
1980 
1981   return Def->getOperand(1).getImm();
1982 }
1983 
1984 // Analyze Base and extracts:
1985 //  - 32bit base registers, subregisters
1986 //  - 64bit constant offset
1987 // Expecting base computation as:
1988 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1989 //   %LO:vgpr_32, %c:sreg_64_xexec =
1990 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1991 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1992 //   %Base:vreg_64 =
1993 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1994 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1995                                                       MemAddress &Addr) const {
1996   if (!Base.isReg())
1997     return;
1998 
1999   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2000   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2001       || Def->getNumOperands() != 5)
2002     return;
2003 
2004   MachineOperand BaseLo = Def->getOperand(1);
2005   MachineOperand BaseHi = Def->getOperand(3);
2006   if (!BaseLo.isReg() || !BaseHi.isReg())
2007     return;
2008 
2009   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2010   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2011 
2012   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2013       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2014     return;
2015 
2016   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2017   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2018 
2019   auto Offset0P = extractConstOffset(*Src0);
2020   if (Offset0P)
2021     BaseLo = *Src1;
2022   else {
2023     if (!(Offset0P = extractConstOffset(*Src1)))
2024       return;
2025     BaseLo = *Src0;
2026   }
2027 
2028   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2029   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2030 
2031   if (Src0->isImm())
2032     std::swap(Src0, Src1);
2033 
2034   if (!Src1->isImm())
2035     return;
2036 
2037   uint64_t Offset1 = Src1->getImm();
2038   BaseHi = *Src0;
2039 
2040   Addr.Base.LoReg = BaseLo.getReg();
2041   Addr.Base.HiReg = BaseHi.getReg();
2042   Addr.Base.LoSubReg = BaseLo.getSubReg();
2043   Addr.Base.HiSubReg = BaseHi.getSubReg();
2044   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2045 }
2046 
2047 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2048     MachineInstr &MI,
2049     MemInfoMap &Visited,
2050     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2051 
2052   if (!(MI.mayLoad() ^ MI.mayStore()))
2053     return false;
2054 
2055   // TODO: Support flat and scratch.
2056   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2057     return false;
2058 
2059   if (MI.mayLoad() &&
2060       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2061     return false;
2062 
2063   if (AnchorList.count(&MI))
2064     return false;
2065 
2066   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2067 
2068   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2069     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2070     return false;
2071   }
2072 
2073   // Step1: Find the base-registers and a 64bit constant offset.
2074   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2075   MemAddress MAddr;
2076   if (!Visited.contains(&MI)) {
2077     processBaseWithConstOffset(Base, MAddr);
2078     Visited[&MI] = MAddr;
2079   } else
2080     MAddr = Visited[&MI];
2081 
2082   if (MAddr.Offset == 0) {
2083     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2084                          " constant offsets that can be promoted.\n";);
2085     return false;
2086   }
2087 
2088   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2089              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2090 
2091   // Step2: Traverse through MI's basic block and find an anchor(that has the
2092   // same base-registers) with the highest 13bit distance from MI's offset.
2093   // E.g. (64bit loads)
2094   // bb:
2095   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2096   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2097   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2098   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2099   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2100   //
2101   // Starting from the first load, the optimization will try to find a new base
2102   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2103   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2104   // as the new-base(anchor) because of the maximum distance which can
2105   // accommodate more intermediate bases presumably.
2106   //
2107   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2108   // (&a + 8192) for load1, load2, load4.
2109   //   addr = &a + 8192
2110   //   load1 = load(addr,       -4096)
2111   //   load2 = load(addr,       -2048)
2112   //   load3 = load(addr,       0)
2113   //   load4 = load(addr,       2048)
2114   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2115   //
2116   MachineInstr *AnchorInst = nullptr;
2117   MemAddress AnchorAddr;
2118   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2119   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2120 
2121   MachineBasicBlock *MBB = MI.getParent();
2122   MachineBasicBlock::iterator E = MBB->end();
2123   MachineBasicBlock::iterator MBBI = MI.getIterator();
2124   ++MBBI;
2125   const SITargetLowering *TLI =
2126     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2127 
2128   for ( ; MBBI != E; ++MBBI) {
2129     MachineInstr &MINext = *MBBI;
2130     // TODO: Support finding an anchor(with same base) from store addresses or
2131     // any other load addresses where the opcodes are different.
2132     if (MINext.getOpcode() != MI.getOpcode() ||
2133         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2134       continue;
2135 
2136     const MachineOperand &BaseNext =
2137       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2138     MemAddress MAddrNext;
2139     if (!Visited.contains(&MINext)) {
2140       processBaseWithConstOffset(BaseNext, MAddrNext);
2141       Visited[&MINext] = MAddrNext;
2142     } else
2143       MAddrNext = Visited[&MINext];
2144 
2145     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2146         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2147         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2148         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2149       continue;
2150 
2151     InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2152 
2153     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2154     TargetLoweringBase::AddrMode AM;
2155     AM.HasBaseReg = true;
2156     AM.BaseOffs = Dist;
2157     if (TLI->isLegalGlobalAddressingMode(AM) &&
2158         (uint32_t)std::abs(Dist) > MaxDist) {
2159       MaxDist = std::abs(Dist);
2160 
2161       AnchorAddr = MAddrNext;
2162       AnchorInst = &MINext;
2163     }
2164   }
2165 
2166   if (AnchorInst) {
2167     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2168                AnchorInst->dump());
2169     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2170                <<  AnchorAddr.Offset << "\n\n");
2171 
2172     // Instead of moving up, just re-compute anchor-instruction's base address.
2173     Register Base = computeBase(MI, AnchorAddr);
2174 
2175     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2176     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2177 
2178     for (auto P : InstsWCommonBase) {
2179       TargetLoweringBase::AddrMode AM;
2180       AM.HasBaseReg = true;
2181       AM.BaseOffs = P.second - AnchorAddr.Offset;
2182 
2183       if (TLI->isLegalGlobalAddressingMode(AM)) {
2184         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
2185                    dbgs() << ")"; P.first->dump());
2186         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2187         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
2188       }
2189     }
2190     AnchorList.insert(AnchorInst);
2191     return true;
2192   }
2193 
2194   return false;
2195 }
2196 
2197 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2198                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2199   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2200     if (AddrList.front().InstClass == CI.InstClass &&
2201         AddrList.front().IsAGPR == CI.IsAGPR &&
2202         AddrList.front().hasSameBaseAddress(CI)) {
2203       AddrList.emplace_back(CI);
2204       return;
2205     }
2206   }
2207 
2208   // Base address not found, so add a new list.
2209   MergeableInsts.emplace_back(1, CI);
2210 }
2211 
2212 std::pair<MachineBasicBlock::iterator, bool>
2213 SILoadStoreOptimizer::collectMergeableInsts(
2214     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2215     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2216     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2217   bool Modified = false;
2218 
2219   // Sort potential mergeable instructions into lists.  One list per base address.
2220   unsigned Order = 0;
2221   MachineBasicBlock::iterator BlockI = Begin;
2222   for (; BlockI != End; ++BlockI) {
2223     MachineInstr &MI = *BlockI;
2224 
2225     // We run this before checking if an address is mergeable, because it can produce
2226     // better code even if the instructions aren't mergeable.
2227     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2228       Modified = true;
2229 
2230     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2231     // barriers. We can look after this barrier for separate merges.
2232     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2233       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2234 
2235       // Search will resume after this instruction in a separate merge list.
2236       ++BlockI;
2237       break;
2238     }
2239 
2240     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2241     if (InstClass == UNKNOWN)
2242       continue;
2243 
2244     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2245     int Swizzled =
2246         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2247     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2248       continue;
2249 
2250     CombineInfo CI;
2251     CI.setMI(MI, *this);
2252     CI.Order = Order++;
2253 
2254     if (!CI.hasMergeableAddress(*MRI))
2255       continue;
2256 
2257     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2258       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2259       //        operands. However we are reporting that ds_write2 shall have
2260       //        only VGPR data so that machine copy propagation does not
2261       //        create an illegal instruction with a VGPR and AGPR sources.
2262       //        Consequenctially if we create such instruction the verifier
2263       //        will complain.
2264       continue;
2265     }
2266 
2267     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2268 
2269     addInstToMergeableList(CI, MergeableInsts);
2270   }
2271 
2272   // At this point we have lists of Mergeable instructions.
2273   //
2274   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2275   // list try to find an instruction that can be merged with I.  If an instruction
2276   // is found, it is stored in the Paired field.  If no instructions are found, then
2277   // the CombineInfo object is deleted from the list.
2278 
2279   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2280                                                    E = MergeableInsts.end(); I != E;) {
2281 
2282     std::list<CombineInfo> &MergeList = *I;
2283     if (MergeList.size() <= 1) {
2284       // This means we have found only one instruction with a given address
2285       // that can be merged, and we need at least 2 instructions to do a merge,
2286       // so this list can be discarded.
2287       I = MergeableInsts.erase(I);
2288       continue;
2289     }
2290 
2291     // Sort the lists by offsets, this way mergeable instructions will be
2292     // adjacent to each other in the list, which will make it easier to find
2293     // matches.
2294     MergeList.sort(
2295         [] (const CombineInfo &A, const CombineInfo &B) {
2296           return A.Offset < B.Offset;
2297         });
2298     ++I;
2299   }
2300 
2301   return std::pair(BlockI, Modified);
2302 }
2303 
2304 // Scan through looking for adjacent LDS operations with constant offsets from
2305 // the same base register. We rely on the scheduler to do the hard work of
2306 // clustering nearby loads, and assume these are all adjacent.
2307 bool SILoadStoreOptimizer::optimizeBlock(
2308                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2309   bool Modified = false;
2310 
2311   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2312                                                    E = MergeableInsts.end(); I != E;) {
2313     std::list<CombineInfo> &MergeList = *I;
2314 
2315     bool OptimizeListAgain = false;
2316     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2317       // We weren't able to make any changes, so delete the list so we don't
2318       // process the same instructions the next time we try to optimize this
2319       // block.
2320       I = MergeableInsts.erase(I);
2321       continue;
2322     }
2323 
2324     Modified = true;
2325 
2326     // We made changes, but also determined that there were no more optimization
2327     // opportunities, so we don't need to reprocess the list
2328     if (!OptimizeListAgain) {
2329       I = MergeableInsts.erase(I);
2330       continue;
2331     }
2332     OptimizeAgain = true;
2333   }
2334   return Modified;
2335 }
2336 
2337 bool
2338 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2339                                           std::list<CombineInfo> &MergeList,
2340                                           bool &OptimizeListAgain) {
2341   if (MergeList.empty())
2342     return false;
2343 
2344   bool Modified = false;
2345 
2346   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2347        Next = std::next(I)) {
2348 
2349     auto First = I;
2350     auto Second = Next;
2351 
2352     if ((*First).Order > (*Second).Order)
2353       std::swap(First, Second);
2354     CombineInfo &CI = *First;
2355     CombineInfo &Paired = *Second;
2356 
2357     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2358     if (!Where) {
2359       ++I;
2360       continue;
2361     }
2362 
2363     Modified = true;
2364 
2365     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2366 
2367     MachineBasicBlock::iterator NewMI;
2368     switch (CI.InstClass) {
2369     default:
2370       llvm_unreachable("unknown InstClass");
2371       break;
2372     case DS_READ:
2373       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2374       break;
2375     case DS_WRITE:
2376       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2377       break;
2378     case S_BUFFER_LOAD_IMM:
2379     case S_BUFFER_LOAD_SGPR_IMM:
2380     case S_LOAD_IMM:
2381       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2382       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2383       break;
2384     case BUFFER_LOAD:
2385       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2386       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2387       break;
2388     case BUFFER_STORE:
2389       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2390       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2391       break;
2392     case MIMG:
2393       NewMI = mergeImagePair(CI, Paired, Where->I);
2394       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2395       break;
2396     case TBUFFER_LOAD:
2397       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2398       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2399       break;
2400     case TBUFFER_STORE:
2401       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2402       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2403       break;
2404     case FLAT_LOAD:
2405     case GLOBAL_LOAD:
2406     case GLOBAL_LOAD_SADDR:
2407       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2408       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2409       break;
2410     case FLAT_STORE:
2411     case GLOBAL_STORE:
2412     case GLOBAL_STORE_SADDR:
2413       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2414       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2415       break;
2416     }
2417     CI.setMI(NewMI, *this);
2418     CI.Order = Where->Order;
2419     if (I == Second)
2420       I = Next;
2421 
2422     MergeList.erase(Second);
2423   }
2424 
2425   return Modified;
2426 }
2427 
2428 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2429   if (skipFunction(MF.getFunction()))
2430     return false;
2431 
2432   STM = &MF.getSubtarget<GCNSubtarget>();
2433   if (!STM->loadStoreOptEnabled())
2434     return false;
2435 
2436   TII = STM->getInstrInfo();
2437   TRI = &TII->getRegisterInfo();
2438 
2439   MRI = &MF.getRegInfo();
2440   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2441 
2442   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2443 
2444   bool Modified = false;
2445 
2446   // Contains the list of instructions for which constant offsets are being
2447   // promoted to the IMM. This is tracked for an entire block at time.
2448   SmallPtrSet<MachineInstr *, 4> AnchorList;
2449   MemInfoMap Visited;
2450 
2451   for (MachineBasicBlock &MBB : MF) {
2452     MachineBasicBlock::iterator SectionEnd;
2453     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2454          I = SectionEnd) {
2455       bool CollectModified;
2456       std::list<std::list<CombineInfo>> MergeableInsts;
2457 
2458       // First pass: Collect list of all instructions we know how to merge in a
2459       // subset of the block.
2460       std::tie(SectionEnd, CollectModified) =
2461           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2462 
2463       Modified |= CollectModified;
2464 
2465       do {
2466         OptimizeAgain = false;
2467         Modified |= optimizeBlock(MergeableInsts);
2468       } while (OptimizeAgain);
2469     }
2470 
2471     Visited.clear();
2472     AnchorList.clear();
2473   }
2474 
2475   return Modified;
2476 }
2477