xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (revision 8bcb0991864975618c09697b1aca10683346d9f0)
1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64InstrInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "llvm/ADT/BitVector.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/iterator_range.h"
22 #include "llvm/Analysis/AliasAnalysis.h"
23 #include "llvm/CodeGen/MachineBasicBlock.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineFunctionPass.h"
26 #include "llvm/CodeGen/MachineInstr.h"
27 #include "llvm/CodeGen/MachineInstrBuilder.h"
28 #include "llvm/CodeGen/MachineOperand.h"
29 #include "llvm/CodeGen/TargetRegisterInfo.h"
30 #include "llvm/IR/DebugLoc.h"
31 #include "llvm/MC/MCRegisterInfo.h"
32 #include "llvm/Pass.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include <cassert>
38 #include <cstdint>
39 #include <iterator>
40 #include <limits>
41 
42 using namespace llvm;
43 
44 #define DEBUG_TYPE "aarch64-ldst-opt"
45 
46 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
47 STATISTIC(NumPostFolded, "Number of post-index updates folded");
48 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
49 STATISTIC(NumUnscaledPairCreated,
50           "Number of load/store from unscaled generated");
51 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
52 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
53 
54 // The LdStLimit limits how far we search for load/store pairs.
55 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
56                                    cl::init(20), cl::Hidden);
57 
58 // The UpdateLimit limits how far we search for update instructions when we form
59 // pre-/post-index instructions.
60 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
61                                      cl::Hidden);
62 
63 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
64 
65 namespace {
66 
67 using LdStPairFlags = struct LdStPairFlags {
68   // If a matching instruction is found, MergeForward is set to true if the
69   // merge is to remove the first instruction and replace the second with
70   // a pair-wise insn, and false if the reverse is true.
71   bool MergeForward = false;
72 
73   // SExtIdx gives the index of the result of the load pair that must be
74   // extended. The value of SExtIdx assumes that the paired load produces the
75   // value in this order: (I, returned iterator), i.e., -1 means no value has
76   // to be extended, 0 means I, and 1 means the returned iterator.
77   int SExtIdx = -1;
78 
79   LdStPairFlags() = default;
80 
81   void setMergeForward(bool V = true) { MergeForward = V; }
82   bool getMergeForward() const { return MergeForward; }
83 
84   void setSExtIdx(int V) { SExtIdx = V; }
85   int getSExtIdx() const { return SExtIdx; }
86 };
87 
88 struct AArch64LoadStoreOpt : public MachineFunctionPass {
89   static char ID;
90 
91   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
92     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
93   }
94 
95   AliasAnalysis *AA;
96   const AArch64InstrInfo *TII;
97   const TargetRegisterInfo *TRI;
98   const AArch64Subtarget *Subtarget;
99 
100   // Track which register units have been modified and used.
101   LiveRegUnits ModifiedRegUnits, UsedRegUnits;
102 
103   void getAnalysisUsage(AnalysisUsage &AU) const override {
104     AU.addRequired<AAResultsWrapperPass>();
105     MachineFunctionPass::getAnalysisUsage(AU);
106   }
107 
108   // Scan the instructions looking for a load/store that can be combined
109   // with the current instruction into a load/store pair.
110   // Return the matching instruction if one is found, else MBB->end().
111   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
112                                                LdStPairFlags &Flags,
113                                                unsigned Limit,
114                                                bool FindNarrowMerge);
115 
116   // Scan the instructions looking for a store that writes to the address from
117   // which the current load instruction reads. Return true if one is found.
118   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
119                          MachineBasicBlock::iterator &StoreI);
120 
121   // Merge the two instructions indicated into a wider narrow store instruction.
122   MachineBasicBlock::iterator
123   mergeNarrowZeroStores(MachineBasicBlock::iterator I,
124                         MachineBasicBlock::iterator MergeMI,
125                         const LdStPairFlags &Flags);
126 
127   // Merge the two instructions indicated into a single pair-wise instruction.
128   MachineBasicBlock::iterator
129   mergePairedInsns(MachineBasicBlock::iterator I,
130                    MachineBasicBlock::iterator Paired,
131                    const LdStPairFlags &Flags);
132 
133   // Promote the load that reads directly from the address stored to.
134   MachineBasicBlock::iterator
135   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
136                        MachineBasicBlock::iterator StoreI);
137 
138   // Scan the instruction list to find a base register update that can
139   // be combined with the current instruction (a load or store) using
140   // pre or post indexed addressing with writeback. Scan forwards.
141   MachineBasicBlock::iterator
142   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
143                                 int UnscaledOffset, unsigned Limit);
144 
145   // Scan the instruction list to find a base register update that can
146   // be combined with the current instruction (a load or store) using
147   // pre or post indexed addressing with writeback. Scan backwards.
148   MachineBasicBlock::iterator
149   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
150 
151   // Find an instruction that updates the base register of the ld/st
152   // instruction.
153   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
154                             unsigned BaseReg, int Offset);
155 
156   // Merge a pre- or post-index base register update into a ld/st instruction.
157   MachineBasicBlock::iterator
158   mergeUpdateInsn(MachineBasicBlock::iterator I,
159                   MachineBasicBlock::iterator Update, bool IsPreIdx);
160 
161   // Find and merge zero store instructions.
162   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
163 
164   // Find and pair ldr/str instructions.
165   bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
166 
167   // Find and promote load instructions which read directly from store.
168   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
169 
170   // Find and merge a base register updates before or after a ld/st instruction.
171   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
172 
173   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
174 
175   bool runOnMachineFunction(MachineFunction &Fn) override;
176 
177   MachineFunctionProperties getRequiredProperties() const override {
178     return MachineFunctionProperties().set(
179         MachineFunctionProperties::Property::NoVRegs);
180   }
181 
182   StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
183 };
184 
185 char AArch64LoadStoreOpt::ID = 0;
186 
187 } // end anonymous namespace
188 
189 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
190                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
191 
192 static bool isNarrowStore(unsigned Opc) {
193   switch (Opc) {
194   default:
195     return false;
196   case AArch64::STRBBui:
197   case AArch64::STURBBi:
198   case AArch64::STRHHui:
199   case AArch64::STURHHi:
200     return true;
201   }
202 }
203 
204 // These instruction set memory tag and either keep memory contents unchanged or
205 // set it to zero, ignoring the address part of the source register.
206 static bool isTagStore(const MachineInstr &MI) {
207   switch (MI.getOpcode()) {
208   default:
209     return false;
210   case AArch64::STGOffset:
211   case AArch64::STZGOffset:
212   case AArch64::ST2GOffset:
213   case AArch64::STZ2GOffset:
214     return true;
215   }
216 }
217 
218 // Scaling factor for unscaled load or store.
219 static int getMemScale(const MachineInstr &MI) {
220   switch (MI.getOpcode()) {
221   default:
222     llvm_unreachable("Opcode has unknown scale!");
223   case AArch64::LDRBBui:
224   case AArch64::LDURBBi:
225   case AArch64::LDRSBWui:
226   case AArch64::LDURSBWi:
227   case AArch64::STRBBui:
228   case AArch64::STURBBi:
229     return 1;
230   case AArch64::LDRHHui:
231   case AArch64::LDURHHi:
232   case AArch64::LDRSHWui:
233   case AArch64::LDURSHWi:
234   case AArch64::STRHHui:
235   case AArch64::STURHHi:
236     return 2;
237   case AArch64::LDRSui:
238   case AArch64::LDURSi:
239   case AArch64::LDRSWui:
240   case AArch64::LDURSWi:
241   case AArch64::LDRWui:
242   case AArch64::LDURWi:
243   case AArch64::STRSui:
244   case AArch64::STURSi:
245   case AArch64::STRWui:
246   case AArch64::STURWi:
247   case AArch64::LDPSi:
248   case AArch64::LDPSWi:
249   case AArch64::LDPWi:
250   case AArch64::STPSi:
251   case AArch64::STPWi:
252     return 4;
253   case AArch64::LDRDui:
254   case AArch64::LDURDi:
255   case AArch64::LDRXui:
256   case AArch64::LDURXi:
257   case AArch64::STRDui:
258   case AArch64::STURDi:
259   case AArch64::STRXui:
260   case AArch64::STURXi:
261   case AArch64::LDPDi:
262   case AArch64::LDPXi:
263   case AArch64::STPDi:
264   case AArch64::STPXi:
265     return 8;
266   case AArch64::LDRQui:
267   case AArch64::LDURQi:
268   case AArch64::STRQui:
269   case AArch64::STURQi:
270   case AArch64::LDPQi:
271   case AArch64::STPQi:
272   case AArch64::STGOffset:
273   case AArch64::STZGOffset:
274   case AArch64::ST2GOffset:
275   case AArch64::STZ2GOffset:
276   case AArch64::STGPi:
277     return 16;
278   }
279 }
280 
281 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
282                                          bool *IsValidLdStrOpc = nullptr) {
283   if (IsValidLdStrOpc)
284     *IsValidLdStrOpc = true;
285   switch (Opc) {
286   default:
287     if (IsValidLdStrOpc)
288       *IsValidLdStrOpc = false;
289     return std::numeric_limits<unsigned>::max();
290   case AArch64::STRDui:
291   case AArch64::STURDi:
292   case AArch64::STRQui:
293   case AArch64::STURQi:
294   case AArch64::STRBBui:
295   case AArch64::STURBBi:
296   case AArch64::STRHHui:
297   case AArch64::STURHHi:
298   case AArch64::STRWui:
299   case AArch64::STURWi:
300   case AArch64::STRXui:
301   case AArch64::STURXi:
302   case AArch64::LDRDui:
303   case AArch64::LDURDi:
304   case AArch64::LDRQui:
305   case AArch64::LDURQi:
306   case AArch64::LDRWui:
307   case AArch64::LDURWi:
308   case AArch64::LDRXui:
309   case AArch64::LDURXi:
310   case AArch64::STRSui:
311   case AArch64::STURSi:
312   case AArch64::LDRSui:
313   case AArch64::LDURSi:
314     return Opc;
315   case AArch64::LDRSWui:
316     return AArch64::LDRWui;
317   case AArch64::LDURSWi:
318     return AArch64::LDURWi;
319   }
320 }
321 
322 static unsigned getMatchingWideOpcode(unsigned Opc) {
323   switch (Opc) {
324   default:
325     llvm_unreachable("Opcode has no wide equivalent!");
326   case AArch64::STRBBui:
327     return AArch64::STRHHui;
328   case AArch64::STRHHui:
329     return AArch64::STRWui;
330   case AArch64::STURBBi:
331     return AArch64::STURHHi;
332   case AArch64::STURHHi:
333     return AArch64::STURWi;
334   case AArch64::STURWi:
335     return AArch64::STURXi;
336   case AArch64::STRWui:
337     return AArch64::STRXui;
338   }
339 }
340 
341 static unsigned getMatchingPairOpcode(unsigned Opc) {
342   switch (Opc) {
343   default:
344     llvm_unreachable("Opcode has no pairwise equivalent!");
345   case AArch64::STRSui:
346   case AArch64::STURSi:
347     return AArch64::STPSi;
348   case AArch64::STRDui:
349   case AArch64::STURDi:
350     return AArch64::STPDi;
351   case AArch64::STRQui:
352   case AArch64::STURQi:
353     return AArch64::STPQi;
354   case AArch64::STRWui:
355   case AArch64::STURWi:
356     return AArch64::STPWi;
357   case AArch64::STRXui:
358   case AArch64::STURXi:
359     return AArch64::STPXi;
360   case AArch64::LDRSui:
361   case AArch64::LDURSi:
362     return AArch64::LDPSi;
363   case AArch64::LDRDui:
364   case AArch64::LDURDi:
365     return AArch64::LDPDi;
366   case AArch64::LDRQui:
367   case AArch64::LDURQi:
368     return AArch64::LDPQi;
369   case AArch64::LDRWui:
370   case AArch64::LDURWi:
371     return AArch64::LDPWi;
372   case AArch64::LDRXui:
373   case AArch64::LDURXi:
374     return AArch64::LDPXi;
375   case AArch64::LDRSWui:
376   case AArch64::LDURSWi:
377     return AArch64::LDPSWi;
378   }
379 }
380 
381 static unsigned isMatchingStore(MachineInstr &LoadInst,
382                                 MachineInstr &StoreInst) {
383   unsigned LdOpc = LoadInst.getOpcode();
384   unsigned StOpc = StoreInst.getOpcode();
385   switch (LdOpc) {
386   default:
387     llvm_unreachable("Unsupported load instruction!");
388   case AArch64::LDRBBui:
389     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
390            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
391   case AArch64::LDURBBi:
392     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
393            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
394   case AArch64::LDRHHui:
395     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
396            StOpc == AArch64::STRXui;
397   case AArch64::LDURHHi:
398     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
399            StOpc == AArch64::STURXi;
400   case AArch64::LDRWui:
401     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
402   case AArch64::LDURWi:
403     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
404   case AArch64::LDRXui:
405     return StOpc == AArch64::STRXui;
406   case AArch64::LDURXi:
407     return StOpc == AArch64::STURXi;
408   }
409 }
410 
411 static unsigned getPreIndexedOpcode(unsigned Opc) {
412   // FIXME: We don't currently support creating pre-indexed loads/stores when
413   // the load or store is the unscaled version.  If we decide to perform such an
414   // optimization in the future the cases for the unscaled loads/stores will
415   // need to be added here.
416   switch (Opc) {
417   default:
418     llvm_unreachable("Opcode has no pre-indexed equivalent!");
419   case AArch64::STRSui:
420     return AArch64::STRSpre;
421   case AArch64::STRDui:
422     return AArch64::STRDpre;
423   case AArch64::STRQui:
424     return AArch64::STRQpre;
425   case AArch64::STRBBui:
426     return AArch64::STRBBpre;
427   case AArch64::STRHHui:
428     return AArch64::STRHHpre;
429   case AArch64::STRWui:
430     return AArch64::STRWpre;
431   case AArch64::STRXui:
432     return AArch64::STRXpre;
433   case AArch64::LDRSui:
434     return AArch64::LDRSpre;
435   case AArch64::LDRDui:
436     return AArch64::LDRDpre;
437   case AArch64::LDRQui:
438     return AArch64::LDRQpre;
439   case AArch64::LDRBBui:
440     return AArch64::LDRBBpre;
441   case AArch64::LDRHHui:
442     return AArch64::LDRHHpre;
443   case AArch64::LDRWui:
444     return AArch64::LDRWpre;
445   case AArch64::LDRXui:
446     return AArch64::LDRXpre;
447   case AArch64::LDRSWui:
448     return AArch64::LDRSWpre;
449   case AArch64::LDPSi:
450     return AArch64::LDPSpre;
451   case AArch64::LDPSWi:
452     return AArch64::LDPSWpre;
453   case AArch64::LDPDi:
454     return AArch64::LDPDpre;
455   case AArch64::LDPQi:
456     return AArch64::LDPQpre;
457   case AArch64::LDPWi:
458     return AArch64::LDPWpre;
459   case AArch64::LDPXi:
460     return AArch64::LDPXpre;
461   case AArch64::STPSi:
462     return AArch64::STPSpre;
463   case AArch64::STPDi:
464     return AArch64::STPDpre;
465   case AArch64::STPQi:
466     return AArch64::STPQpre;
467   case AArch64::STPWi:
468     return AArch64::STPWpre;
469   case AArch64::STPXi:
470     return AArch64::STPXpre;
471   case AArch64::STGOffset:
472     return AArch64::STGPreIndex;
473   case AArch64::STZGOffset:
474     return AArch64::STZGPreIndex;
475   case AArch64::ST2GOffset:
476     return AArch64::ST2GPreIndex;
477   case AArch64::STZ2GOffset:
478     return AArch64::STZ2GPreIndex;
479   case AArch64::STGPi:
480     return AArch64::STGPpre;
481   }
482 }
483 
484 static unsigned getPostIndexedOpcode(unsigned Opc) {
485   switch (Opc) {
486   default:
487     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
488   case AArch64::STRSui:
489   case AArch64::STURSi:
490     return AArch64::STRSpost;
491   case AArch64::STRDui:
492   case AArch64::STURDi:
493     return AArch64::STRDpost;
494   case AArch64::STRQui:
495   case AArch64::STURQi:
496     return AArch64::STRQpost;
497   case AArch64::STRBBui:
498     return AArch64::STRBBpost;
499   case AArch64::STRHHui:
500     return AArch64::STRHHpost;
501   case AArch64::STRWui:
502   case AArch64::STURWi:
503     return AArch64::STRWpost;
504   case AArch64::STRXui:
505   case AArch64::STURXi:
506     return AArch64::STRXpost;
507   case AArch64::LDRSui:
508   case AArch64::LDURSi:
509     return AArch64::LDRSpost;
510   case AArch64::LDRDui:
511   case AArch64::LDURDi:
512     return AArch64::LDRDpost;
513   case AArch64::LDRQui:
514   case AArch64::LDURQi:
515     return AArch64::LDRQpost;
516   case AArch64::LDRBBui:
517     return AArch64::LDRBBpost;
518   case AArch64::LDRHHui:
519     return AArch64::LDRHHpost;
520   case AArch64::LDRWui:
521   case AArch64::LDURWi:
522     return AArch64::LDRWpost;
523   case AArch64::LDRXui:
524   case AArch64::LDURXi:
525     return AArch64::LDRXpost;
526   case AArch64::LDRSWui:
527     return AArch64::LDRSWpost;
528   case AArch64::LDPSi:
529     return AArch64::LDPSpost;
530   case AArch64::LDPSWi:
531     return AArch64::LDPSWpost;
532   case AArch64::LDPDi:
533     return AArch64::LDPDpost;
534   case AArch64::LDPQi:
535     return AArch64::LDPQpost;
536   case AArch64::LDPWi:
537     return AArch64::LDPWpost;
538   case AArch64::LDPXi:
539     return AArch64::LDPXpost;
540   case AArch64::STPSi:
541     return AArch64::STPSpost;
542   case AArch64::STPDi:
543     return AArch64::STPDpost;
544   case AArch64::STPQi:
545     return AArch64::STPQpost;
546   case AArch64::STPWi:
547     return AArch64::STPWpost;
548   case AArch64::STPXi:
549     return AArch64::STPXpost;
550   case AArch64::STGOffset:
551     return AArch64::STGPostIndex;
552   case AArch64::STZGOffset:
553     return AArch64::STZGPostIndex;
554   case AArch64::ST2GOffset:
555     return AArch64::ST2GPostIndex;
556   case AArch64::STZ2GOffset:
557     return AArch64::STZ2GPostIndex;
558   case AArch64::STGPi:
559     return AArch64::STGPpost;
560   }
561 }
562 
563 static bool isPairedLdSt(const MachineInstr &MI) {
564   switch (MI.getOpcode()) {
565   default:
566     return false;
567   case AArch64::LDPSi:
568   case AArch64::LDPSWi:
569   case AArch64::LDPDi:
570   case AArch64::LDPQi:
571   case AArch64::LDPWi:
572   case AArch64::LDPXi:
573   case AArch64::STPSi:
574   case AArch64::STPDi:
575   case AArch64::STPQi:
576   case AArch64::STPWi:
577   case AArch64::STPXi:
578   case AArch64::STGPi:
579     return true;
580   }
581 }
582 
583 // Returns the scale and offset range of pre/post indexed variants of MI.
584 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
585                                        int &MinOffset, int &MaxOffset) {
586   bool IsPaired = isPairedLdSt(MI);
587   bool IsTagStore = isTagStore(MI);
588   // ST*G and all paired ldst have the same scale in pre/post-indexed variants
589   // as in the "unsigned offset" variant.
590   // All other pre/post indexed ldst instructions are unscaled.
591   Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1;
592 
593   if (IsPaired) {
594     MinOffset = -64;
595     MaxOffset = 63;
596   } else {
597     MinOffset = -256;
598     MaxOffset = 255;
599   }
600 }
601 
602 static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
603                                           unsigned PairedRegOp = 0) {
604   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
605   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
606   return MI.getOperand(Idx);
607 }
608 
609 static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
610   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
611   return MI.getOperand(Idx);
612 }
613 
614 static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
615   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
616   return MI.getOperand(Idx);
617 }
618 
619 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
620                                   MachineInstr &StoreInst,
621                                   const AArch64InstrInfo *TII) {
622   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
623   int LoadSize = getMemScale(LoadInst);
624   int StoreSize = getMemScale(StoreInst);
625   int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
626                              ? getLdStOffsetOp(StoreInst).getImm()
627                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
628   int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
629                              ? getLdStOffsetOp(LoadInst).getImm()
630                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
631   return (UnscaledStOffset <= UnscaledLdOffset) &&
632          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
633 }
634 
635 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
636   unsigned Opc = MI.getOpcode();
637   return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
638           isNarrowStore(Opc)) &&
639          getLdStRegOp(MI).getReg() == AArch64::WZR;
640 }
641 
642 static bool isPromotableLoadFromStore(MachineInstr &MI) {
643   switch (MI.getOpcode()) {
644   default:
645     return false;
646   // Scaled instructions.
647   case AArch64::LDRBBui:
648   case AArch64::LDRHHui:
649   case AArch64::LDRWui:
650   case AArch64::LDRXui:
651   // Unscaled instructions.
652   case AArch64::LDURBBi:
653   case AArch64::LDURHHi:
654   case AArch64::LDURWi:
655   case AArch64::LDURXi:
656     return true;
657   }
658 }
659 
660 static bool isMergeableLdStUpdate(MachineInstr &MI) {
661   unsigned Opc = MI.getOpcode();
662   switch (Opc) {
663   default:
664     return false;
665   // Scaled instructions.
666   case AArch64::STRSui:
667   case AArch64::STRDui:
668   case AArch64::STRQui:
669   case AArch64::STRXui:
670   case AArch64::STRWui:
671   case AArch64::STRHHui:
672   case AArch64::STRBBui:
673   case AArch64::LDRSui:
674   case AArch64::LDRDui:
675   case AArch64::LDRQui:
676   case AArch64::LDRXui:
677   case AArch64::LDRWui:
678   case AArch64::LDRHHui:
679   case AArch64::LDRBBui:
680   case AArch64::STGOffset:
681   case AArch64::STZGOffset:
682   case AArch64::ST2GOffset:
683   case AArch64::STZ2GOffset:
684   case AArch64::STGPi:
685   // Unscaled instructions.
686   case AArch64::STURSi:
687   case AArch64::STURDi:
688   case AArch64::STURQi:
689   case AArch64::STURWi:
690   case AArch64::STURXi:
691   case AArch64::LDURSi:
692   case AArch64::LDURDi:
693   case AArch64::LDURQi:
694   case AArch64::LDURWi:
695   case AArch64::LDURXi:
696   // Paired instructions.
697   case AArch64::LDPSi:
698   case AArch64::LDPSWi:
699   case AArch64::LDPDi:
700   case AArch64::LDPQi:
701   case AArch64::LDPWi:
702   case AArch64::LDPXi:
703   case AArch64::STPSi:
704   case AArch64::STPDi:
705   case AArch64::STPQi:
706   case AArch64::STPWi:
707   case AArch64::STPXi:
708     // Make sure this is a reg+imm (as opposed to an address reloc).
709     if (!getLdStOffsetOp(MI).isImm())
710       return false;
711 
712     return true;
713   }
714 }
715 
716 MachineBasicBlock::iterator
717 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
718                                            MachineBasicBlock::iterator MergeMI,
719                                            const LdStPairFlags &Flags) {
720   assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
721          "Expected promotable zero stores.");
722 
723   MachineBasicBlock::iterator NextI = I;
724   ++NextI;
725   // If NextI is the second of the two instructions to be merged, we need
726   // to skip one further. Either way we merge will invalidate the iterator,
727   // and we don't need to scan the new instruction, as it's a pairwise
728   // instruction, which we're not considering for further action anyway.
729   if (NextI == MergeMI)
730     ++NextI;
731 
732   unsigned Opc = I->getOpcode();
733   bool IsScaled = !TII->isUnscaledLdSt(Opc);
734   int OffsetStride = IsScaled ? 1 : getMemScale(*I);
735 
736   bool MergeForward = Flags.getMergeForward();
737   // Insert our new paired instruction after whichever of the paired
738   // instructions MergeForward indicates.
739   MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
740   // Also based on MergeForward is from where we copy the base register operand
741   // so we get the flags compatible with the input code.
742   const MachineOperand &BaseRegOp =
743       MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
744 
745   // Which register is Rt and which is Rt2 depends on the offset order.
746   MachineInstr *RtMI;
747   if (getLdStOffsetOp(*I).getImm() ==
748       getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
749     RtMI = &*MergeMI;
750   else
751     RtMI = &*I;
752 
753   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
754   // Change the scaled offset from small to large type.
755   if (IsScaled) {
756     assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
757     OffsetImm /= 2;
758   }
759 
760   // Construct the new instruction.
761   DebugLoc DL = I->getDebugLoc();
762   MachineBasicBlock *MBB = I->getParent();
763   MachineInstrBuilder MIB;
764   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
765             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
766             .add(BaseRegOp)
767             .addImm(OffsetImm)
768             .cloneMergedMemRefs({&*I, &*MergeMI})
769             .setMIFlags(I->mergeFlagsWith(*MergeMI));
770   (void)MIB;
771 
772   LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
773   LLVM_DEBUG(I->print(dbgs()));
774   LLVM_DEBUG(dbgs() << "    ");
775   LLVM_DEBUG(MergeMI->print(dbgs()));
776   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
777   LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
778   LLVM_DEBUG(dbgs() << "\n");
779 
780   // Erase the old instructions.
781   I->eraseFromParent();
782   MergeMI->eraseFromParent();
783   return NextI;
784 }
785 
786 MachineBasicBlock::iterator
787 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
788                                       MachineBasicBlock::iterator Paired,
789                                       const LdStPairFlags &Flags) {
790   MachineBasicBlock::iterator NextI = I;
791   ++NextI;
792   // If NextI is the second of the two instructions to be merged, we need
793   // to skip one further. Either way we merge will invalidate the iterator,
794   // and we don't need to scan the new instruction, as it's a pairwise
795   // instruction, which we're not considering for further action anyway.
796   if (NextI == Paired)
797     ++NextI;
798 
799   int SExtIdx = Flags.getSExtIdx();
800   unsigned Opc =
801       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
802   bool IsUnscaled = TII->isUnscaledLdSt(Opc);
803   int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
804 
805   bool MergeForward = Flags.getMergeForward();
806   // Insert our new paired instruction after whichever of the paired
807   // instructions MergeForward indicates.
808   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
809   // Also based on MergeForward is from where we copy the base register operand
810   // so we get the flags compatible with the input code.
811   const MachineOperand &BaseRegOp =
812       MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
813 
814   int Offset = getLdStOffsetOp(*I).getImm();
815   int PairedOffset = getLdStOffsetOp(*Paired).getImm();
816   bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
817   if (IsUnscaled != PairedIsUnscaled) {
818     // We're trying to pair instructions that differ in how they are scaled.  If
819     // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
820     // the opposite (i.e., make Paired's offset unscaled).
821     int MemSize = getMemScale(*Paired);
822     if (PairedIsUnscaled) {
823       // If the unscaled offset isn't a multiple of the MemSize, we can't
824       // pair the operations together.
825       assert(!(PairedOffset % getMemScale(*Paired)) &&
826              "Offset should be a multiple of the stride!");
827       PairedOffset /= MemSize;
828     } else {
829       PairedOffset *= MemSize;
830     }
831   }
832 
833   // Which register is Rt and which is Rt2 depends on the offset order.
834   MachineInstr *RtMI, *Rt2MI;
835   if (Offset == PairedOffset + OffsetStride) {
836     RtMI = &*Paired;
837     Rt2MI = &*I;
838     // Here we swapped the assumption made for SExtIdx.
839     // I.e., we turn ldp I, Paired into ldp Paired, I.
840     // Update the index accordingly.
841     if (SExtIdx != -1)
842       SExtIdx = (SExtIdx + 1) % 2;
843   } else {
844     RtMI = &*I;
845     Rt2MI = &*Paired;
846   }
847   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
848   // Scale the immediate offset, if necessary.
849   if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
850     assert(!(OffsetImm % getMemScale(*RtMI)) &&
851            "Unscaled offset cannot be scaled.");
852     OffsetImm /= getMemScale(*RtMI);
853   }
854 
855   // Construct the new instruction.
856   MachineInstrBuilder MIB;
857   DebugLoc DL = I->getDebugLoc();
858   MachineBasicBlock *MBB = I->getParent();
859   MachineOperand RegOp0 = getLdStRegOp(*RtMI);
860   MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
861   // Kill flags may become invalid when moving stores for pairing.
862   if (RegOp0.isUse()) {
863     if (!MergeForward) {
864       // Clear kill flags on store if moving upwards. Example:
865       //   STRWui %w0, ...
866       //   USE %w1
867       //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
868       RegOp0.setIsKill(false);
869       RegOp1.setIsKill(false);
870     } else {
871       // Clear kill flags of the first stores register. Example:
872       //   STRWui %w1, ...
873       //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
874       //   STRW %w0
875       Register Reg = getLdStRegOp(*I).getReg();
876       for (MachineInstr &MI : make_range(std::next(I), Paired))
877         MI.clearRegisterKills(Reg, TRI);
878     }
879   }
880   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
881             .add(RegOp0)
882             .add(RegOp1)
883             .add(BaseRegOp)
884             .addImm(OffsetImm)
885             .cloneMergedMemRefs({&*I, &*Paired})
886             .setMIFlags(I->mergeFlagsWith(*Paired));
887 
888   (void)MIB;
889 
890   LLVM_DEBUG(
891       dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
892   LLVM_DEBUG(I->print(dbgs()));
893   LLVM_DEBUG(dbgs() << "    ");
894   LLVM_DEBUG(Paired->print(dbgs()));
895   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
896   if (SExtIdx != -1) {
897     // Generate the sign extension for the proper result of the ldp.
898     // I.e., with X1, that would be:
899     // %w1 = KILL %w1, implicit-def %x1
900     // %x1 = SBFMXri killed %x1, 0, 31
901     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
902     // Right now, DstMO has the extended register, since it comes from an
903     // extended opcode.
904     Register DstRegX = DstMO.getReg();
905     // Get the W variant of that register.
906     Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
907     // Update the result of LDP to use the W instead of the X variant.
908     DstMO.setReg(DstRegW);
909     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
910     LLVM_DEBUG(dbgs() << "\n");
911     // Make the machine verifier happy by providing a definition for
912     // the X register.
913     // Insert this definition right after the generated LDP, i.e., before
914     // InsertionPoint.
915     MachineInstrBuilder MIBKill =
916         BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
917             .addReg(DstRegW)
918             .addReg(DstRegX, RegState::Define);
919     MIBKill->getOperand(2).setImplicit();
920     // Create the sign extension.
921     MachineInstrBuilder MIBSXTW =
922         BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
923             .addReg(DstRegX)
924             .addImm(0)
925             .addImm(31);
926     (void)MIBSXTW;
927     LLVM_DEBUG(dbgs() << "  Extend operand:\n    ");
928     LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
929   } else {
930     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
931   }
932   LLVM_DEBUG(dbgs() << "\n");
933 
934   // Erase the old instructions.
935   I->eraseFromParent();
936   Paired->eraseFromParent();
937 
938   return NextI;
939 }
940 
941 MachineBasicBlock::iterator
942 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
943                                           MachineBasicBlock::iterator StoreI) {
944   MachineBasicBlock::iterator NextI = LoadI;
945   ++NextI;
946 
947   int LoadSize = getMemScale(*LoadI);
948   int StoreSize = getMemScale(*StoreI);
949   Register LdRt = getLdStRegOp(*LoadI).getReg();
950   const MachineOperand &StMO = getLdStRegOp(*StoreI);
951   Register StRt = getLdStRegOp(*StoreI).getReg();
952   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
953 
954   assert((IsStoreXReg ||
955           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
956          "Unexpected RegClass");
957 
958   MachineInstr *BitExtMI;
959   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
960     // Remove the load, if the destination register of the loads is the same
961     // register for stored value.
962     if (StRt == LdRt && LoadSize == 8) {
963       for (MachineInstr &MI : make_range(StoreI->getIterator(),
964                                          LoadI->getIterator())) {
965         if (MI.killsRegister(StRt, TRI)) {
966           MI.clearRegisterKills(StRt, TRI);
967           break;
968         }
969       }
970       LLVM_DEBUG(dbgs() << "Remove load instruction:\n    ");
971       LLVM_DEBUG(LoadI->print(dbgs()));
972       LLVM_DEBUG(dbgs() << "\n");
973       LoadI->eraseFromParent();
974       return NextI;
975     }
976     // Replace the load with a mov if the load and store are in the same size.
977     BitExtMI =
978         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
979                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
980             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
981             .add(StMO)
982             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
983             .setMIFlags(LoadI->getFlags());
984   } else {
985     // FIXME: Currently we disable this transformation in big-endian targets as
986     // performance and correctness are verified only in little-endian.
987     if (!Subtarget->isLittleEndian())
988       return NextI;
989     bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
990     assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
991            "Unsupported ld/st match");
992     assert(LoadSize <= StoreSize && "Invalid load size");
993     int UnscaledLdOffset = IsUnscaled
994                                ? getLdStOffsetOp(*LoadI).getImm()
995                                : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
996     int UnscaledStOffset = IsUnscaled
997                                ? getLdStOffsetOp(*StoreI).getImm()
998                                : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
999     int Width = LoadSize * 8;
1000     unsigned DestReg =
1001         IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1002                           LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1003                     : LdRt;
1004 
1005     assert((UnscaledLdOffset >= UnscaledStOffset &&
1006             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1007            "Invalid offset");
1008 
1009     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1010     int Imms = Immr + Width - 1;
1011     if (UnscaledLdOffset == UnscaledStOffset) {
1012       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1013                                 | ((Immr) << 6)               // immr
1014                                 | ((Imms) << 0)               // imms
1015           ;
1016 
1017       BitExtMI =
1018           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1019                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1020                   DestReg)
1021               .add(StMO)
1022               .addImm(AndMaskEncoded)
1023               .setMIFlags(LoadI->getFlags());
1024     } else {
1025       BitExtMI =
1026           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1027                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1028                   DestReg)
1029               .add(StMO)
1030               .addImm(Immr)
1031               .addImm(Imms)
1032               .setMIFlags(LoadI->getFlags());
1033     }
1034   }
1035 
1036   // Clear kill flags between store and load.
1037   for (MachineInstr &MI : make_range(StoreI->getIterator(),
1038                                      BitExtMI->getIterator()))
1039     if (MI.killsRegister(StRt, TRI)) {
1040       MI.clearRegisterKills(StRt, TRI);
1041       break;
1042     }
1043 
1044   LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n    ");
1045   LLVM_DEBUG(StoreI->print(dbgs()));
1046   LLVM_DEBUG(dbgs() << "    ");
1047   LLVM_DEBUG(LoadI->print(dbgs()));
1048   LLVM_DEBUG(dbgs() << "  with instructions:\n    ");
1049   LLVM_DEBUG(StoreI->print(dbgs()));
1050   LLVM_DEBUG(dbgs() << "    ");
1051   LLVM_DEBUG((BitExtMI)->print(dbgs()));
1052   LLVM_DEBUG(dbgs() << "\n");
1053 
1054   // Erase the old instructions.
1055   LoadI->eraseFromParent();
1056   return NextI;
1057 }
1058 
1059 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1060   // Convert the byte-offset used by unscaled into an "element" offset used
1061   // by the scaled pair load/store instructions.
1062   if (IsUnscaled) {
1063     // If the byte-offset isn't a multiple of the stride, there's no point
1064     // trying to match it.
1065     if (Offset % OffsetStride)
1066       return false;
1067     Offset /= OffsetStride;
1068   }
1069   return Offset <= 63 && Offset >= -64;
1070 }
1071 
1072 // Do alignment, specialized to power of 2 and for signed ints,
1073 // avoiding having to do a C-style cast from uint_64t to int when
1074 // using alignTo from include/llvm/Support/MathExtras.h.
1075 // FIXME: Move this function to include/MathExtras.h?
1076 static int alignTo(int Num, int PowOf2) {
1077   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1078 }
1079 
1080 static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
1081                      AliasAnalysis *AA) {
1082   // One of the instructions must modify memory.
1083   if (!MIa.mayStore() && !MIb.mayStore())
1084     return false;
1085 
1086   // Both instructions must be memory operations.
1087   if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
1088     return false;
1089 
1090   return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
1091 }
1092 
1093 static bool mayAlias(MachineInstr &MIa,
1094                      SmallVectorImpl<MachineInstr *> &MemInsns,
1095                      AliasAnalysis *AA) {
1096   for (MachineInstr *MIb : MemInsns)
1097     if (mayAlias(MIa, *MIb, AA))
1098       return true;
1099 
1100   return false;
1101 }
1102 
1103 bool AArch64LoadStoreOpt::findMatchingStore(
1104     MachineBasicBlock::iterator I, unsigned Limit,
1105     MachineBasicBlock::iterator &StoreI) {
1106   MachineBasicBlock::iterator B = I->getParent()->begin();
1107   MachineBasicBlock::iterator MBBI = I;
1108   MachineInstr &LoadMI = *I;
1109   Register BaseReg = getLdStBaseOp(LoadMI).getReg();
1110 
1111   // If the load is the first instruction in the block, there's obviously
1112   // not any matching store.
1113   if (MBBI == B)
1114     return false;
1115 
1116   // Track which register units have been modified and used between the first
1117   // insn and the second insn.
1118   ModifiedRegUnits.clear();
1119   UsedRegUnits.clear();
1120 
1121   unsigned Count = 0;
1122   do {
1123     --MBBI;
1124     MachineInstr &MI = *MBBI;
1125 
1126     // Don't count transient instructions towards the search limit since there
1127     // may be different numbers of them if e.g. debug information is present.
1128     if (!MI.isTransient())
1129       ++Count;
1130 
1131     // If the load instruction reads directly from the address to which the
1132     // store instruction writes and the stored value is not modified, we can
1133     // promote the load. Since we do not handle stores with pre-/post-index,
1134     // it's unnecessary to check if BaseReg is modified by the store itself.
1135     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1136         BaseReg == getLdStBaseOp(MI).getReg() &&
1137         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1138         ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1139       StoreI = MBBI;
1140       return true;
1141     }
1142 
1143     if (MI.isCall())
1144       return false;
1145 
1146     // Update modified / uses register units.
1147     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1148 
1149     // Otherwise, if the base register is modified, we have no match, so
1150     // return early.
1151     if (!ModifiedRegUnits.available(BaseReg))
1152       return false;
1153 
1154     // If we encounter a store aliased with the load, return early.
1155     if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
1156       return false;
1157   } while (MBBI != B && Count < Limit);
1158   return false;
1159 }
1160 
1161 // Returns true if FirstMI and MI are candidates for merging or pairing.
1162 // Otherwise, returns false.
1163 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1164                                        LdStPairFlags &Flags,
1165                                        const AArch64InstrInfo *TII) {
1166   // If this is volatile or if pairing is suppressed, not a candidate.
1167   if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1168     return false;
1169 
1170   // We should have already checked FirstMI for pair suppression and volatility.
1171   assert(!FirstMI.hasOrderedMemoryRef() &&
1172          !TII->isLdStPairSuppressed(FirstMI) &&
1173          "FirstMI shouldn't get here if either of these checks are true.");
1174 
1175   unsigned OpcA = FirstMI.getOpcode();
1176   unsigned OpcB = MI.getOpcode();
1177 
1178   // Opcodes match: nothing more to check.
1179   if (OpcA == OpcB)
1180     return true;
1181 
1182   // Try to match a sign-extended load/store with a zero-extended load/store.
1183   bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1184   unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1185   assert(IsValidLdStrOpc &&
1186          "Given Opc should be a Load or Store with an immediate");
1187   // OpcA will be the first instruction in the pair.
1188   if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1189     Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1190     return true;
1191   }
1192 
1193   // If the second instruction isn't even a mergable/pairable load/store, bail
1194   // out.
1195   if (!PairIsValidLdStrOpc)
1196     return false;
1197 
1198   // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1199   // offsets.
1200   if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1201     return false;
1202 
1203   // Try to match an unscaled load/store with a scaled load/store.
1204   return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
1205          getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1206 
1207   // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1208 }
1209 
1210 /// Scan the instructions looking for a load/store that can be combined with the
1211 /// current instruction into a wider equivalent or a load/store pair.
1212 MachineBasicBlock::iterator
1213 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1214                                       LdStPairFlags &Flags, unsigned Limit,
1215                                       bool FindNarrowMerge) {
1216   MachineBasicBlock::iterator E = I->getParent()->end();
1217   MachineBasicBlock::iterator MBBI = I;
1218   MachineInstr &FirstMI = *I;
1219   ++MBBI;
1220 
1221   bool MayLoad = FirstMI.mayLoad();
1222   bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
1223   Register Reg = getLdStRegOp(FirstMI).getReg();
1224   Register BaseReg = getLdStBaseOp(FirstMI).getReg();
1225   int Offset = getLdStOffsetOp(FirstMI).getImm();
1226   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
1227   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1228 
1229   // Track which register units have been modified and used between the first
1230   // insn (inclusive) and the second insn.
1231   ModifiedRegUnits.clear();
1232   UsedRegUnits.clear();
1233 
1234   // Remember any instructions that read/write memory between FirstMI and MI.
1235   SmallVector<MachineInstr *, 4> MemInsns;
1236 
1237   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1238     MachineInstr &MI = *MBBI;
1239 
1240     // Don't count transient instructions towards the search limit since there
1241     // may be different numbers of them if e.g. debug information is present.
1242     if (!MI.isTransient())
1243       ++Count;
1244 
1245     Flags.setSExtIdx(-1);
1246     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1247         getLdStOffsetOp(MI).isImm()) {
1248       assert(MI.mayLoadOrStore() && "Expected memory operation.");
1249       // If we've found another instruction with the same opcode, check to see
1250       // if the base and offset are compatible with our starting instruction.
1251       // These instructions all have scaled immediate operands, so we just
1252       // check for +1/-1. Make sure to check the new instruction offset is
1253       // actually an immediate and not a symbolic reference destined for
1254       // a relocation.
1255       Register MIBaseReg = getLdStBaseOp(MI).getReg();
1256       int MIOffset = getLdStOffsetOp(MI).getImm();
1257       bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
1258       if (IsUnscaled != MIIsUnscaled) {
1259         // We're trying to pair instructions that differ in how they are scaled.
1260         // If FirstMI is scaled then scale the offset of MI accordingly.
1261         // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1262         int MemSize = getMemScale(MI);
1263         if (MIIsUnscaled) {
1264           // If the unscaled offset isn't a multiple of the MemSize, we can't
1265           // pair the operations together: bail and keep looking.
1266           if (MIOffset % MemSize) {
1267             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1268                                               UsedRegUnits, TRI);
1269             MemInsns.push_back(&MI);
1270             continue;
1271           }
1272           MIOffset /= MemSize;
1273         } else {
1274           MIOffset *= MemSize;
1275         }
1276       }
1277 
1278       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
1279                                    (Offset + OffsetStride == MIOffset))) {
1280         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1281         if (FindNarrowMerge) {
1282           // If the alignment requirements of the scaled wide load/store
1283           // instruction can't express the offset of the scaled narrow input,
1284           // bail and keep looking. For promotable zero stores, allow only when
1285           // the stored value is the same (i.e., WZR).
1286           if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1287               (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1288             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1289                                               UsedRegUnits, TRI);
1290             MemInsns.push_back(&MI);
1291             continue;
1292           }
1293         } else {
1294           // Pairwise instructions have a 7-bit signed offset field. Single
1295           // insns have a 12-bit unsigned offset field.  If the resultant
1296           // immediate offset of merging these instructions is out of range for
1297           // a pairwise instruction, bail and keep looking.
1298           if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1299             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1300                                               UsedRegUnits, TRI);
1301             MemInsns.push_back(&MI);
1302             continue;
1303           }
1304           // If the alignment requirements of the paired (scaled) instruction
1305           // can't express the offset of the unscaled input, bail and keep
1306           // looking.
1307           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1308             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1309                                               UsedRegUnits, TRI);
1310             MemInsns.push_back(&MI);
1311             continue;
1312           }
1313         }
1314         // If the destination register of the loads is the same register, bail
1315         // and keep looking. A load-pair instruction with both destination
1316         // registers the same is UNPREDICTABLE and will result in an exception.
1317         if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
1318           LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
1319                                             TRI);
1320           MemInsns.push_back(&MI);
1321           continue;
1322         }
1323 
1324         // If the Rt of the second instruction was not modified or used between
1325         // the two instructions and none of the instructions between the second
1326         // and first alias with the second, we can combine the second into the
1327         // first.
1328         if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
1329             !(MI.mayLoad() &&
1330               !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
1331             !mayAlias(MI, MemInsns, AA)) {
1332           Flags.setMergeForward(false);
1333           return MBBI;
1334         }
1335 
1336         // Likewise, if the Rt of the first instruction is not modified or used
1337         // between the two instructions and none of the instructions between the
1338         // first and the second alias with the first, we can combine the first
1339         // into the second.
1340         if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg()) &&
1341             !(MayLoad &&
1342               !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
1343             !mayAlias(FirstMI, MemInsns, AA)) {
1344           Flags.setMergeForward(true);
1345           return MBBI;
1346         }
1347         // Unable to combine these instructions due to interference in between.
1348         // Keep looking.
1349       }
1350     }
1351 
1352     // If the instruction wasn't a matching load or store.  Stop searching if we
1353     // encounter a call instruction that might modify memory.
1354     if (MI.isCall())
1355       return E;
1356 
1357     // Update modified / uses register units.
1358     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1359 
1360     // Otherwise, if the base register is modified, we have no match, so
1361     // return early.
1362     if (!ModifiedRegUnits.available(BaseReg))
1363       return E;
1364 
1365     // Update list of instructions that read/write memory.
1366     if (MI.mayLoadOrStore())
1367       MemInsns.push_back(&MI);
1368   }
1369   return E;
1370 }
1371 
1372 MachineBasicBlock::iterator
1373 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1374                                      MachineBasicBlock::iterator Update,
1375                                      bool IsPreIdx) {
1376   assert((Update->getOpcode() == AArch64::ADDXri ||
1377           Update->getOpcode() == AArch64::SUBXri) &&
1378          "Unexpected base register update instruction to merge!");
1379   MachineBasicBlock::iterator NextI = I;
1380   // Return the instruction following the merged instruction, which is
1381   // the instruction following our unmerged load. Unless that's the add/sub
1382   // instruction we're merging, in which case it's the one after that.
1383   if (++NextI == Update)
1384     ++NextI;
1385 
1386   int Value = Update->getOperand(2).getImm();
1387   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1388          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1389   if (Update->getOpcode() == AArch64::SUBXri)
1390     Value = -Value;
1391 
1392   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1393                              : getPostIndexedOpcode(I->getOpcode());
1394   MachineInstrBuilder MIB;
1395   int Scale, MinOffset, MaxOffset;
1396   getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
1397   if (!isPairedLdSt(*I)) {
1398     // Non-paired instruction.
1399     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1400               .add(getLdStRegOp(*Update))
1401               .add(getLdStRegOp(*I))
1402               .add(getLdStBaseOp(*I))
1403               .addImm(Value / Scale)
1404               .setMemRefs(I->memoperands())
1405               .setMIFlags(I->mergeFlagsWith(*Update));
1406   } else {
1407     // Paired instruction.
1408     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1409               .add(getLdStRegOp(*Update))
1410               .add(getLdStRegOp(*I, 0))
1411               .add(getLdStRegOp(*I, 1))
1412               .add(getLdStBaseOp(*I))
1413               .addImm(Value / Scale)
1414               .setMemRefs(I->memoperands())
1415               .setMIFlags(I->mergeFlagsWith(*Update));
1416   }
1417   (void)MIB;
1418 
1419   if (IsPreIdx) {
1420     ++NumPreFolded;
1421     LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1422   } else {
1423     ++NumPostFolded;
1424     LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1425   }
1426   LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
1427   LLVM_DEBUG(I->print(dbgs()));
1428   LLVM_DEBUG(dbgs() << "    ");
1429   LLVM_DEBUG(Update->print(dbgs()));
1430   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
1431   LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1432   LLVM_DEBUG(dbgs() << "\n");
1433 
1434   // Erase the old instructions for the block.
1435   I->eraseFromParent();
1436   Update->eraseFromParent();
1437 
1438   return NextI;
1439 }
1440 
1441 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1442                                                MachineInstr &MI,
1443                                                unsigned BaseReg, int Offset) {
1444   switch (MI.getOpcode()) {
1445   default:
1446     break;
1447   case AArch64::SUBXri:
1448   case AArch64::ADDXri:
1449     // Make sure it's a vanilla immediate operand, not a relocation or
1450     // anything else we can't handle.
1451     if (!MI.getOperand(2).isImm())
1452       break;
1453     // Watch out for 1 << 12 shifted value.
1454     if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
1455       break;
1456 
1457     // The update instruction source and destination register must be the
1458     // same as the load/store base register.
1459     if (MI.getOperand(0).getReg() != BaseReg ||
1460         MI.getOperand(1).getReg() != BaseReg)
1461       break;
1462 
1463     int UpdateOffset = MI.getOperand(2).getImm();
1464     if (MI.getOpcode() == AArch64::SUBXri)
1465       UpdateOffset = -UpdateOffset;
1466 
1467     // The immediate must be a multiple of the scaling factor of the pre/post
1468     // indexed instruction.
1469     int Scale, MinOffset, MaxOffset;
1470     getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
1471     if (UpdateOffset % Scale != 0)
1472       break;
1473 
1474     // Scaled offset must fit in the instruction immediate.
1475     int ScaledOffset = UpdateOffset / Scale;
1476     if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
1477       break;
1478 
1479     // If we have a non-zero Offset, we check that it matches the amount
1480     // we're adding to the register.
1481     if (!Offset || Offset == UpdateOffset)
1482       return true;
1483     break;
1484   }
1485   return false;
1486 }
1487 
1488 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1489     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1490   MachineBasicBlock::iterator E = I->getParent()->end();
1491   MachineInstr &MemMI = *I;
1492   MachineBasicBlock::iterator MBBI = I;
1493 
1494   Register BaseReg = getLdStBaseOp(MemMI).getReg();
1495   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1496 
1497   // Scan forward looking for post-index opportunities.  Updating instructions
1498   // can't be formed if the memory instruction doesn't have the offset we're
1499   // looking for.
1500   if (MIUnscaledOffset != UnscaledOffset)
1501     return E;
1502 
1503   // If the base register overlaps a source/destination register, we can't
1504   // merge the update. This does not apply to tag store instructions which
1505   // ignore the address part of the source register.
1506   // This does not apply to STGPi as well, which does not have unpredictable
1507   // behavior in this case unlike normal stores, and always performs writeback
1508   // after reading the source register value.
1509   if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
1510     bool IsPairedInsn = isPairedLdSt(MemMI);
1511     for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1512       Register DestReg = getLdStRegOp(MemMI, i).getReg();
1513       if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1514         return E;
1515     }
1516   }
1517 
1518   // Track which register units have been modified and used between the first
1519   // insn (inclusive) and the second insn.
1520   ModifiedRegUnits.clear();
1521   UsedRegUnits.clear();
1522   ++MBBI;
1523   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1524     MachineInstr &MI = *MBBI;
1525 
1526     // Don't count transient instructions towards the search limit since there
1527     // may be different numbers of them if e.g. debug information is present.
1528     if (!MI.isTransient())
1529       ++Count;
1530 
1531     // If we found a match, return it.
1532     if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1533       return MBBI;
1534 
1535     // Update the status of what the instruction clobbered and used.
1536     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1537 
1538     // Otherwise, if the base register is used or modified, we have no match, so
1539     // return early.
1540     if (!ModifiedRegUnits.available(BaseReg) ||
1541         !UsedRegUnits.available(BaseReg))
1542       return E;
1543   }
1544   return E;
1545 }
1546 
1547 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1548     MachineBasicBlock::iterator I, unsigned Limit) {
1549   MachineBasicBlock::iterator B = I->getParent()->begin();
1550   MachineBasicBlock::iterator E = I->getParent()->end();
1551   MachineInstr &MemMI = *I;
1552   MachineBasicBlock::iterator MBBI = I;
1553 
1554   Register BaseReg = getLdStBaseOp(MemMI).getReg();
1555   int Offset = getLdStOffsetOp(MemMI).getImm();
1556 
1557   // If the load/store is the first instruction in the block, there's obviously
1558   // not any matching update. Ditto if the memory offset isn't zero.
1559   if (MBBI == B || Offset != 0)
1560     return E;
1561   // If the base register overlaps a destination register, we can't
1562   // merge the update.
1563   if (!isTagStore(MemMI)) {
1564     bool IsPairedInsn = isPairedLdSt(MemMI);
1565     for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1566       Register DestReg = getLdStRegOp(MemMI, i).getReg();
1567       if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1568         return E;
1569     }
1570   }
1571 
1572   // Track which register units have been modified and used between the first
1573   // insn (inclusive) and the second insn.
1574   ModifiedRegUnits.clear();
1575   UsedRegUnits.clear();
1576   unsigned Count = 0;
1577   do {
1578     --MBBI;
1579     MachineInstr &MI = *MBBI;
1580 
1581     // Don't count transient instructions towards the search limit since there
1582     // may be different numbers of them if e.g. debug information is present.
1583     if (!MI.isTransient())
1584       ++Count;
1585 
1586     // If we found a match, return it.
1587     if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
1588       return MBBI;
1589 
1590     // Update the status of what the instruction clobbered and used.
1591     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1592 
1593     // Otherwise, if the base register is used or modified, we have no match, so
1594     // return early.
1595     if (!ModifiedRegUnits.available(BaseReg) ||
1596         !UsedRegUnits.available(BaseReg))
1597       return E;
1598   } while (MBBI != B && Count < Limit);
1599   return E;
1600 }
1601 
1602 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1603     MachineBasicBlock::iterator &MBBI) {
1604   MachineInstr &MI = *MBBI;
1605   // If this is a volatile load, don't mess with it.
1606   if (MI.hasOrderedMemoryRef())
1607     return false;
1608 
1609   // Make sure this is a reg+imm.
1610   // FIXME: It is possible to extend it to handle reg+reg cases.
1611   if (!getLdStOffsetOp(MI).isImm())
1612     return false;
1613 
1614   // Look backward up to LdStLimit instructions.
1615   MachineBasicBlock::iterator StoreI;
1616   if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
1617     ++NumLoadsFromStoresPromoted;
1618     // Promote the load. Keeping the iterator straight is a
1619     // pain, so we let the merge routine tell us what the next instruction
1620     // is after it's done mucking about.
1621     MBBI = promoteLoadFromStore(MBBI, StoreI);
1622     return true;
1623   }
1624   return false;
1625 }
1626 
1627 // Merge adjacent zero stores into a wider store.
1628 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
1629     MachineBasicBlock::iterator &MBBI) {
1630   assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
1631   MachineInstr &MI = *MBBI;
1632   MachineBasicBlock::iterator E = MI.getParent()->end();
1633 
1634   if (!TII->isCandidateToMergeOrPair(MI))
1635     return false;
1636 
1637   // Look ahead up to LdStLimit instructions for a mergable instruction.
1638   LdStPairFlags Flags;
1639   MachineBasicBlock::iterator MergeMI =
1640       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
1641   if (MergeMI != E) {
1642     ++NumZeroStoresPromoted;
1643 
1644     // Keeping the iterator straight is a pain, so we let the merge routine tell
1645     // us what the next instruction is after it's done mucking about.
1646     MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
1647     return true;
1648   }
1649   return false;
1650 }
1651 
1652 // Find loads and stores that can be merged into a single load or store pair
1653 // instruction.
1654 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
1655   MachineInstr &MI = *MBBI;
1656   MachineBasicBlock::iterator E = MI.getParent()->end();
1657 
1658   if (!TII->isCandidateToMergeOrPair(MI))
1659     return false;
1660 
1661   // Early exit if the offset is not possible to match. (6 bits of positive
1662   // range, plus allow an extra one in case we find a later insn that matches
1663   // with Offset-1)
1664   bool IsUnscaled = TII->isUnscaledLdSt(MI);
1665   int Offset = getLdStOffsetOp(MI).getImm();
1666   int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
1667   // Allow one more for offset.
1668   if (Offset > 0)
1669     Offset -= OffsetStride;
1670   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
1671     return false;
1672 
1673   // Look ahead up to LdStLimit instructions for a pairable instruction.
1674   LdStPairFlags Flags;
1675   MachineBasicBlock::iterator Paired =
1676       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
1677   if (Paired != E) {
1678     ++NumPairCreated;
1679     if (TII->isUnscaledLdSt(MI))
1680       ++NumUnscaledPairCreated;
1681     // Keeping the iterator straight is a pain, so we let the merge routine tell
1682     // us what the next instruction is after it's done mucking about.
1683     MBBI = mergePairedInsns(MBBI, Paired, Flags);
1684     return true;
1685   }
1686   return false;
1687 }
1688 
1689 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
1690     (MachineBasicBlock::iterator &MBBI) {
1691   MachineInstr &MI = *MBBI;
1692   MachineBasicBlock::iterator E = MI.getParent()->end();
1693   MachineBasicBlock::iterator Update;
1694 
1695   // Look forward to try to form a post-index instruction. For example,
1696   // ldr x0, [x20]
1697   // add x20, x20, #32
1698   //   merged into:
1699   // ldr x0, [x20], #32
1700   Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
1701   if (Update != E) {
1702     // Merge the update into the ld/st.
1703     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1704     return true;
1705   }
1706 
1707   // Don't know how to handle unscaled pre/post-index versions below, so bail.
1708   if (TII->isUnscaledLdSt(MI.getOpcode()))
1709     return false;
1710 
1711   // Look back to try to find a pre-index instruction. For example,
1712   // add x0, x0, #8
1713   // ldr x1, [x0]
1714   //   merged into:
1715   // ldr x1, [x0, #8]!
1716   Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
1717   if (Update != E) {
1718     // Merge the update into the ld/st.
1719     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1720     return true;
1721   }
1722 
1723   // The immediate in the load/store is scaled by the size of the memory
1724   // operation. The immediate in the add we're looking for,
1725   // however, is not, so adjust here.
1726   int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1727 
1728   // Look forward to try to find a pre-index instruction. For example,
1729   // ldr x1, [x0, #64]
1730   // add x0, x0, #64
1731   //   merged into:
1732   // ldr x1, [x0, #64]!
1733   Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
1734   if (Update != E) {
1735     // Merge the update into the ld/st.
1736     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1737     return true;
1738   }
1739 
1740   return false;
1741 }
1742 
1743 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1744                                         bool EnableNarrowZeroStOpt) {
1745   bool Modified = false;
1746   // Four tranformations to do here:
1747   // 1) Find loads that directly read from stores and promote them by
1748   //    replacing with mov instructions. If the store is wider than the load,
1749   //    the load will be replaced with a bitfield extract.
1750   //      e.g.,
1751   //        str w1, [x0, #4]
1752   //        ldrh w2, [x0, #6]
1753   //        ; becomes
1754   //        str w1, [x0, #4]
1755   //        lsr w2, w1, #16
1756   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1757        MBBI != E;) {
1758     if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
1759       Modified = true;
1760     else
1761       ++MBBI;
1762   }
1763   // 2) Merge adjacent zero stores into a wider store.
1764   //      e.g.,
1765   //        strh wzr, [x0]
1766   //        strh wzr, [x0, #2]
1767   //        ; becomes
1768   //        str wzr, [x0]
1769   //      e.g.,
1770   //        str wzr, [x0]
1771   //        str wzr, [x0, #4]
1772   //        ; becomes
1773   //        str xzr, [x0]
1774   if (EnableNarrowZeroStOpt)
1775     for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1776          MBBI != E;) {
1777       if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
1778         Modified = true;
1779       else
1780         ++MBBI;
1781     }
1782   // 3) Find loads and stores that can be merged into a single load or store
1783   //    pair instruction.
1784   //      e.g.,
1785   //        ldr x0, [x2]
1786   //        ldr x1, [x2, #8]
1787   //        ; becomes
1788   //        ldp x0, x1, [x2]
1789   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1790        MBBI != E;) {
1791     if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
1792       Modified = true;
1793     else
1794       ++MBBI;
1795   }
1796   // 4) Find base register updates that can be merged into the load or store
1797   //    as a base-reg writeback.
1798   //      e.g.,
1799   //        ldr x0, [x2]
1800   //        add x2, x2, #4
1801   //        ; becomes
1802   //        ldr x0, [x2], #4
1803   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1804        MBBI != E;) {
1805     if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
1806       Modified = true;
1807     else
1808       ++MBBI;
1809   }
1810 
1811   return Modified;
1812 }
1813 
1814 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1815   if (skipFunction(Fn.getFunction()))
1816     return false;
1817 
1818   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1819   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1820   TRI = Subtarget->getRegisterInfo();
1821   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1822 
1823   // Resize the modified and used register unit trackers.  We do this once
1824   // per function and then clear the register units each time we optimize a load
1825   // or store.
1826   ModifiedRegUnits.init(*TRI);
1827   UsedRegUnits.init(*TRI);
1828 
1829   bool Modified = false;
1830   bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
1831   for (auto &MBB : Fn)
1832     Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
1833 
1834   return Modified;
1835 }
1836 
1837 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
1838 // stores near one another?  Note: The pre-RA instruction scheduler already has
1839 // hooks to try and schedule pairable loads/stores together to improve pairing
1840 // opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
1841 
1842 // FIXME: When pairing store instructions it's very possible for this pass to
1843 // hoist a store with a KILL marker above another use (without a KILL marker).
1844 // The resulting IR is invalid, but nothing uses the KILL markers after this
1845 // pass, so it's never caused a problem in practice.
1846 
1847 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1848 /// load / store optimization pass.
1849 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1850   return new AArch64LoadStoreOpt();
1851 }
1852