xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
11 //
12 // The pass runs after the PrologEpilogInserter where we emit the CFI
13 // instructions. In order to preserve the correctness of the unwind informaiton,
14 // the pass should not change the order of any two instructions, one of which
15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16 // to unwind information.
17 //
18 //===----------------------------------------------------------------------===//
19 
20 #include "AArch64InstrInfo.h"
21 #include "AArch64MachineFunctionInfo.h"
22 #include "AArch64Subtarget.h"
23 #include "MCTargetDesc/AArch64AddressingModes.h"
24 #include "llvm/ADT/BitVector.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/ADT/iterator_range.h"
29 #include "llvm/Analysis/AliasAnalysis.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineFunctionPass.h"
33 #include "llvm/CodeGen/MachineInstr.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineOperand.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/TargetRegisterInfo.h"
38 #include "llvm/IR/DebugLoc.h"
39 #include "llvm/MC/MCAsmInfo.h"
40 #include "llvm/MC/MCDwarf.h"
41 #include "llvm/MC/MCRegisterInfo.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/DebugCounter.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/raw_ostream.h"
48 #include <cassert>
49 #include <cstdint>
50 #include <functional>
51 #include <iterator>
52 #include <limits>
53 #include <optional>
54 
55 using namespace llvm;
56 
57 #define DEBUG_TYPE "aarch64-ldst-opt"
58 
59 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
60 STATISTIC(NumPostFolded, "Number of post-index updates folded");
61 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
62 STATISTIC(NumUnscaledPairCreated,
63           "Number of load/store from unscaled generated");
64 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
65 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
66 
67 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
68               "Controls which pairs are considered for renaming");
69 
70 // The LdStLimit limits how far we search for load/store pairs.
71 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
72                                    cl::init(20), cl::Hidden);
73 
74 // The UpdateLimit limits how far we search for update instructions when we form
75 // pre-/post-index instructions.
76 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
77                                      cl::Hidden);
78 
79 // Enable register renaming to find additional store pairing opportunities.
80 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
81                                     cl::init(true), cl::Hidden);
82 
83 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
84 
85 namespace {
86 
87 using LdStPairFlags = struct LdStPairFlags {
88   // If a matching instruction is found, MergeForward is set to true if the
89   // merge is to remove the first instruction and replace the second with
90   // a pair-wise insn, and false if the reverse is true.
91   bool MergeForward = false;
92 
93   // SExtIdx gives the index of the result of the load pair that must be
94   // extended. The value of SExtIdx assumes that the paired load produces the
95   // value in this order: (I, returned iterator), i.e., -1 means no value has
96   // to be extended, 0 means I, and 1 means the returned iterator.
97   int SExtIdx = -1;
98 
99   // If not none, RenameReg can be used to rename the result register of the
100   // first store in a pair. Currently this only works when merging stores
101   // forward.
102   std::optional<MCPhysReg> RenameReg;
103 
104   LdStPairFlags() = default;
105 
106   void setMergeForward(bool V = true) { MergeForward = V; }
107   bool getMergeForward() const { return MergeForward; }
108 
109   void setSExtIdx(int V) { SExtIdx = V; }
110   int getSExtIdx() const { return SExtIdx; }
111 
112   void setRenameReg(MCPhysReg R) { RenameReg = R; }
113   void clearRenameReg() { RenameReg = std::nullopt; }
114   std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
115 };
116 
117 struct AArch64LoadStoreOpt : public MachineFunctionPass {
118   static char ID;
119 
120   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
121     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
122   }
123 
124   AliasAnalysis *AA;
125   const AArch64InstrInfo *TII;
126   const TargetRegisterInfo *TRI;
127   const AArch64Subtarget *Subtarget;
128 
129   // Track which register units have been modified and used.
130   LiveRegUnits ModifiedRegUnits, UsedRegUnits;
131   LiveRegUnits DefinedInBB;
132 
133   void getAnalysisUsage(AnalysisUsage &AU) const override {
134     AU.addRequired<AAResultsWrapperPass>();
135     MachineFunctionPass::getAnalysisUsage(AU);
136   }
137 
138   // Scan the instructions looking for a load/store that can be combined
139   // with the current instruction into a load/store pair.
140   // Return the matching instruction if one is found, else MBB->end().
141   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
142                                                LdStPairFlags &Flags,
143                                                unsigned Limit,
144                                                bool FindNarrowMerge);
145 
146   // Scan the instructions looking for a store that writes to the address from
147   // which the current load instruction reads. Return true if one is found.
148   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
149                          MachineBasicBlock::iterator &StoreI);
150 
151   // Merge the two instructions indicated into a wider narrow store instruction.
152   MachineBasicBlock::iterator
153   mergeNarrowZeroStores(MachineBasicBlock::iterator I,
154                         MachineBasicBlock::iterator MergeMI,
155                         const LdStPairFlags &Flags);
156 
157   // Merge the two instructions indicated into a single pair-wise instruction.
158   MachineBasicBlock::iterator
159   mergePairedInsns(MachineBasicBlock::iterator I,
160                    MachineBasicBlock::iterator Paired,
161                    const LdStPairFlags &Flags);
162 
163   // Promote the load that reads directly from the address stored to.
164   MachineBasicBlock::iterator
165   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
166                        MachineBasicBlock::iterator StoreI);
167 
168   // Scan the instruction list to find a base register update that can
169   // be combined with the current instruction (a load or store) using
170   // pre or post indexed addressing with writeback. Scan forwards.
171   MachineBasicBlock::iterator
172   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
173                                 int UnscaledOffset, unsigned Limit);
174 
175   // Scan the instruction list to find a base register update that can
176   // be combined with the current instruction (a load or store) using
177   // pre or post indexed addressing with writeback. Scan backwards.
178   MachineBasicBlock::iterator
179   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
180 
181   // Find an instruction that updates the base register of the ld/st
182   // instruction.
183   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
184                             unsigned BaseReg, int Offset);
185 
186   // Merge a pre- or post-index base register update into a ld/st instruction.
187   MachineBasicBlock::iterator
188   mergeUpdateInsn(MachineBasicBlock::iterator I,
189                   MachineBasicBlock::iterator Update, bool IsPreIdx);
190 
191   // Find and merge zero store instructions.
192   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
193 
194   // Find and pair ldr/str instructions.
195   bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
196 
197   // Find and promote load instructions which read directly from store.
198   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
199 
200   // Find and merge a base register updates before or after a ld/st instruction.
201   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
202 
203   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
204 
205   bool runOnMachineFunction(MachineFunction &Fn) override;
206 
207   MachineFunctionProperties getRequiredProperties() const override {
208     return MachineFunctionProperties().set(
209         MachineFunctionProperties::Property::NoVRegs);
210   }
211 
212   StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
213 };
214 
215 char AArch64LoadStoreOpt::ID = 0;
216 
217 } // end anonymous namespace
218 
219 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
220                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
221 
222 static bool isNarrowStore(unsigned Opc) {
223   switch (Opc) {
224   default:
225     return false;
226   case AArch64::STRBBui:
227   case AArch64::STURBBi:
228   case AArch64::STRHHui:
229   case AArch64::STURHHi:
230     return true;
231   }
232 }
233 
234 // These instruction set memory tag and either keep memory contents unchanged or
235 // set it to zero, ignoring the address part of the source register.
236 static bool isTagStore(const MachineInstr &MI) {
237   switch (MI.getOpcode()) {
238   default:
239     return false;
240   case AArch64::STGi:
241   case AArch64::STZGi:
242   case AArch64::ST2Gi:
243   case AArch64::STZ2Gi:
244     return true;
245   }
246 }
247 
248 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
249                                          bool *IsValidLdStrOpc = nullptr) {
250   if (IsValidLdStrOpc)
251     *IsValidLdStrOpc = true;
252   switch (Opc) {
253   default:
254     if (IsValidLdStrOpc)
255       *IsValidLdStrOpc = false;
256     return std::numeric_limits<unsigned>::max();
257   case AArch64::STRDui:
258   case AArch64::STURDi:
259   case AArch64::STRDpre:
260   case AArch64::STRQui:
261   case AArch64::STURQi:
262   case AArch64::STRQpre:
263   case AArch64::STRBBui:
264   case AArch64::STURBBi:
265   case AArch64::STRHHui:
266   case AArch64::STURHHi:
267   case AArch64::STRWui:
268   case AArch64::STRWpre:
269   case AArch64::STURWi:
270   case AArch64::STRXui:
271   case AArch64::STRXpre:
272   case AArch64::STURXi:
273   case AArch64::LDRDui:
274   case AArch64::LDURDi:
275   case AArch64::LDRDpre:
276   case AArch64::LDRQui:
277   case AArch64::LDURQi:
278   case AArch64::LDRQpre:
279   case AArch64::LDRWui:
280   case AArch64::LDURWi:
281   case AArch64::LDRWpre:
282   case AArch64::LDRXui:
283   case AArch64::LDURXi:
284   case AArch64::LDRXpre:
285   case AArch64::STRSui:
286   case AArch64::STURSi:
287   case AArch64::STRSpre:
288   case AArch64::LDRSui:
289   case AArch64::LDURSi:
290   case AArch64::LDRSpre:
291     return Opc;
292   case AArch64::LDRSWui:
293     return AArch64::LDRWui;
294   case AArch64::LDURSWi:
295     return AArch64::LDURWi;
296   }
297 }
298 
299 static unsigned getMatchingWideOpcode(unsigned Opc) {
300   switch (Opc) {
301   default:
302     llvm_unreachable("Opcode has no wide equivalent!");
303   case AArch64::STRBBui:
304     return AArch64::STRHHui;
305   case AArch64::STRHHui:
306     return AArch64::STRWui;
307   case AArch64::STURBBi:
308     return AArch64::STURHHi;
309   case AArch64::STURHHi:
310     return AArch64::STURWi;
311   case AArch64::STURWi:
312     return AArch64::STURXi;
313   case AArch64::STRWui:
314     return AArch64::STRXui;
315   }
316 }
317 
318 static unsigned getMatchingPairOpcode(unsigned Opc) {
319   switch (Opc) {
320   default:
321     llvm_unreachable("Opcode has no pairwise equivalent!");
322   case AArch64::STRSui:
323   case AArch64::STURSi:
324     return AArch64::STPSi;
325   case AArch64::STRSpre:
326     return AArch64::STPSpre;
327   case AArch64::STRDui:
328   case AArch64::STURDi:
329     return AArch64::STPDi;
330   case AArch64::STRDpre:
331     return AArch64::STPDpre;
332   case AArch64::STRQui:
333   case AArch64::STURQi:
334     return AArch64::STPQi;
335   case AArch64::STRQpre:
336     return AArch64::STPQpre;
337   case AArch64::STRWui:
338   case AArch64::STURWi:
339     return AArch64::STPWi;
340   case AArch64::STRWpre:
341     return AArch64::STPWpre;
342   case AArch64::STRXui:
343   case AArch64::STURXi:
344     return AArch64::STPXi;
345   case AArch64::STRXpre:
346     return AArch64::STPXpre;
347   case AArch64::LDRSui:
348   case AArch64::LDURSi:
349     return AArch64::LDPSi;
350   case AArch64::LDRSpre:
351     return AArch64::LDPSpre;
352   case AArch64::LDRDui:
353   case AArch64::LDURDi:
354     return AArch64::LDPDi;
355   case AArch64::LDRDpre:
356     return AArch64::LDPDpre;
357   case AArch64::LDRQui:
358   case AArch64::LDURQi:
359     return AArch64::LDPQi;
360   case AArch64::LDRQpre:
361     return AArch64::LDPQpre;
362   case AArch64::LDRWui:
363   case AArch64::LDURWi:
364     return AArch64::LDPWi;
365   case AArch64::LDRWpre:
366     return AArch64::LDPWpre;
367   case AArch64::LDRXui:
368   case AArch64::LDURXi:
369     return AArch64::LDPXi;
370   case AArch64::LDRXpre:
371     return AArch64::LDPXpre;
372   case AArch64::LDRSWui:
373   case AArch64::LDURSWi:
374     return AArch64::LDPSWi;
375   }
376 }
377 
378 static unsigned isMatchingStore(MachineInstr &LoadInst,
379                                 MachineInstr &StoreInst) {
380   unsigned LdOpc = LoadInst.getOpcode();
381   unsigned StOpc = StoreInst.getOpcode();
382   switch (LdOpc) {
383   default:
384     llvm_unreachable("Unsupported load instruction!");
385   case AArch64::LDRBBui:
386     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
387            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
388   case AArch64::LDURBBi:
389     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
390            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
391   case AArch64::LDRHHui:
392     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
393            StOpc == AArch64::STRXui;
394   case AArch64::LDURHHi:
395     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
396            StOpc == AArch64::STURXi;
397   case AArch64::LDRWui:
398     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
399   case AArch64::LDURWi:
400     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
401   case AArch64::LDRXui:
402     return StOpc == AArch64::STRXui;
403   case AArch64::LDURXi:
404     return StOpc == AArch64::STURXi;
405   }
406 }
407 
408 static unsigned getPreIndexedOpcode(unsigned Opc) {
409   // FIXME: We don't currently support creating pre-indexed loads/stores when
410   // the load or store is the unscaled version.  If we decide to perform such an
411   // optimization in the future the cases for the unscaled loads/stores will
412   // need to be added here.
413   switch (Opc) {
414   default:
415     llvm_unreachable("Opcode has no pre-indexed equivalent!");
416   case AArch64::STRSui:
417     return AArch64::STRSpre;
418   case AArch64::STRDui:
419     return AArch64::STRDpre;
420   case AArch64::STRQui:
421     return AArch64::STRQpre;
422   case AArch64::STRBBui:
423     return AArch64::STRBBpre;
424   case AArch64::STRHHui:
425     return AArch64::STRHHpre;
426   case AArch64::STRWui:
427     return AArch64::STRWpre;
428   case AArch64::STRXui:
429     return AArch64::STRXpre;
430   case AArch64::LDRSui:
431     return AArch64::LDRSpre;
432   case AArch64::LDRDui:
433     return AArch64::LDRDpre;
434   case AArch64::LDRQui:
435     return AArch64::LDRQpre;
436   case AArch64::LDRBBui:
437     return AArch64::LDRBBpre;
438   case AArch64::LDRHHui:
439     return AArch64::LDRHHpre;
440   case AArch64::LDRWui:
441     return AArch64::LDRWpre;
442   case AArch64::LDRXui:
443     return AArch64::LDRXpre;
444   case AArch64::LDRSWui:
445     return AArch64::LDRSWpre;
446   case AArch64::LDPSi:
447     return AArch64::LDPSpre;
448   case AArch64::LDPSWi:
449     return AArch64::LDPSWpre;
450   case AArch64::LDPDi:
451     return AArch64::LDPDpre;
452   case AArch64::LDPQi:
453     return AArch64::LDPQpre;
454   case AArch64::LDPWi:
455     return AArch64::LDPWpre;
456   case AArch64::LDPXi:
457     return AArch64::LDPXpre;
458   case AArch64::STPSi:
459     return AArch64::STPSpre;
460   case AArch64::STPDi:
461     return AArch64::STPDpre;
462   case AArch64::STPQi:
463     return AArch64::STPQpre;
464   case AArch64::STPWi:
465     return AArch64::STPWpre;
466   case AArch64::STPXi:
467     return AArch64::STPXpre;
468   case AArch64::STGi:
469     return AArch64::STGPreIndex;
470   case AArch64::STZGi:
471     return AArch64::STZGPreIndex;
472   case AArch64::ST2Gi:
473     return AArch64::ST2GPreIndex;
474   case AArch64::STZ2Gi:
475     return AArch64::STZ2GPreIndex;
476   case AArch64::STGPi:
477     return AArch64::STGPpre;
478   }
479 }
480 
481 static unsigned getPostIndexedOpcode(unsigned Opc) {
482   switch (Opc) {
483   default:
484     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
485   case AArch64::STRSui:
486   case AArch64::STURSi:
487     return AArch64::STRSpost;
488   case AArch64::STRDui:
489   case AArch64::STURDi:
490     return AArch64::STRDpost;
491   case AArch64::STRQui:
492   case AArch64::STURQi:
493     return AArch64::STRQpost;
494   case AArch64::STRBBui:
495     return AArch64::STRBBpost;
496   case AArch64::STRHHui:
497     return AArch64::STRHHpost;
498   case AArch64::STRWui:
499   case AArch64::STURWi:
500     return AArch64::STRWpost;
501   case AArch64::STRXui:
502   case AArch64::STURXi:
503     return AArch64::STRXpost;
504   case AArch64::LDRSui:
505   case AArch64::LDURSi:
506     return AArch64::LDRSpost;
507   case AArch64::LDRDui:
508   case AArch64::LDURDi:
509     return AArch64::LDRDpost;
510   case AArch64::LDRQui:
511   case AArch64::LDURQi:
512     return AArch64::LDRQpost;
513   case AArch64::LDRBBui:
514     return AArch64::LDRBBpost;
515   case AArch64::LDRHHui:
516     return AArch64::LDRHHpost;
517   case AArch64::LDRWui:
518   case AArch64::LDURWi:
519     return AArch64::LDRWpost;
520   case AArch64::LDRXui:
521   case AArch64::LDURXi:
522     return AArch64::LDRXpost;
523   case AArch64::LDRSWui:
524     return AArch64::LDRSWpost;
525   case AArch64::LDPSi:
526     return AArch64::LDPSpost;
527   case AArch64::LDPSWi:
528     return AArch64::LDPSWpost;
529   case AArch64::LDPDi:
530     return AArch64::LDPDpost;
531   case AArch64::LDPQi:
532     return AArch64::LDPQpost;
533   case AArch64::LDPWi:
534     return AArch64::LDPWpost;
535   case AArch64::LDPXi:
536     return AArch64::LDPXpost;
537   case AArch64::STPSi:
538     return AArch64::STPSpost;
539   case AArch64::STPDi:
540     return AArch64::STPDpost;
541   case AArch64::STPQi:
542     return AArch64::STPQpost;
543   case AArch64::STPWi:
544     return AArch64::STPWpost;
545   case AArch64::STPXi:
546     return AArch64::STPXpost;
547   case AArch64::STGi:
548     return AArch64::STGPostIndex;
549   case AArch64::STZGi:
550     return AArch64::STZGPostIndex;
551   case AArch64::ST2Gi:
552     return AArch64::ST2GPostIndex;
553   case AArch64::STZ2Gi:
554     return AArch64::STZ2GPostIndex;
555   case AArch64::STGPi:
556     return AArch64::STGPpost;
557   }
558 }
559 
560 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
561 
562   unsigned OpcA = FirstMI.getOpcode();
563   unsigned OpcB = MI.getOpcode();
564 
565   switch (OpcA) {
566   default:
567     return false;
568   case AArch64::STRSpre:
569     return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
570   case AArch64::STRDpre:
571     return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
572   case AArch64::STRQpre:
573     return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
574   case AArch64::STRWpre:
575     return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
576   case AArch64::STRXpre:
577     return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
578   case AArch64::LDRSpre:
579     return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
580   case AArch64::LDRDpre:
581     return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
582   case AArch64::LDRQpre:
583     return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
584   case AArch64::LDRWpre:
585     return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
586   case AArch64::LDRXpre:
587     return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
588   }
589 }
590 
591 // Returns the scale and offset range of pre/post indexed variants of MI.
592 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
593                                        int &MinOffset, int &MaxOffset) {
594   bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
595   bool IsTagStore = isTagStore(MI);
596   // ST*G and all paired ldst have the same scale in pre/post-indexed variants
597   // as in the "unsigned offset" variant.
598   // All other pre/post indexed ldst instructions are unscaled.
599   Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
600 
601   if (IsPaired) {
602     MinOffset = -64;
603     MaxOffset = 63;
604   } else {
605     MinOffset = -256;
606     MaxOffset = 255;
607   }
608 }
609 
610 static MachineOperand &getLdStRegOp(MachineInstr &MI,
611                                     unsigned PairedRegOp = 0) {
612   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
613   bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
614   if (IsPreLdSt)
615     PairedRegOp += 1;
616   unsigned Idx =
617       AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
618   return MI.getOperand(Idx);
619 }
620 
621 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
622                                   MachineInstr &StoreInst,
623                                   const AArch64InstrInfo *TII) {
624   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
625   int LoadSize = TII->getMemScale(LoadInst);
626   int StoreSize = TII->getMemScale(StoreInst);
627   int UnscaledStOffset =
628       TII->hasUnscaledLdStOffset(StoreInst)
629           ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
630           : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
631   int UnscaledLdOffset =
632       TII->hasUnscaledLdStOffset(LoadInst)
633           ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
634           : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
635   return (UnscaledStOffset <= UnscaledLdOffset) &&
636          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
637 }
638 
639 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
640   unsigned Opc = MI.getOpcode();
641   return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
642           isNarrowStore(Opc)) &&
643          getLdStRegOp(MI).getReg() == AArch64::WZR;
644 }
645 
646 static bool isPromotableLoadFromStore(MachineInstr &MI) {
647   switch (MI.getOpcode()) {
648   default:
649     return false;
650   // Scaled instructions.
651   case AArch64::LDRBBui:
652   case AArch64::LDRHHui:
653   case AArch64::LDRWui:
654   case AArch64::LDRXui:
655   // Unscaled instructions.
656   case AArch64::LDURBBi:
657   case AArch64::LDURHHi:
658   case AArch64::LDURWi:
659   case AArch64::LDURXi:
660     return true;
661   }
662 }
663 
664 static bool isMergeableLdStUpdate(MachineInstr &MI) {
665   unsigned Opc = MI.getOpcode();
666   switch (Opc) {
667   default:
668     return false;
669   // Scaled instructions.
670   case AArch64::STRSui:
671   case AArch64::STRDui:
672   case AArch64::STRQui:
673   case AArch64::STRXui:
674   case AArch64::STRWui:
675   case AArch64::STRHHui:
676   case AArch64::STRBBui:
677   case AArch64::LDRSui:
678   case AArch64::LDRDui:
679   case AArch64::LDRQui:
680   case AArch64::LDRXui:
681   case AArch64::LDRWui:
682   case AArch64::LDRHHui:
683   case AArch64::LDRBBui:
684   case AArch64::STGi:
685   case AArch64::STZGi:
686   case AArch64::ST2Gi:
687   case AArch64::STZ2Gi:
688   case AArch64::STGPi:
689   // Unscaled instructions.
690   case AArch64::STURSi:
691   case AArch64::STURDi:
692   case AArch64::STURQi:
693   case AArch64::STURWi:
694   case AArch64::STURXi:
695   case AArch64::LDURSi:
696   case AArch64::LDURDi:
697   case AArch64::LDURQi:
698   case AArch64::LDURWi:
699   case AArch64::LDURXi:
700   // Paired instructions.
701   case AArch64::LDPSi:
702   case AArch64::LDPSWi:
703   case AArch64::LDPDi:
704   case AArch64::LDPQi:
705   case AArch64::LDPWi:
706   case AArch64::LDPXi:
707   case AArch64::STPSi:
708   case AArch64::STPDi:
709   case AArch64::STPQi:
710   case AArch64::STPWi:
711   case AArch64::STPXi:
712     // Make sure this is a reg+imm (as opposed to an address reloc).
713     if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
714       return false;
715 
716     return true;
717   }
718 }
719 
720 MachineBasicBlock::iterator
721 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
722                                            MachineBasicBlock::iterator MergeMI,
723                                            const LdStPairFlags &Flags) {
724   assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
725          "Expected promotable zero stores.");
726 
727   MachineBasicBlock::iterator E = I->getParent()->end();
728   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
729   // If NextI is the second of the two instructions to be merged, we need
730   // to skip one further. Either way we merge will invalidate the iterator,
731   // and we don't need to scan the new instruction, as it's a pairwise
732   // instruction, which we're not considering for further action anyway.
733   if (NextI == MergeMI)
734     NextI = next_nodbg(NextI, E);
735 
736   unsigned Opc = I->getOpcode();
737   unsigned MergeMIOpc = MergeMI->getOpcode();
738   bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
739   bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc);
740   int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1;
741   int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1;
742 
743   bool MergeForward = Flags.getMergeForward();
744   // Insert our new paired instruction after whichever of the paired
745   // instructions MergeForward indicates.
746   MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
747   // Also based on MergeForward is from where we copy the base register operand
748   // so we get the flags compatible with the input code.
749   const MachineOperand &BaseRegOp =
750       MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
751                    : AArch64InstrInfo::getLdStBaseOp(*I);
752 
753   // Which register is Rt and which is Rt2 depends on the offset order.
754   int64_t IOffsetInBytes =
755       AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride;
756   int64_t MIOffsetInBytes =
757       AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() *
758       MergeMIOffsetStride;
759   // Select final offset based on the offset order.
760   int64_t OffsetImm;
761   if (IOffsetInBytes > MIOffsetInBytes)
762     OffsetImm = MIOffsetInBytes;
763   else
764     OffsetImm = IOffsetInBytes;
765 
766   int NewOpcode = getMatchingWideOpcode(Opc);
767   bool FinalIsScaled = !TII->hasUnscaledLdStOffset(NewOpcode);
768 
769   // Adjust final offset if the result opcode is a scaled store.
770   if (FinalIsScaled) {
771     int NewOffsetStride = FinalIsScaled ? TII->getMemScale(NewOpcode) : 1;
772     assert(((OffsetImm % NewOffsetStride) == 0) &&
773            "Offset should be a multiple of the store memory scale");
774     OffsetImm = OffsetImm / NewOffsetStride;
775   }
776 
777   // Construct the new instruction.
778   DebugLoc DL = I->getDebugLoc();
779   MachineBasicBlock *MBB = I->getParent();
780   MachineInstrBuilder MIB;
781   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
782             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
783             .add(BaseRegOp)
784             .addImm(OffsetImm)
785             .cloneMergedMemRefs({&*I, &*MergeMI})
786             .setMIFlags(I->mergeFlagsWith(*MergeMI));
787   (void)MIB;
788 
789   LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
790   LLVM_DEBUG(I->print(dbgs()));
791   LLVM_DEBUG(dbgs() << "    ");
792   LLVM_DEBUG(MergeMI->print(dbgs()));
793   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
794   LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
795   LLVM_DEBUG(dbgs() << "\n");
796 
797   // Erase the old instructions.
798   I->eraseFromParent();
799   MergeMI->eraseFromParent();
800   return NextI;
801 }
802 
803 // Apply Fn to all instructions between MI and the beginning of the block, until
804 // a def for DefReg is reached. Returns true, iff Fn returns true for all
805 // visited instructions. Stop after visiting Limit iterations.
806 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
807                               const TargetRegisterInfo *TRI, unsigned Limit,
808                               std::function<bool(MachineInstr &, bool)> &Fn) {
809   auto MBB = MI.getParent();
810   for (MachineInstr &I :
811        instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
812     if (!Limit)
813       return false;
814     --Limit;
815 
816     bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
817       return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
818              TRI->regsOverlap(MOP.getReg(), DefReg);
819     });
820     if (!Fn(I, isDef))
821       return false;
822     if (isDef)
823       break;
824   }
825   return true;
826 }
827 
828 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
829                                    const TargetRegisterInfo *TRI) {
830 
831   for (const MachineOperand &MOP : phys_regs_and_masks(MI))
832     if (MOP.isReg() && MOP.isKill())
833       Units.removeReg(MOP.getReg());
834 
835   for (const MachineOperand &MOP : phys_regs_and_masks(MI))
836     if (MOP.isReg() && !MOP.isKill())
837       Units.addReg(MOP.getReg());
838 }
839 
840 MachineBasicBlock::iterator
841 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
842                                       MachineBasicBlock::iterator Paired,
843                                       const LdStPairFlags &Flags) {
844   MachineBasicBlock::iterator E = I->getParent()->end();
845   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
846   // If NextI is the second of the two instructions to be merged, we need
847   // to skip one further. Either way we merge will invalidate the iterator,
848   // and we don't need to scan the new instruction, as it's a pairwise
849   // instruction, which we're not considering for further action anyway.
850   if (NextI == Paired)
851     NextI = next_nodbg(NextI, E);
852 
853   int SExtIdx = Flags.getSExtIdx();
854   unsigned Opc =
855       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
856   bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
857   int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
858 
859   bool MergeForward = Flags.getMergeForward();
860 
861   std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
862   if (MergeForward && RenameReg) {
863     MCRegister RegToRename = getLdStRegOp(*I).getReg();
864     DefinedInBB.addReg(*RenameReg);
865 
866     // Return the sub/super register for RenameReg, matching the size of
867     // OriginalReg.
868     auto GetMatchingSubReg = [this,
869                               RenameReg](MCPhysReg OriginalReg) -> MCPhysReg {
870       for (MCPhysReg SubOrSuper : TRI->sub_and_superregs_inclusive(*RenameReg))
871         if (TRI->getMinimalPhysRegClass(OriginalReg) ==
872             TRI->getMinimalPhysRegClass(SubOrSuper))
873           return SubOrSuper;
874       llvm_unreachable("Should have found matching sub or super register!");
875     };
876 
877     std::function<bool(MachineInstr &, bool)> UpdateMIs =
878         [this, RegToRename, GetMatchingSubReg](MachineInstr &MI, bool IsDef) {
879           if (IsDef) {
880             bool SeenDef = false;
881             for (auto &MOP : MI.operands()) {
882               // Rename the first explicit definition and all implicit
883               // definitions matching RegToRename.
884               if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
885                   (!SeenDef || (MOP.isDef() && MOP.isImplicit())) &&
886                   TRI->regsOverlap(MOP.getReg(), RegToRename)) {
887                 assert((MOP.isImplicit() ||
888                         (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
889                        "Need renamable operands");
890                 MOP.setReg(GetMatchingSubReg(MOP.getReg()));
891                 SeenDef = true;
892               }
893             }
894           } else {
895             for (auto &MOP : MI.operands()) {
896               if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
897                   TRI->regsOverlap(MOP.getReg(), RegToRename)) {
898                 assert((MOP.isImplicit() ||
899                         (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
900                            "Need renamable operands");
901                 MOP.setReg(GetMatchingSubReg(MOP.getReg()));
902               }
903             }
904           }
905           LLVM_DEBUG(dbgs() << "Renamed " << MI << "\n");
906           return true;
907         };
908     forAllMIsUntilDef(*I, RegToRename, TRI, LdStLimit, UpdateMIs);
909 
910 #if !defined(NDEBUG)
911     // Make sure the register used for renaming is not used between the paired
912     // instructions. That would trash the content before the new paired
913     // instruction.
914     for (auto &MI :
915          iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
916              std::next(I), std::next(Paired)))
917       assert(all_of(MI.operands(),
918                     [this, &RenameReg](const MachineOperand &MOP) {
919                       return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
920                              MOP.isUndef() ||
921                              !TRI->regsOverlap(MOP.getReg(), *RenameReg);
922                     }) &&
923              "Rename register used between paired instruction, trashing the "
924              "content");
925 #endif
926   }
927 
928   // Insert our new paired instruction after whichever of the paired
929   // instructions MergeForward indicates.
930   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
931   // Also based on MergeForward is from where we copy the base register operand
932   // so we get the flags compatible with the input code.
933   const MachineOperand &BaseRegOp =
934       MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
935                    : AArch64InstrInfo::getLdStBaseOp(*I);
936 
937   int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
938   int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
939   bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
940   if (IsUnscaled != PairedIsUnscaled) {
941     // We're trying to pair instructions that differ in how they are scaled.  If
942     // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
943     // the opposite (i.e., make Paired's offset unscaled).
944     int MemSize = TII->getMemScale(*Paired);
945     if (PairedIsUnscaled) {
946       // If the unscaled offset isn't a multiple of the MemSize, we can't
947       // pair the operations together.
948       assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
949              "Offset should be a multiple of the stride!");
950       PairedOffset /= MemSize;
951     } else {
952       PairedOffset *= MemSize;
953     }
954   }
955 
956   // Which register is Rt and which is Rt2 depends on the offset order.
957   // However, for pre load/stores the Rt should be the one of the pre
958   // load/store.
959   MachineInstr *RtMI, *Rt2MI;
960   if (Offset == PairedOffset + OffsetStride &&
961       !AArch64InstrInfo::isPreLdSt(*I)) {
962     RtMI = &*Paired;
963     Rt2MI = &*I;
964     // Here we swapped the assumption made for SExtIdx.
965     // I.e., we turn ldp I, Paired into ldp Paired, I.
966     // Update the index accordingly.
967     if (SExtIdx != -1)
968       SExtIdx = (SExtIdx + 1) % 2;
969   } else {
970     RtMI = &*I;
971     Rt2MI = &*Paired;
972   }
973   int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
974   // Scale the immediate offset, if necessary.
975   if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
976     assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
977            "Unscaled offset cannot be scaled.");
978     OffsetImm /= TII->getMemScale(*RtMI);
979   }
980 
981   // Construct the new instruction.
982   MachineInstrBuilder MIB;
983   DebugLoc DL = I->getDebugLoc();
984   MachineBasicBlock *MBB = I->getParent();
985   MachineOperand RegOp0 = getLdStRegOp(*RtMI);
986   MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
987   // Kill flags may become invalid when moving stores for pairing.
988   if (RegOp0.isUse()) {
989     if (!MergeForward) {
990       // Clear kill flags on store if moving upwards. Example:
991       //   STRWui %w0, ...
992       //   USE %w1
993       //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
994       RegOp0.setIsKill(false);
995       RegOp1.setIsKill(false);
996     } else {
997       // Clear kill flags of the first stores register. Example:
998       //   STRWui %w1, ...
999       //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
1000       //   STRW %w0
1001       Register Reg = getLdStRegOp(*I).getReg();
1002       for (MachineInstr &MI : make_range(std::next(I), Paired))
1003         MI.clearRegisterKills(Reg, TRI);
1004     }
1005   }
1006 
1007   unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1008   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
1009 
1010   // Adds the pre-index operand for pre-indexed ld/st pairs.
1011   if (AArch64InstrInfo::isPreLdSt(*RtMI))
1012     MIB.addReg(BaseRegOp.getReg(), RegState::Define);
1013 
1014   MIB.add(RegOp0)
1015       .add(RegOp1)
1016       .add(BaseRegOp)
1017       .addImm(OffsetImm)
1018       .cloneMergedMemRefs({&*I, &*Paired})
1019       .setMIFlags(I->mergeFlagsWith(*Paired));
1020 
1021   (void)MIB;
1022 
1023   LLVM_DEBUG(
1024       dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
1025   LLVM_DEBUG(I->print(dbgs()));
1026   LLVM_DEBUG(dbgs() << "    ");
1027   LLVM_DEBUG(Paired->print(dbgs()));
1028   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
1029   if (SExtIdx != -1) {
1030     // Generate the sign extension for the proper result of the ldp.
1031     // I.e., with X1, that would be:
1032     // %w1 = KILL %w1, implicit-def %x1
1033     // %x1 = SBFMXri killed %x1, 0, 31
1034     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
1035     // Right now, DstMO has the extended register, since it comes from an
1036     // extended opcode.
1037     Register DstRegX = DstMO.getReg();
1038     // Get the W variant of that register.
1039     Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
1040     // Update the result of LDP to use the W instead of the X variant.
1041     DstMO.setReg(DstRegW);
1042     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1043     LLVM_DEBUG(dbgs() << "\n");
1044     // Make the machine verifier happy by providing a definition for
1045     // the X register.
1046     // Insert this definition right after the generated LDP, i.e., before
1047     // InsertionPoint.
1048     MachineInstrBuilder MIBKill =
1049         BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
1050             .addReg(DstRegW)
1051             .addReg(DstRegX, RegState::Define);
1052     MIBKill->getOperand(2).setImplicit();
1053     // Create the sign extension.
1054     MachineInstrBuilder MIBSXTW =
1055         BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
1056             .addReg(DstRegX)
1057             .addImm(0)
1058             .addImm(31);
1059     (void)MIBSXTW;
1060     LLVM_DEBUG(dbgs() << "  Extend operand:\n    ");
1061     LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1062   } else {
1063     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1064   }
1065   LLVM_DEBUG(dbgs() << "\n");
1066 
1067   if (MergeForward)
1068     for (const MachineOperand &MOP : phys_regs_and_masks(*I))
1069       if (MOP.isReg() && MOP.isKill())
1070         DefinedInBB.addReg(MOP.getReg());
1071 
1072   // Erase the old instructions.
1073   I->eraseFromParent();
1074   Paired->eraseFromParent();
1075 
1076   return NextI;
1077 }
1078 
1079 MachineBasicBlock::iterator
1080 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1081                                           MachineBasicBlock::iterator StoreI) {
1082   MachineBasicBlock::iterator NextI =
1083       next_nodbg(LoadI, LoadI->getParent()->end());
1084 
1085   int LoadSize = TII->getMemScale(*LoadI);
1086   int StoreSize = TII->getMemScale(*StoreI);
1087   Register LdRt = getLdStRegOp(*LoadI).getReg();
1088   const MachineOperand &StMO = getLdStRegOp(*StoreI);
1089   Register StRt = getLdStRegOp(*StoreI).getReg();
1090   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1091 
1092   assert((IsStoreXReg ||
1093           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1094          "Unexpected RegClass");
1095 
1096   MachineInstr *BitExtMI;
1097   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1098     // Remove the load, if the destination register of the loads is the same
1099     // register for stored value.
1100     if (StRt == LdRt && LoadSize == 8) {
1101       for (MachineInstr &MI : make_range(StoreI->getIterator(),
1102                                          LoadI->getIterator())) {
1103         if (MI.killsRegister(StRt, TRI)) {
1104           MI.clearRegisterKills(StRt, TRI);
1105           break;
1106         }
1107       }
1108       LLVM_DEBUG(dbgs() << "Remove load instruction:\n    ");
1109       LLVM_DEBUG(LoadI->print(dbgs()));
1110       LLVM_DEBUG(dbgs() << "\n");
1111       LoadI->eraseFromParent();
1112       return NextI;
1113     }
1114     // Replace the load with a mov if the load and store are in the same size.
1115     BitExtMI =
1116         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1117                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1118             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1119             .add(StMO)
1120             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1121             .setMIFlags(LoadI->getFlags());
1122   } else {
1123     // FIXME: Currently we disable this transformation in big-endian targets as
1124     // performance and correctness are verified only in little-endian.
1125     if (!Subtarget->isLittleEndian())
1126       return NextI;
1127     bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
1128     assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1129            "Unsupported ld/st match");
1130     assert(LoadSize <= StoreSize && "Invalid load size");
1131     int UnscaledLdOffset =
1132         IsUnscaled
1133             ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
1134             : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
1135     int UnscaledStOffset =
1136         IsUnscaled
1137             ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
1138             : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
1139     int Width = LoadSize * 8;
1140     Register DestReg =
1141         IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1142                           LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1143                     : LdRt;
1144 
1145     assert((UnscaledLdOffset >= UnscaledStOffset &&
1146             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1147            "Invalid offset");
1148 
1149     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1150     int Imms = Immr + Width - 1;
1151     if (UnscaledLdOffset == UnscaledStOffset) {
1152       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1153                                 | ((Immr) << 6)               // immr
1154                                 | ((Imms) << 0)               // imms
1155           ;
1156 
1157       BitExtMI =
1158           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1159                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1160                   DestReg)
1161               .add(StMO)
1162               .addImm(AndMaskEncoded)
1163               .setMIFlags(LoadI->getFlags());
1164     } else {
1165       BitExtMI =
1166           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1167                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1168                   DestReg)
1169               .add(StMO)
1170               .addImm(Immr)
1171               .addImm(Imms)
1172               .setMIFlags(LoadI->getFlags());
1173     }
1174   }
1175 
1176   // Clear kill flags between store and load.
1177   for (MachineInstr &MI : make_range(StoreI->getIterator(),
1178                                      BitExtMI->getIterator()))
1179     if (MI.killsRegister(StRt, TRI)) {
1180       MI.clearRegisterKills(StRt, TRI);
1181       break;
1182     }
1183 
1184   LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n    ");
1185   LLVM_DEBUG(StoreI->print(dbgs()));
1186   LLVM_DEBUG(dbgs() << "    ");
1187   LLVM_DEBUG(LoadI->print(dbgs()));
1188   LLVM_DEBUG(dbgs() << "  with instructions:\n    ");
1189   LLVM_DEBUG(StoreI->print(dbgs()));
1190   LLVM_DEBUG(dbgs() << "    ");
1191   LLVM_DEBUG((BitExtMI)->print(dbgs()));
1192   LLVM_DEBUG(dbgs() << "\n");
1193 
1194   // Erase the old instructions.
1195   LoadI->eraseFromParent();
1196   return NextI;
1197 }
1198 
1199 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1200   // Convert the byte-offset used by unscaled into an "element" offset used
1201   // by the scaled pair load/store instructions.
1202   if (IsUnscaled) {
1203     // If the byte-offset isn't a multiple of the stride, there's no point
1204     // trying to match it.
1205     if (Offset % OffsetStride)
1206       return false;
1207     Offset /= OffsetStride;
1208   }
1209   return Offset <= 63 && Offset >= -64;
1210 }
1211 
1212 // Do alignment, specialized to power of 2 and for signed ints,
1213 // avoiding having to do a C-style cast from uint_64t to int when
1214 // using alignTo from include/llvm/Support/MathExtras.h.
1215 // FIXME: Move this function to include/MathExtras.h?
1216 static int alignTo(int Num, int PowOf2) {
1217   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1218 }
1219 
1220 static bool mayAlias(MachineInstr &MIa,
1221                      SmallVectorImpl<MachineInstr *> &MemInsns,
1222                      AliasAnalysis *AA) {
1223   for (MachineInstr *MIb : MemInsns)
1224     if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
1225       return true;
1226 
1227   return false;
1228 }
1229 
1230 bool AArch64LoadStoreOpt::findMatchingStore(
1231     MachineBasicBlock::iterator I, unsigned Limit,
1232     MachineBasicBlock::iterator &StoreI) {
1233   MachineBasicBlock::iterator B = I->getParent()->begin();
1234   MachineBasicBlock::iterator MBBI = I;
1235   MachineInstr &LoadMI = *I;
1236   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
1237 
1238   // If the load is the first instruction in the block, there's obviously
1239   // not any matching store.
1240   if (MBBI == B)
1241     return false;
1242 
1243   // Track which register units have been modified and used between the first
1244   // insn and the second insn.
1245   ModifiedRegUnits.clear();
1246   UsedRegUnits.clear();
1247 
1248   unsigned Count = 0;
1249   do {
1250     MBBI = prev_nodbg(MBBI, B);
1251     MachineInstr &MI = *MBBI;
1252 
1253     // Don't count transient instructions towards the search limit since there
1254     // may be different numbers of them if e.g. debug information is present.
1255     if (!MI.isTransient())
1256       ++Count;
1257 
1258     // If the load instruction reads directly from the address to which the
1259     // store instruction writes and the stored value is not modified, we can
1260     // promote the load. Since we do not handle stores with pre-/post-index,
1261     // it's unnecessary to check if BaseReg is modified by the store itself.
1262     // Also we can't handle stores without an immediate offset operand,
1263     // while the operand might be the address for a global variable.
1264     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1265         BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1266         AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1267         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1268         ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1269       StoreI = MBBI;
1270       return true;
1271     }
1272 
1273     if (MI.isCall())
1274       return false;
1275 
1276     // Update modified / uses register units.
1277     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1278 
1279     // Otherwise, if the base register is modified, we have no match, so
1280     // return early.
1281     if (!ModifiedRegUnits.available(BaseReg))
1282       return false;
1283 
1284     // If we encounter a store aliased with the load, return early.
1285     if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
1286       return false;
1287   } while (MBBI != B && Count < Limit);
1288   return false;
1289 }
1290 
1291 static bool needsWinCFI(const MachineFunction *MF) {
1292   return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1293          MF->getFunction().needsUnwindTableEntry();
1294 }
1295 
1296 // Returns true if FirstMI and MI are candidates for merging or pairing.
1297 // Otherwise, returns false.
1298 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1299                                        LdStPairFlags &Flags,
1300                                        const AArch64InstrInfo *TII) {
1301   // If this is volatile or if pairing is suppressed, not a candidate.
1302   if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1303     return false;
1304 
1305   // We should have already checked FirstMI for pair suppression and volatility.
1306   assert(!FirstMI.hasOrderedMemoryRef() &&
1307          !TII->isLdStPairSuppressed(FirstMI) &&
1308          "FirstMI shouldn't get here if either of these checks are true.");
1309 
1310   if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) ||
1311                                   MI.getFlag(MachineInstr::FrameDestroy)))
1312     return false;
1313 
1314   unsigned OpcA = FirstMI.getOpcode();
1315   unsigned OpcB = MI.getOpcode();
1316 
1317   // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1318   if (OpcA == OpcB)
1319     return !AArch64InstrInfo::isPreLdSt(FirstMI);
1320 
1321   // Try to match a sign-extended load/store with a zero-extended load/store.
1322   bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1323   unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1324   assert(IsValidLdStrOpc &&
1325          "Given Opc should be a Load or Store with an immediate");
1326   // OpcA will be the first instruction in the pair.
1327   if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1328     Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1329     return true;
1330   }
1331 
1332   // If the second instruction isn't even a mergable/pairable load/store, bail
1333   // out.
1334   if (!PairIsValidLdStrOpc)
1335     return false;
1336 
1337   // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1338   // offsets.
1339   if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1340     return false;
1341 
1342   // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1343   // LDR<S,D,Q,W,X>pre-LDR<S,D,Q,W,X>ui
1344   // are candidate pairs that can be merged.
1345   if (isPreLdStPairCandidate(FirstMI, MI))
1346     return true;
1347 
1348   // Try to match an unscaled load/store with a scaled load/store.
1349   return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
1350          getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1351 
1352   // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1353 }
1354 
1355 static bool
1356 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1357                  SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1358                  const TargetRegisterInfo *TRI) {
1359   if (!FirstMI.mayStore())
1360     return false;
1361 
1362   // Check if we can find an unused register which we can use to rename
1363   // the register used by the first load/store.
1364   auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1365   MachineFunction &MF = *FirstMI.getParent()->getParent();
1366   if (!RegClass || !MF.getRegInfo().tracksLiveness())
1367     return false;
1368 
1369   auto RegToRename = getLdStRegOp(FirstMI).getReg();
1370   // For now, we only rename if the store operand gets killed at the store.
1371   if (!getLdStRegOp(FirstMI).isKill() &&
1372       !any_of(FirstMI.operands(),
1373               [TRI, RegToRename](const MachineOperand &MOP) {
1374                 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1375                        MOP.isImplicit() && MOP.isKill() &&
1376                        TRI->regsOverlap(RegToRename, MOP.getReg());
1377               })) {
1378     LLVM_DEBUG(dbgs() << "  Operand not killed at " << FirstMI << "\n");
1379     return false;
1380   }
1381   auto canRenameMOP = [TRI](const MachineOperand &MOP) {
1382     if (MOP.isReg()) {
1383       auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
1384       // Renaming registers with multiple disjunct sub-registers (e.g. the
1385       // result of a LD3) means that all sub-registers are renamed, potentially
1386       // impacting other instructions we did not check. Bail out.
1387       // Note that this relies on the structure of the AArch64 register file. In
1388       // particular, a subregister cannot be written without overwriting the
1389       // whole register.
1390       if (RegClass->HasDisjunctSubRegs) {
1391         LLVM_DEBUG(
1392             dbgs()
1393             << "  Cannot rename operands with multiple disjunct subregisters ("
1394             << MOP << ")\n");
1395         return false;
1396       }
1397     }
1398     return MOP.isImplicit() ||
1399            (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1400   };
1401 
1402   bool FoundDef = false;
1403 
1404   // For each instruction between FirstMI and the previous def for RegToRename,
1405   // we
1406   // * check if we can rename RegToRename in this instruction
1407   // * collect the registers used and required register classes for RegToRename.
1408   std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1409                                                            bool IsDef) {
1410     LLVM_DEBUG(dbgs() << "Checking " << MI << "\n");
1411     // Currently we do not try to rename across frame-setup instructions.
1412     if (MI.getFlag(MachineInstr::FrameSetup)) {
1413       LLVM_DEBUG(dbgs() << "  Cannot rename framesetup instructions currently ("
1414                         << MI << ")\n");
1415       return false;
1416     }
1417 
1418     UsedInBetween.accumulate(MI);
1419 
1420     // For a definition, check that we can rename the definition and exit the
1421     // loop.
1422     FoundDef = IsDef;
1423 
1424     // For defs, check if we can rename the first def of RegToRename.
1425     if (FoundDef) {
1426       // For some pseudo instructions, we might not generate code in the end
1427       // (e.g. KILL) and we would end up without a correct def for the rename
1428       // register.
1429       // TODO: This might be overly conservative and we could handle those cases
1430       // in multiple ways:
1431       //       1. Insert an extra copy, to materialize the def.
1432       //       2. Skip pseudo-defs until we find an non-pseudo def.
1433       if (MI.isPseudo()) {
1434         LLVM_DEBUG(dbgs() << "  Cannot rename pseudo instruction " << MI
1435                           << "\n");
1436         return false;
1437       }
1438 
1439       for (auto &MOP : MI.operands()) {
1440         if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1441             !TRI->regsOverlap(MOP.getReg(), RegToRename))
1442           continue;
1443         if (!canRenameMOP(MOP)) {
1444           LLVM_DEBUG(dbgs()
1445                      << "  Cannot rename " << MOP << " in " << MI << "\n");
1446           return false;
1447         }
1448         RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1449       }
1450       return true;
1451     } else {
1452       for (auto &MOP : MI.operands()) {
1453         if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1454             !TRI->regsOverlap(MOP.getReg(), RegToRename))
1455           continue;
1456 
1457         if (!canRenameMOP(MOP)) {
1458           LLVM_DEBUG(dbgs()
1459                      << "  Cannot rename " << MOP << " in " << MI << "\n");
1460           return false;
1461         }
1462         RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1463       }
1464     }
1465     return true;
1466   };
1467 
1468   if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
1469     return false;
1470 
1471   if (!FoundDef) {
1472     LLVM_DEBUG(dbgs() << "  Did not find definition for register in BB\n");
1473     return false;
1474   }
1475   return true;
1476 }
1477 
1478 // Check if we can find a physical register for renaming \p Reg. This register
1479 // must:
1480 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1481 //   defined registers up to the point where the renamed register will be used,
1482 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1483 //   registers in the range the rename register will be used,
1484 // * is available in all used register classes (checked using RequiredClasses).
1485 static std::optional<MCPhysReg> tryToFindRegisterToRename(
1486     const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1487     LiveRegUnits &UsedInBetween,
1488     SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1489     const TargetRegisterInfo *TRI) {
1490   const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1491 
1492   // Checks if any sub- or super-register of PR is callee saved.
1493   auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1494     return any_of(TRI->sub_and_superregs_inclusive(PR),
1495                   [&MF, TRI](MCPhysReg SubOrSuper) {
1496                     return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1497                   });
1498   };
1499 
1500   // Check if PR or one of its sub- or super-registers can be used for all
1501   // required register classes.
1502   auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1503     return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
1504       return any_of(TRI->sub_and_superregs_inclusive(PR),
1505                     [C, TRI](MCPhysReg SubOrSuper) {
1506                       return C == TRI->getMinimalPhysRegClass(SubOrSuper);
1507                     });
1508     });
1509   };
1510 
1511   auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1512   for (const MCPhysReg &PR : *RegClass) {
1513     if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
1514         !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1515         CanBeUsedForAllClasses(PR)) {
1516       DefinedInBB.addReg(PR);
1517       LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1518                         << "\n");
1519       return {PR};
1520     }
1521   }
1522   LLVM_DEBUG(dbgs() << "No rename register found from "
1523                     << TRI->getRegClassName(RegClass) << "\n");
1524   return std::nullopt;
1525 }
1526 
1527 /// Scan the instructions looking for a load/store that can be combined with the
1528 /// current instruction into a wider equivalent or a load/store pair.
1529 MachineBasicBlock::iterator
1530 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1531                                       LdStPairFlags &Flags, unsigned Limit,
1532                                       bool FindNarrowMerge) {
1533   MachineBasicBlock::iterator E = I->getParent()->end();
1534   MachineBasicBlock::iterator MBBI = I;
1535   MachineBasicBlock::iterator MBBIWithRenameReg;
1536   MachineInstr &FirstMI = *I;
1537   MBBI = next_nodbg(MBBI, E);
1538 
1539   bool MayLoad = FirstMI.mayLoad();
1540   bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
1541   Register Reg = getLdStRegOp(FirstMI).getReg();
1542   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
1543   int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
1544   int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
1545   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1546 
1547   std::optional<bool> MaybeCanRename;
1548   if (!EnableRenaming)
1549     MaybeCanRename = {false};
1550 
1551   SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1552   LiveRegUnits UsedInBetween;
1553   UsedInBetween.init(*TRI);
1554 
1555   Flags.clearRenameReg();
1556 
1557   // Track which register units have been modified and used between the first
1558   // insn (inclusive) and the second insn.
1559   ModifiedRegUnits.clear();
1560   UsedRegUnits.clear();
1561 
1562   // Remember any instructions that read/write memory between FirstMI and MI.
1563   SmallVector<MachineInstr *, 4> MemInsns;
1564 
1565   for (unsigned Count = 0; MBBI != E && Count < Limit;
1566        MBBI = next_nodbg(MBBI, E)) {
1567     MachineInstr &MI = *MBBI;
1568 
1569     UsedInBetween.accumulate(MI);
1570 
1571     // Don't count transient instructions towards the search limit since there
1572     // may be different numbers of them if e.g. debug information is present.
1573     if (!MI.isTransient())
1574       ++Count;
1575 
1576     Flags.setSExtIdx(-1);
1577     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1578         AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
1579       assert(MI.mayLoadOrStore() && "Expected memory operation.");
1580       // If we've found another instruction with the same opcode, check to see
1581       // if the base and offset are compatible with our starting instruction.
1582       // These instructions all have scaled immediate operands, so we just
1583       // check for +1/-1. Make sure to check the new instruction offset is
1584       // actually an immediate and not a symbolic reference destined for
1585       // a relocation.
1586       Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
1587       int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
1588       bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
1589       if (IsUnscaled != MIIsUnscaled) {
1590         // We're trying to pair instructions that differ in how they are scaled.
1591         // If FirstMI is scaled then scale the offset of MI accordingly.
1592         // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1593         int MemSize = TII->getMemScale(MI);
1594         if (MIIsUnscaled) {
1595           // If the unscaled offset isn't a multiple of the MemSize, we can't
1596           // pair the operations together: bail and keep looking.
1597           if (MIOffset % MemSize) {
1598             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1599                                               UsedRegUnits, TRI);
1600             MemInsns.push_back(&MI);
1601             continue;
1602           }
1603           MIOffset /= MemSize;
1604         } else {
1605           MIOffset *= MemSize;
1606         }
1607       }
1608 
1609       bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
1610 
1611       if (BaseReg == MIBaseReg) {
1612         // If the offset of the second ld/st is not equal to the size of the
1613         // destination register it can’t be paired with a pre-index ld/st
1614         // pair. Additionally if the base reg is used or modified the operations
1615         // can't be paired: bail and keep looking.
1616         if (IsPreLdSt) {
1617           bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
1618           bool IsBaseRegUsed = !UsedRegUnits.available(
1619               AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1620           bool IsBaseRegModified = !ModifiedRegUnits.available(
1621               AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1622           // If the stored value and the address of the second instruction is
1623           // the same, it needs to be using the updated register and therefore
1624           // it must not be folded.
1625           bool IsMIRegTheSame =
1626               TRI->regsOverlap(getLdStRegOp(MI).getReg(),
1627                                AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1628           if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
1629               IsMIRegTheSame) {
1630             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1631                                               UsedRegUnits, TRI);
1632             MemInsns.push_back(&MI);
1633             continue;
1634           }
1635         } else {
1636           if ((Offset != MIOffset + OffsetStride) &&
1637               (Offset + OffsetStride != MIOffset)) {
1638             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1639                                               UsedRegUnits, TRI);
1640             MemInsns.push_back(&MI);
1641             continue;
1642           }
1643         }
1644 
1645         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1646         if (FindNarrowMerge) {
1647           // If the alignment requirements of the scaled wide load/store
1648           // instruction can't express the offset of the scaled narrow input,
1649           // bail and keep looking. For promotable zero stores, allow only when
1650           // the stored value is the same (i.e., WZR).
1651           if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1652               (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1653             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1654                                               UsedRegUnits, TRI);
1655             MemInsns.push_back(&MI);
1656             continue;
1657           }
1658         } else {
1659           // Pairwise instructions have a 7-bit signed offset field. Single
1660           // insns have a 12-bit unsigned offset field.  If the resultant
1661           // immediate offset of merging these instructions is out of range for
1662           // a pairwise instruction, bail and keep looking.
1663           if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1664             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1665                                               UsedRegUnits, TRI);
1666             MemInsns.push_back(&MI);
1667             continue;
1668           }
1669           // If the alignment requirements of the paired (scaled) instruction
1670           // can't express the offset of the unscaled input, bail and keep
1671           // looking.
1672           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1673             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1674                                               UsedRegUnits, TRI);
1675             MemInsns.push_back(&MI);
1676             continue;
1677           }
1678         }
1679         // If the destination register of one load is the same register or a
1680         // sub/super register of the other load, bail and keep looking. A
1681         // load-pair instruction with both destination registers the same is
1682         // UNPREDICTABLE and will result in an exception.
1683         if (MayLoad &&
1684             TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) {
1685           LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
1686                                             TRI);
1687           MemInsns.push_back(&MI);
1688           continue;
1689         }
1690 
1691         // If the BaseReg has been modified, then we cannot do the optimization.
1692         // For example, in the following pattern
1693         //   ldr x1 [x2]
1694         //   ldr x2 [x3]
1695         //   ldr x4 [x2, #8],
1696         // the first and third ldr cannot be converted to ldp x1, x4, [x2]
1697         if (!ModifiedRegUnits.available(BaseReg))
1698           return E;
1699 
1700         // If the Rt of the second instruction was not modified or used between
1701         // the two instructions and none of the instructions between the second
1702         // and first alias with the second, we can combine the second into the
1703         // first.
1704         if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
1705             !(MI.mayLoad() &&
1706               !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
1707             !mayAlias(MI, MemInsns, AA)) {
1708 
1709           Flags.setMergeForward(false);
1710           Flags.clearRenameReg();
1711           return MBBI;
1712         }
1713 
1714         // Likewise, if the Rt of the first instruction is not modified or used
1715         // between the two instructions and none of the instructions between the
1716         // first and the second alias with the first, we can combine the first
1717         // into the second.
1718         if (!(MayLoad &&
1719               !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
1720             !mayAlias(FirstMI, MemInsns, AA)) {
1721 
1722           if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
1723             Flags.setMergeForward(true);
1724             Flags.clearRenameReg();
1725             return MBBI;
1726           }
1727 
1728           if (DebugCounter::shouldExecute(RegRenamingCounter)) {
1729             if (!MaybeCanRename)
1730               MaybeCanRename = {canRenameUpToDef(FirstMI, UsedInBetween,
1731                                                  RequiredClasses, TRI)};
1732 
1733             if (*MaybeCanRename) {
1734               std::optional<MCPhysReg> MaybeRenameReg =
1735                   tryToFindRegisterToRename(*FirstMI.getParent()->getParent(),
1736                                             Reg, DefinedInBB, UsedInBetween,
1737                                             RequiredClasses, TRI);
1738               if (MaybeRenameReg) {
1739                 Flags.setRenameReg(*MaybeRenameReg);
1740                 Flags.setMergeForward(true);
1741                 MBBIWithRenameReg = MBBI;
1742               }
1743             }
1744           }
1745         }
1746         // Unable to combine these instructions due to interference in between.
1747         // Keep looking.
1748       }
1749     }
1750 
1751     if (Flags.getRenameReg())
1752       return MBBIWithRenameReg;
1753 
1754     // If the instruction wasn't a matching load or store.  Stop searching if we
1755     // encounter a call instruction that might modify memory.
1756     if (MI.isCall())
1757       return E;
1758 
1759     // Update modified / uses register units.
1760     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1761 
1762     // Otherwise, if the base register is modified, we have no match, so
1763     // return early.
1764     if (!ModifiedRegUnits.available(BaseReg))
1765       return E;
1766 
1767     // Update list of instructions that read/write memory.
1768     if (MI.mayLoadOrStore())
1769       MemInsns.push_back(&MI);
1770   }
1771   return E;
1772 }
1773 
1774 static MachineBasicBlock::iterator
1775 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
1776   auto End = MI.getParent()->end();
1777   if (MaybeCFI == End ||
1778       MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
1779       !(MI.getFlag(MachineInstr::FrameSetup) ||
1780         MI.getFlag(MachineInstr::FrameDestroy)) ||
1781       AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
1782     return End;
1783 
1784   const MachineFunction &MF = *MI.getParent()->getParent();
1785   unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
1786   const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
1787   switch (CFI.getOperation()) {
1788   case MCCFIInstruction::OpDefCfa:
1789   case MCCFIInstruction::OpDefCfaOffset:
1790     return MaybeCFI;
1791   default:
1792     return End;
1793   }
1794 }
1795 
1796 MachineBasicBlock::iterator
1797 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1798                                      MachineBasicBlock::iterator Update,
1799                                      bool IsPreIdx) {
1800   assert((Update->getOpcode() == AArch64::ADDXri ||
1801           Update->getOpcode() == AArch64::SUBXri) &&
1802          "Unexpected base register update instruction to merge!");
1803   MachineBasicBlock::iterator E = I->getParent()->end();
1804   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
1805 
1806   // If updating the SP and the following instruction is CFA offset related CFI
1807   // instruction move it after the merged instruction.
1808   MachineBasicBlock::iterator CFI =
1809       IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
1810 
1811   // Return the instruction following the merged instruction, which is
1812   // the instruction following our unmerged load. Unless that's the add/sub
1813   // instruction we're merging, in which case it's the one after that.
1814   if (NextI == Update)
1815     NextI = next_nodbg(NextI, E);
1816 
1817   int Value = Update->getOperand(2).getImm();
1818   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1819          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1820   if (Update->getOpcode() == AArch64::SUBXri)
1821     Value = -Value;
1822 
1823   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1824                              : getPostIndexedOpcode(I->getOpcode());
1825   MachineInstrBuilder MIB;
1826   int Scale, MinOffset, MaxOffset;
1827   getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
1828   if (!AArch64InstrInfo::isPairedLdSt(*I)) {
1829     // Non-paired instruction.
1830     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1831               .add(getLdStRegOp(*Update))
1832               .add(getLdStRegOp(*I))
1833               .add(AArch64InstrInfo::getLdStBaseOp(*I))
1834               .addImm(Value / Scale)
1835               .setMemRefs(I->memoperands())
1836               .setMIFlags(I->mergeFlagsWith(*Update));
1837   } else {
1838     // Paired instruction.
1839     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1840               .add(getLdStRegOp(*Update))
1841               .add(getLdStRegOp(*I, 0))
1842               .add(getLdStRegOp(*I, 1))
1843               .add(AArch64InstrInfo::getLdStBaseOp(*I))
1844               .addImm(Value / Scale)
1845               .setMemRefs(I->memoperands())
1846               .setMIFlags(I->mergeFlagsWith(*Update));
1847   }
1848   if (CFI != E) {
1849     MachineBasicBlock *MBB = I->getParent();
1850     MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
1851   }
1852 
1853   if (IsPreIdx) {
1854     ++NumPreFolded;
1855     LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1856   } else {
1857     ++NumPostFolded;
1858     LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1859   }
1860   LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
1861   LLVM_DEBUG(I->print(dbgs()));
1862   LLVM_DEBUG(dbgs() << "    ");
1863   LLVM_DEBUG(Update->print(dbgs()));
1864   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
1865   LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1866   LLVM_DEBUG(dbgs() << "\n");
1867 
1868   // Erase the old instructions for the block.
1869   I->eraseFromParent();
1870   Update->eraseFromParent();
1871 
1872   return NextI;
1873 }
1874 
1875 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1876                                                MachineInstr &MI,
1877                                                unsigned BaseReg, int Offset) {
1878   switch (MI.getOpcode()) {
1879   default:
1880     break;
1881   case AArch64::SUBXri:
1882   case AArch64::ADDXri:
1883     // Make sure it's a vanilla immediate operand, not a relocation or
1884     // anything else we can't handle.
1885     if (!MI.getOperand(2).isImm())
1886       break;
1887     // Watch out for 1 << 12 shifted value.
1888     if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
1889       break;
1890 
1891     // The update instruction source and destination register must be the
1892     // same as the load/store base register.
1893     if (MI.getOperand(0).getReg() != BaseReg ||
1894         MI.getOperand(1).getReg() != BaseReg)
1895       break;
1896 
1897     int UpdateOffset = MI.getOperand(2).getImm();
1898     if (MI.getOpcode() == AArch64::SUBXri)
1899       UpdateOffset = -UpdateOffset;
1900 
1901     // The immediate must be a multiple of the scaling factor of the pre/post
1902     // indexed instruction.
1903     int Scale, MinOffset, MaxOffset;
1904     getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
1905     if (UpdateOffset % Scale != 0)
1906       break;
1907 
1908     // Scaled offset must fit in the instruction immediate.
1909     int ScaledOffset = UpdateOffset / Scale;
1910     if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
1911       break;
1912 
1913     // If we have a non-zero Offset, we check that it matches the amount
1914     // we're adding to the register.
1915     if (!Offset || Offset == UpdateOffset)
1916       return true;
1917     break;
1918   }
1919   return false;
1920 }
1921 
1922 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1923     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1924   MachineBasicBlock::iterator E = I->getParent()->end();
1925   MachineInstr &MemMI = *I;
1926   MachineBasicBlock::iterator MBBI = I;
1927 
1928   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
1929   int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
1930                          TII->getMemScale(MemMI);
1931 
1932   // Scan forward looking for post-index opportunities.  Updating instructions
1933   // can't be formed if the memory instruction doesn't have the offset we're
1934   // looking for.
1935   if (MIUnscaledOffset != UnscaledOffset)
1936     return E;
1937 
1938   // If the base register overlaps a source/destination register, we can't
1939   // merge the update. This does not apply to tag store instructions which
1940   // ignore the address part of the source register.
1941   // This does not apply to STGPi as well, which does not have unpredictable
1942   // behavior in this case unlike normal stores, and always performs writeback
1943   // after reading the source register value.
1944   if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
1945     bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
1946     for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1947       Register DestReg = getLdStRegOp(MemMI, i).getReg();
1948       if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1949         return E;
1950     }
1951   }
1952 
1953   // Track which register units have been modified and used between the first
1954   // insn (inclusive) and the second insn.
1955   ModifiedRegUnits.clear();
1956   UsedRegUnits.clear();
1957   MBBI = next_nodbg(MBBI, E);
1958 
1959   // We can't post-increment the stack pointer if any instruction between
1960   // the memory access (I) and the increment (MBBI) can access the memory
1961   // region defined by [SP, MBBI].
1962   const bool BaseRegSP = BaseReg == AArch64::SP;
1963   if (BaseRegSP && needsWinCFI(I->getMF())) {
1964     // FIXME: For now, we always block the optimization over SP in windows
1965     // targets as it requires to adjust the unwind/debug info, messing up
1966     // the unwind info can actually cause a miscompile.
1967     return E;
1968   }
1969 
1970   for (unsigned Count = 0; MBBI != E && Count < Limit;
1971        MBBI = next_nodbg(MBBI, E)) {
1972     MachineInstr &MI = *MBBI;
1973 
1974     // Don't count transient instructions towards the search limit since there
1975     // may be different numbers of them if e.g. debug information is present.
1976     if (!MI.isTransient())
1977       ++Count;
1978 
1979     // If we found a match, return it.
1980     if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1981       return MBBI;
1982 
1983     // Update the status of what the instruction clobbered and used.
1984     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1985 
1986     // Otherwise, if the base register is used or modified, we have no match, so
1987     // return early.
1988     // If we are optimizing SP, do not allow instructions that may load or store
1989     // in between the load and the optimized value update.
1990     if (!ModifiedRegUnits.available(BaseReg) ||
1991         !UsedRegUnits.available(BaseReg) ||
1992         (BaseRegSP && MBBI->mayLoadOrStore()))
1993       return E;
1994   }
1995   return E;
1996 }
1997 
1998 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1999     MachineBasicBlock::iterator I, unsigned Limit) {
2000   MachineBasicBlock::iterator B = I->getParent()->begin();
2001   MachineBasicBlock::iterator E = I->getParent()->end();
2002   MachineInstr &MemMI = *I;
2003   MachineBasicBlock::iterator MBBI = I;
2004   MachineFunction &MF = *MemMI.getMF();
2005 
2006   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2007   int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
2008 
2009   // If the load/store is the first instruction in the block, there's obviously
2010   // not any matching update. Ditto if the memory offset isn't zero.
2011   if (MBBI == B || Offset != 0)
2012     return E;
2013   // If the base register overlaps a destination register, we can't
2014   // merge the update.
2015   if (!isTagStore(MemMI)) {
2016     bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2017     for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2018       Register DestReg = getLdStRegOp(MemMI, i).getReg();
2019       if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2020         return E;
2021     }
2022   }
2023 
2024   const bool BaseRegSP = BaseReg == AArch64::SP;
2025   if (BaseRegSP && needsWinCFI(I->getMF())) {
2026     // FIXME: For now, we always block the optimization over SP in windows
2027     // targets as it requires to adjust the unwind/debug info, messing up
2028     // the unwind info can actually cause a miscompile.
2029     return E;
2030   }
2031 
2032   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2033   unsigned RedZoneSize =
2034       Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
2035 
2036   // Track which register units have been modified and used between the first
2037   // insn (inclusive) and the second insn.
2038   ModifiedRegUnits.clear();
2039   UsedRegUnits.clear();
2040   unsigned Count = 0;
2041   bool MemAcessBeforeSPPreInc = false;
2042   do {
2043     MBBI = prev_nodbg(MBBI, B);
2044     MachineInstr &MI = *MBBI;
2045 
2046     // Don't count transient instructions towards the search limit since there
2047     // may be different numbers of them if e.g. debug information is present.
2048     if (!MI.isTransient())
2049       ++Count;
2050 
2051     // If we found a match, return it.
2052     if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
2053       // Check that the update value is within our red zone limit (which may be
2054       // zero).
2055       if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
2056         return E;
2057       return MBBI;
2058     }
2059 
2060     // Update the status of what the instruction clobbered and used.
2061     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2062 
2063     // Otherwise, if the base register is used or modified, we have no match, so
2064     // return early.
2065     if (!ModifiedRegUnits.available(BaseReg) ||
2066         !UsedRegUnits.available(BaseReg))
2067       return E;
2068     // Keep track if we have a memory access before an SP pre-increment, in this
2069     // case we need to validate later that the update amount respects the red
2070     // zone.
2071     if (BaseRegSP && MBBI->mayLoadOrStore())
2072       MemAcessBeforeSPPreInc = true;
2073   } while (MBBI != B && Count < Limit);
2074   return E;
2075 }
2076 
2077 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2078     MachineBasicBlock::iterator &MBBI) {
2079   MachineInstr &MI = *MBBI;
2080   // If this is a volatile load, don't mess with it.
2081   if (MI.hasOrderedMemoryRef())
2082     return false;
2083 
2084   if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy))
2085     return false;
2086 
2087   // Make sure this is a reg+imm.
2088   // FIXME: It is possible to extend it to handle reg+reg cases.
2089   if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2090     return false;
2091 
2092   // Look backward up to LdStLimit instructions.
2093   MachineBasicBlock::iterator StoreI;
2094   if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
2095     ++NumLoadsFromStoresPromoted;
2096     // Promote the load. Keeping the iterator straight is a
2097     // pain, so we let the merge routine tell us what the next instruction
2098     // is after it's done mucking about.
2099     MBBI = promoteLoadFromStore(MBBI, StoreI);
2100     return true;
2101   }
2102   return false;
2103 }
2104 
2105 // Merge adjacent zero stores into a wider store.
2106 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2107     MachineBasicBlock::iterator &MBBI) {
2108   assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2109   MachineInstr &MI = *MBBI;
2110   MachineBasicBlock::iterator E = MI.getParent()->end();
2111 
2112   if (!TII->isCandidateToMergeOrPair(MI))
2113     return false;
2114 
2115   // Look ahead up to LdStLimit instructions for a mergable instruction.
2116   LdStPairFlags Flags;
2117   MachineBasicBlock::iterator MergeMI =
2118       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
2119   if (MergeMI != E) {
2120     ++NumZeroStoresPromoted;
2121 
2122     // Keeping the iterator straight is a pain, so we let the merge routine tell
2123     // us what the next instruction is after it's done mucking about.
2124     MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
2125     return true;
2126   }
2127   return false;
2128 }
2129 
2130 // Find loads and stores that can be merged into a single load or store pair
2131 // instruction.
2132 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2133   MachineInstr &MI = *MBBI;
2134   MachineBasicBlock::iterator E = MI.getParent()->end();
2135 
2136   if (!TII->isCandidateToMergeOrPair(MI))
2137     return false;
2138 
2139   // Early exit if the offset is not possible to match. (6 bits of positive
2140   // range, plus allow an extra one in case we find a later insn that matches
2141   // with Offset-1)
2142   bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2143   int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2144   int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2145   // Allow one more for offset.
2146   if (Offset > 0)
2147     Offset -= OffsetStride;
2148   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2149     return false;
2150 
2151   // Look ahead up to LdStLimit instructions for a pairable instruction.
2152   LdStPairFlags Flags;
2153   MachineBasicBlock::iterator Paired =
2154       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
2155   if (Paired != E) {
2156     ++NumPairCreated;
2157     if (TII->hasUnscaledLdStOffset(MI))
2158       ++NumUnscaledPairCreated;
2159     // Keeping the iterator straight is a pain, so we let the merge routine tell
2160     // us what the next instruction is after it's done mucking about.
2161     auto Prev = std::prev(MBBI);
2162     MBBI = mergePairedInsns(MBBI, Paired, Flags);
2163     // Collect liveness info for instructions between Prev and the new position
2164     // MBBI.
2165     for (auto I = std::next(Prev); I != MBBI; I++)
2166       updateDefinedRegisters(*I, DefinedInBB, TRI);
2167 
2168     return true;
2169   }
2170   return false;
2171 }
2172 
2173 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2174     (MachineBasicBlock::iterator &MBBI) {
2175   MachineInstr &MI = *MBBI;
2176   MachineBasicBlock::iterator E = MI.getParent()->end();
2177   MachineBasicBlock::iterator Update;
2178 
2179   // Look forward to try to form a post-index instruction. For example,
2180   // ldr x0, [x20]
2181   // add x20, x20, #32
2182   //   merged into:
2183   // ldr x0, [x20], #32
2184   Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
2185   if (Update != E) {
2186     // Merge the update into the ld/st.
2187     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
2188     return true;
2189   }
2190 
2191   // Don't know how to handle unscaled pre/post-index versions below, so bail.
2192   if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2193     return false;
2194 
2195   // Look back to try to find a pre-index instruction. For example,
2196   // add x0, x0, #8
2197   // ldr x1, [x0]
2198   //   merged into:
2199   // ldr x1, [x0, #8]!
2200   Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
2201   if (Update != E) {
2202     // Merge the update into the ld/st.
2203     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2204     return true;
2205   }
2206 
2207   // The immediate in the load/store is scaled by the size of the memory
2208   // operation. The immediate in the add we're looking for,
2209   // however, is not, so adjust here.
2210   int UnscaledOffset =
2211       AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2212 
2213   // Look forward to try to find a pre-index instruction. For example,
2214   // ldr x1, [x0, #64]
2215   // add x0, x0, #64
2216   //   merged into:
2217   // ldr x1, [x0, #64]!
2218   Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
2219   if (Update != E) {
2220     // Merge the update into the ld/st.
2221     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2222     return true;
2223   }
2224 
2225   return false;
2226 }
2227 
2228 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2229                                         bool EnableNarrowZeroStOpt) {
2230 
2231   bool Modified = false;
2232   // Four tranformations to do here:
2233   // 1) Find loads that directly read from stores and promote them by
2234   //    replacing with mov instructions. If the store is wider than the load,
2235   //    the load will be replaced with a bitfield extract.
2236   //      e.g.,
2237   //        str w1, [x0, #4]
2238   //        ldrh w2, [x0, #6]
2239   //        ; becomes
2240   //        str w1, [x0, #4]
2241   //        lsr w2, w1, #16
2242   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2243        MBBI != E;) {
2244     if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
2245       Modified = true;
2246     else
2247       ++MBBI;
2248   }
2249   // 2) Merge adjacent zero stores into a wider store.
2250   //      e.g.,
2251   //        strh wzr, [x0]
2252   //        strh wzr, [x0, #2]
2253   //        ; becomes
2254   //        str wzr, [x0]
2255   //      e.g.,
2256   //        str wzr, [x0]
2257   //        str wzr, [x0, #4]
2258   //        ; becomes
2259   //        str xzr, [x0]
2260   if (EnableNarrowZeroStOpt)
2261     for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2262          MBBI != E;) {
2263       if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
2264         Modified = true;
2265       else
2266         ++MBBI;
2267     }
2268   // 3) Find loads and stores that can be merged into a single load or store
2269   //    pair instruction.
2270   //      e.g.,
2271   //        ldr x0, [x2]
2272   //        ldr x1, [x2, #8]
2273   //        ; becomes
2274   //        ldp x0, x1, [x2]
2275 
2276   if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2277     DefinedInBB.clear();
2278     DefinedInBB.addLiveIns(MBB);
2279   }
2280 
2281   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2282        MBBI != E;) {
2283     // Track currently live registers up to this point, to help with
2284     // searching for a rename register on demand.
2285     updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
2286     if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
2287       Modified = true;
2288     else
2289       ++MBBI;
2290   }
2291   // 4) Find base register updates that can be merged into the load or store
2292   //    as a base-reg writeback.
2293   //      e.g.,
2294   //        ldr x0, [x2]
2295   //        add x2, x2, #4
2296   //        ; becomes
2297   //        ldr x0, [x2], #4
2298   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2299        MBBI != E;) {
2300     if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
2301       Modified = true;
2302     else
2303       ++MBBI;
2304   }
2305 
2306   return Modified;
2307 }
2308 
2309 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2310   if (skipFunction(Fn.getFunction()))
2311     return false;
2312 
2313   Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
2314   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
2315   TRI = Subtarget->getRegisterInfo();
2316   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2317 
2318   // Resize the modified and used register unit trackers.  We do this once
2319   // per function and then clear the register units each time we optimize a load
2320   // or store.
2321   ModifiedRegUnits.init(*TRI);
2322   UsedRegUnits.init(*TRI);
2323   DefinedInBB.init(*TRI);
2324 
2325   bool Modified = false;
2326   bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
2327   for (auto &MBB : Fn) {
2328     auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
2329     Modified |= M;
2330   }
2331 
2332   return Modified;
2333 }
2334 
2335 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2336 // stores near one another?  Note: The pre-RA instruction scheduler already has
2337 // hooks to try and schedule pairable loads/stores together to improve pairing
2338 // opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
2339 
2340 // FIXME: When pairing store instructions it's very possible for this pass to
2341 // hoist a store with a KILL marker above another use (without a KILL marker).
2342 // The resulting IR is invalid, but nothing uses the KILL markers after this
2343 // pass, so it's never caused a problem in practice.
2344 
2345 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
2346 /// load / store optimization pass.
2347 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
2348   return new AArch64LoadStoreOpt();
2349 }
2350