xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (revision 63f537551380d2dab29fa402ad1269feae17e594)
1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
11 //
12 // The pass runs after the PrologEpilogInserter where we emit the CFI
13 // instructions. In order to preserve the correctness of the unwind informaiton,
14 // the pass should not change the order of any two instructions, one of which
15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16 // to unwind information.
17 //
18 //===----------------------------------------------------------------------===//
19 
20 #include "AArch64InstrInfo.h"
21 #include "AArch64MachineFunctionInfo.h"
22 #include "AArch64Subtarget.h"
23 #include "MCTargetDesc/AArch64AddressingModes.h"
24 #include "llvm/ADT/BitVector.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/ADT/iterator_range.h"
29 #include "llvm/Analysis/AliasAnalysis.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineFunctionPass.h"
33 #include "llvm/CodeGen/MachineInstr.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineOperand.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/TargetRegisterInfo.h"
38 #include "llvm/IR/DebugLoc.h"
39 #include "llvm/MC/MCAsmInfo.h"
40 #include "llvm/MC/MCDwarf.h"
41 #include "llvm/MC/MCRegisterInfo.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/DebugCounter.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/raw_ostream.h"
48 #include <cassert>
49 #include <cstdint>
50 #include <functional>
51 #include <iterator>
52 #include <limits>
53 #include <optional>
54 
55 using namespace llvm;
56 
57 #define DEBUG_TYPE "aarch64-ldst-opt"
58 
59 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
60 STATISTIC(NumPostFolded, "Number of post-index updates folded");
61 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
62 STATISTIC(NumUnscaledPairCreated,
63           "Number of load/store from unscaled generated");
64 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
65 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
66 
67 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
68               "Controls which pairs are considered for renaming");
69 
70 // The LdStLimit limits how far we search for load/store pairs.
71 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
72                                    cl::init(20), cl::Hidden);
73 
74 // The UpdateLimit limits how far we search for update instructions when we form
75 // pre-/post-index instructions.
76 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
77                                      cl::Hidden);
78 
79 // Enable register renaming to find additional store pairing opportunities.
80 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
81                                     cl::init(true), cl::Hidden);
82 
83 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
84 
85 namespace {
86 
87 using LdStPairFlags = struct LdStPairFlags {
88   // If a matching instruction is found, MergeForward is set to true if the
89   // merge is to remove the first instruction and replace the second with
90   // a pair-wise insn, and false if the reverse is true.
91   bool MergeForward = false;
92 
93   // SExtIdx gives the index of the result of the load pair that must be
94   // extended. The value of SExtIdx assumes that the paired load produces the
95   // value in this order: (I, returned iterator), i.e., -1 means no value has
96   // to be extended, 0 means I, and 1 means the returned iterator.
97   int SExtIdx = -1;
98 
99   // If not none, RenameReg can be used to rename the result register of the
100   // first store in a pair. Currently this only works when merging stores
101   // forward.
102   std::optional<MCPhysReg> RenameReg;
103 
104   LdStPairFlags() = default;
105 
106   void setMergeForward(bool V = true) { MergeForward = V; }
107   bool getMergeForward() const { return MergeForward; }
108 
109   void setSExtIdx(int V) { SExtIdx = V; }
110   int getSExtIdx() const { return SExtIdx; }
111 
112   void setRenameReg(MCPhysReg R) { RenameReg = R; }
113   void clearRenameReg() { RenameReg = std::nullopt; }
114   std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
115 };
116 
117 struct AArch64LoadStoreOpt : public MachineFunctionPass {
118   static char ID;
119 
120   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
121     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
122   }
123 
124   AliasAnalysis *AA;
125   const AArch64InstrInfo *TII;
126   const TargetRegisterInfo *TRI;
127   const AArch64Subtarget *Subtarget;
128 
129   // Track which register units have been modified and used.
130   LiveRegUnits ModifiedRegUnits, UsedRegUnits;
131   LiveRegUnits DefinedInBB;
132 
133   void getAnalysisUsage(AnalysisUsage &AU) const override {
134     AU.addRequired<AAResultsWrapperPass>();
135     MachineFunctionPass::getAnalysisUsage(AU);
136   }
137 
138   // Scan the instructions looking for a load/store that can be combined
139   // with the current instruction into a load/store pair.
140   // Return the matching instruction if one is found, else MBB->end().
141   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
142                                                LdStPairFlags &Flags,
143                                                unsigned Limit,
144                                                bool FindNarrowMerge);
145 
146   // Scan the instructions looking for a store that writes to the address from
147   // which the current load instruction reads. Return true if one is found.
148   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
149                          MachineBasicBlock::iterator &StoreI);
150 
151   // Merge the two instructions indicated into a wider narrow store instruction.
152   MachineBasicBlock::iterator
153   mergeNarrowZeroStores(MachineBasicBlock::iterator I,
154                         MachineBasicBlock::iterator MergeMI,
155                         const LdStPairFlags &Flags);
156 
157   // Merge the two instructions indicated into a single pair-wise instruction.
158   MachineBasicBlock::iterator
159   mergePairedInsns(MachineBasicBlock::iterator I,
160                    MachineBasicBlock::iterator Paired,
161                    const LdStPairFlags &Flags);
162 
163   // Promote the load that reads directly from the address stored to.
164   MachineBasicBlock::iterator
165   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
166                        MachineBasicBlock::iterator StoreI);
167 
168   // Scan the instruction list to find a base register update that can
169   // be combined with the current instruction (a load or store) using
170   // pre or post indexed addressing with writeback. Scan forwards.
171   MachineBasicBlock::iterator
172   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
173                                 int UnscaledOffset, unsigned Limit);
174 
175   // Scan the instruction list to find a base register update that can
176   // be combined with the current instruction (a load or store) using
177   // pre or post indexed addressing with writeback. Scan backwards.
178   MachineBasicBlock::iterator
179   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
180 
181   // Find an instruction that updates the base register of the ld/st
182   // instruction.
183   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
184                             unsigned BaseReg, int Offset);
185 
186   // Merge a pre- or post-index base register update into a ld/st instruction.
187   MachineBasicBlock::iterator
188   mergeUpdateInsn(MachineBasicBlock::iterator I,
189                   MachineBasicBlock::iterator Update, bool IsPreIdx);
190 
191   // Find and merge zero store instructions.
192   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
193 
194   // Find and pair ldr/str instructions.
195   bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
196 
197   // Find and promote load instructions which read directly from store.
198   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
199 
200   // Find and merge a base register updates before or after a ld/st instruction.
201   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
202 
203   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
204 
205   bool runOnMachineFunction(MachineFunction &Fn) override;
206 
207   MachineFunctionProperties getRequiredProperties() const override {
208     return MachineFunctionProperties().set(
209         MachineFunctionProperties::Property::NoVRegs);
210   }
211 
212   StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
213 };
214 
215 char AArch64LoadStoreOpt::ID = 0;
216 
217 } // end anonymous namespace
218 
219 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
220                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
221 
222 static bool isNarrowStore(unsigned Opc) {
223   switch (Opc) {
224   default:
225     return false;
226   case AArch64::STRBBui:
227   case AArch64::STURBBi:
228   case AArch64::STRHHui:
229   case AArch64::STURHHi:
230     return true;
231   }
232 }
233 
234 // These instruction set memory tag and either keep memory contents unchanged or
235 // set it to zero, ignoring the address part of the source register.
236 static bool isTagStore(const MachineInstr &MI) {
237   switch (MI.getOpcode()) {
238   default:
239     return false;
240   case AArch64::STGOffset:
241   case AArch64::STZGOffset:
242   case AArch64::ST2GOffset:
243   case AArch64::STZ2GOffset:
244     return true;
245   }
246 }
247 
248 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
249                                          bool *IsValidLdStrOpc = nullptr) {
250   if (IsValidLdStrOpc)
251     *IsValidLdStrOpc = true;
252   switch (Opc) {
253   default:
254     if (IsValidLdStrOpc)
255       *IsValidLdStrOpc = false;
256     return std::numeric_limits<unsigned>::max();
257   case AArch64::STRDui:
258   case AArch64::STURDi:
259   case AArch64::STRDpre:
260   case AArch64::STRQui:
261   case AArch64::STURQi:
262   case AArch64::STRQpre:
263   case AArch64::STRBBui:
264   case AArch64::STURBBi:
265   case AArch64::STRHHui:
266   case AArch64::STURHHi:
267   case AArch64::STRWui:
268   case AArch64::STRWpre:
269   case AArch64::STURWi:
270   case AArch64::STRXui:
271   case AArch64::STRXpre:
272   case AArch64::STURXi:
273   case AArch64::LDRDui:
274   case AArch64::LDURDi:
275   case AArch64::LDRDpre:
276   case AArch64::LDRQui:
277   case AArch64::LDURQi:
278   case AArch64::LDRQpre:
279   case AArch64::LDRWui:
280   case AArch64::LDURWi:
281   case AArch64::LDRWpre:
282   case AArch64::LDRXui:
283   case AArch64::LDURXi:
284   case AArch64::LDRXpre:
285   case AArch64::STRSui:
286   case AArch64::STURSi:
287   case AArch64::STRSpre:
288   case AArch64::LDRSui:
289   case AArch64::LDURSi:
290   case AArch64::LDRSpre:
291     return Opc;
292   case AArch64::LDRSWui:
293     return AArch64::LDRWui;
294   case AArch64::LDURSWi:
295     return AArch64::LDURWi;
296   }
297 }
298 
299 static unsigned getMatchingWideOpcode(unsigned Opc) {
300   switch (Opc) {
301   default:
302     llvm_unreachable("Opcode has no wide equivalent!");
303   case AArch64::STRBBui:
304     return AArch64::STRHHui;
305   case AArch64::STRHHui:
306     return AArch64::STRWui;
307   case AArch64::STURBBi:
308     return AArch64::STURHHi;
309   case AArch64::STURHHi:
310     return AArch64::STURWi;
311   case AArch64::STURWi:
312     return AArch64::STURXi;
313   case AArch64::STRWui:
314     return AArch64::STRXui;
315   }
316 }
317 
318 static unsigned getMatchingPairOpcode(unsigned Opc) {
319   switch (Opc) {
320   default:
321     llvm_unreachable("Opcode has no pairwise equivalent!");
322   case AArch64::STRSui:
323   case AArch64::STURSi:
324     return AArch64::STPSi;
325   case AArch64::STRSpre:
326     return AArch64::STPSpre;
327   case AArch64::STRDui:
328   case AArch64::STURDi:
329     return AArch64::STPDi;
330   case AArch64::STRDpre:
331     return AArch64::STPDpre;
332   case AArch64::STRQui:
333   case AArch64::STURQi:
334     return AArch64::STPQi;
335   case AArch64::STRQpre:
336     return AArch64::STPQpre;
337   case AArch64::STRWui:
338   case AArch64::STURWi:
339     return AArch64::STPWi;
340   case AArch64::STRWpre:
341     return AArch64::STPWpre;
342   case AArch64::STRXui:
343   case AArch64::STURXi:
344     return AArch64::STPXi;
345   case AArch64::STRXpre:
346     return AArch64::STPXpre;
347   case AArch64::LDRSui:
348   case AArch64::LDURSi:
349     return AArch64::LDPSi;
350   case AArch64::LDRSpre:
351     return AArch64::LDPSpre;
352   case AArch64::LDRDui:
353   case AArch64::LDURDi:
354     return AArch64::LDPDi;
355   case AArch64::LDRDpre:
356     return AArch64::LDPDpre;
357   case AArch64::LDRQui:
358   case AArch64::LDURQi:
359     return AArch64::LDPQi;
360   case AArch64::LDRQpre:
361     return AArch64::LDPQpre;
362   case AArch64::LDRWui:
363   case AArch64::LDURWi:
364     return AArch64::LDPWi;
365   case AArch64::LDRWpre:
366     return AArch64::LDPWpre;
367   case AArch64::LDRXui:
368   case AArch64::LDURXi:
369     return AArch64::LDPXi;
370   case AArch64::LDRXpre:
371     return AArch64::LDPXpre;
372   case AArch64::LDRSWui:
373   case AArch64::LDURSWi:
374     return AArch64::LDPSWi;
375   }
376 }
377 
378 static unsigned isMatchingStore(MachineInstr &LoadInst,
379                                 MachineInstr &StoreInst) {
380   unsigned LdOpc = LoadInst.getOpcode();
381   unsigned StOpc = StoreInst.getOpcode();
382   switch (LdOpc) {
383   default:
384     llvm_unreachable("Unsupported load instruction!");
385   case AArch64::LDRBBui:
386     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
387            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
388   case AArch64::LDURBBi:
389     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
390            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
391   case AArch64::LDRHHui:
392     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
393            StOpc == AArch64::STRXui;
394   case AArch64::LDURHHi:
395     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
396            StOpc == AArch64::STURXi;
397   case AArch64::LDRWui:
398     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
399   case AArch64::LDURWi:
400     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
401   case AArch64::LDRXui:
402     return StOpc == AArch64::STRXui;
403   case AArch64::LDURXi:
404     return StOpc == AArch64::STURXi;
405   }
406 }
407 
408 static unsigned getPreIndexedOpcode(unsigned Opc) {
409   // FIXME: We don't currently support creating pre-indexed loads/stores when
410   // the load or store is the unscaled version.  If we decide to perform such an
411   // optimization in the future the cases for the unscaled loads/stores will
412   // need to be added here.
413   switch (Opc) {
414   default:
415     llvm_unreachable("Opcode has no pre-indexed equivalent!");
416   case AArch64::STRSui:
417     return AArch64::STRSpre;
418   case AArch64::STRDui:
419     return AArch64::STRDpre;
420   case AArch64::STRQui:
421     return AArch64::STRQpre;
422   case AArch64::STRBBui:
423     return AArch64::STRBBpre;
424   case AArch64::STRHHui:
425     return AArch64::STRHHpre;
426   case AArch64::STRWui:
427     return AArch64::STRWpre;
428   case AArch64::STRXui:
429     return AArch64::STRXpre;
430   case AArch64::LDRSui:
431     return AArch64::LDRSpre;
432   case AArch64::LDRDui:
433     return AArch64::LDRDpre;
434   case AArch64::LDRQui:
435     return AArch64::LDRQpre;
436   case AArch64::LDRBBui:
437     return AArch64::LDRBBpre;
438   case AArch64::LDRHHui:
439     return AArch64::LDRHHpre;
440   case AArch64::LDRWui:
441     return AArch64::LDRWpre;
442   case AArch64::LDRXui:
443     return AArch64::LDRXpre;
444   case AArch64::LDRSWui:
445     return AArch64::LDRSWpre;
446   case AArch64::LDPSi:
447     return AArch64::LDPSpre;
448   case AArch64::LDPSWi:
449     return AArch64::LDPSWpre;
450   case AArch64::LDPDi:
451     return AArch64::LDPDpre;
452   case AArch64::LDPQi:
453     return AArch64::LDPQpre;
454   case AArch64::LDPWi:
455     return AArch64::LDPWpre;
456   case AArch64::LDPXi:
457     return AArch64::LDPXpre;
458   case AArch64::STPSi:
459     return AArch64::STPSpre;
460   case AArch64::STPDi:
461     return AArch64::STPDpre;
462   case AArch64::STPQi:
463     return AArch64::STPQpre;
464   case AArch64::STPWi:
465     return AArch64::STPWpre;
466   case AArch64::STPXi:
467     return AArch64::STPXpre;
468   case AArch64::STGOffset:
469     return AArch64::STGPreIndex;
470   case AArch64::STZGOffset:
471     return AArch64::STZGPreIndex;
472   case AArch64::ST2GOffset:
473     return AArch64::ST2GPreIndex;
474   case AArch64::STZ2GOffset:
475     return AArch64::STZ2GPreIndex;
476   case AArch64::STGPi:
477     return AArch64::STGPpre;
478   }
479 }
480 
481 static unsigned getPostIndexedOpcode(unsigned Opc) {
482   switch (Opc) {
483   default:
484     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
485   case AArch64::STRSui:
486   case AArch64::STURSi:
487     return AArch64::STRSpost;
488   case AArch64::STRDui:
489   case AArch64::STURDi:
490     return AArch64::STRDpost;
491   case AArch64::STRQui:
492   case AArch64::STURQi:
493     return AArch64::STRQpost;
494   case AArch64::STRBBui:
495     return AArch64::STRBBpost;
496   case AArch64::STRHHui:
497     return AArch64::STRHHpost;
498   case AArch64::STRWui:
499   case AArch64::STURWi:
500     return AArch64::STRWpost;
501   case AArch64::STRXui:
502   case AArch64::STURXi:
503     return AArch64::STRXpost;
504   case AArch64::LDRSui:
505   case AArch64::LDURSi:
506     return AArch64::LDRSpost;
507   case AArch64::LDRDui:
508   case AArch64::LDURDi:
509     return AArch64::LDRDpost;
510   case AArch64::LDRQui:
511   case AArch64::LDURQi:
512     return AArch64::LDRQpost;
513   case AArch64::LDRBBui:
514     return AArch64::LDRBBpost;
515   case AArch64::LDRHHui:
516     return AArch64::LDRHHpost;
517   case AArch64::LDRWui:
518   case AArch64::LDURWi:
519     return AArch64::LDRWpost;
520   case AArch64::LDRXui:
521   case AArch64::LDURXi:
522     return AArch64::LDRXpost;
523   case AArch64::LDRSWui:
524     return AArch64::LDRSWpost;
525   case AArch64::LDPSi:
526     return AArch64::LDPSpost;
527   case AArch64::LDPSWi:
528     return AArch64::LDPSWpost;
529   case AArch64::LDPDi:
530     return AArch64::LDPDpost;
531   case AArch64::LDPQi:
532     return AArch64::LDPQpost;
533   case AArch64::LDPWi:
534     return AArch64::LDPWpost;
535   case AArch64::LDPXi:
536     return AArch64::LDPXpost;
537   case AArch64::STPSi:
538     return AArch64::STPSpost;
539   case AArch64::STPDi:
540     return AArch64::STPDpost;
541   case AArch64::STPQi:
542     return AArch64::STPQpost;
543   case AArch64::STPWi:
544     return AArch64::STPWpost;
545   case AArch64::STPXi:
546     return AArch64::STPXpost;
547   case AArch64::STGOffset:
548     return AArch64::STGPostIndex;
549   case AArch64::STZGOffset:
550     return AArch64::STZGPostIndex;
551   case AArch64::ST2GOffset:
552     return AArch64::ST2GPostIndex;
553   case AArch64::STZ2GOffset:
554     return AArch64::STZ2GPostIndex;
555   case AArch64::STGPi:
556     return AArch64::STGPpost;
557   }
558 }
559 
560 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
561 
562   unsigned OpcA = FirstMI.getOpcode();
563   unsigned OpcB = MI.getOpcode();
564 
565   switch (OpcA) {
566   default:
567     return false;
568   case AArch64::STRSpre:
569     return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
570   case AArch64::STRDpre:
571     return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
572   case AArch64::STRQpre:
573     return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
574   case AArch64::STRWpre:
575     return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
576   case AArch64::STRXpre:
577     return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
578   case AArch64::LDRSpre:
579     return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
580   case AArch64::LDRDpre:
581     return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
582   case AArch64::LDRQpre:
583     return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
584   case AArch64::LDRWpre:
585     return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
586   case AArch64::LDRXpre:
587     return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
588   }
589 }
590 
591 // Returns the scale and offset range of pre/post indexed variants of MI.
592 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
593                                        int &MinOffset, int &MaxOffset) {
594   bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
595   bool IsTagStore = isTagStore(MI);
596   // ST*G and all paired ldst have the same scale in pre/post-indexed variants
597   // as in the "unsigned offset" variant.
598   // All other pre/post indexed ldst instructions are unscaled.
599   Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
600 
601   if (IsPaired) {
602     MinOffset = -64;
603     MaxOffset = 63;
604   } else {
605     MinOffset = -256;
606     MaxOffset = 255;
607   }
608 }
609 
610 static MachineOperand &getLdStRegOp(MachineInstr &MI,
611                                     unsigned PairedRegOp = 0) {
612   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
613   bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
614   if (IsPreLdSt)
615     PairedRegOp += 1;
616   unsigned Idx =
617       AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
618   return MI.getOperand(Idx);
619 }
620 
621 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
622                                   MachineInstr &StoreInst,
623                                   const AArch64InstrInfo *TII) {
624   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
625   int LoadSize = TII->getMemScale(LoadInst);
626   int StoreSize = TII->getMemScale(StoreInst);
627   int UnscaledStOffset =
628       TII->hasUnscaledLdStOffset(StoreInst)
629           ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
630           : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
631   int UnscaledLdOffset =
632       TII->hasUnscaledLdStOffset(LoadInst)
633           ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
634           : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
635   return (UnscaledStOffset <= UnscaledLdOffset) &&
636          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
637 }
638 
639 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
640   unsigned Opc = MI.getOpcode();
641   return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
642           isNarrowStore(Opc)) &&
643          getLdStRegOp(MI).getReg() == AArch64::WZR;
644 }
645 
646 static bool isPromotableLoadFromStore(MachineInstr &MI) {
647   switch (MI.getOpcode()) {
648   default:
649     return false;
650   // Scaled instructions.
651   case AArch64::LDRBBui:
652   case AArch64::LDRHHui:
653   case AArch64::LDRWui:
654   case AArch64::LDRXui:
655   // Unscaled instructions.
656   case AArch64::LDURBBi:
657   case AArch64::LDURHHi:
658   case AArch64::LDURWi:
659   case AArch64::LDURXi:
660     return true;
661   }
662 }
663 
664 static bool isMergeableLdStUpdate(MachineInstr &MI) {
665   unsigned Opc = MI.getOpcode();
666   switch (Opc) {
667   default:
668     return false;
669   // Scaled instructions.
670   case AArch64::STRSui:
671   case AArch64::STRDui:
672   case AArch64::STRQui:
673   case AArch64::STRXui:
674   case AArch64::STRWui:
675   case AArch64::STRHHui:
676   case AArch64::STRBBui:
677   case AArch64::LDRSui:
678   case AArch64::LDRDui:
679   case AArch64::LDRQui:
680   case AArch64::LDRXui:
681   case AArch64::LDRWui:
682   case AArch64::LDRHHui:
683   case AArch64::LDRBBui:
684   case AArch64::STGOffset:
685   case AArch64::STZGOffset:
686   case AArch64::ST2GOffset:
687   case AArch64::STZ2GOffset:
688   case AArch64::STGPi:
689   // Unscaled instructions.
690   case AArch64::STURSi:
691   case AArch64::STURDi:
692   case AArch64::STURQi:
693   case AArch64::STURWi:
694   case AArch64::STURXi:
695   case AArch64::LDURSi:
696   case AArch64::LDURDi:
697   case AArch64::LDURQi:
698   case AArch64::LDURWi:
699   case AArch64::LDURXi:
700   // Paired instructions.
701   case AArch64::LDPSi:
702   case AArch64::LDPSWi:
703   case AArch64::LDPDi:
704   case AArch64::LDPQi:
705   case AArch64::LDPWi:
706   case AArch64::LDPXi:
707   case AArch64::STPSi:
708   case AArch64::STPDi:
709   case AArch64::STPQi:
710   case AArch64::STPWi:
711   case AArch64::STPXi:
712     // Make sure this is a reg+imm (as opposed to an address reloc).
713     if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
714       return false;
715 
716     return true;
717   }
718 }
719 
720 MachineBasicBlock::iterator
721 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
722                                            MachineBasicBlock::iterator MergeMI,
723                                            const LdStPairFlags &Flags) {
724   assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
725          "Expected promotable zero stores.");
726 
727   MachineBasicBlock::iterator E = I->getParent()->end();
728   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
729   // If NextI is the second of the two instructions to be merged, we need
730   // to skip one further. Either way we merge will invalidate the iterator,
731   // and we don't need to scan the new instruction, as it's a pairwise
732   // instruction, which we're not considering for further action anyway.
733   if (NextI == MergeMI)
734     NextI = next_nodbg(NextI, E);
735 
736   unsigned Opc = I->getOpcode();
737   bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
738   int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I);
739 
740   bool MergeForward = Flags.getMergeForward();
741   // Insert our new paired instruction after whichever of the paired
742   // instructions MergeForward indicates.
743   MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
744   // Also based on MergeForward is from where we copy the base register operand
745   // so we get the flags compatible with the input code.
746   const MachineOperand &BaseRegOp =
747       MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
748                    : AArch64InstrInfo::getLdStBaseOp(*I);
749 
750   // Which register is Rt and which is Rt2 depends on the offset order.
751   MachineInstr *RtMI;
752   if (AArch64InstrInfo::getLdStOffsetOp(*I).getImm() ==
753       AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
754     RtMI = &*MergeMI;
755   else
756     RtMI = &*I;
757 
758   int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
759   // Change the scaled offset from small to large type.
760   if (IsScaled) {
761     assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
762     OffsetImm /= 2;
763   }
764 
765   // Construct the new instruction.
766   DebugLoc DL = I->getDebugLoc();
767   MachineBasicBlock *MBB = I->getParent();
768   MachineInstrBuilder MIB;
769   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
770             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
771             .add(BaseRegOp)
772             .addImm(OffsetImm)
773             .cloneMergedMemRefs({&*I, &*MergeMI})
774             .setMIFlags(I->mergeFlagsWith(*MergeMI));
775   (void)MIB;
776 
777   LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
778   LLVM_DEBUG(I->print(dbgs()));
779   LLVM_DEBUG(dbgs() << "    ");
780   LLVM_DEBUG(MergeMI->print(dbgs()));
781   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
782   LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
783   LLVM_DEBUG(dbgs() << "\n");
784 
785   // Erase the old instructions.
786   I->eraseFromParent();
787   MergeMI->eraseFromParent();
788   return NextI;
789 }
790 
791 // Apply Fn to all instructions between MI and the beginning of the block, until
792 // a def for DefReg is reached. Returns true, iff Fn returns true for all
793 // visited instructions. Stop after visiting Limit iterations.
794 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
795                               const TargetRegisterInfo *TRI, unsigned Limit,
796                               std::function<bool(MachineInstr &, bool)> &Fn) {
797   auto MBB = MI.getParent();
798   for (MachineInstr &I :
799        instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
800     if (!Limit)
801       return false;
802     --Limit;
803 
804     bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
805       return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
806              TRI->regsOverlap(MOP.getReg(), DefReg);
807     });
808     if (!Fn(I, isDef))
809       return false;
810     if (isDef)
811       break;
812   }
813   return true;
814 }
815 
816 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
817                                    const TargetRegisterInfo *TRI) {
818 
819   for (const MachineOperand &MOP : phys_regs_and_masks(MI))
820     if (MOP.isReg() && MOP.isKill())
821       Units.removeReg(MOP.getReg());
822 
823   for (const MachineOperand &MOP : phys_regs_and_masks(MI))
824     if (MOP.isReg() && !MOP.isKill())
825       Units.addReg(MOP.getReg());
826 }
827 
828 MachineBasicBlock::iterator
829 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
830                                       MachineBasicBlock::iterator Paired,
831                                       const LdStPairFlags &Flags) {
832   MachineBasicBlock::iterator E = I->getParent()->end();
833   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
834   // If NextI is the second of the two instructions to be merged, we need
835   // to skip one further. Either way we merge will invalidate the iterator,
836   // and we don't need to scan the new instruction, as it's a pairwise
837   // instruction, which we're not considering for further action anyway.
838   if (NextI == Paired)
839     NextI = next_nodbg(NextI, E);
840 
841   int SExtIdx = Flags.getSExtIdx();
842   unsigned Opc =
843       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
844   bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
845   int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
846 
847   bool MergeForward = Flags.getMergeForward();
848 
849   std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
850   if (MergeForward && RenameReg) {
851     MCRegister RegToRename = getLdStRegOp(*I).getReg();
852     DefinedInBB.addReg(*RenameReg);
853 
854     // Return the sub/super register for RenameReg, matching the size of
855     // OriginalReg.
856     auto GetMatchingSubReg = [this,
857                               RenameReg](MCPhysReg OriginalReg) -> MCPhysReg {
858       for (MCPhysReg SubOrSuper : TRI->sub_and_superregs_inclusive(*RenameReg))
859         if (TRI->getMinimalPhysRegClass(OriginalReg) ==
860             TRI->getMinimalPhysRegClass(SubOrSuper))
861           return SubOrSuper;
862       llvm_unreachable("Should have found matching sub or super register!");
863     };
864 
865     std::function<bool(MachineInstr &, bool)> UpdateMIs =
866         [this, RegToRename, GetMatchingSubReg](MachineInstr &MI, bool IsDef) {
867           if (IsDef) {
868             bool SeenDef = false;
869             for (auto &MOP : MI.operands()) {
870               // Rename the first explicit definition and all implicit
871               // definitions matching RegToRename.
872               if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
873                   (!SeenDef || (MOP.isDef() && MOP.isImplicit())) &&
874                   TRI->regsOverlap(MOP.getReg(), RegToRename)) {
875                 assert((MOP.isImplicit() ||
876                         (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
877                        "Need renamable operands");
878                 MOP.setReg(GetMatchingSubReg(MOP.getReg()));
879                 SeenDef = true;
880               }
881             }
882           } else {
883             for (auto &MOP : MI.operands()) {
884               if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
885                   TRI->regsOverlap(MOP.getReg(), RegToRename)) {
886                 assert((MOP.isImplicit() ||
887                         (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
888                            "Need renamable operands");
889                 MOP.setReg(GetMatchingSubReg(MOP.getReg()));
890               }
891             }
892           }
893           LLVM_DEBUG(dbgs() << "Renamed " << MI << "\n");
894           return true;
895         };
896     forAllMIsUntilDef(*I, RegToRename, TRI, LdStLimit, UpdateMIs);
897 
898 #if !defined(NDEBUG)
899     // Make sure the register used for renaming is not used between the paired
900     // instructions. That would trash the content before the new paired
901     // instruction.
902     for (auto &MI :
903          iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
904              std::next(I), std::next(Paired)))
905       assert(all_of(MI.operands(),
906                     [this, &RenameReg](const MachineOperand &MOP) {
907                       return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
908                              MOP.isUndef() ||
909                              !TRI->regsOverlap(MOP.getReg(), *RenameReg);
910                     }) &&
911              "Rename register used between paired instruction, trashing the "
912              "content");
913 #endif
914   }
915 
916   // Insert our new paired instruction after whichever of the paired
917   // instructions MergeForward indicates.
918   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
919   // Also based on MergeForward is from where we copy the base register operand
920   // so we get the flags compatible with the input code.
921   const MachineOperand &BaseRegOp =
922       MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
923                    : AArch64InstrInfo::getLdStBaseOp(*I);
924 
925   int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
926   int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
927   bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
928   if (IsUnscaled != PairedIsUnscaled) {
929     // We're trying to pair instructions that differ in how they are scaled.  If
930     // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
931     // the opposite (i.e., make Paired's offset unscaled).
932     int MemSize = TII->getMemScale(*Paired);
933     if (PairedIsUnscaled) {
934       // If the unscaled offset isn't a multiple of the MemSize, we can't
935       // pair the operations together.
936       assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
937              "Offset should be a multiple of the stride!");
938       PairedOffset /= MemSize;
939     } else {
940       PairedOffset *= MemSize;
941     }
942   }
943 
944   // Which register is Rt and which is Rt2 depends on the offset order.
945   // However, for pre load/stores the Rt should be the one of the pre
946   // load/store.
947   MachineInstr *RtMI, *Rt2MI;
948   if (Offset == PairedOffset + OffsetStride &&
949       !AArch64InstrInfo::isPreLdSt(*I)) {
950     RtMI = &*Paired;
951     Rt2MI = &*I;
952     // Here we swapped the assumption made for SExtIdx.
953     // I.e., we turn ldp I, Paired into ldp Paired, I.
954     // Update the index accordingly.
955     if (SExtIdx != -1)
956       SExtIdx = (SExtIdx + 1) % 2;
957   } else {
958     RtMI = &*I;
959     Rt2MI = &*Paired;
960   }
961   int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
962   // Scale the immediate offset, if necessary.
963   if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
964     assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
965            "Unscaled offset cannot be scaled.");
966     OffsetImm /= TII->getMemScale(*RtMI);
967   }
968 
969   // Construct the new instruction.
970   MachineInstrBuilder MIB;
971   DebugLoc DL = I->getDebugLoc();
972   MachineBasicBlock *MBB = I->getParent();
973   MachineOperand RegOp0 = getLdStRegOp(*RtMI);
974   MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
975   // Kill flags may become invalid when moving stores for pairing.
976   if (RegOp0.isUse()) {
977     if (!MergeForward) {
978       // Clear kill flags on store if moving upwards. Example:
979       //   STRWui %w0, ...
980       //   USE %w1
981       //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
982       RegOp0.setIsKill(false);
983       RegOp1.setIsKill(false);
984     } else {
985       // Clear kill flags of the first stores register. Example:
986       //   STRWui %w1, ...
987       //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
988       //   STRW %w0
989       Register Reg = getLdStRegOp(*I).getReg();
990       for (MachineInstr &MI : make_range(std::next(I), Paired))
991         MI.clearRegisterKills(Reg, TRI);
992     }
993   }
994 
995   unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
996   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
997 
998   // Adds the pre-index operand for pre-indexed ld/st pairs.
999   if (AArch64InstrInfo::isPreLdSt(*RtMI))
1000     MIB.addReg(BaseRegOp.getReg(), RegState::Define);
1001 
1002   MIB.add(RegOp0)
1003       .add(RegOp1)
1004       .add(BaseRegOp)
1005       .addImm(OffsetImm)
1006       .cloneMergedMemRefs({&*I, &*Paired})
1007       .setMIFlags(I->mergeFlagsWith(*Paired));
1008 
1009   (void)MIB;
1010 
1011   LLVM_DEBUG(
1012       dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
1013   LLVM_DEBUG(I->print(dbgs()));
1014   LLVM_DEBUG(dbgs() << "    ");
1015   LLVM_DEBUG(Paired->print(dbgs()));
1016   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
1017   if (SExtIdx != -1) {
1018     // Generate the sign extension for the proper result of the ldp.
1019     // I.e., with X1, that would be:
1020     // %w1 = KILL %w1, implicit-def %x1
1021     // %x1 = SBFMXri killed %x1, 0, 31
1022     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
1023     // Right now, DstMO has the extended register, since it comes from an
1024     // extended opcode.
1025     Register DstRegX = DstMO.getReg();
1026     // Get the W variant of that register.
1027     Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
1028     // Update the result of LDP to use the W instead of the X variant.
1029     DstMO.setReg(DstRegW);
1030     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1031     LLVM_DEBUG(dbgs() << "\n");
1032     // Make the machine verifier happy by providing a definition for
1033     // the X register.
1034     // Insert this definition right after the generated LDP, i.e., before
1035     // InsertionPoint.
1036     MachineInstrBuilder MIBKill =
1037         BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
1038             .addReg(DstRegW)
1039             .addReg(DstRegX, RegState::Define);
1040     MIBKill->getOperand(2).setImplicit();
1041     // Create the sign extension.
1042     MachineInstrBuilder MIBSXTW =
1043         BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
1044             .addReg(DstRegX)
1045             .addImm(0)
1046             .addImm(31);
1047     (void)MIBSXTW;
1048     LLVM_DEBUG(dbgs() << "  Extend operand:\n    ");
1049     LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1050   } else {
1051     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1052   }
1053   LLVM_DEBUG(dbgs() << "\n");
1054 
1055   if (MergeForward)
1056     for (const MachineOperand &MOP : phys_regs_and_masks(*I))
1057       if (MOP.isReg() && MOP.isKill())
1058         DefinedInBB.addReg(MOP.getReg());
1059 
1060   // Erase the old instructions.
1061   I->eraseFromParent();
1062   Paired->eraseFromParent();
1063 
1064   return NextI;
1065 }
1066 
1067 MachineBasicBlock::iterator
1068 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1069                                           MachineBasicBlock::iterator StoreI) {
1070   MachineBasicBlock::iterator NextI =
1071       next_nodbg(LoadI, LoadI->getParent()->end());
1072 
1073   int LoadSize = TII->getMemScale(*LoadI);
1074   int StoreSize = TII->getMemScale(*StoreI);
1075   Register LdRt = getLdStRegOp(*LoadI).getReg();
1076   const MachineOperand &StMO = getLdStRegOp(*StoreI);
1077   Register StRt = getLdStRegOp(*StoreI).getReg();
1078   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1079 
1080   assert((IsStoreXReg ||
1081           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1082          "Unexpected RegClass");
1083 
1084   MachineInstr *BitExtMI;
1085   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1086     // Remove the load, if the destination register of the loads is the same
1087     // register for stored value.
1088     if (StRt == LdRt && LoadSize == 8) {
1089       for (MachineInstr &MI : make_range(StoreI->getIterator(),
1090                                          LoadI->getIterator())) {
1091         if (MI.killsRegister(StRt, TRI)) {
1092           MI.clearRegisterKills(StRt, TRI);
1093           break;
1094         }
1095       }
1096       LLVM_DEBUG(dbgs() << "Remove load instruction:\n    ");
1097       LLVM_DEBUG(LoadI->print(dbgs()));
1098       LLVM_DEBUG(dbgs() << "\n");
1099       LoadI->eraseFromParent();
1100       return NextI;
1101     }
1102     // Replace the load with a mov if the load and store are in the same size.
1103     BitExtMI =
1104         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1105                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1106             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1107             .add(StMO)
1108             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1109             .setMIFlags(LoadI->getFlags());
1110   } else {
1111     // FIXME: Currently we disable this transformation in big-endian targets as
1112     // performance and correctness are verified only in little-endian.
1113     if (!Subtarget->isLittleEndian())
1114       return NextI;
1115     bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
1116     assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1117            "Unsupported ld/st match");
1118     assert(LoadSize <= StoreSize && "Invalid load size");
1119     int UnscaledLdOffset =
1120         IsUnscaled
1121             ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
1122             : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
1123     int UnscaledStOffset =
1124         IsUnscaled
1125             ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
1126             : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
1127     int Width = LoadSize * 8;
1128     Register DestReg =
1129         IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1130                           LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1131                     : LdRt;
1132 
1133     assert((UnscaledLdOffset >= UnscaledStOffset &&
1134             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1135            "Invalid offset");
1136 
1137     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1138     int Imms = Immr + Width - 1;
1139     if (UnscaledLdOffset == UnscaledStOffset) {
1140       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1141                                 | ((Immr) << 6)               // immr
1142                                 | ((Imms) << 0)               // imms
1143           ;
1144 
1145       BitExtMI =
1146           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1147                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1148                   DestReg)
1149               .add(StMO)
1150               .addImm(AndMaskEncoded)
1151               .setMIFlags(LoadI->getFlags());
1152     } else {
1153       BitExtMI =
1154           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1155                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1156                   DestReg)
1157               .add(StMO)
1158               .addImm(Immr)
1159               .addImm(Imms)
1160               .setMIFlags(LoadI->getFlags());
1161     }
1162   }
1163 
1164   // Clear kill flags between store and load.
1165   for (MachineInstr &MI : make_range(StoreI->getIterator(),
1166                                      BitExtMI->getIterator()))
1167     if (MI.killsRegister(StRt, TRI)) {
1168       MI.clearRegisterKills(StRt, TRI);
1169       break;
1170     }
1171 
1172   LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n    ");
1173   LLVM_DEBUG(StoreI->print(dbgs()));
1174   LLVM_DEBUG(dbgs() << "    ");
1175   LLVM_DEBUG(LoadI->print(dbgs()));
1176   LLVM_DEBUG(dbgs() << "  with instructions:\n    ");
1177   LLVM_DEBUG(StoreI->print(dbgs()));
1178   LLVM_DEBUG(dbgs() << "    ");
1179   LLVM_DEBUG((BitExtMI)->print(dbgs()));
1180   LLVM_DEBUG(dbgs() << "\n");
1181 
1182   // Erase the old instructions.
1183   LoadI->eraseFromParent();
1184   return NextI;
1185 }
1186 
1187 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1188   // Convert the byte-offset used by unscaled into an "element" offset used
1189   // by the scaled pair load/store instructions.
1190   if (IsUnscaled) {
1191     // If the byte-offset isn't a multiple of the stride, there's no point
1192     // trying to match it.
1193     if (Offset % OffsetStride)
1194       return false;
1195     Offset /= OffsetStride;
1196   }
1197   return Offset <= 63 && Offset >= -64;
1198 }
1199 
1200 // Do alignment, specialized to power of 2 and for signed ints,
1201 // avoiding having to do a C-style cast from uint_64t to int when
1202 // using alignTo from include/llvm/Support/MathExtras.h.
1203 // FIXME: Move this function to include/MathExtras.h?
1204 static int alignTo(int Num, int PowOf2) {
1205   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1206 }
1207 
1208 static bool mayAlias(MachineInstr &MIa,
1209                      SmallVectorImpl<MachineInstr *> &MemInsns,
1210                      AliasAnalysis *AA) {
1211   for (MachineInstr *MIb : MemInsns)
1212     if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
1213       return true;
1214 
1215   return false;
1216 }
1217 
1218 bool AArch64LoadStoreOpt::findMatchingStore(
1219     MachineBasicBlock::iterator I, unsigned Limit,
1220     MachineBasicBlock::iterator &StoreI) {
1221   MachineBasicBlock::iterator B = I->getParent()->begin();
1222   MachineBasicBlock::iterator MBBI = I;
1223   MachineInstr &LoadMI = *I;
1224   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
1225 
1226   // If the load is the first instruction in the block, there's obviously
1227   // not any matching store.
1228   if (MBBI == B)
1229     return false;
1230 
1231   // Track which register units have been modified and used between the first
1232   // insn and the second insn.
1233   ModifiedRegUnits.clear();
1234   UsedRegUnits.clear();
1235 
1236   unsigned Count = 0;
1237   do {
1238     MBBI = prev_nodbg(MBBI, B);
1239     MachineInstr &MI = *MBBI;
1240 
1241     // Don't count transient instructions towards the search limit since there
1242     // may be different numbers of them if e.g. debug information is present.
1243     if (!MI.isTransient())
1244       ++Count;
1245 
1246     // If the load instruction reads directly from the address to which the
1247     // store instruction writes and the stored value is not modified, we can
1248     // promote the load. Since we do not handle stores with pre-/post-index,
1249     // it's unnecessary to check if BaseReg is modified by the store itself.
1250     // Also we can't handle stores without an immediate offset operand,
1251     // while the operand might be the address for a global variable.
1252     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1253         BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1254         AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1255         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1256         ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1257       StoreI = MBBI;
1258       return true;
1259     }
1260 
1261     if (MI.isCall())
1262       return false;
1263 
1264     // Update modified / uses register units.
1265     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1266 
1267     // Otherwise, if the base register is modified, we have no match, so
1268     // return early.
1269     if (!ModifiedRegUnits.available(BaseReg))
1270       return false;
1271 
1272     // If we encounter a store aliased with the load, return early.
1273     if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
1274       return false;
1275   } while (MBBI != B && Count < Limit);
1276   return false;
1277 }
1278 
1279 static bool needsWinCFI(const MachineFunction *MF) {
1280   return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1281          MF->getFunction().needsUnwindTableEntry();
1282 }
1283 
1284 // Returns true if FirstMI and MI are candidates for merging or pairing.
1285 // Otherwise, returns false.
1286 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1287                                        LdStPairFlags &Flags,
1288                                        const AArch64InstrInfo *TII) {
1289   // If this is volatile or if pairing is suppressed, not a candidate.
1290   if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1291     return false;
1292 
1293   // We should have already checked FirstMI for pair suppression and volatility.
1294   assert(!FirstMI.hasOrderedMemoryRef() &&
1295          !TII->isLdStPairSuppressed(FirstMI) &&
1296          "FirstMI shouldn't get here if either of these checks are true.");
1297 
1298   if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) ||
1299                                   MI.getFlag(MachineInstr::FrameDestroy)))
1300     return false;
1301 
1302   unsigned OpcA = FirstMI.getOpcode();
1303   unsigned OpcB = MI.getOpcode();
1304 
1305   // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1306   if (OpcA == OpcB)
1307     return !AArch64InstrInfo::isPreLdSt(FirstMI);
1308 
1309   // Try to match a sign-extended load/store with a zero-extended load/store.
1310   bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1311   unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1312   assert(IsValidLdStrOpc &&
1313          "Given Opc should be a Load or Store with an immediate");
1314   // OpcA will be the first instruction in the pair.
1315   if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1316     Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1317     return true;
1318   }
1319 
1320   // If the second instruction isn't even a mergable/pairable load/store, bail
1321   // out.
1322   if (!PairIsValidLdStrOpc)
1323     return false;
1324 
1325   // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1326   // offsets.
1327   if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1328     return false;
1329 
1330   // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1331   // LDR<S,D,Q,W,X>pre-LDR<S,D,Q,W,X>ui
1332   // are candidate pairs that can be merged.
1333   if (isPreLdStPairCandidate(FirstMI, MI))
1334     return true;
1335 
1336   // Try to match an unscaled load/store with a scaled load/store.
1337   return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
1338          getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1339 
1340   // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1341 }
1342 
1343 static bool
1344 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1345                  SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1346                  const TargetRegisterInfo *TRI) {
1347   if (!FirstMI.mayStore())
1348     return false;
1349 
1350   // Check if we can find an unused register which we can use to rename
1351   // the register used by the first load/store.
1352   auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1353   MachineFunction &MF = *FirstMI.getParent()->getParent();
1354   if (!RegClass || !MF.getRegInfo().tracksLiveness())
1355     return false;
1356 
1357   auto RegToRename = getLdStRegOp(FirstMI).getReg();
1358   // For now, we only rename if the store operand gets killed at the store.
1359   if (!getLdStRegOp(FirstMI).isKill() &&
1360       !any_of(FirstMI.operands(),
1361               [TRI, RegToRename](const MachineOperand &MOP) {
1362                 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1363                        MOP.isImplicit() && MOP.isKill() &&
1364                        TRI->regsOverlap(RegToRename, MOP.getReg());
1365               })) {
1366     LLVM_DEBUG(dbgs() << "  Operand not killed at " << FirstMI << "\n");
1367     return false;
1368   }
1369   auto canRenameMOP = [TRI](const MachineOperand &MOP) {
1370     if (MOP.isReg()) {
1371       auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
1372       // Renaming registers with multiple disjunct sub-registers (e.g. the
1373       // result of a LD3) means that all sub-registers are renamed, potentially
1374       // impacting other instructions we did not check. Bail out.
1375       // Note that this relies on the structure of the AArch64 register file. In
1376       // particular, a subregister cannot be written without overwriting the
1377       // whole register.
1378       if (RegClass->HasDisjunctSubRegs) {
1379         LLVM_DEBUG(
1380             dbgs()
1381             << "  Cannot rename operands with multiple disjunct subregisters ("
1382             << MOP << ")\n");
1383         return false;
1384       }
1385     }
1386     return MOP.isImplicit() ||
1387            (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1388   };
1389 
1390   bool FoundDef = false;
1391 
1392   // For each instruction between FirstMI and the previous def for RegToRename,
1393   // we
1394   // * check if we can rename RegToRename in this instruction
1395   // * collect the registers used and required register classes for RegToRename.
1396   std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1397                                                            bool IsDef) {
1398     LLVM_DEBUG(dbgs() << "Checking " << MI << "\n");
1399     // Currently we do not try to rename across frame-setup instructions.
1400     if (MI.getFlag(MachineInstr::FrameSetup)) {
1401       LLVM_DEBUG(dbgs() << "  Cannot rename framesetup instructions currently ("
1402                         << MI << ")\n");
1403       return false;
1404     }
1405 
1406     UsedInBetween.accumulate(MI);
1407 
1408     // For a definition, check that we can rename the definition and exit the
1409     // loop.
1410     FoundDef = IsDef;
1411 
1412     // For defs, check if we can rename the first def of RegToRename.
1413     if (FoundDef) {
1414       // For some pseudo instructions, we might not generate code in the end
1415       // (e.g. KILL) and we would end up without a correct def for the rename
1416       // register.
1417       // TODO: This might be overly conservative and we could handle those cases
1418       // in multiple ways:
1419       //       1. Insert an extra copy, to materialize the def.
1420       //       2. Skip pseudo-defs until we find an non-pseudo def.
1421       if (MI.isPseudo()) {
1422         LLVM_DEBUG(dbgs() << "  Cannot rename pseudo instruction " << MI
1423                           << "\n");
1424         return false;
1425       }
1426 
1427       for (auto &MOP : MI.operands()) {
1428         if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1429             !TRI->regsOverlap(MOP.getReg(), RegToRename))
1430           continue;
1431         if (!canRenameMOP(MOP)) {
1432           LLVM_DEBUG(dbgs()
1433                      << "  Cannot rename " << MOP << " in " << MI << "\n");
1434           return false;
1435         }
1436         RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1437       }
1438       return true;
1439     } else {
1440       for (auto &MOP : MI.operands()) {
1441         if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1442             !TRI->regsOverlap(MOP.getReg(), RegToRename))
1443           continue;
1444 
1445         if (!canRenameMOP(MOP)) {
1446           LLVM_DEBUG(dbgs()
1447                      << "  Cannot rename " << MOP << " in " << MI << "\n");
1448           return false;
1449         }
1450         RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1451       }
1452     }
1453     return true;
1454   };
1455 
1456   if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
1457     return false;
1458 
1459   if (!FoundDef) {
1460     LLVM_DEBUG(dbgs() << "  Did not find definition for register in BB\n");
1461     return false;
1462   }
1463   return true;
1464 }
1465 
1466 // Check if we can find a physical register for renaming \p Reg. This register
1467 // must:
1468 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1469 //   defined registers up to the point where the renamed register will be used,
1470 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1471 //   registers in the range the rename register will be used,
1472 // * is available in all used register classes (checked using RequiredClasses).
1473 static std::optional<MCPhysReg> tryToFindRegisterToRename(
1474     const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1475     LiveRegUnits &UsedInBetween,
1476     SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1477     const TargetRegisterInfo *TRI) {
1478   const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1479 
1480   // Checks if any sub- or super-register of PR is callee saved.
1481   auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1482     return any_of(TRI->sub_and_superregs_inclusive(PR),
1483                   [&MF, TRI](MCPhysReg SubOrSuper) {
1484                     return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1485                   });
1486   };
1487 
1488   // Check if PR or one of its sub- or super-registers can be used for all
1489   // required register classes.
1490   auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1491     return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
1492       return any_of(TRI->sub_and_superregs_inclusive(PR),
1493                     [C, TRI](MCPhysReg SubOrSuper) {
1494                       return C == TRI->getMinimalPhysRegClass(SubOrSuper);
1495                     });
1496     });
1497   };
1498 
1499   auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1500   for (const MCPhysReg &PR : *RegClass) {
1501     if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
1502         !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1503         CanBeUsedForAllClasses(PR)) {
1504       DefinedInBB.addReg(PR);
1505       LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1506                         << "\n");
1507       return {PR};
1508     }
1509   }
1510   LLVM_DEBUG(dbgs() << "No rename register found from "
1511                     << TRI->getRegClassName(RegClass) << "\n");
1512   return std::nullopt;
1513 }
1514 
1515 /// Scan the instructions looking for a load/store that can be combined with the
1516 /// current instruction into a wider equivalent or a load/store pair.
1517 MachineBasicBlock::iterator
1518 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1519                                       LdStPairFlags &Flags, unsigned Limit,
1520                                       bool FindNarrowMerge) {
1521   MachineBasicBlock::iterator E = I->getParent()->end();
1522   MachineBasicBlock::iterator MBBI = I;
1523   MachineBasicBlock::iterator MBBIWithRenameReg;
1524   MachineInstr &FirstMI = *I;
1525   MBBI = next_nodbg(MBBI, E);
1526 
1527   bool MayLoad = FirstMI.mayLoad();
1528   bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
1529   Register Reg = getLdStRegOp(FirstMI).getReg();
1530   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
1531   int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
1532   int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
1533   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1534 
1535   std::optional<bool> MaybeCanRename;
1536   if (!EnableRenaming)
1537     MaybeCanRename = {false};
1538 
1539   SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1540   LiveRegUnits UsedInBetween;
1541   UsedInBetween.init(*TRI);
1542 
1543   Flags.clearRenameReg();
1544 
1545   // Track which register units have been modified and used between the first
1546   // insn (inclusive) and the second insn.
1547   ModifiedRegUnits.clear();
1548   UsedRegUnits.clear();
1549 
1550   // Remember any instructions that read/write memory between FirstMI and MI.
1551   SmallVector<MachineInstr *, 4> MemInsns;
1552 
1553   for (unsigned Count = 0; MBBI != E && Count < Limit;
1554        MBBI = next_nodbg(MBBI, E)) {
1555     MachineInstr &MI = *MBBI;
1556 
1557     UsedInBetween.accumulate(MI);
1558 
1559     // Don't count transient instructions towards the search limit since there
1560     // may be different numbers of them if e.g. debug information is present.
1561     if (!MI.isTransient())
1562       ++Count;
1563 
1564     Flags.setSExtIdx(-1);
1565     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1566         AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
1567       assert(MI.mayLoadOrStore() && "Expected memory operation.");
1568       // If we've found another instruction with the same opcode, check to see
1569       // if the base and offset are compatible with our starting instruction.
1570       // These instructions all have scaled immediate operands, so we just
1571       // check for +1/-1. Make sure to check the new instruction offset is
1572       // actually an immediate and not a symbolic reference destined for
1573       // a relocation.
1574       Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
1575       int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
1576       bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
1577       if (IsUnscaled != MIIsUnscaled) {
1578         // We're trying to pair instructions that differ in how they are scaled.
1579         // If FirstMI is scaled then scale the offset of MI accordingly.
1580         // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1581         int MemSize = TII->getMemScale(MI);
1582         if (MIIsUnscaled) {
1583           // If the unscaled offset isn't a multiple of the MemSize, we can't
1584           // pair the operations together: bail and keep looking.
1585           if (MIOffset % MemSize) {
1586             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1587                                               UsedRegUnits, TRI);
1588             MemInsns.push_back(&MI);
1589             continue;
1590           }
1591           MIOffset /= MemSize;
1592         } else {
1593           MIOffset *= MemSize;
1594         }
1595       }
1596 
1597       bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
1598 
1599       if (BaseReg == MIBaseReg) {
1600         // If the offset of the second ld/st is not equal to the size of the
1601         // destination register it can’t be paired with a pre-index ld/st
1602         // pair. Additionally if the base reg is used or modified the operations
1603         // can't be paired: bail and keep looking.
1604         if (IsPreLdSt) {
1605           bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
1606           bool IsBaseRegUsed = !UsedRegUnits.available(
1607               AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1608           bool IsBaseRegModified = !ModifiedRegUnits.available(
1609               AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1610           // If the stored value and the address of the second instruction is
1611           // the same, it needs to be using the updated register and therefore
1612           // it must not be folded.
1613           bool IsMIRegTheSame =
1614               TRI->regsOverlap(getLdStRegOp(MI).getReg(),
1615                                AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1616           if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
1617               IsMIRegTheSame) {
1618             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1619                                               UsedRegUnits, TRI);
1620             MemInsns.push_back(&MI);
1621             continue;
1622           }
1623         } else {
1624           if ((Offset != MIOffset + OffsetStride) &&
1625               (Offset + OffsetStride != MIOffset)) {
1626             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1627                                               UsedRegUnits, TRI);
1628             MemInsns.push_back(&MI);
1629             continue;
1630           }
1631         }
1632 
1633         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1634         if (FindNarrowMerge) {
1635           // If the alignment requirements of the scaled wide load/store
1636           // instruction can't express the offset of the scaled narrow input,
1637           // bail and keep looking. For promotable zero stores, allow only when
1638           // the stored value is the same (i.e., WZR).
1639           if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1640               (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1641             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1642                                               UsedRegUnits, TRI);
1643             MemInsns.push_back(&MI);
1644             continue;
1645           }
1646         } else {
1647           // Pairwise instructions have a 7-bit signed offset field. Single
1648           // insns have a 12-bit unsigned offset field.  If the resultant
1649           // immediate offset of merging these instructions is out of range for
1650           // a pairwise instruction, bail and keep looking.
1651           if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1652             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1653                                               UsedRegUnits, TRI);
1654             MemInsns.push_back(&MI);
1655             continue;
1656           }
1657           // If the alignment requirements of the paired (scaled) instruction
1658           // can't express the offset of the unscaled input, bail and keep
1659           // looking.
1660           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1661             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1662                                               UsedRegUnits, TRI);
1663             MemInsns.push_back(&MI);
1664             continue;
1665           }
1666         }
1667         // If the destination register of one load is the same register or a
1668         // sub/super register of the other load, bail and keep looking. A
1669         // load-pair instruction with both destination registers the same is
1670         // UNPREDICTABLE and will result in an exception.
1671         if (MayLoad &&
1672             TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) {
1673           LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
1674                                             TRI);
1675           MemInsns.push_back(&MI);
1676           continue;
1677         }
1678 
1679         // If the BaseReg has been modified, then we cannot do the optimization.
1680         // For example, in the following pattern
1681         //   ldr x1 [x2]
1682         //   ldr x2 [x3]
1683         //   ldr x4 [x2, #8],
1684         // the first and third ldr cannot be converted to ldp x1, x4, [x2]
1685         if (!ModifiedRegUnits.available(BaseReg))
1686           return E;
1687 
1688         // If the Rt of the second instruction was not modified or used between
1689         // the two instructions and none of the instructions between the second
1690         // and first alias with the second, we can combine the second into the
1691         // first.
1692         if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
1693             !(MI.mayLoad() &&
1694               !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
1695             !mayAlias(MI, MemInsns, AA)) {
1696 
1697           Flags.setMergeForward(false);
1698           Flags.clearRenameReg();
1699           return MBBI;
1700         }
1701 
1702         // Likewise, if the Rt of the first instruction is not modified or used
1703         // between the two instructions and none of the instructions between the
1704         // first and the second alias with the first, we can combine the first
1705         // into the second.
1706         if (!(MayLoad &&
1707               !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
1708             !mayAlias(FirstMI, MemInsns, AA)) {
1709 
1710           if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
1711             Flags.setMergeForward(true);
1712             Flags.clearRenameReg();
1713             return MBBI;
1714           }
1715 
1716           if (DebugCounter::shouldExecute(RegRenamingCounter)) {
1717             if (!MaybeCanRename)
1718               MaybeCanRename = {canRenameUpToDef(FirstMI, UsedInBetween,
1719                                                  RequiredClasses, TRI)};
1720 
1721             if (*MaybeCanRename) {
1722               std::optional<MCPhysReg> MaybeRenameReg =
1723                   tryToFindRegisterToRename(*FirstMI.getParent()->getParent(),
1724                                             Reg, DefinedInBB, UsedInBetween,
1725                                             RequiredClasses, TRI);
1726               if (MaybeRenameReg) {
1727                 Flags.setRenameReg(*MaybeRenameReg);
1728                 Flags.setMergeForward(true);
1729                 MBBIWithRenameReg = MBBI;
1730               }
1731             }
1732           }
1733         }
1734         // Unable to combine these instructions due to interference in between.
1735         // Keep looking.
1736       }
1737     }
1738 
1739     if (Flags.getRenameReg())
1740       return MBBIWithRenameReg;
1741 
1742     // If the instruction wasn't a matching load or store.  Stop searching if we
1743     // encounter a call instruction that might modify memory.
1744     if (MI.isCall())
1745       return E;
1746 
1747     // Update modified / uses register units.
1748     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1749 
1750     // Otherwise, if the base register is modified, we have no match, so
1751     // return early.
1752     if (!ModifiedRegUnits.available(BaseReg))
1753       return E;
1754 
1755     // Update list of instructions that read/write memory.
1756     if (MI.mayLoadOrStore())
1757       MemInsns.push_back(&MI);
1758   }
1759   return E;
1760 }
1761 
1762 static MachineBasicBlock::iterator
1763 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
1764   auto End = MI.getParent()->end();
1765   if (MaybeCFI == End ||
1766       MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
1767       !(MI.getFlag(MachineInstr::FrameSetup) ||
1768         MI.getFlag(MachineInstr::FrameDestroy)) ||
1769       AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
1770     return End;
1771 
1772   const MachineFunction &MF = *MI.getParent()->getParent();
1773   unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
1774   const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
1775   switch (CFI.getOperation()) {
1776   case MCCFIInstruction::OpDefCfa:
1777   case MCCFIInstruction::OpDefCfaOffset:
1778     return MaybeCFI;
1779   default:
1780     return End;
1781   }
1782 }
1783 
1784 MachineBasicBlock::iterator
1785 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1786                                      MachineBasicBlock::iterator Update,
1787                                      bool IsPreIdx) {
1788   assert((Update->getOpcode() == AArch64::ADDXri ||
1789           Update->getOpcode() == AArch64::SUBXri) &&
1790          "Unexpected base register update instruction to merge!");
1791   MachineBasicBlock::iterator E = I->getParent()->end();
1792   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
1793 
1794   // If updating the SP and the following instruction is CFA offset related CFI
1795   // instruction move it after the merged instruction.
1796   MachineBasicBlock::iterator CFI =
1797       IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
1798 
1799   // Return the instruction following the merged instruction, which is
1800   // the instruction following our unmerged load. Unless that's the add/sub
1801   // instruction we're merging, in which case it's the one after that.
1802   if (NextI == Update)
1803     NextI = next_nodbg(NextI, E);
1804 
1805   int Value = Update->getOperand(2).getImm();
1806   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1807          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1808   if (Update->getOpcode() == AArch64::SUBXri)
1809     Value = -Value;
1810 
1811   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1812                              : getPostIndexedOpcode(I->getOpcode());
1813   MachineInstrBuilder MIB;
1814   int Scale, MinOffset, MaxOffset;
1815   getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
1816   if (!AArch64InstrInfo::isPairedLdSt(*I)) {
1817     // Non-paired instruction.
1818     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1819               .add(getLdStRegOp(*Update))
1820               .add(getLdStRegOp(*I))
1821               .add(AArch64InstrInfo::getLdStBaseOp(*I))
1822               .addImm(Value / Scale)
1823               .setMemRefs(I->memoperands())
1824               .setMIFlags(I->mergeFlagsWith(*Update));
1825   } else {
1826     // Paired instruction.
1827     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1828               .add(getLdStRegOp(*Update))
1829               .add(getLdStRegOp(*I, 0))
1830               .add(getLdStRegOp(*I, 1))
1831               .add(AArch64InstrInfo::getLdStBaseOp(*I))
1832               .addImm(Value / Scale)
1833               .setMemRefs(I->memoperands())
1834               .setMIFlags(I->mergeFlagsWith(*Update));
1835   }
1836   if (CFI != E) {
1837     MachineBasicBlock *MBB = I->getParent();
1838     MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
1839   }
1840 
1841   if (IsPreIdx) {
1842     ++NumPreFolded;
1843     LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1844   } else {
1845     ++NumPostFolded;
1846     LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1847   }
1848   LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
1849   LLVM_DEBUG(I->print(dbgs()));
1850   LLVM_DEBUG(dbgs() << "    ");
1851   LLVM_DEBUG(Update->print(dbgs()));
1852   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
1853   LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1854   LLVM_DEBUG(dbgs() << "\n");
1855 
1856   // Erase the old instructions for the block.
1857   I->eraseFromParent();
1858   Update->eraseFromParent();
1859 
1860   return NextI;
1861 }
1862 
1863 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1864                                                MachineInstr &MI,
1865                                                unsigned BaseReg, int Offset) {
1866   switch (MI.getOpcode()) {
1867   default:
1868     break;
1869   case AArch64::SUBXri:
1870   case AArch64::ADDXri:
1871     // Make sure it's a vanilla immediate operand, not a relocation or
1872     // anything else we can't handle.
1873     if (!MI.getOperand(2).isImm())
1874       break;
1875     // Watch out for 1 << 12 shifted value.
1876     if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
1877       break;
1878 
1879     // The update instruction source and destination register must be the
1880     // same as the load/store base register.
1881     if (MI.getOperand(0).getReg() != BaseReg ||
1882         MI.getOperand(1).getReg() != BaseReg)
1883       break;
1884 
1885     int UpdateOffset = MI.getOperand(2).getImm();
1886     if (MI.getOpcode() == AArch64::SUBXri)
1887       UpdateOffset = -UpdateOffset;
1888 
1889     // The immediate must be a multiple of the scaling factor of the pre/post
1890     // indexed instruction.
1891     int Scale, MinOffset, MaxOffset;
1892     getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
1893     if (UpdateOffset % Scale != 0)
1894       break;
1895 
1896     // Scaled offset must fit in the instruction immediate.
1897     int ScaledOffset = UpdateOffset / Scale;
1898     if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
1899       break;
1900 
1901     // If we have a non-zero Offset, we check that it matches the amount
1902     // we're adding to the register.
1903     if (!Offset || Offset == UpdateOffset)
1904       return true;
1905     break;
1906   }
1907   return false;
1908 }
1909 
1910 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1911     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1912   MachineBasicBlock::iterator E = I->getParent()->end();
1913   MachineInstr &MemMI = *I;
1914   MachineBasicBlock::iterator MBBI = I;
1915 
1916   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
1917   int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
1918                          TII->getMemScale(MemMI);
1919 
1920   // Scan forward looking for post-index opportunities.  Updating instructions
1921   // can't be formed if the memory instruction doesn't have the offset we're
1922   // looking for.
1923   if (MIUnscaledOffset != UnscaledOffset)
1924     return E;
1925 
1926   // If the base register overlaps a source/destination register, we can't
1927   // merge the update. This does not apply to tag store instructions which
1928   // ignore the address part of the source register.
1929   // This does not apply to STGPi as well, which does not have unpredictable
1930   // behavior in this case unlike normal stores, and always performs writeback
1931   // after reading the source register value.
1932   if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
1933     bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
1934     for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1935       Register DestReg = getLdStRegOp(MemMI, i).getReg();
1936       if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1937         return E;
1938     }
1939   }
1940 
1941   // Track which register units have been modified and used between the first
1942   // insn (inclusive) and the second insn.
1943   ModifiedRegUnits.clear();
1944   UsedRegUnits.clear();
1945   MBBI = next_nodbg(MBBI, E);
1946 
1947   // We can't post-increment the stack pointer if any instruction between
1948   // the memory access (I) and the increment (MBBI) can access the memory
1949   // region defined by [SP, MBBI].
1950   const bool BaseRegSP = BaseReg == AArch64::SP;
1951   if (BaseRegSP && needsWinCFI(I->getMF())) {
1952     // FIXME: For now, we always block the optimization over SP in windows
1953     // targets as it requires to adjust the unwind/debug info, messing up
1954     // the unwind info can actually cause a miscompile.
1955     return E;
1956   }
1957 
1958   for (unsigned Count = 0; MBBI != E && Count < Limit;
1959        MBBI = next_nodbg(MBBI, E)) {
1960     MachineInstr &MI = *MBBI;
1961 
1962     // Don't count transient instructions towards the search limit since there
1963     // may be different numbers of them if e.g. debug information is present.
1964     if (!MI.isTransient())
1965       ++Count;
1966 
1967     // If we found a match, return it.
1968     if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1969       return MBBI;
1970 
1971     // Update the status of what the instruction clobbered and used.
1972     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1973 
1974     // Otherwise, if the base register is used or modified, we have no match, so
1975     // return early.
1976     // If we are optimizing SP, do not allow instructions that may load or store
1977     // in between the load and the optimized value update.
1978     if (!ModifiedRegUnits.available(BaseReg) ||
1979         !UsedRegUnits.available(BaseReg) ||
1980         (BaseRegSP && MBBI->mayLoadOrStore()))
1981       return E;
1982   }
1983   return E;
1984 }
1985 
1986 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1987     MachineBasicBlock::iterator I, unsigned Limit) {
1988   MachineBasicBlock::iterator B = I->getParent()->begin();
1989   MachineBasicBlock::iterator E = I->getParent()->end();
1990   MachineInstr &MemMI = *I;
1991   MachineBasicBlock::iterator MBBI = I;
1992   MachineFunction &MF = *MemMI.getMF();
1993 
1994   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
1995   int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
1996 
1997   // If the load/store is the first instruction in the block, there's obviously
1998   // not any matching update. Ditto if the memory offset isn't zero.
1999   if (MBBI == B || Offset != 0)
2000     return E;
2001   // If the base register overlaps a destination register, we can't
2002   // merge the update.
2003   if (!isTagStore(MemMI)) {
2004     bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2005     for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2006       Register DestReg = getLdStRegOp(MemMI, i).getReg();
2007       if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2008         return E;
2009     }
2010   }
2011 
2012   const bool BaseRegSP = BaseReg == AArch64::SP;
2013   if (BaseRegSP && needsWinCFI(I->getMF())) {
2014     // FIXME: For now, we always block the optimization over SP in windows
2015     // targets as it requires to adjust the unwind/debug info, messing up
2016     // the unwind info can actually cause a miscompile.
2017     return E;
2018   }
2019 
2020   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2021   unsigned RedZoneSize =
2022       Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
2023 
2024   // Track which register units have been modified and used between the first
2025   // insn (inclusive) and the second insn.
2026   ModifiedRegUnits.clear();
2027   UsedRegUnits.clear();
2028   unsigned Count = 0;
2029   bool MemAcessBeforeSPPreInc = false;
2030   do {
2031     MBBI = prev_nodbg(MBBI, B);
2032     MachineInstr &MI = *MBBI;
2033 
2034     // Don't count transient instructions towards the search limit since there
2035     // may be different numbers of them if e.g. debug information is present.
2036     if (!MI.isTransient())
2037       ++Count;
2038 
2039     // If we found a match, return it.
2040     if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
2041       // Check that the update value is within our red zone limit (which may be
2042       // zero).
2043       if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
2044         return E;
2045       return MBBI;
2046     }
2047 
2048     // Update the status of what the instruction clobbered and used.
2049     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2050 
2051     // Otherwise, if the base register is used or modified, we have no match, so
2052     // return early.
2053     if (!ModifiedRegUnits.available(BaseReg) ||
2054         !UsedRegUnits.available(BaseReg))
2055       return E;
2056     // Keep track if we have a memory access before an SP pre-increment, in this
2057     // case we need to validate later that the update amount respects the red
2058     // zone.
2059     if (BaseRegSP && MBBI->mayLoadOrStore())
2060       MemAcessBeforeSPPreInc = true;
2061   } while (MBBI != B && Count < Limit);
2062   return E;
2063 }
2064 
2065 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2066     MachineBasicBlock::iterator &MBBI) {
2067   MachineInstr &MI = *MBBI;
2068   // If this is a volatile load, don't mess with it.
2069   if (MI.hasOrderedMemoryRef())
2070     return false;
2071 
2072   if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy))
2073     return false;
2074 
2075   // Make sure this is a reg+imm.
2076   // FIXME: It is possible to extend it to handle reg+reg cases.
2077   if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2078     return false;
2079 
2080   // Look backward up to LdStLimit instructions.
2081   MachineBasicBlock::iterator StoreI;
2082   if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
2083     ++NumLoadsFromStoresPromoted;
2084     // Promote the load. Keeping the iterator straight is a
2085     // pain, so we let the merge routine tell us what the next instruction
2086     // is after it's done mucking about.
2087     MBBI = promoteLoadFromStore(MBBI, StoreI);
2088     return true;
2089   }
2090   return false;
2091 }
2092 
2093 // Merge adjacent zero stores into a wider store.
2094 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2095     MachineBasicBlock::iterator &MBBI) {
2096   assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2097   MachineInstr &MI = *MBBI;
2098   MachineBasicBlock::iterator E = MI.getParent()->end();
2099 
2100   if (!TII->isCandidateToMergeOrPair(MI))
2101     return false;
2102 
2103   // Look ahead up to LdStLimit instructions for a mergable instruction.
2104   LdStPairFlags Flags;
2105   MachineBasicBlock::iterator MergeMI =
2106       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
2107   if (MergeMI != E) {
2108     ++NumZeroStoresPromoted;
2109 
2110     // Keeping the iterator straight is a pain, so we let the merge routine tell
2111     // us what the next instruction is after it's done mucking about.
2112     MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
2113     return true;
2114   }
2115   return false;
2116 }
2117 
2118 // Find loads and stores that can be merged into a single load or store pair
2119 // instruction.
2120 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2121   MachineInstr &MI = *MBBI;
2122   MachineBasicBlock::iterator E = MI.getParent()->end();
2123 
2124   if (!TII->isCandidateToMergeOrPair(MI))
2125     return false;
2126 
2127   // Early exit if the offset is not possible to match. (6 bits of positive
2128   // range, plus allow an extra one in case we find a later insn that matches
2129   // with Offset-1)
2130   bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2131   int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2132   int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2133   // Allow one more for offset.
2134   if (Offset > 0)
2135     Offset -= OffsetStride;
2136   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2137     return false;
2138 
2139   // Look ahead up to LdStLimit instructions for a pairable instruction.
2140   LdStPairFlags Flags;
2141   MachineBasicBlock::iterator Paired =
2142       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
2143   if (Paired != E) {
2144     ++NumPairCreated;
2145     if (TII->hasUnscaledLdStOffset(MI))
2146       ++NumUnscaledPairCreated;
2147     // Keeping the iterator straight is a pain, so we let the merge routine tell
2148     // us what the next instruction is after it's done mucking about.
2149     auto Prev = std::prev(MBBI);
2150     MBBI = mergePairedInsns(MBBI, Paired, Flags);
2151     // Collect liveness info for instructions between Prev and the new position
2152     // MBBI.
2153     for (auto I = std::next(Prev); I != MBBI; I++)
2154       updateDefinedRegisters(*I, DefinedInBB, TRI);
2155 
2156     return true;
2157   }
2158   return false;
2159 }
2160 
2161 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2162     (MachineBasicBlock::iterator &MBBI) {
2163   MachineInstr &MI = *MBBI;
2164   MachineBasicBlock::iterator E = MI.getParent()->end();
2165   MachineBasicBlock::iterator Update;
2166 
2167   // Look forward to try to form a post-index instruction. For example,
2168   // ldr x0, [x20]
2169   // add x20, x20, #32
2170   //   merged into:
2171   // ldr x0, [x20], #32
2172   Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
2173   if (Update != E) {
2174     // Merge the update into the ld/st.
2175     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
2176     return true;
2177   }
2178 
2179   // Don't know how to handle unscaled pre/post-index versions below, so bail.
2180   if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2181     return false;
2182 
2183   // Look back to try to find a pre-index instruction. For example,
2184   // add x0, x0, #8
2185   // ldr x1, [x0]
2186   //   merged into:
2187   // ldr x1, [x0, #8]!
2188   Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
2189   if (Update != E) {
2190     // Merge the update into the ld/st.
2191     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2192     return true;
2193   }
2194 
2195   // The immediate in the load/store is scaled by the size of the memory
2196   // operation. The immediate in the add we're looking for,
2197   // however, is not, so adjust here.
2198   int UnscaledOffset =
2199       AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2200 
2201   // Look forward to try to find a pre-index instruction. For example,
2202   // ldr x1, [x0, #64]
2203   // add x0, x0, #64
2204   //   merged into:
2205   // ldr x1, [x0, #64]!
2206   Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
2207   if (Update != E) {
2208     // Merge the update into the ld/st.
2209     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2210     return true;
2211   }
2212 
2213   return false;
2214 }
2215 
2216 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2217                                         bool EnableNarrowZeroStOpt) {
2218 
2219   bool Modified = false;
2220   // Four tranformations to do here:
2221   // 1) Find loads that directly read from stores and promote them by
2222   //    replacing with mov instructions. If the store is wider than the load,
2223   //    the load will be replaced with a bitfield extract.
2224   //      e.g.,
2225   //        str w1, [x0, #4]
2226   //        ldrh w2, [x0, #6]
2227   //        ; becomes
2228   //        str w1, [x0, #4]
2229   //        lsr w2, w1, #16
2230   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2231        MBBI != E;) {
2232     if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
2233       Modified = true;
2234     else
2235       ++MBBI;
2236   }
2237   // 2) Merge adjacent zero stores into a wider store.
2238   //      e.g.,
2239   //        strh wzr, [x0]
2240   //        strh wzr, [x0, #2]
2241   //        ; becomes
2242   //        str wzr, [x0]
2243   //      e.g.,
2244   //        str wzr, [x0]
2245   //        str wzr, [x0, #4]
2246   //        ; becomes
2247   //        str xzr, [x0]
2248   if (EnableNarrowZeroStOpt)
2249     for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2250          MBBI != E;) {
2251       if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
2252         Modified = true;
2253       else
2254         ++MBBI;
2255     }
2256   // 3) Find loads and stores that can be merged into a single load or store
2257   //    pair instruction.
2258   //      e.g.,
2259   //        ldr x0, [x2]
2260   //        ldr x1, [x2, #8]
2261   //        ; becomes
2262   //        ldp x0, x1, [x2]
2263 
2264   if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2265     DefinedInBB.clear();
2266     DefinedInBB.addLiveIns(MBB);
2267   }
2268 
2269   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2270        MBBI != E;) {
2271     // Track currently live registers up to this point, to help with
2272     // searching for a rename register on demand.
2273     updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
2274     if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
2275       Modified = true;
2276     else
2277       ++MBBI;
2278   }
2279   // 4) Find base register updates that can be merged into the load or store
2280   //    as a base-reg writeback.
2281   //      e.g.,
2282   //        ldr x0, [x2]
2283   //        add x2, x2, #4
2284   //        ; becomes
2285   //        ldr x0, [x2], #4
2286   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2287        MBBI != E;) {
2288     if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
2289       Modified = true;
2290     else
2291       ++MBBI;
2292   }
2293 
2294   return Modified;
2295 }
2296 
2297 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2298   if (skipFunction(Fn.getFunction()))
2299     return false;
2300 
2301   Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
2302   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
2303   TRI = Subtarget->getRegisterInfo();
2304   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2305 
2306   // Resize the modified and used register unit trackers.  We do this once
2307   // per function and then clear the register units each time we optimize a load
2308   // or store.
2309   ModifiedRegUnits.init(*TRI);
2310   UsedRegUnits.init(*TRI);
2311   DefinedInBB.init(*TRI);
2312 
2313   bool Modified = false;
2314   bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
2315   for (auto &MBB : Fn) {
2316     auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
2317     Modified |= M;
2318   }
2319 
2320   return Modified;
2321 }
2322 
2323 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2324 // stores near one another?  Note: The pre-RA instruction scheduler already has
2325 // hooks to try and schedule pairable loads/stores together to improve pairing
2326 // opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
2327 
2328 // FIXME: When pairing store instructions it's very possible for this pass to
2329 // hoist a store with a KILL marker above another use (without a KILL marker).
2330 // The resulting IR is invalid, but nothing uses the KILL markers after this
2331 // pass, so it's never caused a problem in practice.
2332 
2333 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
2334 /// load / store optimization pass.
2335 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
2336   return new AArch64LoadStoreOpt();
2337 }
2338