xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp (revision 2ccfa855b2fc331819953e3de1b1c15ce5b95a7e)
1  //===- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass -------------===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  //
9  /// \file This file contains a pass that performs load / store related peephole
10  /// optimizations. This pass should be run after register allocation.
11  //
12  //===----------------------------------------------------------------------===//
13  
14  #include "ARM.h"
15  #include "ARMBaseInstrInfo.h"
16  #include "ARMBaseRegisterInfo.h"
17  #include "ARMISelLowering.h"
18  #include "ARMMachineFunctionInfo.h"
19  #include "ARMSubtarget.h"
20  #include "MCTargetDesc/ARMAddressingModes.h"
21  #include "MCTargetDesc/ARMBaseInfo.h"
22  #include "Utils/ARMBaseInfo.h"
23  #include "llvm/ADT/ArrayRef.h"
24  #include "llvm/ADT/DenseMap.h"
25  #include "llvm/ADT/DenseSet.h"
26  #include "llvm/ADT/STLExtras.h"
27  #include "llvm/ADT/SetVector.h"
28  #include "llvm/ADT/SmallPtrSet.h"
29  #include "llvm/ADT/SmallSet.h"
30  #include "llvm/ADT/SmallVector.h"
31  #include "llvm/ADT/Statistic.h"
32  #include "llvm/ADT/iterator_range.h"
33  #include "llvm/Analysis/AliasAnalysis.h"
34  #include "llvm/CodeGen/LivePhysRegs.h"
35  #include "llvm/CodeGen/MachineBasicBlock.h"
36  #include "llvm/CodeGen/MachineDominators.h"
37  #include "llvm/CodeGen/MachineFrameInfo.h"
38  #include "llvm/CodeGen/MachineFunction.h"
39  #include "llvm/CodeGen/MachineFunctionPass.h"
40  #include "llvm/CodeGen/MachineInstr.h"
41  #include "llvm/CodeGen/MachineInstrBuilder.h"
42  #include "llvm/CodeGen/MachineMemOperand.h"
43  #include "llvm/CodeGen/MachineOperand.h"
44  #include "llvm/CodeGen/MachineRegisterInfo.h"
45  #include "llvm/CodeGen/RegisterClassInfo.h"
46  #include "llvm/CodeGen/TargetFrameLowering.h"
47  #include "llvm/CodeGen/TargetInstrInfo.h"
48  #include "llvm/CodeGen/TargetLowering.h"
49  #include "llvm/CodeGen/TargetRegisterInfo.h"
50  #include "llvm/CodeGen/TargetSubtargetInfo.h"
51  #include "llvm/IR/DataLayout.h"
52  #include "llvm/IR/DebugLoc.h"
53  #include "llvm/IR/DerivedTypes.h"
54  #include "llvm/IR/Function.h"
55  #include "llvm/IR/Type.h"
56  #include "llvm/InitializePasses.h"
57  #include "llvm/MC/MCInstrDesc.h"
58  #include "llvm/Pass.h"
59  #include "llvm/Support/Allocator.h"
60  #include "llvm/Support/CommandLine.h"
61  #include "llvm/Support/Debug.h"
62  #include "llvm/Support/ErrorHandling.h"
63  #include "llvm/Support/raw_ostream.h"
64  #include <algorithm>
65  #include <cassert>
66  #include <cstddef>
67  #include <cstdlib>
68  #include <iterator>
69  #include <limits>
70  #include <utility>
71  
72  using namespace llvm;
73  
74  #define DEBUG_TYPE "arm-ldst-opt"
75  
76  STATISTIC(NumLDMGened , "Number of ldm instructions generated");
77  STATISTIC(NumSTMGened , "Number of stm instructions generated");
78  STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
79  STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
80  STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
81  STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
82  STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
83  STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
84  STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
85  STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
86  STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
87  
88  /// This switch disables formation of double/multi instructions that could
89  /// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP
90  /// disabled. This can be used to create libraries that are robust even when
91  /// users provoke undefined behaviour by supplying misaligned pointers.
92  /// \see mayCombineMisaligned()
93  static cl::opt<bool>
94  AssumeMisalignedLoadStores("arm-assume-misaligned-load-store", cl::Hidden,
95      cl::init(false), cl::desc("Be more conservative in ARM load/store opt"));
96  
97  #define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
98  
99  namespace {
100  
101    /// Post- register allocation pass the combine load / store instructions to
102    /// form ldm / stm instructions.
103    struct ARMLoadStoreOpt : public MachineFunctionPass {
104      static char ID;
105  
106      const MachineFunction *MF;
107      const TargetInstrInfo *TII;
108      const TargetRegisterInfo *TRI;
109      const ARMSubtarget *STI;
110      const TargetLowering *TL;
111      ARMFunctionInfo *AFI;
112      LivePhysRegs LiveRegs;
113      RegisterClassInfo RegClassInfo;
114      MachineBasicBlock::const_iterator LiveRegPos;
115      bool LiveRegsValid;
116      bool RegClassInfoValid;
117      bool isThumb1, isThumb2;
118  
119      ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
120  
121      bool runOnMachineFunction(MachineFunction &Fn) override;
122  
123      MachineFunctionProperties getRequiredProperties() const override {
124        return MachineFunctionProperties().set(
125            MachineFunctionProperties::Property::NoVRegs);
126      }
127  
128      StringRef getPassName() const override { return ARM_LOAD_STORE_OPT_NAME; }
129  
130    private:
131      /// A set of load/store MachineInstrs with same base register sorted by
132      /// offset.
133      struct MemOpQueueEntry {
134        MachineInstr *MI;
135        int Offset;        ///< Load/Store offset.
136        unsigned Position; ///< Position as counted from end of basic block.
137  
138        MemOpQueueEntry(MachineInstr &MI, int Offset, unsigned Position)
139            : MI(&MI), Offset(Offset), Position(Position) {}
140      };
141      using MemOpQueue = SmallVector<MemOpQueueEntry, 8>;
142  
143      /// A set of MachineInstrs that fulfill (nearly all) conditions to get
144      /// merged into a LDM/STM.
145      struct MergeCandidate {
146        /// List of instructions ordered by load/store offset.
147        SmallVector<MachineInstr*, 4> Instrs;
148  
149        /// Index in Instrs of the instruction being latest in the schedule.
150        unsigned LatestMIIdx;
151  
152        /// Index in Instrs of the instruction being earliest in the schedule.
153        unsigned EarliestMIIdx;
154  
155        /// Index into the basic block where the merged instruction will be
156        /// inserted. (See MemOpQueueEntry.Position)
157        unsigned InsertPos;
158  
159        /// Whether the instructions can be merged into a ldm/stm instruction.
160        bool CanMergeToLSMulti;
161  
162        /// Whether the instructions can be merged into a ldrd/strd instruction.
163        bool CanMergeToLSDouble;
164      };
165      SpecificBumpPtrAllocator<MergeCandidate> Allocator;
166      SmallVector<const MergeCandidate*,4> Candidates;
167      SmallVector<MachineInstr*,4> MergeBaseCandidates;
168  
169      void moveLiveRegsBefore(const MachineBasicBlock &MBB,
170                              MachineBasicBlock::const_iterator Before);
171      unsigned findFreeReg(const TargetRegisterClass &RegClass);
172      void UpdateBaseRegUses(MachineBasicBlock &MBB,
173                             MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
174                             unsigned Base, unsigned WordOffset,
175                             ARMCC::CondCodes Pred, unsigned PredReg);
176      MachineInstr *CreateLoadStoreMulti(
177          MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
178          int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
179          ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
180          ArrayRef<std::pair<unsigned, bool>> Regs,
181          ArrayRef<MachineInstr*> Instrs);
182      MachineInstr *CreateLoadStoreDouble(
183          MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
184          int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
185          ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
186          ArrayRef<std::pair<unsigned, bool>> Regs,
187          ArrayRef<MachineInstr*> Instrs) const;
188      void FormCandidates(const MemOpQueue &MemOps);
189      MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
190      bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
191                               MachineBasicBlock::iterator &MBBI);
192      bool MergeBaseUpdateLoadStore(MachineInstr *MI);
193      bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
194      bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
195      bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
196      bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
197      bool CombineMovBx(MachineBasicBlock &MBB);
198    };
199  
200  } // end anonymous namespace
201  
202  char ARMLoadStoreOpt::ID = 0;
203  
204  INITIALIZE_PASS(ARMLoadStoreOpt, "arm-ldst-opt", ARM_LOAD_STORE_OPT_NAME, false,
205                  false)
206  
207  static bool definesCPSR(const MachineInstr &MI) {
208    for (const auto &MO : MI.operands()) {
209      if (!MO.isReg())
210        continue;
211      if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
212        // If the instruction has live CPSR def, then it's not safe to fold it
213        // into load / store.
214        return true;
215    }
216  
217    return false;
218  }
219  
220  static int getMemoryOpOffset(const MachineInstr &MI) {
221    unsigned Opcode = MI.getOpcode();
222    bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
223    unsigned NumOperands = MI.getDesc().getNumOperands();
224    unsigned OffField = MI.getOperand(NumOperands - 3).getImm();
225  
226    if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
227        Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
228        Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
229        Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
230      return OffField;
231  
232    // Thumb1 immediate offsets are scaled by 4
233    if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi ||
234        Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
235      return OffField * 4;
236  
237    int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
238      : ARM_AM::getAM5Offset(OffField) * 4;
239    ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField)
240      : ARM_AM::getAM5Op(OffField);
241  
242    if (Op == ARM_AM::sub)
243      return -Offset;
244  
245    return Offset;
246  }
247  
248  static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
249    return MI.getOperand(1);
250  }
251  
252  static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
253    return MI.getOperand(0);
254  }
255  
256  static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
257    switch (Opcode) {
258    default: llvm_unreachable("Unhandled opcode!");
259    case ARM::LDRi12:
260      ++NumLDMGened;
261      switch (Mode) {
262      default: llvm_unreachable("Unhandled submode!");
263      case ARM_AM::ia: return ARM::LDMIA;
264      case ARM_AM::da: return ARM::LDMDA;
265      case ARM_AM::db: return ARM::LDMDB;
266      case ARM_AM::ib: return ARM::LDMIB;
267      }
268    case ARM::STRi12:
269      ++NumSTMGened;
270      switch (Mode) {
271      default: llvm_unreachable("Unhandled submode!");
272      case ARM_AM::ia: return ARM::STMIA;
273      case ARM_AM::da: return ARM::STMDA;
274      case ARM_AM::db: return ARM::STMDB;
275      case ARM_AM::ib: return ARM::STMIB;
276      }
277    case ARM::tLDRi:
278    case ARM::tLDRspi:
279      // tLDMIA is writeback-only - unless the base register is in the input
280      // reglist.
281      ++NumLDMGened;
282      switch (Mode) {
283      default: llvm_unreachable("Unhandled submode!");
284      case ARM_AM::ia: return ARM::tLDMIA;
285      }
286    case ARM::tSTRi:
287    case ARM::tSTRspi:
288      // There is no non-writeback tSTMIA either.
289      ++NumSTMGened;
290      switch (Mode) {
291      default: llvm_unreachable("Unhandled submode!");
292      case ARM_AM::ia: return ARM::tSTMIA_UPD;
293      }
294    case ARM::t2LDRi8:
295    case ARM::t2LDRi12:
296      ++NumLDMGened;
297      switch (Mode) {
298      default: llvm_unreachable("Unhandled submode!");
299      case ARM_AM::ia: return ARM::t2LDMIA;
300      case ARM_AM::db: return ARM::t2LDMDB;
301      }
302    case ARM::t2STRi8:
303    case ARM::t2STRi12:
304      ++NumSTMGened;
305      switch (Mode) {
306      default: llvm_unreachable("Unhandled submode!");
307      case ARM_AM::ia: return ARM::t2STMIA;
308      case ARM_AM::db: return ARM::t2STMDB;
309      }
310    case ARM::VLDRS:
311      ++NumVLDMGened;
312      switch (Mode) {
313      default: llvm_unreachable("Unhandled submode!");
314      case ARM_AM::ia: return ARM::VLDMSIA;
315      case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
316      }
317    case ARM::VSTRS:
318      ++NumVSTMGened;
319      switch (Mode) {
320      default: llvm_unreachable("Unhandled submode!");
321      case ARM_AM::ia: return ARM::VSTMSIA;
322      case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
323      }
324    case ARM::VLDRD:
325      ++NumVLDMGened;
326      switch (Mode) {
327      default: llvm_unreachable("Unhandled submode!");
328      case ARM_AM::ia: return ARM::VLDMDIA;
329      case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
330      }
331    case ARM::VSTRD:
332      ++NumVSTMGened;
333      switch (Mode) {
334      default: llvm_unreachable("Unhandled submode!");
335      case ARM_AM::ia: return ARM::VSTMDIA;
336      case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
337      }
338    }
339  }
340  
341  static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
342    switch (Opcode) {
343    default: llvm_unreachable("Unhandled opcode!");
344    case ARM::LDMIA_RET:
345    case ARM::LDMIA:
346    case ARM::LDMIA_UPD:
347    case ARM::STMIA:
348    case ARM::STMIA_UPD:
349    case ARM::tLDMIA:
350    case ARM::tLDMIA_UPD:
351    case ARM::tSTMIA_UPD:
352    case ARM::t2LDMIA_RET:
353    case ARM::t2LDMIA:
354    case ARM::t2LDMIA_UPD:
355    case ARM::t2STMIA:
356    case ARM::t2STMIA_UPD:
357    case ARM::VLDMSIA:
358    case ARM::VLDMSIA_UPD:
359    case ARM::VSTMSIA:
360    case ARM::VSTMSIA_UPD:
361    case ARM::VLDMDIA:
362    case ARM::VLDMDIA_UPD:
363    case ARM::VSTMDIA:
364    case ARM::VSTMDIA_UPD:
365      return ARM_AM::ia;
366  
367    case ARM::LDMDA:
368    case ARM::LDMDA_UPD:
369    case ARM::STMDA:
370    case ARM::STMDA_UPD:
371      return ARM_AM::da;
372  
373    case ARM::LDMDB:
374    case ARM::LDMDB_UPD:
375    case ARM::STMDB:
376    case ARM::STMDB_UPD:
377    case ARM::t2LDMDB:
378    case ARM::t2LDMDB_UPD:
379    case ARM::t2STMDB:
380    case ARM::t2STMDB_UPD:
381    case ARM::VLDMSDB_UPD:
382    case ARM::VSTMSDB_UPD:
383    case ARM::VLDMDDB_UPD:
384    case ARM::VSTMDDB_UPD:
385      return ARM_AM::db;
386  
387    case ARM::LDMIB:
388    case ARM::LDMIB_UPD:
389    case ARM::STMIB:
390    case ARM::STMIB_UPD:
391      return ARM_AM::ib;
392    }
393  }
394  
395  static bool isT1i32Load(unsigned Opc) {
396    return Opc == ARM::tLDRi || Opc == ARM::tLDRspi;
397  }
398  
399  static bool isT2i32Load(unsigned Opc) {
400    return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
401  }
402  
403  static bool isi32Load(unsigned Opc) {
404    return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
405  }
406  
407  static bool isT1i32Store(unsigned Opc) {
408    return Opc == ARM::tSTRi || Opc == ARM::tSTRspi;
409  }
410  
411  static bool isT2i32Store(unsigned Opc) {
412    return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
413  }
414  
415  static bool isi32Store(unsigned Opc) {
416    return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
417  }
418  
419  static bool isLoadSingle(unsigned Opc) {
420    return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
421  }
422  
423  static unsigned getImmScale(unsigned Opc) {
424    switch (Opc) {
425    default: llvm_unreachable("Unhandled opcode!");
426    case ARM::tLDRi:
427    case ARM::tSTRi:
428    case ARM::tLDRspi:
429    case ARM::tSTRspi:
430      return 1;
431    case ARM::tLDRHi:
432    case ARM::tSTRHi:
433      return 2;
434    case ARM::tLDRBi:
435    case ARM::tSTRBi:
436      return 4;
437    }
438  }
439  
440  static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
441    switch (MI->getOpcode()) {
442    default: return 0;
443    case ARM::LDRi12:
444    case ARM::STRi12:
445    case ARM::tLDRi:
446    case ARM::tSTRi:
447    case ARM::tLDRspi:
448    case ARM::tSTRspi:
449    case ARM::t2LDRi8:
450    case ARM::t2LDRi12:
451    case ARM::t2STRi8:
452    case ARM::t2STRi12:
453    case ARM::VLDRS:
454    case ARM::VSTRS:
455      return 4;
456    case ARM::VLDRD:
457    case ARM::VSTRD:
458      return 8;
459    case ARM::LDMIA:
460    case ARM::LDMDA:
461    case ARM::LDMDB:
462    case ARM::LDMIB:
463    case ARM::STMIA:
464    case ARM::STMDA:
465    case ARM::STMDB:
466    case ARM::STMIB:
467    case ARM::tLDMIA:
468    case ARM::tLDMIA_UPD:
469    case ARM::tSTMIA_UPD:
470    case ARM::t2LDMIA:
471    case ARM::t2LDMDB:
472    case ARM::t2STMIA:
473    case ARM::t2STMDB:
474    case ARM::VLDMSIA:
475    case ARM::VSTMSIA:
476      return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
477    case ARM::VLDMDIA:
478    case ARM::VSTMDIA:
479      return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
480    }
481  }
482  
483  /// Update future uses of the base register with the offset introduced
484  /// due to writeback. This function only works on Thumb1.
485  void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
486                                          MachineBasicBlock::iterator MBBI,
487                                          const DebugLoc &DL, unsigned Base,
488                                          unsigned WordOffset,
489                                          ARMCC::CondCodes Pred,
490                                          unsigned PredReg) {
491    assert(isThumb1 && "Can only update base register uses for Thumb1!");
492    // Start updating any instructions with immediate offsets. Insert a SUB before
493    // the first non-updateable instruction (if any).
494    for (; MBBI != MBB.end(); ++MBBI) {
495      bool InsertSub = false;
496      unsigned Opc = MBBI->getOpcode();
497  
498      if (MBBI->readsRegister(Base)) {
499        int Offset;
500        bool IsLoad =
501          Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi;
502        bool IsStore =
503          Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi;
504  
505        if (IsLoad || IsStore) {
506          // Loads and stores with immediate offsets can be updated, but only if
507          // the new offset isn't negative.
508          // The MachineOperand containing the offset immediate is the last one
509          // before predicates.
510          MachineOperand &MO =
511            MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
512          // The offsets are scaled by 1, 2 or 4 depending on the Opcode.
513          Offset = MO.getImm() - WordOffset * getImmScale(Opc);
514  
515          // If storing the base register, it needs to be reset first.
516          Register InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
517  
518          if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
519            MO.setImm(Offset);
520          else
521            InsertSub = true;
522        } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
523                   !definesCPSR(*MBBI)) {
524          // SUBS/ADDS using this register, with a dead def of the CPSR.
525          // Merge it with the update; if the merged offset is too large,
526          // insert a new sub instead.
527          MachineOperand &MO =
528            MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
529          Offset = (Opc == ARM::tSUBi8) ?
530            MO.getImm() + WordOffset * 4 :
531            MO.getImm() - WordOffset * 4 ;
532          if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) {
533            // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if
534            // Offset == 0.
535            MO.setImm(Offset);
536            // The base register has now been reset, so exit early.
537            return;
538          } else {
539            InsertSub = true;
540          }
541        } else {
542          // Can't update the instruction.
543          InsertSub = true;
544        }
545      } else if (definesCPSR(*MBBI) || MBBI->isCall() || MBBI->isBranch()) {
546        // Since SUBS sets the condition flags, we can't place the base reset
547        // after an instruction that has a live CPSR def.
548        // The base register might also contain an argument for a function call.
549        InsertSub = true;
550      }
551  
552      if (InsertSub) {
553        // An instruction above couldn't be updated, so insert a sub.
554        BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base)
555            .add(t1CondCodeOp(true))
556            .addReg(Base)
557            .addImm(WordOffset * 4)
558            .addImm(Pred)
559            .addReg(PredReg);
560        return;
561      }
562  
563      if (MBBI->killsRegister(Base) || MBBI->definesRegister(Base))
564        // Register got killed. Stop updating.
565        return;
566    }
567  
568    // End of block was reached.
569    if (!MBB.succ_empty()) {
570      // FIXME: Because of a bug, live registers are sometimes missing from
571      // the successor blocks' live-in sets. This means we can't trust that
572      // information and *always* have to reset at the end of a block.
573      // See PR21029.
574      if (MBBI != MBB.end()) --MBBI;
575      BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base)
576          .add(t1CondCodeOp(true))
577          .addReg(Base)
578          .addImm(WordOffset * 4)
579          .addImm(Pred)
580          .addReg(PredReg);
581    }
582  }
583  
584  /// Return the first register of class \p RegClass that is not in \p Regs.
585  unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
586    if (!RegClassInfoValid) {
587      RegClassInfo.runOnMachineFunction(*MF);
588      RegClassInfoValid = true;
589    }
590  
591    for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
592      if (LiveRegs.available(MF->getRegInfo(), Reg))
593        return Reg;
594    return 0;
595  }
596  
597  /// Compute live registers just before instruction \p Before (in normal schedule
598  /// direction). Computes backwards so multiple queries in the same block must
599  /// come in reverse order.
600  void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
601      MachineBasicBlock::const_iterator Before) {
602    // Initialize if we never queried in this block.
603    if (!LiveRegsValid) {
604      LiveRegs.init(*TRI);
605      LiveRegs.addLiveOuts(MBB);
606      LiveRegPos = MBB.end();
607      LiveRegsValid = true;
608    }
609    // Move backward just before the "Before" position.
610    while (LiveRegPos != Before) {
611      --LiveRegPos;
612      LiveRegs.stepBackward(*LiveRegPos);
613    }
614  }
615  
616  static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
617                          unsigned Reg) {
618    for (const std::pair<unsigned, bool> &R : Regs)
619      if (R.first == Reg)
620        return true;
621    return false;
622  }
623  
624  /// Create and insert a LDM or STM with Base as base register and registers in
625  /// Regs as the register operands that would be loaded / stored.  It returns
626  /// true if the transformation is done.
627  MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
628      MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
629      int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
630      ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
631      ArrayRef<std::pair<unsigned, bool>> Regs,
632      ArrayRef<MachineInstr*> Instrs) {
633    unsigned NumRegs = Regs.size();
634    assert(NumRegs > 1);
635  
636    // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
637    // Compute liveness information for that register to make the decision.
638    bool SafeToClobberCPSR = !isThumb1 ||
639      (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
640       MachineBasicBlock::LQR_Dead);
641  
642    bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
643  
644    // Exception: If the base register is in the input reglist, Thumb1 LDM is
645    // non-writeback.
646    // It's also not possible to merge an STR of the base register in Thumb1.
647    if (isThumb1 && ContainsReg(Regs, Base)) {
648      assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
649      if (Opcode == ARM::tLDRi)
650        Writeback = false;
651      else if (Opcode == ARM::tSTRi)
652        return nullptr;
653    }
654  
655    ARM_AM::AMSubMode Mode = ARM_AM::ia;
656    // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
657    bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
658    bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
659  
660    if (Offset == 4 && haveIBAndDA) {
661      Mode = ARM_AM::ib;
662    } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
663      Mode = ARM_AM::da;
664    } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
665      // VLDM/VSTM do not support DB mode without also updating the base reg.
666      Mode = ARM_AM::db;
667    } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
668      // Check if this is a supported opcode before inserting instructions to
669      // calculate a new base register.
670      if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
671  
672      // If starting offset isn't zero, insert a MI to materialize a new base.
673      // But only do so if it is cost effective, i.e. merging more than two
674      // loads / stores.
675      if (NumRegs <= 2)
676        return nullptr;
677  
678      // On Thumb1, it's not worth materializing a new base register without
679      // clobbering the CPSR (i.e. not using ADDS/SUBS).
680      if (!SafeToClobberCPSR)
681        return nullptr;
682  
683      unsigned NewBase;
684      if (isi32Load(Opcode)) {
685        // If it is a load, then just use one of the destination registers
686        // as the new base. Will no longer be writeback in Thumb1.
687        NewBase = Regs[NumRegs-1].first;
688        Writeback = false;
689      } else {
690        // Find a free register that we can use as scratch register.
691        moveLiveRegsBefore(MBB, InsertBefore);
692        // The merged instruction does not exist yet but will use several Regs if
693        // it is a Store.
694        if (!isLoadSingle(Opcode))
695          for (const std::pair<unsigned, bool> &R : Regs)
696            LiveRegs.addReg(R.first);
697  
698        NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
699        if (NewBase == 0)
700          return nullptr;
701      }
702  
703      int BaseOpc = isThumb2 ? (BaseKill && Base == ARM::SP ? ARM::t2ADDspImm
704                                                            : ARM::t2ADDri)
705                             : (isThumb1 && Base == ARM::SP)
706                                   ? ARM::tADDrSPi
707                                   : (isThumb1 && Offset < 8)
708                                         ? ARM::tADDi3
709                                         : isThumb1 ? ARM::tADDi8 : ARM::ADDri;
710  
711      if (Offset < 0) {
712        // FIXME: There are no Thumb1 load/store instructions with negative
713        // offsets. So the Base != ARM::SP might be unnecessary.
714        Offset = -Offset;
715        BaseOpc = isThumb2 ? (BaseKill && Base == ARM::SP ? ARM::t2SUBspImm
716                                                          : ARM::t2SUBri)
717                           : (isThumb1 && Offset < 8 && Base != ARM::SP)
718                                 ? ARM::tSUBi3
719                                 : isThumb1 ? ARM::tSUBi8 : ARM::SUBri;
720      }
721  
722      if (!TL->isLegalAddImmediate(Offset))
723        // FIXME: Try add with register operand?
724        return nullptr; // Probably not worth it then.
725  
726      // We can only append a kill flag to the add/sub input if the value is not
727      // used in the register list of the stm as well.
728      bool KillOldBase = BaseKill &&
729        (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
730  
731      if (isThumb1) {
732        // Thumb1: depending on immediate size, use either
733        //   ADDS NewBase, Base, #imm3
734        // or
735        //   MOV  NewBase, Base
736        //   ADDS NewBase, #imm8.
737        if (Base != NewBase &&
738            (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) {
739          // Need to insert a MOV to the new base first.
740          if (isARMLowRegister(NewBase) && isARMLowRegister(Base) &&
741              !STI->hasV6Ops()) {
742            // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
743            if (Pred != ARMCC::AL)
744              return nullptr;
745            BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
746              .addReg(Base, getKillRegState(KillOldBase));
747          } else
748            BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
749                .addReg(Base, getKillRegState(KillOldBase))
750                .add(predOps(Pred, PredReg));
751  
752          // The following ADDS/SUBS becomes an update.
753          Base = NewBase;
754          KillOldBase = true;
755        }
756        if (BaseOpc == ARM::tADDrSPi) {
757          assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
758          BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
759              .addReg(Base, getKillRegState(KillOldBase))
760              .addImm(Offset / 4)
761              .add(predOps(Pred, PredReg));
762        } else
763          BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
764              .add(t1CondCodeOp(true))
765              .addReg(Base, getKillRegState(KillOldBase))
766              .addImm(Offset)
767              .add(predOps(Pred, PredReg));
768      } else {
769        BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
770            .addReg(Base, getKillRegState(KillOldBase))
771            .addImm(Offset)
772            .add(predOps(Pred, PredReg))
773            .add(condCodeOp());
774      }
775      Base = NewBase;
776      BaseKill = true; // New base is always killed straight away.
777    }
778  
779    bool isDef = isLoadSingle(Opcode);
780  
781    // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
782    // base register writeback.
783    Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
784    if (!Opcode)
785      return nullptr;
786  
787    // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
788    // - There is no writeback (LDM of base register),
789    // - the base register is killed by the merged instruction,
790    // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS
791    //   to reset the base register.
792    // Otherwise, don't merge.
793    // It's safe to return here since the code to materialize a new base register
794    // above is also conditional on SafeToClobberCPSR.
795    if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
796      return nullptr;
797  
798    MachineInstrBuilder MIB;
799  
800    if (Writeback) {
801      assert(isThumb1 && "expected Writeback only inThumb1");
802      if (Opcode == ARM::tLDMIA) {
803        assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs");
804        // Update tLDMIA with writeback if necessary.
805        Opcode = ARM::tLDMIA_UPD;
806      }
807  
808      MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
809  
810      // Thumb1: we might need to set base writeback when building the MI.
811      MIB.addReg(Base, getDefRegState(true))
812         .addReg(Base, getKillRegState(BaseKill));
813  
814      // The base isn't dead after a merged instruction with writeback.
815      // Insert a sub instruction after the newly formed instruction to reset.
816      if (!BaseKill)
817        UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
818    } else {
819      // No writeback, simply build the MachineInstr.
820      MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
821      MIB.addReg(Base, getKillRegState(BaseKill));
822    }
823  
824    MIB.addImm(Pred).addReg(PredReg);
825  
826    for (const std::pair<unsigned, bool> &R : Regs)
827      MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
828  
829    MIB.cloneMergedMemRefs(Instrs);
830  
831    return MIB.getInstr();
832  }
833  
834  MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(
835      MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
836      int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
837      ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
838      ArrayRef<std::pair<unsigned, bool>> Regs,
839      ArrayRef<MachineInstr*> Instrs) const {
840    bool IsLoad = isi32Load(Opcode);
841    assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
842    unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
843  
844    assert(Regs.size() == 2);
845    MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
846                                      TII->get(LoadStoreOpcode));
847    if (IsLoad) {
848      MIB.addReg(Regs[0].first, RegState::Define)
849         .addReg(Regs[1].first, RegState::Define);
850    } else {
851      MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
852         .addReg(Regs[1].first, getKillRegState(Regs[1].second));
853    }
854    MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
855    MIB.cloneMergedMemRefs(Instrs);
856    return MIB.getInstr();
857  }
858  
859  /// Call MergeOps and update MemOps and merges accordingly on success.
860  MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
861    const MachineInstr *First = Cand.Instrs.front();
862    unsigned Opcode = First->getOpcode();
863    bool IsLoad = isLoadSingle(Opcode);
864    SmallVector<std::pair<unsigned, bool>, 8> Regs;
865    SmallVector<unsigned, 4> ImpDefs;
866    DenseSet<unsigned> KilledRegs;
867    DenseSet<unsigned> UsedRegs;
868    // Determine list of registers and list of implicit super-register defs.
869    for (const MachineInstr *MI : Cand.Instrs) {
870      const MachineOperand &MO = getLoadStoreRegOp(*MI);
871      Register Reg = MO.getReg();
872      bool IsKill = MO.isKill();
873      if (IsKill)
874        KilledRegs.insert(Reg);
875      Regs.push_back(std::make_pair(Reg, IsKill));
876      UsedRegs.insert(Reg);
877  
878      if (IsLoad) {
879        // Collect any implicit defs of super-registers, after merging we can't
880        // be sure anymore that we properly preserved these live ranges and must
881        // removed these implicit operands.
882        for (const MachineOperand &MO : MI->implicit_operands()) {
883          if (!MO.isReg() || !MO.isDef() || MO.isDead())
884            continue;
885          assert(MO.isImplicit());
886          Register DefReg = MO.getReg();
887  
888          if (is_contained(ImpDefs, DefReg))
889            continue;
890          // We can ignore cases where the super-reg is read and written.
891          if (MI->readsRegister(DefReg))
892            continue;
893          ImpDefs.push_back(DefReg);
894        }
895      }
896    }
897  
898    // Attempt the merge.
899    using iterator = MachineBasicBlock::iterator;
900  
901    MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
902    iterator InsertBefore = std::next(iterator(LatestMI));
903    MachineBasicBlock &MBB = *LatestMI->getParent();
904    unsigned Offset = getMemoryOpOffset(*First);
905    Register Base = getLoadStoreBaseOp(*First).getReg();
906    bool BaseKill = LatestMI->killsRegister(Base);
907    Register PredReg;
908    ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
909    DebugLoc DL = First->getDebugLoc();
910    MachineInstr *Merged = nullptr;
911    if (Cand.CanMergeToLSDouble)
912      Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
913                                     Opcode, Pred, PredReg, DL, Regs,
914                                     Cand.Instrs);
915    if (!Merged && Cand.CanMergeToLSMulti)
916      Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
917                                    Opcode, Pred, PredReg, DL, Regs, Cand.Instrs);
918    if (!Merged)
919      return nullptr;
920  
921    // Determine earliest instruction that will get removed. We then keep an
922    // iterator just above it so the following erases don't invalidated it.
923    iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
924    bool EarliestAtBegin = false;
925    if (EarliestI == MBB.begin()) {
926      EarliestAtBegin = true;
927    } else {
928      EarliestI = std::prev(EarliestI);
929    }
930  
931    // Remove instructions which have been merged.
932    for (MachineInstr *MI : Cand.Instrs)
933      MBB.erase(MI);
934  
935    // Determine range between the earliest removed instruction and the new one.
936    if (EarliestAtBegin)
937      EarliestI = MBB.begin();
938    else
939      EarliestI = std::next(EarliestI);
940    auto FixupRange = make_range(EarliestI, iterator(Merged));
941  
942    if (isLoadSingle(Opcode)) {
943      // If the previous loads defined a super-reg, then we have to mark earlier
944      // operands undef; Replicate the super-reg def on the merged instruction.
945      for (MachineInstr &MI : FixupRange) {
946        for (unsigned &ImpDefReg : ImpDefs) {
947          for (MachineOperand &MO : MI.implicit_operands()) {
948            if (!MO.isReg() || MO.getReg() != ImpDefReg)
949              continue;
950            if (MO.readsReg())
951              MO.setIsUndef();
952            else if (MO.isDef())
953              ImpDefReg = 0;
954          }
955        }
956      }
957  
958      MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
959      for (unsigned ImpDef : ImpDefs)
960        MIB.addReg(ImpDef, RegState::ImplicitDefine);
961    } else {
962      // Remove kill flags: We are possibly storing the values later now.
963      assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
964      for (MachineInstr &MI : FixupRange) {
965        for (MachineOperand &MO : MI.uses()) {
966          if (!MO.isReg() || !MO.isKill())
967            continue;
968          if (UsedRegs.count(MO.getReg()))
969            MO.setIsKill(false);
970        }
971      }
972      assert(ImpDefs.empty());
973    }
974  
975    return Merged;
976  }
977  
978  static bool isValidLSDoubleOffset(int Offset) {
979    unsigned Value = abs(Offset);
980    // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
981    // multiplied by 4.
982    return (Value % 4) == 0 && Value < 1024;
983  }
984  
985  /// Return true for loads/stores that can be combined to a double/multi
986  /// operation without increasing the requirements for alignment.
987  static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
988                                   const MachineInstr &MI) {
989    // vldr/vstr trap on misaligned pointers anyway, forming vldm makes no
990    // difference.
991    unsigned Opcode = MI.getOpcode();
992    if (!isi32Load(Opcode) && !isi32Store(Opcode))
993      return true;
994  
995    // Stack pointer alignment is out of the programmers control so we can trust
996    // SP-relative loads/stores.
997    if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
998        STI.getFrameLowering()->getTransientStackAlign() >= Align(4))
999      return true;
1000    return false;
1001  }
1002  
1003  /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
1004  void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
1005    const MachineInstr *FirstMI = MemOps[0].MI;
1006    unsigned Opcode = FirstMI->getOpcode();
1007    bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
1008    unsigned Size = getLSMultipleTransferSize(FirstMI);
1009  
1010    unsigned SIndex = 0;
1011    unsigned EIndex = MemOps.size();
1012    do {
1013      // Look at the first instruction.
1014      const MachineInstr *MI = MemOps[SIndex].MI;
1015      int Offset = MemOps[SIndex].Offset;
1016      const MachineOperand &PMO = getLoadStoreRegOp(*MI);
1017      Register PReg = PMO.getReg();
1018      unsigned PRegNum = PMO.isUndef() ? std::numeric_limits<unsigned>::max()
1019                                       : TRI->getEncodingValue(PReg);
1020      unsigned Latest = SIndex;
1021      unsigned Earliest = SIndex;
1022      unsigned Count = 1;
1023      bool CanMergeToLSDouble =
1024        STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
1025      // ARM errata 602117: LDRD with base in list may result in incorrect base
1026      // register when interrupted or faulted.
1027      if (STI->isCortexM3() && isi32Load(Opcode) &&
1028          PReg == getLoadStoreBaseOp(*MI).getReg())
1029        CanMergeToLSDouble = false;
1030  
1031      bool CanMergeToLSMulti = true;
1032      // On swift vldm/vstm starting with an odd register number as that needs
1033      // more uops than single vldrs.
1034      if (STI->hasSlowOddRegister() && !isNotVFP && (PRegNum % 2) == 1)
1035        CanMergeToLSMulti = false;
1036  
1037      // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
1038      // deprecated; LDM to PC is fine but cannot happen here.
1039      if (PReg == ARM::SP || PReg == ARM::PC)
1040        CanMergeToLSMulti = CanMergeToLSDouble = false;
1041  
1042      // Should we be conservative?
1043      if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
1044        CanMergeToLSMulti = CanMergeToLSDouble = false;
1045  
1046      // vldm / vstm limit are 32 for S variants, 16 for D variants.
1047      unsigned Limit;
1048      switch (Opcode) {
1049      default:
1050        Limit = UINT_MAX;
1051        break;
1052      case ARM::VLDRD:
1053      case ARM::VSTRD:
1054        Limit = 16;
1055        break;
1056      }
1057  
1058      // Merge following instructions where possible.
1059      for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
1060        int NewOffset = MemOps[I].Offset;
1061        if (NewOffset != Offset + (int)Size)
1062          break;
1063        const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
1064        Register Reg = MO.getReg();
1065        if (Reg == ARM::SP || Reg == ARM::PC)
1066          break;
1067        if (Count == Limit)
1068          break;
1069  
1070        // See if the current load/store may be part of a multi load/store.
1071        unsigned RegNum = MO.isUndef() ? std::numeric_limits<unsigned>::max()
1072                                       : TRI->getEncodingValue(Reg);
1073        bool PartOfLSMulti = CanMergeToLSMulti;
1074        if (PartOfLSMulti) {
1075          // Register numbers must be in ascending order.
1076          if (RegNum <= PRegNum)
1077            PartOfLSMulti = false;
1078          // For VFP / NEON load/store multiples, the registers must be
1079          // consecutive and within the limit on the number of registers per
1080          // instruction.
1081          else if (!isNotVFP && RegNum != PRegNum+1)
1082            PartOfLSMulti = false;
1083        }
1084        // See if the current load/store may be part of a double load/store.
1085        bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
1086  
1087        if (!PartOfLSMulti && !PartOfLSDouble)
1088          break;
1089        CanMergeToLSMulti &= PartOfLSMulti;
1090        CanMergeToLSDouble &= PartOfLSDouble;
1091        // Track MemOp with latest and earliest position (Positions are
1092        // counted in reverse).
1093        unsigned Position = MemOps[I].Position;
1094        if (Position < MemOps[Latest].Position)
1095          Latest = I;
1096        else if (Position > MemOps[Earliest].Position)
1097          Earliest = I;
1098        // Prepare for next MemOp.
1099        Offset += Size;
1100        PRegNum = RegNum;
1101      }
1102  
1103      // Form a candidate from the Ops collected so far.
1104      MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate;
1105      for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
1106        Candidate->Instrs.push_back(MemOps[C].MI);
1107      Candidate->LatestMIIdx = Latest - SIndex;
1108      Candidate->EarliestMIIdx = Earliest - SIndex;
1109      Candidate->InsertPos = MemOps[Latest].Position;
1110      if (Count == 1)
1111        CanMergeToLSMulti = CanMergeToLSDouble = false;
1112      Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
1113      Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
1114      Candidates.push_back(Candidate);
1115      // Continue after the chain.
1116      SIndex += Count;
1117    } while (SIndex < EIndex);
1118  }
1119  
1120  static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
1121                                              ARM_AM::AMSubMode Mode) {
1122    switch (Opc) {
1123    default: llvm_unreachable("Unhandled opcode!");
1124    case ARM::LDMIA:
1125    case ARM::LDMDA:
1126    case ARM::LDMDB:
1127    case ARM::LDMIB:
1128      switch (Mode) {
1129      default: llvm_unreachable("Unhandled submode!");
1130      case ARM_AM::ia: return ARM::LDMIA_UPD;
1131      case ARM_AM::ib: return ARM::LDMIB_UPD;
1132      case ARM_AM::da: return ARM::LDMDA_UPD;
1133      case ARM_AM::db: return ARM::LDMDB_UPD;
1134      }
1135    case ARM::STMIA:
1136    case ARM::STMDA:
1137    case ARM::STMDB:
1138    case ARM::STMIB:
1139      switch (Mode) {
1140      default: llvm_unreachable("Unhandled submode!");
1141      case ARM_AM::ia: return ARM::STMIA_UPD;
1142      case ARM_AM::ib: return ARM::STMIB_UPD;
1143      case ARM_AM::da: return ARM::STMDA_UPD;
1144      case ARM_AM::db: return ARM::STMDB_UPD;
1145      }
1146    case ARM::t2LDMIA:
1147    case ARM::t2LDMDB:
1148      switch (Mode) {
1149      default: llvm_unreachable("Unhandled submode!");
1150      case ARM_AM::ia: return ARM::t2LDMIA_UPD;
1151      case ARM_AM::db: return ARM::t2LDMDB_UPD;
1152      }
1153    case ARM::t2STMIA:
1154    case ARM::t2STMDB:
1155      switch (Mode) {
1156      default: llvm_unreachable("Unhandled submode!");
1157      case ARM_AM::ia: return ARM::t2STMIA_UPD;
1158      case ARM_AM::db: return ARM::t2STMDB_UPD;
1159      }
1160    case ARM::VLDMSIA:
1161      switch (Mode) {
1162      default: llvm_unreachable("Unhandled submode!");
1163      case ARM_AM::ia: return ARM::VLDMSIA_UPD;
1164      case ARM_AM::db: return ARM::VLDMSDB_UPD;
1165      }
1166    case ARM::VLDMDIA:
1167      switch (Mode) {
1168      default: llvm_unreachable("Unhandled submode!");
1169      case ARM_AM::ia: return ARM::VLDMDIA_UPD;
1170      case ARM_AM::db: return ARM::VLDMDDB_UPD;
1171      }
1172    case ARM::VSTMSIA:
1173      switch (Mode) {
1174      default: llvm_unreachable("Unhandled submode!");
1175      case ARM_AM::ia: return ARM::VSTMSIA_UPD;
1176      case ARM_AM::db: return ARM::VSTMSDB_UPD;
1177      }
1178    case ARM::VSTMDIA:
1179      switch (Mode) {
1180      default: llvm_unreachable("Unhandled submode!");
1181      case ARM_AM::ia: return ARM::VSTMDIA_UPD;
1182      case ARM_AM::db: return ARM::VSTMDDB_UPD;
1183      }
1184    }
1185  }
1186  
1187  /// Check if the given instruction increments or decrements a register and
1188  /// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
1189  /// generated by the instruction are possibly read as well.
1190  static int isIncrementOrDecrement(const MachineInstr &MI, Register Reg,
1191                                    ARMCC::CondCodes Pred, Register PredReg) {
1192    bool CheckCPSRDef;
1193    int Scale;
1194    switch (MI.getOpcode()) {
1195    case ARM::tADDi8:  Scale =  4; CheckCPSRDef = true; break;
1196    case ARM::tSUBi8:  Scale = -4; CheckCPSRDef = true; break;
1197    case ARM::t2SUBri:
1198    case ARM::t2SUBspImm:
1199    case ARM::SUBri:   Scale = -1; CheckCPSRDef = true; break;
1200    case ARM::t2ADDri:
1201    case ARM::t2ADDspImm:
1202    case ARM::ADDri:   Scale =  1; CheckCPSRDef = true; break;
1203    case ARM::tADDspi: Scale =  4; CheckCPSRDef = false; break;
1204    case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
1205    default: return 0;
1206    }
1207  
1208    Register MIPredReg;
1209    if (MI.getOperand(0).getReg() != Reg ||
1210        MI.getOperand(1).getReg() != Reg ||
1211        getInstrPredicate(MI, MIPredReg) != Pred ||
1212        MIPredReg != PredReg)
1213      return 0;
1214  
1215    if (CheckCPSRDef && definesCPSR(MI))
1216      return 0;
1217    return MI.getOperand(2).getImm() * Scale;
1218  }
1219  
1220  /// Searches for an increment or decrement of \p Reg before \p MBBI.
1221  static MachineBasicBlock::iterator
1222  findIncDecBefore(MachineBasicBlock::iterator MBBI, Register Reg,
1223                   ARMCC::CondCodes Pred, Register PredReg, int &Offset) {
1224    Offset = 0;
1225    MachineBasicBlock &MBB = *MBBI->getParent();
1226    MachineBasicBlock::iterator BeginMBBI = MBB.begin();
1227    MachineBasicBlock::iterator EndMBBI = MBB.end();
1228    if (MBBI == BeginMBBI)
1229      return EndMBBI;
1230  
1231    // Skip debug values.
1232    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
1233    while (PrevMBBI->isDebugInstr() && PrevMBBI != BeginMBBI)
1234      --PrevMBBI;
1235  
1236    Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
1237    return Offset == 0 ? EndMBBI : PrevMBBI;
1238  }
1239  
1240  /// Searches for a increment or decrement of \p Reg after \p MBBI.
1241  static MachineBasicBlock::iterator
1242  findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg,
1243                  ARMCC::CondCodes Pred, Register PredReg, int &Offset,
1244                  const TargetRegisterInfo *TRI) {
1245    Offset = 0;
1246    MachineBasicBlock &MBB = *MBBI->getParent();
1247    MachineBasicBlock::iterator EndMBBI = MBB.end();
1248    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
1249    while (NextMBBI != EndMBBI) {
1250      // Skip debug values.
1251      while (NextMBBI != EndMBBI && NextMBBI->isDebugInstr())
1252        ++NextMBBI;
1253      if (NextMBBI == EndMBBI)
1254        return EndMBBI;
1255  
1256      unsigned Off = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
1257      if (Off) {
1258        Offset = Off;
1259        return NextMBBI;
1260      }
1261  
1262      // SP can only be combined if it is the next instruction after the original
1263      // MBBI, otherwise we may be incrementing the stack pointer (invalidating
1264      // anything below the new pointer) when its frame elements are still in
1265      // use. Other registers can attempt to look further, until a different use
1266      // or def of the register is found.
1267      if (Reg == ARM::SP || NextMBBI->readsRegister(Reg, TRI) ||
1268          NextMBBI->definesRegister(Reg, TRI))
1269        return EndMBBI;
1270  
1271      ++NextMBBI;
1272    }
1273    return EndMBBI;
1274  }
1275  
1276  /// Fold proceeding/trailing inc/dec of base register into the
1277  /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
1278  ///
1279  /// stmia rn, <ra, rb, rc>
1280  /// rn := rn + 4 * 3;
1281  /// =>
1282  /// stmia rn!, <ra, rb, rc>
1283  ///
1284  /// rn := rn - 4 * 3;
1285  /// ldmia rn, <ra, rb, rc>
1286  /// =>
1287  /// ldmdb rn!, <ra, rb, rc>
1288  bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
1289    // Thumb1 is already using updating loads/stores.
1290    if (isThumb1) return false;
1291    LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << *MI);
1292  
1293    const MachineOperand &BaseOP = MI->getOperand(0);
1294    Register Base = BaseOP.getReg();
1295    bool BaseKill = BaseOP.isKill();
1296    Register PredReg;
1297    ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
1298    unsigned Opcode = MI->getOpcode();
1299    DebugLoc DL = MI->getDebugLoc();
1300  
1301    // Can't use an updating ld/st if the base register is also a dest
1302    // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
1303    for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
1304      if (MO.getReg() == Base)
1305        return false;
1306  
1307    int Bytes = getLSMultipleTransferSize(MI);
1308    MachineBasicBlock &MBB = *MI->getParent();
1309    MachineBasicBlock::iterator MBBI(MI);
1310    int Offset;
1311    MachineBasicBlock::iterator MergeInstr
1312      = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
1313    ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
1314    if (Mode == ARM_AM::ia && Offset == -Bytes) {
1315      Mode = ARM_AM::db;
1316    } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
1317      Mode = ARM_AM::da;
1318    } else {
1319      MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI);
1320      if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
1321          ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) {
1322  
1323        // We couldn't find an inc/dec to merge. But if the base is dead, we
1324        // can still change to a writeback form as that will save us 2 bytes
1325        // of code size. It can create WAW hazards though, so only do it if
1326        // we're minimizing code size.
1327        if (!STI->hasMinSize() || !BaseKill)
1328          return false;
1329  
1330        bool HighRegsUsed = false;
1331        for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
1332          if (MO.getReg() >= ARM::R8) {
1333            HighRegsUsed = true;
1334            break;
1335          }
1336  
1337        if (!HighRegsUsed)
1338          MergeInstr = MBB.end();
1339        else
1340          return false;
1341      }
1342    }
1343    if (MergeInstr != MBB.end()) {
1344      LLVM_DEBUG(dbgs() << "  Erasing old increment: " << *MergeInstr);
1345      MBB.erase(MergeInstr);
1346    }
1347  
1348    unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
1349    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1350      .addReg(Base, getDefRegState(true)) // WB base register
1351      .addReg(Base, getKillRegState(BaseKill))
1352      .addImm(Pred).addReg(PredReg);
1353  
1354    // Transfer the rest of operands.
1355    for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 3))
1356      MIB.add(MO);
1357  
1358    // Transfer memoperands.
1359    MIB.setMemRefs(MI->memoperands());
1360  
1361    LLVM_DEBUG(dbgs() << "  Added new load/store: " << *MIB);
1362    MBB.erase(MBBI);
1363    return true;
1364  }
1365  
1366  static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
1367                                               ARM_AM::AddrOpc Mode) {
1368    switch (Opc) {
1369    case ARM::LDRi12:
1370      return ARM::LDR_PRE_IMM;
1371    case ARM::STRi12:
1372      return ARM::STR_PRE_IMM;
1373    case ARM::VLDRS:
1374      return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1375    case ARM::VLDRD:
1376      return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1377    case ARM::VSTRS:
1378      return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1379    case ARM::VSTRD:
1380      return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1381    case ARM::t2LDRi8:
1382    case ARM::t2LDRi12:
1383      return ARM::t2LDR_PRE;
1384    case ARM::t2STRi8:
1385    case ARM::t2STRi12:
1386      return ARM::t2STR_PRE;
1387    default: llvm_unreachable("Unhandled opcode!");
1388    }
1389  }
1390  
1391  static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
1392                                                ARM_AM::AddrOpc Mode) {
1393    switch (Opc) {
1394    case ARM::LDRi12:
1395      return ARM::LDR_POST_IMM;
1396    case ARM::STRi12:
1397      return ARM::STR_POST_IMM;
1398    case ARM::VLDRS:
1399      return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1400    case ARM::VLDRD:
1401      return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1402    case ARM::VSTRS:
1403      return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1404    case ARM::VSTRD:
1405      return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1406    case ARM::t2LDRi8:
1407    case ARM::t2LDRi12:
1408      return ARM::t2LDR_POST;
1409    case ARM::t2LDRBi8:
1410    case ARM::t2LDRBi12:
1411      return ARM::t2LDRB_POST;
1412    case ARM::t2LDRSBi8:
1413    case ARM::t2LDRSBi12:
1414      return ARM::t2LDRSB_POST;
1415    case ARM::t2LDRHi8:
1416    case ARM::t2LDRHi12:
1417      return ARM::t2LDRH_POST;
1418    case ARM::t2LDRSHi8:
1419    case ARM::t2LDRSHi12:
1420      return ARM::t2LDRSH_POST;
1421    case ARM::t2STRi8:
1422    case ARM::t2STRi12:
1423      return ARM::t2STR_POST;
1424    case ARM::t2STRBi8:
1425    case ARM::t2STRBi12:
1426      return ARM::t2STRB_POST;
1427    case ARM::t2STRHi8:
1428    case ARM::t2STRHi12:
1429      return ARM::t2STRH_POST;
1430  
1431    case ARM::MVE_VLDRBS16:
1432      return ARM::MVE_VLDRBS16_post;
1433    case ARM::MVE_VLDRBS32:
1434      return ARM::MVE_VLDRBS32_post;
1435    case ARM::MVE_VLDRBU16:
1436      return ARM::MVE_VLDRBU16_post;
1437    case ARM::MVE_VLDRBU32:
1438      return ARM::MVE_VLDRBU32_post;
1439    case ARM::MVE_VLDRHS32:
1440      return ARM::MVE_VLDRHS32_post;
1441    case ARM::MVE_VLDRHU32:
1442      return ARM::MVE_VLDRHU32_post;
1443    case ARM::MVE_VLDRBU8:
1444      return ARM::MVE_VLDRBU8_post;
1445    case ARM::MVE_VLDRHU16:
1446      return ARM::MVE_VLDRHU16_post;
1447    case ARM::MVE_VLDRWU32:
1448      return ARM::MVE_VLDRWU32_post;
1449    case ARM::MVE_VSTRB16:
1450      return ARM::MVE_VSTRB16_post;
1451    case ARM::MVE_VSTRB32:
1452      return ARM::MVE_VSTRB32_post;
1453    case ARM::MVE_VSTRH32:
1454      return ARM::MVE_VSTRH32_post;
1455    case ARM::MVE_VSTRBU8:
1456      return ARM::MVE_VSTRBU8_post;
1457    case ARM::MVE_VSTRHU16:
1458      return ARM::MVE_VSTRHU16_post;
1459    case ARM::MVE_VSTRWU32:
1460      return ARM::MVE_VSTRWU32_post;
1461  
1462    default: llvm_unreachable("Unhandled opcode!");
1463    }
1464  }
1465  
1466  /// Fold proceeding/trailing inc/dec of base register into the
1467  /// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
1468  bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
1469    // Thumb1 doesn't have updating LDR/STR.
1470    // FIXME: Use LDM/STM with single register instead.
1471    if (isThumb1) return false;
1472    LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << *MI);
1473  
1474    Register Base = getLoadStoreBaseOp(*MI).getReg();
1475    bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
1476    unsigned Opcode = MI->getOpcode();
1477    DebugLoc DL = MI->getDebugLoc();
1478    bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
1479                  Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
1480    bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
1481    if (isi32Load(Opcode) || isi32Store(Opcode))
1482      if (MI->getOperand(2).getImm() != 0)
1483        return false;
1484    if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
1485      return false;
1486  
1487    // Can't do the merge if the destination register is the same as the would-be
1488    // writeback register.
1489    if (MI->getOperand(0).getReg() == Base)
1490      return false;
1491  
1492    Register PredReg;
1493    ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
1494    int Bytes = getLSMultipleTransferSize(MI);
1495    MachineBasicBlock &MBB = *MI->getParent();
1496    MachineBasicBlock::iterator MBBI(MI);
1497    int Offset;
1498    MachineBasicBlock::iterator MergeInstr
1499      = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
1500    unsigned NewOpc;
1501    if (!isAM5 && Offset == Bytes) {
1502      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
1503    } else if (Offset == -Bytes) {
1504      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
1505    } else {
1506      MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI);
1507      if (MergeInstr == MBB.end())
1508        return false;
1509  
1510      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
1511      if ((isAM5 && Offset != Bytes) ||
1512          (!isAM5 && !isLegalAddressImm(NewOpc, Offset, TII))) {
1513        NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
1514        if (isAM5 || !isLegalAddressImm(NewOpc, Offset, TII))
1515          return false;
1516      }
1517    }
1518    LLVM_DEBUG(dbgs() << "  Erasing old increment: " << *MergeInstr);
1519    MBB.erase(MergeInstr);
1520  
1521    ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
1522  
1523    bool isLd = isLoadSingle(Opcode);
1524    if (isAM5) {
1525      // VLDM[SD]_UPD, VSTM[SD]_UPD
1526      // (There are no base-updating versions of VLDR/VSTR instructions, but the
1527      // updating load/store-multiple instructions can be used with only one
1528      // register.)
1529      MachineOperand &MO = MI->getOperand(0);
1530      auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1531                     .addReg(Base, getDefRegState(true)) // WB base register
1532                     .addReg(Base, getKillRegState(isLd ? BaseKill : false))
1533                     .addImm(Pred)
1534                     .addReg(PredReg)
1535                     .addReg(MO.getReg(), (isLd ? getDefRegState(true)
1536                                                : getKillRegState(MO.isKill())))
1537                     .cloneMemRefs(*MI);
1538      (void)MIB;
1539      LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
1540    } else if (isLd) {
1541      if (isAM2) {
1542        // LDR_PRE, LDR_POST
1543        if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
1544          auto MIB =
1545              BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1546                  .addReg(Base, RegState::Define)
1547                  .addReg(Base)
1548                  .addImm(Offset)
1549                  .addImm(Pred)
1550                  .addReg(PredReg)
1551                  .cloneMemRefs(*MI);
1552          (void)MIB;
1553          LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
1554        } else {
1555          int Imm = ARM_AM::getAM2Opc(AddSub, abs(Offset), ARM_AM::no_shift);
1556          auto MIB =
1557              BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1558                  .addReg(Base, RegState::Define)
1559                  .addReg(Base)
1560                  .addReg(0)
1561                  .addImm(Imm)
1562                  .add(predOps(Pred, PredReg))
1563                  .cloneMemRefs(*MI);
1564          (void)MIB;
1565          LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
1566        }
1567      } else {
1568        // t2LDR_PRE, t2LDR_POST
1569        auto MIB =
1570            BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1571                .addReg(Base, RegState::Define)
1572                .addReg(Base)
1573                .addImm(Offset)
1574                .add(predOps(Pred, PredReg))
1575                .cloneMemRefs(*MI);
1576        (void)MIB;
1577        LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
1578      }
1579    } else {
1580      MachineOperand &MO = MI->getOperand(0);
1581      // FIXME: post-indexed stores use am2offset_imm, which still encodes
1582      // the vestigal zero-reg offset register. When that's fixed, this clause
1583      // can be removed entirely.
1584      if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
1585        int Imm = ARM_AM::getAM2Opc(AddSub, abs(Offset), ARM_AM::no_shift);
1586        // STR_PRE, STR_POST
1587        auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1588                       .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1589                       .addReg(Base)
1590                       .addReg(0)
1591                       .addImm(Imm)
1592                       .add(predOps(Pred, PredReg))
1593                       .cloneMemRefs(*MI);
1594        (void)MIB;
1595        LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
1596      } else {
1597        // t2STR_PRE, t2STR_POST
1598        auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1599                       .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1600                       .addReg(Base)
1601                       .addImm(Offset)
1602                       .add(predOps(Pred, PredReg))
1603                       .cloneMemRefs(*MI);
1604        (void)MIB;
1605        LLVM_DEBUG(dbgs() << "  Added new instruction: " << *MIB);
1606      }
1607    }
1608    MBB.erase(MBBI);
1609  
1610    return true;
1611  }
1612  
1613  bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
1614    unsigned Opcode = MI.getOpcode();
1615    assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
1616           "Must have t2STRDi8 or t2LDRDi8");
1617    if (MI.getOperand(3).getImm() != 0)
1618      return false;
1619    LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << MI);
1620  
1621    // Behaviour for writeback is undefined if base register is the same as one
1622    // of the others.
1623    const MachineOperand &BaseOp = MI.getOperand(2);
1624    Register Base = BaseOp.getReg();
1625    const MachineOperand &Reg0Op = MI.getOperand(0);
1626    const MachineOperand &Reg1Op = MI.getOperand(1);
1627    if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
1628      return false;
1629  
1630    Register PredReg;
1631    ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1632    MachineBasicBlock::iterator MBBI(MI);
1633    MachineBasicBlock &MBB = *MI.getParent();
1634    int Offset;
1635    MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
1636                                                              PredReg, Offset);
1637    unsigned NewOpc;
1638    if (Offset == 8 || Offset == -8) {
1639      NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
1640    } else {
1641      MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI);
1642      if (MergeInstr == MBB.end())
1643        return false;
1644      NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
1645      if (!isLegalAddressImm(NewOpc, Offset, TII))
1646        return false;
1647    }
1648    LLVM_DEBUG(dbgs() << "  Erasing old increment: " << *MergeInstr);
1649    MBB.erase(MergeInstr);
1650  
1651    DebugLoc DL = MI.getDebugLoc();
1652    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1653    if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
1654      MIB.add(Reg0Op).add(Reg1Op).addReg(BaseOp.getReg(), RegState::Define);
1655    } else {
1656      assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
1657      MIB.addReg(BaseOp.getReg(), RegState::Define).add(Reg0Op).add(Reg1Op);
1658    }
1659    MIB.addReg(BaseOp.getReg(), RegState::Kill)
1660       .addImm(Offset).addImm(Pred).addReg(PredReg);
1661    assert(TII->get(Opcode).getNumOperands() == 6 &&
1662           TII->get(NewOpc).getNumOperands() == 7 &&
1663           "Unexpected number of operands in Opcode specification.");
1664  
1665    // Transfer implicit operands.
1666    for (const MachineOperand &MO : MI.implicit_operands())
1667      MIB.add(MO);
1668    MIB.cloneMemRefs(MI);
1669  
1670    LLVM_DEBUG(dbgs() << "  Added new load/store: " << *MIB);
1671    MBB.erase(MBBI);
1672    return true;
1673  }
1674  
1675  /// Returns true if instruction is a memory operation that this pass is capable
1676  /// of operating on.
1677  static bool isMemoryOp(const MachineInstr &MI) {
1678    unsigned Opcode = MI.getOpcode();
1679    switch (Opcode) {
1680    case ARM::VLDRS:
1681    case ARM::VSTRS:
1682    case ARM::VLDRD:
1683    case ARM::VSTRD:
1684    case ARM::LDRi12:
1685    case ARM::STRi12:
1686    case ARM::tLDRi:
1687    case ARM::tSTRi:
1688    case ARM::tLDRspi:
1689    case ARM::tSTRspi:
1690    case ARM::t2LDRi8:
1691    case ARM::t2LDRi12:
1692    case ARM::t2STRi8:
1693    case ARM::t2STRi12:
1694      break;
1695    default:
1696      return false;
1697    }
1698    if (!MI.getOperand(1).isReg())
1699      return false;
1700  
1701    // When no memory operands are present, conservatively assume unaligned,
1702    // volatile, unfoldable.
1703    if (!MI.hasOneMemOperand())
1704      return false;
1705  
1706    const MachineMemOperand &MMO = **MI.memoperands_begin();
1707  
1708    // Don't touch volatile memory accesses - we may be changing their order.
1709    // TODO: We could allow unordered and monotonic atomics here, but we need to
1710    // make sure the resulting ldm/stm is correctly marked as atomic.
1711    if (MMO.isVolatile() || MMO.isAtomic())
1712      return false;
1713  
1714    // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
1715    // not.
1716    if (MMO.getAlign() < Align(4))
1717      return false;
1718  
1719    // str <undef> could probably be eliminated entirely, but for now we just want
1720    // to avoid making a mess of it.
1721    // FIXME: Use str <undef> as a wildcard to enable better stm folding.
1722    if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef())
1723      return false;
1724  
1725    // Likewise don't mess with references to undefined addresses.
1726    if (MI.getOperand(1).isUndef())
1727      return false;
1728  
1729    return true;
1730  }
1731  
1732  static void InsertLDR_STR(MachineBasicBlock &MBB,
1733                            MachineBasicBlock::iterator &MBBI, int Offset,
1734                            bool isDef, unsigned NewOpc, unsigned Reg,
1735                            bool RegDeadKill, bool RegUndef, unsigned BaseReg,
1736                            bool BaseKill, bool BaseUndef, ARMCC::CondCodes Pred,
1737                            unsigned PredReg, const TargetInstrInfo *TII,
1738                            MachineInstr *MI) {
1739    if (isDef) {
1740      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1741                                        TII->get(NewOpc))
1742        .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
1743        .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1744      MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1745      // FIXME: This is overly conservative; the new instruction accesses 4
1746      // bytes, not 8.
1747      MIB.cloneMemRefs(*MI);
1748    } else {
1749      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1750                                        TII->get(NewOpc))
1751        .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
1752        .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1753      MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1754      // FIXME: This is overly conservative; the new instruction accesses 4
1755      // bytes, not 8.
1756      MIB.cloneMemRefs(*MI);
1757    }
1758  }
1759  
1760  bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
1761                                            MachineBasicBlock::iterator &MBBI) {
1762    MachineInstr *MI = &*MBBI;
1763    unsigned Opcode = MI->getOpcode();
1764    // FIXME: Code/comments below check Opcode == t2STRDi8, but this check returns
1765    // if we see this opcode.
1766    if (Opcode != ARM::LDRD && Opcode != ARM::STRD && Opcode != ARM::t2LDRDi8)
1767      return false;
1768  
1769    const MachineOperand &BaseOp = MI->getOperand(2);
1770    Register BaseReg = BaseOp.getReg();
1771    Register EvenReg = MI->getOperand(0).getReg();
1772    Register OddReg = MI->getOperand(1).getReg();
1773    unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
1774    unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
1775  
1776    // ARM errata 602117: LDRD with base in list may result in incorrect base
1777    // register when interrupted or faulted.
1778    bool Errata602117 = EvenReg == BaseReg &&
1779      (Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8) && STI->isCortexM3();
1780    // ARM LDRD/STRD needs consecutive registers.
1781    bool NonConsecutiveRegs = (Opcode == ARM::LDRD || Opcode == ARM::STRD) &&
1782      (EvenRegNum % 2 != 0 || EvenRegNum + 1 != OddRegNum);
1783  
1784    if (!Errata602117 && !NonConsecutiveRegs)
1785      return false;
1786  
1787    bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
1788    bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
1789    bool EvenDeadKill = isLd ?
1790      MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
1791    bool EvenUndef = MI->getOperand(0).isUndef();
1792    bool OddDeadKill  = isLd ?
1793      MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
1794    bool OddUndef = MI->getOperand(1).isUndef();
1795    bool BaseKill = BaseOp.isKill();
1796    bool BaseUndef = BaseOp.isUndef();
1797    assert((isT2 || MI->getOperand(3).getReg() == ARM::NoRegister) &&
1798           "register offset not handled below");
1799    int OffImm = getMemoryOpOffset(*MI);
1800    Register PredReg;
1801    ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
1802  
1803    if (OddRegNum > EvenRegNum && OffImm == 0) {
1804      // Ascending register numbers and no offset. It's safe to change it to a
1805      // ldm or stm.
1806      unsigned NewOpc = (isLd)
1807        ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
1808        : (isT2 ? ARM::t2STMIA : ARM::STMIA);
1809      if (isLd) {
1810        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1811          .addReg(BaseReg, getKillRegState(BaseKill))
1812          .addImm(Pred).addReg(PredReg)
1813          .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
1814          .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill))
1815          .cloneMemRefs(*MI);
1816        ++NumLDRD2LDM;
1817      } else {
1818        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1819          .addReg(BaseReg, getKillRegState(BaseKill))
1820          .addImm(Pred).addReg(PredReg)
1821          .addReg(EvenReg,
1822                  getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
1823          .addReg(OddReg,
1824                  getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef))
1825          .cloneMemRefs(*MI);
1826        ++NumSTRD2STM;
1827      }
1828    } else {
1829      // Split into two instructions.
1830      unsigned NewOpc = (isLd)
1831        ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1832        : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1833      // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
1834      // so adjust and use t2LDRi12 here for that.
1835      unsigned NewOpc2 = (isLd)
1836        ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1837        : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1838      // If this is a load, make sure the first load does not clobber the base
1839      // register before the second load reads it.
1840      if (isLd && TRI->regsOverlap(EvenReg, BaseReg)) {
1841        assert(!TRI->regsOverlap(OddReg, BaseReg));
1842        InsertLDR_STR(MBB, MBBI, OffImm + 4, isLd, NewOpc2, OddReg, OddDeadKill,
1843                      false, BaseReg, false, BaseUndef, Pred, PredReg, TII, MI);
1844        InsertLDR_STR(MBB, MBBI, OffImm, isLd, NewOpc, EvenReg, EvenDeadKill,
1845                      false, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII,
1846                      MI);
1847      } else {
1848        if (OddReg == EvenReg && EvenDeadKill) {
1849          // If the two source operands are the same, the kill marker is
1850          // probably on the first one. e.g.
1851          // t2STRDi8 killed %r5, %r5, killed %r9, 0, 14, %reg0
1852          EvenDeadKill = false;
1853          OddDeadKill = true;
1854        }
1855        // Never kill the base register in the first instruction.
1856        if (EvenReg == BaseReg)
1857          EvenDeadKill = false;
1858        InsertLDR_STR(MBB, MBBI, OffImm, isLd, NewOpc, EvenReg, EvenDeadKill,
1859                      EvenUndef, BaseReg, false, BaseUndef, Pred, PredReg, TII,
1860                      MI);
1861        InsertLDR_STR(MBB, MBBI, OffImm + 4, isLd, NewOpc2, OddReg, OddDeadKill,
1862                      OddUndef, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII,
1863                      MI);
1864      }
1865      if (isLd)
1866        ++NumLDRD2LDR;
1867      else
1868        ++NumSTRD2STR;
1869    }
1870  
1871    MBBI = MBB.erase(MBBI);
1872    return true;
1873  }
1874  
1875  /// An optimization pass to turn multiple LDR / STR ops of the same base and
1876  /// incrementing offset into LDM / STM ops.
1877  bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
1878    MemOpQueue MemOps;
1879    unsigned CurrBase = 0;
1880    unsigned CurrOpc = ~0u;
1881    ARMCC::CondCodes CurrPred = ARMCC::AL;
1882    unsigned Position = 0;
1883    assert(Candidates.size() == 0);
1884    assert(MergeBaseCandidates.size() == 0);
1885    LiveRegsValid = false;
1886  
1887    for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
1888         I = MBBI) {
1889      // The instruction in front of the iterator is the one we look at.
1890      MBBI = std::prev(I);
1891      if (FixInvalidRegPairOp(MBB, MBBI))
1892        continue;
1893      ++Position;
1894  
1895      if (isMemoryOp(*MBBI)) {
1896        unsigned Opcode = MBBI->getOpcode();
1897        const MachineOperand &MO = MBBI->getOperand(0);
1898        Register Reg = MO.getReg();
1899        Register Base = getLoadStoreBaseOp(*MBBI).getReg();
1900        Register PredReg;
1901        ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
1902        int Offset = getMemoryOpOffset(*MBBI);
1903        if (CurrBase == 0) {
1904          // Start of a new chain.
1905          CurrBase = Base;
1906          CurrOpc  = Opcode;
1907          CurrPred = Pred;
1908          MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Position));
1909          continue;
1910        }
1911        // Note: No need to match PredReg in the next if.
1912        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
1913          // Watch out for:
1914          //   r4 := ldr [r0, #8]
1915          //   r4 := ldr [r0, #4]
1916          // or
1917          //   r0 := ldr [r0]
1918          // If a load overrides the base register or a register loaded by
1919          // another load in our chain, we cannot take this instruction.
1920          bool Overlap = false;
1921          if (isLoadSingle(Opcode)) {
1922            Overlap = (Base == Reg);
1923            if (!Overlap) {
1924              for (const MemOpQueueEntry &E : MemOps) {
1925                if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
1926                  Overlap = true;
1927                  break;
1928                }
1929              }
1930            }
1931          }
1932  
1933          if (!Overlap) {
1934            // Check offset and sort memory operation into the current chain.
1935            if (Offset > MemOps.back().Offset) {
1936              MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Position));
1937              continue;
1938            } else {
1939              MemOpQueue::iterator MI, ME;
1940              for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
1941                if (Offset < MI->Offset) {
1942                  // Found a place to insert.
1943                  break;
1944                }
1945                if (Offset == MI->Offset) {
1946                  // Collision, abort.
1947                  MI = ME;
1948                  break;
1949                }
1950              }
1951              if (MI != MemOps.end()) {
1952                MemOps.insert(MI, MemOpQueueEntry(*MBBI, Offset, Position));
1953                continue;
1954              }
1955            }
1956          }
1957        }
1958  
1959        // Don't advance the iterator; The op will start a new chain next.
1960        MBBI = I;
1961        --Position;
1962        // Fallthrough to look into existing chain.
1963      } else if (MBBI->isDebugInstr()) {
1964        continue;
1965      } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
1966                 MBBI->getOpcode() == ARM::t2STRDi8) {
1967        // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
1968        // remember them because we may still be able to merge add/sub into them.
1969        MergeBaseCandidates.push_back(&*MBBI);
1970      }
1971  
1972      // If we are here then the chain is broken; Extract candidates for a merge.
1973      if (MemOps.size() > 0) {
1974        FormCandidates(MemOps);
1975        // Reset for the next chain.
1976        CurrBase = 0;
1977        CurrOpc = ~0u;
1978        CurrPred = ARMCC::AL;
1979        MemOps.clear();
1980      }
1981    }
1982    if (MemOps.size() > 0)
1983      FormCandidates(MemOps);
1984  
1985    // Sort candidates so they get processed from end to begin of the basic
1986    // block later; This is necessary for liveness calculation.
1987    auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
1988      return M0->InsertPos < M1->InsertPos;
1989    };
1990    llvm::sort(Candidates, LessThan);
1991  
1992    // Go through list of candidates and merge.
1993    bool Changed = false;
1994    for (const MergeCandidate *Candidate : Candidates) {
1995      if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
1996        MachineInstr *Merged = MergeOpsUpdate(*Candidate);
1997        // Merge preceding/trailing base inc/dec into the merged op.
1998        if (Merged) {
1999          Changed = true;
2000          unsigned Opcode = Merged->getOpcode();
2001          if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
2002            MergeBaseUpdateLSDouble(*Merged);
2003          else
2004            MergeBaseUpdateLSMultiple(Merged);
2005        } else {
2006          for (MachineInstr *MI : Candidate->Instrs) {
2007            if (MergeBaseUpdateLoadStore(MI))
2008              Changed = true;
2009          }
2010        }
2011      } else {
2012        assert(Candidate->Instrs.size() == 1);
2013        if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
2014          Changed = true;
2015      }
2016    }
2017    Candidates.clear();
2018    // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
2019    for (MachineInstr *MI : MergeBaseCandidates)
2020      MergeBaseUpdateLSDouble(*MI);
2021    MergeBaseCandidates.clear();
2022  
2023    return Changed;
2024  }
2025  
2026  /// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
2027  /// into the preceding stack restore so it directly restore the value of LR
2028  /// into pc.
2029  ///   ldmfd sp!, {..., lr}
2030  ///   bx lr
2031  /// or
2032  ///   ldmfd sp!, {..., lr}
2033  ///   mov pc, lr
2034  /// =>
2035  ///   ldmfd sp!, {..., pc}
2036  bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
2037    // Thumb1 LDM doesn't allow high registers.
2038    if (isThumb1) return false;
2039    if (MBB.empty()) return false;
2040  
2041    MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
2042    if (MBBI != MBB.begin() && MBBI != MBB.end() &&
2043        (MBBI->getOpcode() == ARM::BX_RET ||
2044         MBBI->getOpcode() == ARM::tBX_RET ||
2045         MBBI->getOpcode() == ARM::MOVPCLR)) {
2046      MachineBasicBlock::iterator PrevI = std::prev(MBBI);
2047      // Ignore any debug instructions.
2048      while (PrevI->isDebugInstr() && PrevI != MBB.begin())
2049        --PrevI;
2050      MachineInstr &PrevMI = *PrevI;
2051      unsigned Opcode = PrevMI.getOpcode();
2052      if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
2053          Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
2054          Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
2055        MachineOperand &MO = PrevMI.getOperand(PrevMI.getNumOperands() - 1);
2056        if (MO.getReg() != ARM::LR)
2057          return false;
2058        unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
2059        assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
2060                Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
2061        PrevMI.setDesc(TII->get(NewOpc));
2062        MO.setReg(ARM::PC);
2063        PrevMI.copyImplicitOps(*MBB.getParent(), *MBBI);
2064        MBB.erase(MBBI);
2065        // We now restore LR into PC so it is not live-out of the return block
2066        // anymore: Clear the CSI Restored bit.
2067        MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo();
2068        // CSI should be fixed after PrologEpilog Insertion
2069        assert(MFI.isCalleeSavedInfoValid() && "CSI should be valid");
2070        for (CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
2071          if (Info.getReg() == ARM::LR) {
2072            Info.setRestored(false);
2073            break;
2074          }
2075        }
2076        return true;
2077      }
2078    }
2079    return false;
2080  }
2081  
2082  bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
2083    MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
2084    if (MBBI == MBB.begin() || MBBI == MBB.end() ||
2085        MBBI->getOpcode() != ARM::tBX_RET)
2086      return false;
2087  
2088    MachineBasicBlock::iterator Prev = MBBI;
2089    --Prev;
2090    if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR))
2091      return false;
2092  
2093    for (auto Use : Prev->uses())
2094      if (Use.isKill()) {
2095        assert(STI->hasV4TOps());
2096        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
2097            .addReg(Use.getReg(), RegState::Kill)
2098            .add(predOps(ARMCC::AL))
2099            .copyImplicitOps(*MBBI);
2100        MBB.erase(MBBI);
2101        MBB.erase(Prev);
2102        return true;
2103      }
2104  
2105    llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?");
2106  }
2107  
2108  bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2109    if (skipFunction(Fn.getFunction()))
2110      return false;
2111  
2112    MF = &Fn;
2113    STI = &Fn.getSubtarget<ARMSubtarget>();
2114    TL = STI->getTargetLowering();
2115    AFI = Fn.getInfo<ARMFunctionInfo>();
2116    TII = STI->getInstrInfo();
2117    TRI = STI->getRegisterInfo();
2118  
2119    RegClassInfoValid = false;
2120    isThumb2 = AFI->isThumb2Function();
2121    isThumb1 = AFI->isThumbFunction() && !isThumb2;
2122  
2123    bool Modified = false;
2124    for (MachineBasicBlock &MBB : Fn) {
2125      Modified |= LoadStoreMultipleOpti(MBB);
2126      if (STI->hasV5TOps() && !AFI->shouldSignReturnAddress())
2127        Modified |= MergeReturnIntoLDM(MBB);
2128      if (isThumb1)
2129        Modified |= CombineMovBx(MBB);
2130    }
2131  
2132    Allocator.DestroyAll();
2133    return Modified;
2134  }
2135  
2136  #define ARM_PREALLOC_LOAD_STORE_OPT_NAME                                       \
2137    "ARM pre- register allocation load / store optimization pass"
2138  
2139  namespace {
2140  
2141    /// Pre- register allocation pass that move load / stores from consecutive
2142    /// locations close to make it more likely they will be combined later.
2143    struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
2144      static char ID;
2145  
2146      AliasAnalysis *AA;
2147      const DataLayout *TD;
2148      const TargetInstrInfo *TII;
2149      const TargetRegisterInfo *TRI;
2150      const ARMSubtarget *STI;
2151      MachineRegisterInfo *MRI;
2152      MachineDominatorTree *DT;
2153      MachineFunction *MF;
2154  
2155      ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
2156  
2157      bool runOnMachineFunction(MachineFunction &Fn) override;
2158  
2159      StringRef getPassName() const override {
2160        return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
2161      }
2162  
2163      void getAnalysisUsage(AnalysisUsage &AU) const override {
2164        AU.addRequired<AAResultsWrapperPass>();
2165        AU.addRequired<MachineDominatorTree>();
2166        AU.addPreserved<MachineDominatorTree>();
2167        MachineFunctionPass::getAnalysisUsage(AU);
2168      }
2169  
2170    private:
2171      bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
2172                            unsigned &NewOpc, Register &EvenReg, Register &OddReg,
2173                            Register &BaseReg, int &Offset, Register &PredReg,
2174                            ARMCC::CondCodes &Pred, bool &isT2);
2175      bool RescheduleOps(MachineBasicBlock *MBB,
2176                         SmallVectorImpl<MachineInstr *> &Ops,
2177                         unsigned Base, bool isLd,
2178                         DenseMap<MachineInstr*, unsigned> &MI2LocMap);
2179      bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
2180      bool DistributeIncrements();
2181      bool DistributeIncrements(Register Base);
2182    };
2183  
2184  } // end anonymous namespace
2185  
2186  char ARMPreAllocLoadStoreOpt::ID = 0;
2187  
2188  INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
2189                        ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
2190  INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
2191  INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
2192                      ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
2193  
2194  // Limit the number of instructions to be rescheduled.
2195  // FIXME: tune this limit, and/or come up with some better heuristics.
2196  static cl::opt<unsigned> InstReorderLimit("arm-prera-ldst-opt-reorder-limit",
2197                                            cl::init(8), cl::Hidden);
2198  
2199  bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2200    if (AssumeMisalignedLoadStores || skipFunction(Fn.getFunction()))
2201      return false;
2202  
2203    TD = &Fn.getDataLayout();
2204    STI = &Fn.getSubtarget<ARMSubtarget>();
2205    TII = STI->getInstrInfo();
2206    TRI = STI->getRegisterInfo();
2207    MRI = &Fn.getRegInfo();
2208    DT = &getAnalysis<MachineDominatorTree>();
2209    MF  = &Fn;
2210    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2211  
2212    bool Modified = DistributeIncrements();
2213    for (MachineBasicBlock &MFI : Fn)
2214      Modified |= RescheduleLoadStoreInstrs(&MFI);
2215  
2216    return Modified;
2217  }
2218  
2219  static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
2220                                        MachineBasicBlock::iterator I,
2221                                        MachineBasicBlock::iterator E,
2222                                        SmallPtrSetImpl<MachineInstr*> &MemOps,
2223                                        SmallSet<unsigned, 4> &MemRegs,
2224                                        const TargetRegisterInfo *TRI,
2225                                        AliasAnalysis *AA) {
2226    // Are there stores / loads / calls between them?
2227    SmallSet<unsigned, 4> AddedRegPressure;
2228    while (++I != E) {
2229      if (I->isDebugInstr() || MemOps.count(&*I))
2230        continue;
2231      if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
2232        return false;
2233      if (I->mayStore() || (!isLd && I->mayLoad()))
2234        for (MachineInstr *MemOp : MemOps)
2235          if (I->mayAlias(AA, *MemOp, /*UseTBAA*/ false))
2236            return false;
2237      for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
2238        MachineOperand &MO = I->getOperand(j);
2239        if (!MO.isReg())
2240          continue;
2241        Register Reg = MO.getReg();
2242        if (MO.isDef() && TRI->regsOverlap(Reg, Base))
2243          return false;
2244        if (Reg != Base && !MemRegs.count(Reg))
2245          AddedRegPressure.insert(Reg);
2246      }
2247    }
2248  
2249    // Estimate register pressure increase due to the transformation.
2250    if (MemRegs.size() <= 4)
2251      // Ok if we are moving small number of instructions.
2252      return true;
2253    return AddedRegPressure.size() <= MemRegs.size() * 2;
2254  }
2255  
2256  bool ARMPreAllocLoadStoreOpt::CanFormLdStDWord(
2257      MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc,
2258      Register &FirstReg, Register &SecondReg, Register &BaseReg, int &Offset,
2259      Register &PredReg, ARMCC::CondCodes &Pred, bool &isT2) {
2260    // Make sure we're allowed to generate LDRD/STRD.
2261    if (!STI->hasV5TEOps())
2262      return false;
2263  
2264    // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
2265    unsigned Scale = 1;
2266    unsigned Opcode = Op0->getOpcode();
2267    if (Opcode == ARM::LDRi12) {
2268      NewOpc = ARM::LDRD;
2269    } else if (Opcode == ARM::STRi12) {
2270      NewOpc = ARM::STRD;
2271    } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
2272      NewOpc = ARM::t2LDRDi8;
2273      Scale = 4;
2274      isT2 = true;
2275    } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
2276      NewOpc = ARM::t2STRDi8;
2277      Scale = 4;
2278      isT2 = true;
2279    } else {
2280      return false;
2281    }
2282  
2283    // Make sure the base address satisfies i64 ld / st alignment requirement.
2284    // At the moment, we ignore the memoryoperand's value.
2285    // If we want to use AliasAnalysis, we should check it accordingly.
2286    if (!Op0->hasOneMemOperand() ||
2287        (*Op0->memoperands_begin())->isVolatile() ||
2288        (*Op0->memoperands_begin())->isAtomic())
2289      return false;
2290  
2291    Align Alignment = (*Op0->memoperands_begin())->getAlign();
2292    const Function &Func = MF->getFunction();
2293    Align ReqAlign =
2294        STI->hasV6Ops() ? TD->getABITypeAlign(Type::getInt64Ty(Func.getContext()))
2295                        : Align(8); // Pre-v6 need 8-byte align
2296    if (Alignment < ReqAlign)
2297      return false;
2298  
2299    // Then make sure the immediate offset fits.
2300    int OffImm = getMemoryOpOffset(*Op0);
2301    if (isT2) {
2302      int Limit = (1 << 8) * Scale;
2303      if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
2304        return false;
2305      Offset = OffImm;
2306    } else {
2307      ARM_AM::AddrOpc AddSub = ARM_AM::add;
2308      if (OffImm < 0) {
2309        AddSub = ARM_AM::sub;
2310        OffImm = - OffImm;
2311      }
2312      int Limit = (1 << 8) * Scale;
2313      if (OffImm >= Limit || (OffImm & (Scale-1)))
2314        return false;
2315      Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
2316    }
2317    FirstReg = Op0->getOperand(0).getReg();
2318    SecondReg = Op1->getOperand(0).getReg();
2319    if (FirstReg == SecondReg)
2320      return false;
2321    BaseReg = Op0->getOperand(1).getReg();
2322    Pred = getInstrPredicate(*Op0, PredReg);
2323    dl = Op0->getDebugLoc();
2324    return true;
2325  }
2326  
2327  bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
2328                                   SmallVectorImpl<MachineInstr *> &Ops,
2329                                   unsigned Base, bool isLd,
2330                                   DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
2331    bool RetVal = false;
2332  
2333    // Sort by offset (in reverse order).
2334    llvm::sort(Ops, [](const MachineInstr *LHS, const MachineInstr *RHS) {
2335      int LOffset = getMemoryOpOffset(*LHS);
2336      int ROffset = getMemoryOpOffset(*RHS);
2337      assert(LHS == RHS || LOffset != ROffset);
2338      return LOffset > ROffset;
2339    });
2340  
2341    // The loads / stores of the same base are in order. Scan them from first to
2342    // last and check for the following:
2343    // 1. Any def of base.
2344    // 2. Any gaps.
2345    while (Ops.size() > 1) {
2346      unsigned FirstLoc = ~0U;
2347      unsigned LastLoc = 0;
2348      MachineInstr *FirstOp = nullptr;
2349      MachineInstr *LastOp = nullptr;
2350      int LastOffset = 0;
2351      unsigned LastOpcode = 0;
2352      unsigned LastBytes = 0;
2353      unsigned NumMove = 0;
2354      for (MachineInstr *Op : llvm::reverse(Ops)) {
2355        // Make sure each operation has the same kind.
2356        unsigned LSMOpcode
2357          = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
2358        if (LastOpcode && LSMOpcode != LastOpcode)
2359          break;
2360  
2361        // Check that we have a continuous set of offsets.
2362        int Offset = getMemoryOpOffset(*Op);
2363        unsigned Bytes = getLSMultipleTransferSize(Op);
2364        if (LastBytes) {
2365          if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
2366            break;
2367        }
2368  
2369        // Don't try to reschedule too many instructions.
2370        if (NumMove == InstReorderLimit)
2371          break;
2372  
2373        // Found a mergable instruction; save information about it.
2374        ++NumMove;
2375        LastOffset = Offset;
2376        LastBytes = Bytes;
2377        LastOpcode = LSMOpcode;
2378  
2379        unsigned Loc = MI2LocMap[Op];
2380        if (Loc <= FirstLoc) {
2381          FirstLoc = Loc;
2382          FirstOp = Op;
2383        }
2384        if (Loc >= LastLoc) {
2385          LastLoc = Loc;
2386          LastOp = Op;
2387        }
2388      }
2389  
2390      if (NumMove <= 1)
2391        Ops.pop_back();
2392      else {
2393        SmallPtrSet<MachineInstr*, 4> MemOps;
2394        SmallSet<unsigned, 4> MemRegs;
2395        for (size_t i = Ops.size() - NumMove, e = Ops.size(); i != e; ++i) {
2396          MemOps.insert(Ops[i]);
2397          MemRegs.insert(Ops[i]->getOperand(0).getReg());
2398        }
2399  
2400        // Be conservative, if the instructions are too far apart, don't
2401        // move them. We want to limit the increase of register pressure.
2402        bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
2403        if (DoMove)
2404          DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
2405                                             MemOps, MemRegs, TRI, AA);
2406        if (!DoMove) {
2407          for (unsigned i = 0; i != NumMove; ++i)
2408            Ops.pop_back();
2409        } else {
2410          // This is the new location for the loads / stores.
2411          MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
2412          while (InsertPos != MBB->end() &&
2413                 (MemOps.count(&*InsertPos) || InsertPos->isDebugInstr()))
2414            ++InsertPos;
2415  
2416          // If we are moving a pair of loads / stores, see if it makes sense
2417          // to try to allocate a pair of registers that can form register pairs.
2418          MachineInstr *Op0 = Ops.back();
2419          MachineInstr *Op1 = Ops[Ops.size()-2];
2420          Register FirstReg, SecondReg;
2421          Register BaseReg, PredReg;
2422          ARMCC::CondCodes Pred = ARMCC::AL;
2423          bool isT2 = false;
2424          unsigned NewOpc = 0;
2425          int Offset = 0;
2426          DebugLoc dl;
2427          if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
2428                                               FirstReg, SecondReg, BaseReg,
2429                                               Offset, PredReg, Pred, isT2)) {
2430            Ops.pop_back();
2431            Ops.pop_back();
2432  
2433            const MCInstrDesc &MCID = TII->get(NewOpc);
2434            const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
2435            MRI->constrainRegClass(FirstReg, TRC);
2436            MRI->constrainRegClass(SecondReg, TRC);
2437  
2438            // Form the pair instruction.
2439            if (isLd) {
2440              MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2441                .addReg(FirstReg, RegState::Define)
2442                .addReg(SecondReg, RegState::Define)
2443                .addReg(BaseReg);
2444              // FIXME: We're converting from LDRi12 to an insn that still
2445              // uses addrmode2, so we need an explicit offset reg. It should
2446              // always by reg0 since we're transforming LDRi12s.
2447              if (!isT2)
2448                MIB.addReg(0);
2449              MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2450              MIB.cloneMergedMemRefs({Op0, Op1});
2451              LLVM_DEBUG(dbgs() << "Formed " << *MIB << "\n");
2452              ++NumLDRDFormed;
2453            } else {
2454              MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2455                .addReg(FirstReg)
2456                .addReg(SecondReg)
2457                .addReg(BaseReg);
2458              // FIXME: We're converting from LDRi12 to an insn that still
2459              // uses addrmode2, so we need an explicit offset reg. It should
2460              // always by reg0 since we're transforming STRi12s.
2461              if (!isT2)
2462                MIB.addReg(0);
2463              MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2464              MIB.cloneMergedMemRefs({Op0, Op1});
2465              LLVM_DEBUG(dbgs() << "Formed " << *MIB << "\n");
2466              ++NumSTRDFormed;
2467            }
2468            MBB->erase(Op0);
2469            MBB->erase(Op1);
2470  
2471            if (!isT2) {
2472              // Add register allocation hints to form register pairs.
2473              MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg);
2474              MRI->setRegAllocationHint(SecondReg,  ARMRI::RegPairOdd, FirstReg);
2475            }
2476          } else {
2477            for (unsigned i = 0; i != NumMove; ++i) {
2478              MachineInstr *Op = Ops.pop_back_val();
2479              MBB->splice(InsertPos, MBB, Op);
2480            }
2481          }
2482  
2483          NumLdStMoved += NumMove;
2484          RetVal = true;
2485        }
2486      }
2487    }
2488  
2489    return RetVal;
2490  }
2491  
2492  bool
2493  ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
2494    bool RetVal = false;
2495  
2496    DenseMap<MachineInstr*, unsigned> MI2LocMap;
2497    using MapIt = DenseMap<unsigned, SmallVector<MachineInstr *, 4>>::iterator;
2498    using Base2InstMap = DenseMap<unsigned, SmallVector<MachineInstr *, 4>>;
2499    using BaseVec = SmallVector<unsigned, 4>;
2500    Base2InstMap Base2LdsMap;
2501    Base2InstMap Base2StsMap;
2502    BaseVec LdBases;
2503    BaseVec StBases;
2504  
2505    unsigned Loc = 0;
2506    MachineBasicBlock::iterator MBBI = MBB->begin();
2507    MachineBasicBlock::iterator E = MBB->end();
2508    while (MBBI != E) {
2509      for (; MBBI != E; ++MBBI) {
2510        MachineInstr &MI = *MBBI;
2511        if (MI.isCall() || MI.isTerminator()) {
2512          // Stop at barriers.
2513          ++MBBI;
2514          break;
2515        }
2516  
2517        if (!MI.isDebugInstr())
2518          MI2LocMap[&MI] = ++Loc;
2519  
2520        if (!isMemoryOp(MI))
2521          continue;
2522        Register PredReg;
2523        if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
2524          continue;
2525  
2526        int Opc = MI.getOpcode();
2527        bool isLd = isLoadSingle(Opc);
2528        Register Base = MI.getOperand(1).getReg();
2529        int Offset = getMemoryOpOffset(MI);
2530        bool StopHere = false;
2531        auto FindBases = [&] (Base2InstMap &Base2Ops, BaseVec &Bases) {
2532          MapIt BI = Base2Ops.find(Base);
2533          if (BI == Base2Ops.end()) {
2534            Base2Ops[Base].push_back(&MI);
2535            Bases.push_back(Base);
2536            return;
2537          }
2538          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2539            if (Offset == getMemoryOpOffset(*BI->second[i])) {
2540              StopHere = true;
2541              break;
2542            }
2543          }
2544          if (!StopHere)
2545            BI->second.push_back(&MI);
2546        };
2547  
2548        if (isLd)
2549          FindBases(Base2LdsMap, LdBases);
2550        else
2551          FindBases(Base2StsMap, StBases);
2552  
2553        if (StopHere) {
2554          // Found a duplicate (a base+offset combination that's seen earlier).
2555          // Backtrack.
2556          --Loc;
2557          break;
2558        }
2559      }
2560  
2561      // Re-schedule loads.
2562      for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
2563        unsigned Base = LdBases[i];
2564        SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
2565        if (Lds.size() > 1)
2566          RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
2567      }
2568  
2569      // Re-schedule stores.
2570      for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
2571        unsigned Base = StBases[i];
2572        SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
2573        if (Sts.size() > 1)
2574          RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
2575      }
2576  
2577      if (MBBI != E) {
2578        Base2LdsMap.clear();
2579        Base2StsMap.clear();
2580        LdBases.clear();
2581        StBases.clear();
2582      }
2583    }
2584  
2585    return RetVal;
2586  }
2587  
2588  // Get the Base register operand index from the memory access MachineInst if we
2589  // should attempt to distribute postinc on it. Return -1 if not of a valid
2590  // instruction type. If it returns an index, it is assumed that instruction is a
2591  // r+i indexing mode, and getBaseOperandIndex() + 1 is the Offset index.
2592  static int getBaseOperandIndex(MachineInstr &MI) {
2593    switch (MI.getOpcode()) {
2594    case ARM::MVE_VLDRBS16:
2595    case ARM::MVE_VLDRBS32:
2596    case ARM::MVE_VLDRBU16:
2597    case ARM::MVE_VLDRBU32:
2598    case ARM::MVE_VLDRHS32:
2599    case ARM::MVE_VLDRHU32:
2600    case ARM::MVE_VLDRBU8:
2601    case ARM::MVE_VLDRHU16:
2602    case ARM::MVE_VLDRWU32:
2603    case ARM::MVE_VSTRB16:
2604    case ARM::MVE_VSTRB32:
2605    case ARM::MVE_VSTRH32:
2606    case ARM::MVE_VSTRBU8:
2607    case ARM::MVE_VSTRHU16:
2608    case ARM::MVE_VSTRWU32:
2609    case ARM::t2LDRHi8:
2610    case ARM::t2LDRHi12:
2611    case ARM::t2LDRSHi8:
2612    case ARM::t2LDRSHi12:
2613    case ARM::t2LDRBi8:
2614    case ARM::t2LDRBi12:
2615    case ARM::t2LDRSBi8:
2616    case ARM::t2LDRSBi12:
2617    case ARM::t2STRBi8:
2618    case ARM::t2STRBi12:
2619    case ARM::t2STRHi8:
2620    case ARM::t2STRHi12:
2621      return 1;
2622    case ARM::MVE_VLDRBS16_post:
2623    case ARM::MVE_VLDRBS32_post:
2624    case ARM::MVE_VLDRBU16_post:
2625    case ARM::MVE_VLDRBU32_post:
2626    case ARM::MVE_VLDRHS32_post:
2627    case ARM::MVE_VLDRHU32_post:
2628    case ARM::MVE_VLDRBU8_post:
2629    case ARM::MVE_VLDRHU16_post:
2630    case ARM::MVE_VLDRWU32_post:
2631    case ARM::MVE_VSTRB16_post:
2632    case ARM::MVE_VSTRB32_post:
2633    case ARM::MVE_VSTRH32_post:
2634    case ARM::MVE_VSTRBU8_post:
2635    case ARM::MVE_VSTRHU16_post:
2636    case ARM::MVE_VSTRWU32_post:
2637    case ARM::MVE_VLDRBS16_pre:
2638    case ARM::MVE_VLDRBS32_pre:
2639    case ARM::MVE_VLDRBU16_pre:
2640    case ARM::MVE_VLDRBU32_pre:
2641    case ARM::MVE_VLDRHS32_pre:
2642    case ARM::MVE_VLDRHU32_pre:
2643    case ARM::MVE_VLDRBU8_pre:
2644    case ARM::MVE_VLDRHU16_pre:
2645    case ARM::MVE_VLDRWU32_pre:
2646    case ARM::MVE_VSTRB16_pre:
2647    case ARM::MVE_VSTRB32_pre:
2648    case ARM::MVE_VSTRH32_pre:
2649    case ARM::MVE_VSTRBU8_pre:
2650    case ARM::MVE_VSTRHU16_pre:
2651    case ARM::MVE_VSTRWU32_pre:
2652      return 2;
2653    }
2654    return -1;
2655  }
2656  
2657  static bool isPostIndex(MachineInstr &MI) {
2658    switch (MI.getOpcode()) {
2659    case ARM::MVE_VLDRBS16_post:
2660    case ARM::MVE_VLDRBS32_post:
2661    case ARM::MVE_VLDRBU16_post:
2662    case ARM::MVE_VLDRBU32_post:
2663    case ARM::MVE_VLDRHS32_post:
2664    case ARM::MVE_VLDRHU32_post:
2665    case ARM::MVE_VLDRBU8_post:
2666    case ARM::MVE_VLDRHU16_post:
2667    case ARM::MVE_VLDRWU32_post:
2668    case ARM::MVE_VSTRB16_post:
2669    case ARM::MVE_VSTRB32_post:
2670    case ARM::MVE_VSTRH32_post:
2671    case ARM::MVE_VSTRBU8_post:
2672    case ARM::MVE_VSTRHU16_post:
2673    case ARM::MVE_VSTRWU32_post:
2674      return true;
2675    }
2676    return false;
2677  }
2678  
2679  static bool isPreIndex(MachineInstr &MI) {
2680    switch (MI.getOpcode()) {
2681    case ARM::MVE_VLDRBS16_pre:
2682    case ARM::MVE_VLDRBS32_pre:
2683    case ARM::MVE_VLDRBU16_pre:
2684    case ARM::MVE_VLDRBU32_pre:
2685    case ARM::MVE_VLDRHS32_pre:
2686    case ARM::MVE_VLDRHU32_pre:
2687    case ARM::MVE_VLDRBU8_pre:
2688    case ARM::MVE_VLDRHU16_pre:
2689    case ARM::MVE_VLDRWU32_pre:
2690    case ARM::MVE_VSTRB16_pre:
2691    case ARM::MVE_VSTRB32_pre:
2692    case ARM::MVE_VSTRH32_pre:
2693    case ARM::MVE_VSTRBU8_pre:
2694    case ARM::MVE_VSTRHU16_pre:
2695    case ARM::MVE_VSTRWU32_pre:
2696      return true;
2697    }
2698    return false;
2699  }
2700  
2701  // Given a memory access Opcode, check that the give Imm would be a valid Offset
2702  // for this instruction (same as isLegalAddressImm), Or if the instruction
2703  // could be easily converted to one where that was valid. For example converting
2704  // t2LDRi12 to t2LDRi8 for negative offsets. Works in conjunction with
2705  // AdjustBaseAndOffset below.
2706  static bool isLegalOrConvertableAddressImm(unsigned Opcode, int Imm,
2707                                             const TargetInstrInfo *TII,
2708                                             int &CodesizeEstimate) {
2709    if (isLegalAddressImm(Opcode, Imm, TII))
2710      return true;
2711  
2712    // We can convert AddrModeT2_i12 to AddrModeT2_i8neg.
2713    const MCInstrDesc &Desc = TII->get(Opcode);
2714    unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
2715    switch (AddrMode) {
2716    case ARMII::AddrModeT2_i12:
2717      CodesizeEstimate += 1;
2718      return Imm < 0 && -Imm < ((1 << 8) * 1);
2719    }
2720    return false;
2721  }
2722  
2723  // Given an MI adjust its address BaseReg to use NewBaseReg and address offset
2724  // by -Offset. This can either happen in-place or be a replacement as MI is
2725  // converted to another instruction type.
2726  static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg,
2727                                  int Offset, const TargetInstrInfo *TII,
2728                                  const TargetRegisterInfo *TRI) {
2729    // Set the Base reg
2730    unsigned BaseOp = getBaseOperandIndex(*MI);
2731    MI->getOperand(BaseOp).setReg(NewBaseReg);
2732    // and constrain the reg class to that required by the instruction.
2733    MachineFunction *MF = MI->getMF();
2734    MachineRegisterInfo &MRI = MF->getRegInfo();
2735    const MCInstrDesc &MCID = TII->get(MI->getOpcode());
2736    const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp, TRI, *MF);
2737    MRI.constrainRegClass(NewBaseReg, TRC);
2738  
2739    int OldOffset = MI->getOperand(BaseOp + 1).getImm();
2740    if (isLegalAddressImm(MI->getOpcode(), OldOffset - Offset, TII))
2741      MI->getOperand(BaseOp + 1).setImm(OldOffset - Offset);
2742    else {
2743      unsigned ConvOpcode;
2744      switch (MI->getOpcode()) {
2745      case ARM::t2LDRHi12:
2746        ConvOpcode = ARM::t2LDRHi8;
2747        break;
2748      case ARM::t2LDRSHi12:
2749        ConvOpcode = ARM::t2LDRSHi8;
2750        break;
2751      case ARM::t2LDRBi12:
2752        ConvOpcode = ARM::t2LDRBi8;
2753        break;
2754      case ARM::t2LDRSBi12:
2755        ConvOpcode = ARM::t2LDRSBi8;
2756        break;
2757      case ARM::t2STRHi12:
2758        ConvOpcode = ARM::t2STRHi8;
2759        break;
2760      case ARM::t2STRBi12:
2761        ConvOpcode = ARM::t2STRBi8;
2762        break;
2763      default:
2764        llvm_unreachable("Unhandled convertable opcode");
2765      }
2766      assert(isLegalAddressImm(ConvOpcode, OldOffset - Offset, TII) &&
2767             "Illegal Address Immediate after convert!");
2768  
2769      const MCInstrDesc &MCID = TII->get(ConvOpcode);
2770      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
2771          .add(MI->getOperand(0))
2772          .add(MI->getOperand(1))
2773          .addImm(OldOffset - Offset)
2774          .add(MI->getOperand(3))
2775          .add(MI->getOperand(4))
2776          .cloneMemRefs(*MI);
2777      MI->eraseFromParent();
2778    }
2779  }
2780  
2781  static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
2782                                              Register NewReg,
2783                                              const TargetInstrInfo *TII,
2784                                              const TargetRegisterInfo *TRI) {
2785    MachineFunction *MF = MI->getMF();
2786    MachineRegisterInfo &MRI = MF->getRegInfo();
2787  
2788    unsigned NewOpcode = getPostIndexedLoadStoreOpcode(
2789        MI->getOpcode(), Offset > 0 ? ARM_AM::add : ARM_AM::sub);
2790  
2791    const MCInstrDesc &MCID = TII->get(NewOpcode);
2792    // Constrain the def register class
2793    const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
2794    MRI.constrainRegClass(NewReg, TRC);
2795    // And do the same for the base operand
2796    TRC = TII->getRegClass(MCID, 2, TRI, *MF);
2797    MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
2798  
2799    unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask);
2800    switch (AddrMode) {
2801    case ARMII::AddrModeT2_i7:
2802    case ARMII::AddrModeT2_i7s2:
2803    case ARMII::AddrModeT2_i7s4:
2804      // Any MVE load/store
2805      return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
2806          .addReg(NewReg, RegState::Define)
2807          .add(MI->getOperand(0))
2808          .add(MI->getOperand(1))
2809          .addImm(Offset)
2810          .add(MI->getOperand(3))
2811          .add(MI->getOperand(4))
2812          .add(MI->getOperand(5))
2813          .cloneMemRefs(*MI);
2814    case ARMII::AddrModeT2_i8:
2815      if (MI->mayLoad()) {
2816        return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
2817            .add(MI->getOperand(0))
2818            .addReg(NewReg, RegState::Define)
2819            .add(MI->getOperand(1))
2820            .addImm(Offset)
2821            .add(MI->getOperand(3))
2822            .add(MI->getOperand(4))
2823            .cloneMemRefs(*MI);
2824      } else {
2825        return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
2826            .addReg(NewReg, RegState::Define)
2827            .add(MI->getOperand(0))
2828            .add(MI->getOperand(1))
2829            .addImm(Offset)
2830            .add(MI->getOperand(3))
2831            .add(MI->getOperand(4))
2832            .cloneMemRefs(*MI);
2833      }
2834    default:
2835      llvm_unreachable("Unhandled createPostIncLoadStore");
2836    }
2837  }
2838  
2839  // Given a Base Register, optimise the load/store uses to attempt to create more
2840  // post-inc accesses and less register moves. We do this by taking zero offset
2841  // loads/stores with an add, and convert them to a postinc load/store of the
2842  // same type. Any subsequent accesses will be adjusted to use and account for
2843  // the post-inc value.
2844  // For example:
2845  // LDR #0            LDR_POSTINC #16
2846  // LDR #4            LDR #-12
2847  // LDR #8            LDR #-8
2848  // LDR #12           LDR #-4
2849  // ADD #16
2850  //
2851  // At the same time if we do not find an increment but do find an existing
2852  // pre/post inc instruction, we can still adjust the offsets of subsequent
2853  // instructions to save the register move that would otherwise be needed for the
2854  // in-place increment.
2855  bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
2856    // We are looking for:
2857    // One zero offset load/store that can become postinc
2858    MachineInstr *BaseAccess = nullptr;
2859    MachineInstr *PrePostInc = nullptr;
2860    // An increment that can be folded in
2861    MachineInstr *Increment = nullptr;
2862    // Other accesses after BaseAccess that will need to be updated to use the
2863    // postinc value.
2864    SmallPtrSet<MachineInstr *, 8> OtherAccesses;
2865    for (auto &Use : MRI->use_nodbg_instructions(Base)) {
2866      if (!Increment && getAddSubImmediate(Use) != 0) {
2867        Increment = &Use;
2868        continue;
2869      }
2870  
2871      int BaseOp = getBaseOperandIndex(Use);
2872      if (BaseOp == -1)
2873        return false;
2874  
2875      if (!Use.getOperand(BaseOp).isReg() ||
2876          Use.getOperand(BaseOp).getReg() != Base)
2877        return false;
2878      if (isPreIndex(Use) || isPostIndex(Use))
2879        PrePostInc = &Use;
2880      else if (Use.getOperand(BaseOp + 1).getImm() == 0)
2881        BaseAccess = &Use;
2882      else
2883        OtherAccesses.insert(&Use);
2884    }
2885  
2886    int IncrementOffset;
2887    Register NewBaseReg;
2888    if (BaseAccess && Increment) {
2889      if (PrePostInc || BaseAccess->getParent() != Increment->getParent())
2890        return false;
2891      Register PredReg;
2892      if (Increment->definesRegister(ARM::CPSR) ||
2893          getInstrPredicate(*Increment, PredReg) != ARMCC::AL)
2894        return false;
2895  
2896      LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
2897                        << Base.virtRegIndex() << "\n");
2898  
2899      // Make sure that Increment has no uses before BaseAccess that are not PHI
2900      // uses.
2901      for (MachineInstr &Use :
2902          MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
2903        if (&Use == BaseAccess || (Use.getOpcode() != TargetOpcode::PHI &&
2904                                   !DT->dominates(BaseAccess, &Use))) {
2905          LLVM_DEBUG(dbgs() << "  BaseAccess doesn't dominate use of increment\n");
2906          return false;
2907        }
2908      }
2909  
2910      // Make sure that Increment can be folded into Base
2911      IncrementOffset = getAddSubImmediate(*Increment);
2912      unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode(
2913          BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub);
2914      if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) {
2915        LLVM_DEBUG(dbgs() << "  Illegal addressing mode immediate on postinc\n");
2916        return false;
2917      }
2918    }
2919    else if (PrePostInc) {
2920      // If we already have a pre/post index load/store then set BaseAccess,
2921      // IncrementOffset and NewBaseReg to the values it already produces,
2922      // allowing us to update and subsequent uses of BaseOp reg with the
2923      // incremented value.
2924      if (Increment)
2925        return false;
2926  
2927      LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on already "
2928                        << "indexed VirtualReg " << Base.virtRegIndex() << "\n");
2929      int BaseOp = getBaseOperandIndex(*PrePostInc);
2930      IncrementOffset = PrePostInc->getOperand(BaseOp+1).getImm();
2931      BaseAccess = PrePostInc;
2932      NewBaseReg = PrePostInc->getOperand(0).getReg();
2933    }
2934    else
2935      return false;
2936  
2937    // And make sure that the negative value of increment can be added to all
2938    // other offsets after the BaseAccess. We rely on either
2939    // dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess)
2940    // to keep things simple.
2941    // This also adds a simple codesize metric, to detect if an instruction (like
2942    // t2LDRBi12) which can often be shrunk to a thumb1 instruction (tLDRBi)
2943    // cannot because it is converted to something else (t2LDRBi8). We start this
2944    // at -1 for the gain from removing the increment.
2945    SmallPtrSet<MachineInstr *, 4> SuccessorAccesses;
2946    int CodesizeEstimate = -1;
2947    for (auto *Use : OtherAccesses) {
2948      if (DT->dominates(BaseAccess, Use)) {
2949        SuccessorAccesses.insert(Use);
2950        unsigned BaseOp = getBaseOperandIndex(*Use);
2951        if (!isLegalOrConvertableAddressImm(Use->getOpcode(),
2952                                            Use->getOperand(BaseOp + 1).getImm() -
2953                                                IncrementOffset,
2954                                            TII, CodesizeEstimate)) {
2955          LLVM_DEBUG(dbgs() << "  Illegal addressing mode immediate on use\n");
2956          return false;
2957        }
2958      } else if (!DT->dominates(Use, BaseAccess)) {
2959        LLVM_DEBUG(
2960            dbgs() << "  Unknown dominance relation between Base and Use\n");
2961        return false;
2962      }
2963    }
2964    if (STI->hasMinSize() && CodesizeEstimate > 0) {
2965      LLVM_DEBUG(dbgs() << "  Expected to grow instructions under minsize\n");
2966      return false;
2967    }
2968  
2969    if (!PrePostInc) {
2970      // Replace BaseAccess with a post inc
2971      LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump());
2972      LLVM_DEBUG(dbgs() << "  And   : "; Increment->dump());
2973      NewBaseReg = Increment->getOperand(0).getReg();
2974      MachineInstr *BaseAccessPost =
2975          createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI);
2976      BaseAccess->eraseFromParent();
2977      Increment->eraseFromParent();
2978      (void)BaseAccessPost;
2979      LLVM_DEBUG(dbgs() << "  To    : "; BaseAccessPost->dump());
2980    }
2981  
2982    for (auto *Use : SuccessorAccesses) {
2983      LLVM_DEBUG(dbgs() << "Changing: "; Use->dump());
2984      AdjustBaseAndOffset(Use, NewBaseReg, IncrementOffset, TII, TRI);
2985      LLVM_DEBUG(dbgs() << "  To    : "; Use->dump());
2986    }
2987  
2988    // Remove the kill flag from all uses of NewBaseReg, in case any old uses
2989    // remain.
2990    for (MachineOperand &Op : MRI->use_nodbg_operands(NewBaseReg))
2991      Op.setIsKill(false);
2992    return true;
2993  }
2994  
2995  bool ARMPreAllocLoadStoreOpt::DistributeIncrements() {
2996    bool Changed = false;
2997    SmallSetVector<Register, 4> Visited;
2998    for (auto &MBB : *MF) {
2999      for (auto &MI : MBB) {
3000        int BaseOp = getBaseOperandIndex(MI);
3001        if (BaseOp == -1 || !MI.getOperand(BaseOp).isReg())
3002          continue;
3003  
3004        Register Base = MI.getOperand(BaseOp).getReg();
3005        if (!Base.isVirtual() || Visited.count(Base))
3006          continue;
3007  
3008        Visited.insert(Base);
3009      }
3010    }
3011  
3012    for (auto Base : Visited)
3013      Changed |= DistributeIncrements(Base);
3014  
3015    return Changed;
3016  }
3017  
3018  /// Returns an instance of the load / store optimization pass.
3019  FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
3020    if (PreAlloc)
3021      return new ARMPreAllocLoadStoreOpt();
3022    return new ARMLoadStoreOpt();
3023  }
3024