xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
100b57cec5SDimitry Andric /// instructions into machine operations.
110b57cec5SDimitry Andric /// The expectation is that the loop contains three pseudo instructions:
120b57cec5SDimitry Andric /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
130b57cec5SDimitry Andric ///   form should be in the preheader, whereas the while form should be in the
148bcb0991SDimitry Andric ///   preheaders only predecessor.
150b57cec5SDimitry Andric /// - t2LoopDec - placed within in the loop body.
160b57cec5SDimitry Andric /// - t2LoopEnd - the loop latch terminator.
170b57cec5SDimitry Andric ///
18480093f4SDimitry Andric /// In addition to this, we also look for the presence of the VCTP instruction,
19480093f4SDimitry Andric /// which determines whether we can generated the tail-predicated low-overhead
20480093f4SDimitry Andric /// loop form.
21480093f4SDimitry Andric ///
22480093f4SDimitry Andric /// Assumptions and Dependencies:
23480093f4SDimitry Andric /// Low-overhead loops are constructed and executed using a setup instruction:
24480093f4SDimitry Andric /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25480093f4SDimitry Andric /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26480093f4SDimitry Andric /// but fixed polarity: WLS can only branch forwards and LE can only branch
27480093f4SDimitry Andric /// backwards. These restrictions mean that this pass is dependent upon block
28480093f4SDimitry Andric /// layout and block sizes, which is why it's the last pass to run. The same is
29480093f4SDimitry Andric /// true for ConstantIslands, but this pass does not increase the size of the
30480093f4SDimitry Andric /// basic blocks, nor does it change the CFG. Instructions are mainly removed
31480093f4SDimitry Andric /// during the transform and pseudo instructions are replaced by real ones. In
32480093f4SDimitry Andric /// some cases, when we have to revert to a 'normal' loop, we have to introduce
33480093f4SDimitry Andric /// multiple instructions for a single pseudo (see RevertWhile and
34fe6060f1SDimitry Andric /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
35480093f4SDimitry Andric /// are defined to be as large as this maximum sequence of replacement
36480093f4SDimitry Andric /// instructions.
37480093f4SDimitry Andric ///
385ffd83dbSDimitry Andric /// A note on VPR.P0 (the lane mask):
395ffd83dbSDimitry Andric /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
405ffd83dbSDimitry Andric /// "VPT Active" context (which includes low-overhead loops and vpt blocks).
415ffd83dbSDimitry Andric /// They will simply "and" the result of their calculation with the current
425ffd83dbSDimitry Andric /// value of VPR.P0. You can think of it like this:
435ffd83dbSDimitry Andric /// \verbatim
445ffd83dbSDimitry Andric /// if VPT active:    ; Between a DLSTP/LETP, or for predicated instrs
455ffd83dbSDimitry Andric ///   VPR.P0 &= Value
465ffd83dbSDimitry Andric /// else
475ffd83dbSDimitry Andric ///   VPR.P0 = Value
485ffd83dbSDimitry Andric /// \endverbatim
495ffd83dbSDimitry Andric /// When we're inside the low-overhead loop (between DLSTP and LETP), we always
505ffd83dbSDimitry Andric /// fall in the "VPT active" case, so we can consider that all VPR writes by
515ffd83dbSDimitry Andric /// one of those instruction is actually a "and".
520b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
530b57cec5SDimitry Andric 
540b57cec5SDimitry Andric #include "ARM.h"
550b57cec5SDimitry Andric #include "ARMBaseInstrInfo.h"
560b57cec5SDimitry Andric #include "ARMBaseRegisterInfo.h"
570b57cec5SDimitry Andric #include "ARMBasicBlockInfo.h"
580b57cec5SDimitry Andric #include "ARMSubtarget.h"
59e8d8bef9SDimitry Andric #include "MVETailPredUtils.h"
60480093f4SDimitry Andric #include "Thumb2InstrInfo.h"
61480093f4SDimitry Andric #include "llvm/ADT/SetOperations.h"
6281ad6265SDimitry Andric #include "llvm/ADT/SetVector.h"
635ffd83dbSDimitry Andric #include "llvm/CodeGen/LivePhysRegs.h"
6481ad6265SDimitry Andric #include "llvm/CodeGen/MachineFrameInfo.h"
650b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
660b57cec5SDimitry Andric #include "llvm/CodeGen/MachineLoopInfo.h"
67480093f4SDimitry Andric #include "llvm/CodeGen/MachineLoopUtils.h"
680b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
69480093f4SDimitry Andric #include "llvm/CodeGen/Passes.h"
70480093f4SDimitry Andric #include "llvm/CodeGen/ReachingDefAnalysis.h"
71480093f4SDimitry Andric #include "llvm/MC/MCInstrDesc.h"
720b57cec5SDimitry Andric 
730b57cec5SDimitry Andric using namespace llvm;
740b57cec5SDimitry Andric 
750b57cec5SDimitry Andric #define DEBUG_TYPE "arm-low-overhead-loops"
760b57cec5SDimitry Andric #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
770b57cec5SDimitry Andric 
78e8d8bef9SDimitry Andric static cl::opt<bool>
79e8d8bef9SDimitry Andric DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
80e8d8bef9SDimitry Andric     cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
81e8d8bef9SDimitry Andric     cl::init(false));
82e8d8bef9SDimitry Andric 
83bdd1243dSDimitry Andric static cl::opt<bool>
84bdd1243dSDimitry Andric     DisableOmitDLS("arm-disable-omit-dls", cl::Hidden,
85bdd1243dSDimitry Andric                    cl::desc("Disable omitting 'dls lr, lr' instructions"),
86bdd1243dSDimitry Andric                    cl::init(false));
87bdd1243dSDimitry Andric 
isVectorPredicated(MachineInstr * MI)88e8d8bef9SDimitry Andric static bool isVectorPredicated(MachineInstr *MI) {
89e8d8bef9SDimitry Andric   int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
90e8d8bef9SDimitry Andric   return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
91e8d8bef9SDimitry Andric }
92e8d8bef9SDimitry Andric 
isVectorPredicate(MachineInstr * MI)93e8d8bef9SDimitry Andric static bool isVectorPredicate(MachineInstr *MI) {
94*0fca6ea1SDimitry Andric   return MI->findRegisterDefOperandIdx(ARM::VPR, /*TRI=*/nullptr) != -1;
95e8d8bef9SDimitry Andric }
96e8d8bef9SDimitry Andric 
hasVPRUse(MachineInstr & MI)97e8d8bef9SDimitry Andric static bool hasVPRUse(MachineInstr &MI) {
98*0fca6ea1SDimitry Andric   return MI.findRegisterUseOperandIdx(ARM::VPR, /*TRI=*/nullptr) != -1;
99e8d8bef9SDimitry Andric }
100e8d8bef9SDimitry Andric 
isDomainMVE(MachineInstr * MI)101e8d8bef9SDimitry Andric static bool isDomainMVE(MachineInstr *MI) {
102e8d8bef9SDimitry Andric   uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
103e8d8bef9SDimitry Andric   return Domain == ARMII::DomainMVE;
104e8d8bef9SDimitry Andric }
105e8d8bef9SDimitry Andric 
getVecSize(const MachineInstr & MI)106349cc55cSDimitry Andric static int getVecSize(const MachineInstr &MI) {
107349cc55cSDimitry Andric   const MCInstrDesc &MCID = MI.getDesc();
108349cc55cSDimitry Andric   uint64_t Flags = MCID.TSFlags;
109349cc55cSDimitry Andric   return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
110349cc55cSDimitry Andric }
111349cc55cSDimitry Andric 
shouldInspect(MachineInstr & MI)112e8d8bef9SDimitry Andric static bool shouldInspect(MachineInstr &MI) {
113349cc55cSDimitry Andric   if (MI.isDebugInstr())
114349cc55cSDimitry Andric     return false;
115e8d8bef9SDimitry Andric   return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
116e8d8bef9SDimitry Andric }
117e8d8bef9SDimitry Andric 
isHorizontalReduction(const MachineInstr & MI)118*0fca6ea1SDimitry Andric static bool isHorizontalReduction(const MachineInstr &MI) {
119*0fca6ea1SDimitry Andric   const MCInstrDesc &MCID = MI.getDesc();
120*0fca6ea1SDimitry Andric   uint64_t Flags = MCID.TSFlags;
121*0fca6ea1SDimitry Andric   return (Flags & ARMII::HorizontalReduction) != 0;
122*0fca6ea1SDimitry Andric }
123*0fca6ea1SDimitry Andric 
1240b57cec5SDimitry Andric namespace {
1250b57cec5SDimitry Andric 
1265ffd83dbSDimitry Andric   using InstSet = SmallPtrSetImpl<MachineInstr *>;
1275ffd83dbSDimitry Andric 
1285ffd83dbSDimitry Andric   class PostOrderLoopTraversal {
1295ffd83dbSDimitry Andric     MachineLoop &ML;
1305ffd83dbSDimitry Andric     MachineLoopInfo &MLI;
1315ffd83dbSDimitry Andric     SmallPtrSet<MachineBasicBlock*, 4> Visited;
1325ffd83dbSDimitry Andric     SmallVector<MachineBasicBlock*, 4> Order;
1335ffd83dbSDimitry Andric 
1345ffd83dbSDimitry Andric   public:
PostOrderLoopTraversal(MachineLoop & ML,MachineLoopInfo & MLI)1355ffd83dbSDimitry Andric     PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
1365ffd83dbSDimitry Andric       : ML(ML), MLI(MLI) { }
1375ffd83dbSDimitry Andric 
getOrder() const1385ffd83dbSDimitry Andric     const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
1395ffd83dbSDimitry Andric       return Order;
1405ffd83dbSDimitry Andric     }
1415ffd83dbSDimitry Andric 
1425ffd83dbSDimitry Andric     // Visit all the blocks within the loop, as well as exit blocks and any
1435ffd83dbSDimitry Andric     // blocks properly dominating the header.
ProcessLoop()1445ffd83dbSDimitry Andric     void ProcessLoop() {
1455ffd83dbSDimitry Andric       std::function<void(MachineBasicBlock*)> Search = [this, &Search]
1465ffd83dbSDimitry Andric         (MachineBasicBlock *MBB) -> void {
1475ffd83dbSDimitry Andric         if (Visited.count(MBB))
1485ffd83dbSDimitry Andric           return;
1495ffd83dbSDimitry Andric 
1505ffd83dbSDimitry Andric         Visited.insert(MBB);
1515ffd83dbSDimitry Andric         for (auto *Succ : MBB->successors()) {
1525ffd83dbSDimitry Andric           if (!ML.contains(Succ))
1535ffd83dbSDimitry Andric             continue;
1545ffd83dbSDimitry Andric           Search(Succ);
1555ffd83dbSDimitry Andric         }
1565ffd83dbSDimitry Andric         Order.push_back(MBB);
1575ffd83dbSDimitry Andric       };
1585ffd83dbSDimitry Andric 
1595ffd83dbSDimitry Andric       // Insert exit blocks.
1605ffd83dbSDimitry Andric       SmallVector<MachineBasicBlock*, 2> ExitBlocks;
1615ffd83dbSDimitry Andric       ML.getExitBlocks(ExitBlocks);
162e8d8bef9SDimitry Andric       append_range(Order, ExitBlocks);
1635ffd83dbSDimitry Andric 
1645ffd83dbSDimitry Andric       // Then add the loop body.
1655ffd83dbSDimitry Andric       Search(ML.getHeader());
1665ffd83dbSDimitry Andric 
1675ffd83dbSDimitry Andric       // Then try the preheader and its predecessors.
1685ffd83dbSDimitry Andric       std::function<void(MachineBasicBlock*)> GetPredecessor =
1695ffd83dbSDimitry Andric         [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
1705ffd83dbSDimitry Andric         Order.push_back(MBB);
1715ffd83dbSDimitry Andric         if (MBB->pred_size() == 1)
1725ffd83dbSDimitry Andric           GetPredecessor(*MBB->pred_begin());
1735ffd83dbSDimitry Andric       };
1745ffd83dbSDimitry Andric 
1755ffd83dbSDimitry Andric       if (auto *Preheader = ML.getLoopPreheader())
1765ffd83dbSDimitry Andric         GetPredecessor(Preheader);
177fe6060f1SDimitry Andric       else if (auto *Preheader = MLI.findLoopPreheader(&ML, true, true))
1785ffd83dbSDimitry Andric         GetPredecessor(Preheader);
1795ffd83dbSDimitry Andric     }
1805ffd83dbSDimitry Andric   };
1815ffd83dbSDimitry Andric 
182*0fca6ea1SDimitry Andric   class VPTBlock {
183*0fca6ea1SDimitry Andric     SmallVector<MachineInstr *, 4> Insts;
184480093f4SDimitry Andric 
185480093f4SDimitry Andric   public:
VPTBlock(MachineInstr * MI)186*0fca6ea1SDimitry Andric     VPTBlock(MachineInstr *MI) { Insts.push_back(MI); }
187*0fca6ea1SDimitry Andric 
188*0fca6ea1SDimitry Andric     // Have we found an instruction within the block which defines the vpr? If
189*0fca6ea1SDimitry Andric     // so, not all the instructions in the block will have the same predicate.
hasUniformPredicate()190*0fca6ea1SDimitry Andric     bool hasUniformPredicate() { return getDivergent() == nullptr; }
191*0fca6ea1SDimitry Andric 
192*0fca6ea1SDimitry Andric     // If it exists, return the first internal instruction which modifies the
193*0fca6ea1SDimitry Andric     // VPR.
getDivergent()194*0fca6ea1SDimitry Andric     MachineInstr *getDivergent() {
195*0fca6ea1SDimitry Andric       SmallVectorImpl<MachineInstr *> &Insts = getInsts();
196*0fca6ea1SDimitry Andric       for (unsigned i = 1; i < Insts.size(); ++i) {
197*0fca6ea1SDimitry Andric         MachineInstr *Next = Insts[i];
198*0fca6ea1SDimitry Andric         if (isVectorPredicate(Next))
199*0fca6ea1SDimitry Andric           return Next; // Found an instruction altering the vpr.
200480093f4SDimitry Andric       }
201*0fca6ea1SDimitry Andric       return nullptr;
202*0fca6ea1SDimitry Andric     }
203*0fca6ea1SDimitry Andric 
insert(MachineInstr * MI)204*0fca6ea1SDimitry Andric     void insert(MachineInstr *MI) {
205*0fca6ea1SDimitry Andric       Insts.push_back(MI);
206*0fca6ea1SDimitry Andric       // VPT/VPST + 4 predicated instructions.
207*0fca6ea1SDimitry Andric       assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
208*0fca6ea1SDimitry Andric     }
209*0fca6ea1SDimitry Andric 
containsVCTP() const210*0fca6ea1SDimitry Andric     bool containsVCTP() const { return llvm::any_of(Insts, isVCTP); }
211*0fca6ea1SDimitry Andric 
size() const212*0fca6ea1SDimitry Andric     unsigned size() const { return Insts.size(); }
getInsts()213*0fca6ea1SDimitry Andric     SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
214480093f4SDimitry Andric   };
215480093f4SDimitry Andric 
216e8d8bef9SDimitry Andric   // Represent the current state of the VPR and hold all instances which
217e8d8bef9SDimitry Andric   // represent a VPT block, which is a list of instructions that begins with a
218e8d8bef9SDimitry Andric   // VPT/VPST and has a maximum of four proceeding instructions. All
219e8d8bef9SDimitry Andric   // instructions within the block are predicated upon the vpr and we allow
220e8d8bef9SDimitry Andric   // instructions to define the vpr within in the block too.
221e8d8bef9SDimitry Andric   class VPTState {
222e8d8bef9SDimitry Andric     friend struct LowOverheadLoop;
223e8d8bef9SDimitry Andric 
224*0fca6ea1SDimitry Andric     SmallVector<VPTBlock, 4> Blocks;
225*0fca6ea1SDimitry Andric     SetVector<MachineInstr *> CurrentPredicates;
226*0fca6ea1SDimitry Andric     std::map<MachineInstr *, SetVector<MachineInstr *>> PredicatedInsts;
227e8d8bef9SDimitry Andric 
CreateVPTBlock(MachineInstr * MI)228*0fca6ea1SDimitry Andric     void CreateVPTBlock(MachineInstr *MI) {
229e8d8bef9SDimitry Andric       assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
230e8d8bef9SDimitry Andric              && "Can't begin VPT without predicate");
231e8d8bef9SDimitry Andric       Blocks.emplace_back(MI);
232e8d8bef9SDimitry Andric       // The execution of MI is predicated upon the current set of instructions
233e8d8bef9SDimitry Andric       // that are AND'ed together to form the VPR predicate value. In the case
234e8d8bef9SDimitry Andric       // that MI is a VPT, CurrentPredicates will also just be MI.
235*0fca6ea1SDimitry Andric       PredicatedInsts[MI] = CurrentPredicates;
236e8d8bef9SDimitry Andric     }
237e8d8bef9SDimitry Andric 
addInst(MachineInstr * MI)238*0fca6ea1SDimitry Andric     void addInst(MachineInstr *MI) {
239e8d8bef9SDimitry Andric       Blocks.back().insert(MI);
240*0fca6ea1SDimitry Andric       PredicatedInsts[MI] = CurrentPredicates;
241e8d8bef9SDimitry Andric     }
242e8d8bef9SDimitry Andric 
addPredicate(MachineInstr * MI)243*0fca6ea1SDimitry Andric     void addPredicate(MachineInstr *MI) {
244e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
245e8d8bef9SDimitry Andric       CurrentPredicates.insert(MI);
246e8d8bef9SDimitry Andric     }
247e8d8bef9SDimitry Andric 
resetPredicate(MachineInstr * MI)248*0fca6ea1SDimitry Andric     void resetPredicate(MachineInstr *MI) {
249e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
250e8d8bef9SDimitry Andric       CurrentPredicates.clear();
251e8d8bef9SDimitry Andric       CurrentPredicates.insert(MI);
252e8d8bef9SDimitry Andric     }
253480093f4SDimitry Andric 
254480093f4SDimitry Andric   public:
255e8d8bef9SDimitry Andric     // Return whether the given instruction is predicated upon a VCTP.
isPredicatedOnVCTP(MachineInstr * MI,bool Exclusive=false)256*0fca6ea1SDimitry Andric     bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
257*0fca6ea1SDimitry Andric       SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI];
258e8d8bef9SDimitry Andric       if (Exclusive && Predicates.size() != 1)
259e8d8bef9SDimitry Andric         return false;
260*0fca6ea1SDimitry Andric       // We do not know how to convert an else predicate of a VCTP.
261*0fca6ea1SDimitry Andric       if (getVPTInstrPredicate(*MI) == ARMVCC::Else)
262*0fca6ea1SDimitry Andric         return false;
2630eae32dcSDimitry Andric       return llvm::any_of(Predicates, isVCTP);
264480093f4SDimitry Andric     }
265480093f4SDimitry Andric 
266e8d8bef9SDimitry Andric     // Is the VPST, controlling the block entry, predicated upon a VCTP.
isEntryPredicatedOnVCTP(VPTBlock & Block,bool Exclusive=false)267*0fca6ea1SDimitry Andric     bool isEntryPredicatedOnVCTP(VPTBlock &Block, bool Exclusive = false) {
268e8d8bef9SDimitry Andric       SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
269e8d8bef9SDimitry Andric       return isPredicatedOnVCTP(Insts.front(), Exclusive);
270e8d8bef9SDimitry Andric     }
271e8d8bef9SDimitry Andric 
272e8d8bef9SDimitry Andric     // If this block begins with a VPT, we can check whether it's using
273e8d8bef9SDimitry Andric     // at least one predicated input(s), as well as possible loop invariant
274e8d8bef9SDimitry Andric     // which would result in it being implicitly predicated.
hasImplicitlyValidVPT(VPTBlock & Block,ReachingDefAnalysis & RDA)275*0fca6ea1SDimitry Andric     bool hasImplicitlyValidVPT(VPTBlock &Block, ReachingDefAnalysis &RDA) {
276e8d8bef9SDimitry Andric       SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
277e8d8bef9SDimitry Andric       MachineInstr *VPT = Insts.front();
278e8d8bef9SDimitry Andric       assert(isVPTOpcode(VPT->getOpcode()) &&
279e8d8bef9SDimitry Andric              "Expected VPT block to begin with VPT/VPST");
280e8d8bef9SDimitry Andric 
281e8d8bef9SDimitry Andric       if (VPT->getOpcode() == ARM::MVE_VPST)
282e8d8bef9SDimitry Andric         return false;
283e8d8bef9SDimitry Andric 
284*0fca6ea1SDimitry Andric       // If the VPT block does not define something that is an "output", then
285*0fca6ea1SDimitry Andric       // the tail-predicated version will just perform a subset of the original
286*0fca6ea1SDimitry Andric       // vpt block, where the last lanes should not be used.
287*0fca6ea1SDimitry Andric       if (isVPTOpcode(VPT->getOpcode()) &&
288*0fca6ea1SDimitry Andric           all_of(Block.getInsts(), [](const MachineInstr *MI) {
289*0fca6ea1SDimitry Andric             return !MI->mayStore() && !MI->mayLoad() &&
290*0fca6ea1SDimitry Andric                    !isHorizontalReduction(*MI) && !isVCTP(MI);
291*0fca6ea1SDimitry Andric           }))
292*0fca6ea1SDimitry Andric         return true;
293*0fca6ea1SDimitry Andric 
294e8d8bef9SDimitry Andric       auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
295e8d8bef9SDimitry Andric         MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
296e8d8bef9SDimitry Andric         return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
297e8d8bef9SDimitry Andric       };
298e8d8bef9SDimitry Andric 
299e8d8bef9SDimitry Andric       auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
300e8d8bef9SDimitry Andric         MachineOperand &MO = MI->getOperand(Idx);
301e8d8bef9SDimitry Andric         if (!MO.isReg() || !MO.getReg())
302e8d8bef9SDimitry Andric           return true;
303e8d8bef9SDimitry Andric 
304e8d8bef9SDimitry Andric         SmallPtrSet<MachineInstr *, 2> Defs;
305e8d8bef9SDimitry Andric         RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
306e8d8bef9SDimitry Andric         if (Defs.empty())
307e8d8bef9SDimitry Andric           return true;
308e8d8bef9SDimitry Andric 
309e8d8bef9SDimitry Andric         for (auto *Def : Defs)
310e8d8bef9SDimitry Andric           if (Def->getParent() == VPT->getParent())
311e8d8bef9SDimitry Andric             return false;
312e8d8bef9SDimitry Andric         return true;
313e8d8bef9SDimitry Andric       };
314e8d8bef9SDimitry Andric 
315e8d8bef9SDimitry Andric       // Check that at least one of the operands is directly predicated on a
316e8d8bef9SDimitry Andric       // vctp and allow an invariant value too.
317e8d8bef9SDimitry Andric       return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
318e8d8bef9SDimitry Andric              (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
319e8d8bef9SDimitry Andric              (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
320e8d8bef9SDimitry Andric     }
321e8d8bef9SDimitry Andric 
isValid(ReachingDefAnalysis & RDA)322*0fca6ea1SDimitry Andric     bool isValid(ReachingDefAnalysis &RDA) {
323e8d8bef9SDimitry Andric       // All predication within the loop should be based on vctp. If the block
324e8d8bef9SDimitry Andric       // isn't predicated on entry, check whether the vctp is within the block
325e8d8bef9SDimitry Andric       // and that all other instructions are then predicated on it.
326e8d8bef9SDimitry Andric       for (auto &Block : Blocks) {
327*0fca6ea1SDimitry Andric         if (isEntryPredicatedOnVCTP(Block, false) &&
328*0fca6ea1SDimitry Andric             !any_of(drop_begin(Block.getInsts()), [](const MachineInstr *MI) {
329*0fca6ea1SDimitry Andric               return getVPTInstrPredicate(*MI) == ARMVCC::Else;
330*0fca6ea1SDimitry Andric             }))
331*0fca6ea1SDimitry Andric           continue;
332*0fca6ea1SDimitry Andric         if (hasImplicitlyValidVPT(Block, RDA))
333e8d8bef9SDimitry Andric           continue;
334e8d8bef9SDimitry Andric 
335e8d8bef9SDimitry Andric         SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
336e8d8bef9SDimitry Andric         // We don't know how to convert a block with just a VPT;VCTP into
337e8d8bef9SDimitry Andric         // anything valid once we remove the VCTP. For now just bail out.
338e8d8bef9SDimitry Andric         assert(isVPTOpcode(Insts.front()->getOpcode()) &&
339e8d8bef9SDimitry Andric                "Expected VPT block to start with a VPST or VPT!");
340e8d8bef9SDimitry Andric         if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
341e8d8bef9SDimitry Andric             isVCTP(Insts.back()))
342e8d8bef9SDimitry Andric           return false;
343e8d8bef9SDimitry Andric 
344e8d8bef9SDimitry Andric         for (auto *MI : Insts) {
345e8d8bef9SDimitry Andric           // Check that any internal VCTPs are 'Then' predicated.
346e8d8bef9SDimitry Andric           if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
347e8d8bef9SDimitry Andric             return false;
348e8d8bef9SDimitry Andric           // Skip other instructions that build up the predicate.
349e8d8bef9SDimitry Andric           if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
350e8d8bef9SDimitry Andric             continue;
351e8d8bef9SDimitry Andric           // Check that any other instructions are predicated upon a vctp.
352e8d8bef9SDimitry Andric           // TODO: We could infer when VPTs are implicitly predicated on the
353e8d8bef9SDimitry Andric           // vctp (when the operands are predicated).
354e8d8bef9SDimitry Andric           if (!isPredicatedOnVCTP(MI)) {
355e8d8bef9SDimitry Andric             LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
356e8d8bef9SDimitry Andric             return false;
357e8d8bef9SDimitry Andric           }
358e8d8bef9SDimitry Andric         }
359e8d8bef9SDimitry Andric       }
360e8d8bef9SDimitry Andric       return true;
361e8d8bef9SDimitry Andric     }
3625ffd83dbSDimitry Andric   };
3635ffd83dbSDimitry Andric 
364480093f4SDimitry Andric   struct LowOverheadLoop {
365480093f4SDimitry Andric 
3665ffd83dbSDimitry Andric     MachineLoop &ML;
3675ffd83dbSDimitry Andric     MachineBasicBlock *Preheader = nullptr;
3685ffd83dbSDimitry Andric     MachineLoopInfo &MLI;
3695ffd83dbSDimitry Andric     ReachingDefAnalysis &RDA;
3705ffd83dbSDimitry Andric     const TargetRegisterInfo &TRI;
3715ffd83dbSDimitry Andric     const ARMBaseInstrInfo &TII;
372480093f4SDimitry Andric     MachineFunction *MF = nullptr;
373e8d8bef9SDimitry Andric     MachineBasicBlock::iterator StartInsertPt;
374e8d8bef9SDimitry Andric     MachineBasicBlock *StartInsertBB = nullptr;
375480093f4SDimitry Andric     MachineInstr *Start = nullptr;
376480093f4SDimitry Andric     MachineInstr *Dec = nullptr;
377480093f4SDimitry Andric     MachineInstr *End = nullptr;
378e8d8bef9SDimitry Andric     MachineOperand TPNumElements;
379e8d8bef9SDimitry Andric     SmallVector<MachineInstr *, 4> VCTPs;
3805ffd83dbSDimitry Andric     SmallPtrSet<MachineInstr *, 4> ToRemove;
3815ffd83dbSDimitry Andric     SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute;
382349cc55cSDimitry Andric     SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs;
383349cc55cSDimitry Andric     SmallPtrSet<MachineInstr *, 4> VMOVCopies;
384480093f4SDimitry Andric     bool Revert = false;
385480093f4SDimitry Andric     bool CannotTailPredicate = false;
386*0fca6ea1SDimitry Andric     VPTState VPTstate;
387480093f4SDimitry Andric 
LowOverheadLoop__anona50655620111::LowOverheadLoop3885ffd83dbSDimitry Andric     LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
3895ffd83dbSDimitry Andric                     ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
3905ffd83dbSDimitry Andric                     const ARMBaseInstrInfo &TII)
391e8d8bef9SDimitry Andric         : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
392e8d8bef9SDimitry Andric           TPNumElements(MachineOperand::CreateImm(0)) {
3935ffd83dbSDimitry Andric       MF = ML.getHeader()->getParent();
3945ffd83dbSDimitry Andric       if (auto *MBB = ML.getLoopPreheader())
3955ffd83dbSDimitry Andric         Preheader = MBB;
396fe6060f1SDimitry Andric       else if (auto *MBB = MLI.findLoopPreheader(&ML, true, true))
3975ffd83dbSDimitry Andric         Preheader = MBB;
398480093f4SDimitry Andric     }
399480093f4SDimitry Andric 
400480093f4SDimitry Andric     // If this is an MVE instruction, check that we know how to use tail
401480093f4SDimitry Andric     // predication with it. Record VPT blocks and return whether the
402480093f4SDimitry Andric     // instruction is valid for tail predication.
403480093f4SDimitry Andric     bool ValidateMVEInst(MachineInstr *MI);
404480093f4SDimitry Andric 
AnalyseMVEInst__anona50655620111::LowOverheadLoop405480093f4SDimitry Andric     void AnalyseMVEInst(MachineInstr *MI) {
406480093f4SDimitry Andric       CannotTailPredicate = !ValidateMVEInst(MI);
407480093f4SDimitry Andric     }
408480093f4SDimitry Andric 
IsTailPredicationLegal__anona50655620111::LowOverheadLoop409480093f4SDimitry Andric     bool IsTailPredicationLegal() const {
410480093f4SDimitry Andric       // For now, let's keep things really simple and only support a single
411480093f4SDimitry Andric       // block for tail predication.
412e8d8bef9SDimitry Andric       return !Revert && FoundAllComponents() && !VCTPs.empty() &&
4135ffd83dbSDimitry Andric              !CannotTailPredicate && ML.getNumBlocks() == 1;
414480093f4SDimitry Andric     }
415480093f4SDimitry Andric 
416e8d8bef9SDimitry Andric     // Given that MI is a VCTP, check that is equivalent to any other VCTPs
417e8d8bef9SDimitry Andric     // found.
418e8d8bef9SDimitry Andric     bool AddVCTP(MachineInstr *MI);
419e8d8bef9SDimitry Andric 
4205ffd83dbSDimitry Andric     // Check that the predication in the loop will be equivalent once we
4215ffd83dbSDimitry Andric     // perform the conversion. Also ensure that we can provide the number
4225ffd83dbSDimitry Andric     // of elements to the loop start instruction.
423e8d8bef9SDimitry Andric     bool ValidateTailPredicate();
4245ffd83dbSDimitry Andric 
4255ffd83dbSDimitry Andric     // Check that any values available outside of the loop will be the same
4265ffd83dbSDimitry Andric     // after tail predication conversion.
4275ffd83dbSDimitry Andric     bool ValidateLiveOuts();
428480093f4SDimitry Andric 
429480093f4SDimitry Andric     // Check the branch targets are within range and we satisfy our
430480093f4SDimitry Andric     // restrictions.
431e8d8bef9SDimitry Andric     void Validate(ARMBasicBlockUtils *BBUtils);
432480093f4SDimitry Andric 
FoundAllComponents__anona50655620111::LowOverheadLoop433480093f4SDimitry Andric     bool FoundAllComponents() const {
434480093f4SDimitry Andric       return Start && Dec && End;
435480093f4SDimitry Andric     }
436480093f4SDimitry Andric 
getVPTBlocks__anona50655620111::LowOverheadLoop437*0fca6ea1SDimitry Andric     SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTstate.Blocks; }
438480093f4SDimitry Andric 
439e8d8bef9SDimitry Andric     // Return the operand for the loop start instruction. This will be the loop
440e8d8bef9SDimitry Andric     // iteration count, or the number of elements if we're tail predicating.
getLoopStartOperand__anona50655620111::LowOverheadLoop441e8d8bef9SDimitry Andric     MachineOperand &getLoopStartOperand() {
442e8d8bef9SDimitry Andric       if (IsTailPredicationLegal())
443e8d8bef9SDimitry Andric         return TPNumElements;
444fe6060f1SDimitry Andric       return Start->getOperand(1);
445480093f4SDimitry Andric     }
446480093f4SDimitry Andric 
getStartOpcode__anona50655620111::LowOverheadLoop447480093f4SDimitry Andric     unsigned getStartOpcode() const {
448fe6060f1SDimitry Andric       bool IsDo = isDoLoopStart(*Start);
449480093f4SDimitry Andric       if (!IsTailPredicationLegal())
450480093f4SDimitry Andric         return IsDo ? ARM::t2DLS : ARM::t2WLS;
451480093f4SDimitry Andric 
452e8d8bef9SDimitry Andric       return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
453480093f4SDimitry Andric     }
454480093f4SDimitry Andric 
dump__anona50655620111::LowOverheadLoop455480093f4SDimitry Andric     void dump() const {
456480093f4SDimitry Andric       if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
457480093f4SDimitry Andric       if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
458480093f4SDimitry Andric       if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
459e8d8bef9SDimitry Andric       if (!VCTPs.empty()) {
460e8d8bef9SDimitry Andric         dbgs() << "ARM Loops: Found VCTP(s):\n";
461e8d8bef9SDimitry Andric         for (auto *MI : VCTPs)
462e8d8bef9SDimitry Andric           dbgs() << " - " << *MI;
463e8d8bef9SDimitry Andric       }
464480093f4SDimitry Andric       if (!FoundAllComponents())
465480093f4SDimitry Andric         dbgs() << "ARM Loops: Not a low-overhead loop.\n";
466480093f4SDimitry Andric       else if (!(Start && Dec && End))
467480093f4SDimitry Andric         dbgs() << "ARM Loops: Failed to find all loop components.\n";
468480093f4SDimitry Andric     }
469480093f4SDimitry Andric   };
470480093f4SDimitry Andric 
4710b57cec5SDimitry Andric   class ARMLowOverheadLoops : public MachineFunctionPass {
4728bcb0991SDimitry Andric     MachineFunction           *MF = nullptr;
473480093f4SDimitry Andric     MachineLoopInfo           *MLI = nullptr;
474480093f4SDimitry Andric     ReachingDefAnalysis       *RDA = nullptr;
4750b57cec5SDimitry Andric     const ARMBaseInstrInfo    *TII = nullptr;
4760b57cec5SDimitry Andric     MachineRegisterInfo       *MRI = nullptr;
477480093f4SDimitry Andric     const TargetRegisterInfo  *TRI = nullptr;
4780b57cec5SDimitry Andric     std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
4790b57cec5SDimitry Andric 
4800b57cec5SDimitry Andric   public:
4810b57cec5SDimitry Andric     static char ID;
4820b57cec5SDimitry Andric 
ARMLowOverheadLoops()4830b57cec5SDimitry Andric     ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
4840b57cec5SDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const4850b57cec5SDimitry Andric     void getAnalysisUsage(AnalysisUsage &AU) const override {
4860b57cec5SDimitry Andric       AU.setPreservesCFG();
487*0fca6ea1SDimitry Andric       AU.addRequired<MachineLoopInfoWrapperPass>();
488480093f4SDimitry Andric       AU.addRequired<ReachingDefAnalysis>();
4890b57cec5SDimitry Andric       MachineFunctionPass::getAnalysisUsage(AU);
4900b57cec5SDimitry Andric     }
4910b57cec5SDimitry Andric 
4920b57cec5SDimitry Andric     bool runOnMachineFunction(MachineFunction &MF) override;
4930b57cec5SDimitry Andric 
getRequiredProperties() const4940b57cec5SDimitry Andric     MachineFunctionProperties getRequiredProperties() const override {
4950b57cec5SDimitry Andric       return MachineFunctionProperties().set(
496480093f4SDimitry Andric           MachineFunctionProperties::Property::NoVRegs).set(
497480093f4SDimitry Andric           MachineFunctionProperties::Property::TracksLiveness);
4980b57cec5SDimitry Andric     }
4990b57cec5SDimitry Andric 
getPassName() const5000b57cec5SDimitry Andric     StringRef getPassName() const override {
5010b57cec5SDimitry Andric       return ARM_LOW_OVERHEAD_LOOPS_NAME;
5020b57cec5SDimitry Andric     }
5038bcb0991SDimitry Andric 
5048bcb0991SDimitry Andric   private:
5058bcb0991SDimitry Andric     bool ProcessLoop(MachineLoop *ML);
5068bcb0991SDimitry Andric 
5078bcb0991SDimitry Andric     bool RevertNonLoops();
5088bcb0991SDimitry Andric 
5098bcb0991SDimitry Andric     void RevertWhile(MachineInstr *MI) const;
510e8d8bef9SDimitry Andric     void RevertDo(MachineInstr *MI) const;
5118bcb0991SDimitry Andric 
5125ffd83dbSDimitry Andric     bool RevertLoopDec(MachineInstr *MI) const;
5138bcb0991SDimitry Andric 
5148bcb0991SDimitry Andric     void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
5158bcb0991SDimitry Andric 
516e8d8bef9SDimitry Andric     void RevertLoopEndDec(MachineInstr *MI) const;
517480093f4SDimitry Andric 
518e8d8bef9SDimitry Andric     void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
5195ffd83dbSDimitry Andric 
520480093f4SDimitry Andric     MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
521480093f4SDimitry Andric 
522480093f4SDimitry Andric     void Expand(LowOverheadLoop &LoLoop);
5238bcb0991SDimitry Andric 
5245ffd83dbSDimitry Andric     void IterationCountDCE(LowOverheadLoop &LoLoop);
5250b57cec5SDimitry Andric   };
5260b57cec5SDimitry Andric }
5270b57cec5SDimitry Andric 
5280b57cec5SDimitry Andric char ARMLowOverheadLoops::ID = 0;
5290b57cec5SDimitry Andric 
INITIALIZE_PASS(ARMLowOverheadLoops,DEBUG_TYPE,ARM_LOW_OVERHEAD_LOOPS_NAME,false,false)5300b57cec5SDimitry Andric INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
5310b57cec5SDimitry Andric                 false, false)
5320b57cec5SDimitry Andric 
533e8d8bef9SDimitry Andric static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
534e8d8bef9SDimitry Andric                       InstSet &ToRemove, InstSet &Ignore) {
535480093f4SDimitry Andric 
536e8d8bef9SDimitry Andric   // Check that we can remove all of Killed without having to modify any IT
537e8d8bef9SDimitry Andric   // blocks.
538e8d8bef9SDimitry Andric   auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
539e8d8bef9SDimitry Andric     // Collect the dead code and the MBBs in which they reside.
540e8d8bef9SDimitry Andric     SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
541e8d8bef9SDimitry Andric     for (auto *Dead : Killed)
542e8d8bef9SDimitry Andric       BasicBlocks.insert(Dead->getParent());
543e8d8bef9SDimitry Andric 
544e8d8bef9SDimitry Andric     // Collect IT blocks in all affected basic blocks.
545e8d8bef9SDimitry Andric     std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
546e8d8bef9SDimitry Andric     for (auto *MBB : BasicBlocks) {
547e8d8bef9SDimitry Andric       for (auto &IT : *MBB) {
548e8d8bef9SDimitry Andric         if (IT.getOpcode() != ARM::t2IT)
549e8d8bef9SDimitry Andric           continue;
550e8d8bef9SDimitry Andric         RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE),
551e8d8bef9SDimitry Andric                                  ITBlocks[&IT]);
552e8d8bef9SDimitry Andric       }
553e8d8bef9SDimitry Andric     }
554e8d8bef9SDimitry Andric 
555e8d8bef9SDimitry Andric     // If we're removing all of the instructions within an IT block, then
556e8d8bef9SDimitry Andric     // also remove the IT instruction.
557e8d8bef9SDimitry Andric     SmallPtrSet<MachineInstr *, 2> ModifiedITs;
558e8d8bef9SDimitry Andric     SmallPtrSet<MachineInstr *, 2> RemoveITs;
559e8d8bef9SDimitry Andric     for (auto *Dead : Killed) {
560*0fca6ea1SDimitry Andric       if (MachineOperand *MO =
561*0fca6ea1SDimitry Andric               Dead->findRegisterUseOperand(ARM::ITSTATE, /*TRI=*/nullptr)) {
562e8d8bef9SDimitry Andric         MachineInstr *IT = RDA.getMIOperand(Dead, *MO);
563e8d8bef9SDimitry Andric         RemoveITs.insert(IT);
564e8d8bef9SDimitry Andric         auto &CurrentBlock = ITBlocks[IT];
565e8d8bef9SDimitry Andric         CurrentBlock.erase(Dead);
566e8d8bef9SDimitry Andric         if (CurrentBlock.empty())
567e8d8bef9SDimitry Andric           ModifiedITs.erase(IT);
568e8d8bef9SDimitry Andric         else
569e8d8bef9SDimitry Andric           ModifiedITs.insert(IT);
570e8d8bef9SDimitry Andric       }
571e8d8bef9SDimitry Andric     }
572e8d8bef9SDimitry Andric     if (!ModifiedITs.empty())
573e8d8bef9SDimitry Andric       return false;
574e8d8bef9SDimitry Andric     Killed.insert(RemoveITs.begin(), RemoveITs.end());
575e8d8bef9SDimitry Andric     return true;
576480093f4SDimitry Andric   };
577480093f4SDimitry Andric 
578e8d8bef9SDimitry Andric   SmallPtrSet<MachineInstr *, 2> Uses;
579e8d8bef9SDimitry Andric   if (!RDA.isSafeToRemove(MI, Uses, Ignore))
580e8d8bef9SDimitry Andric     return false;
581480093f4SDimitry Andric 
582e8d8bef9SDimitry Andric   if (WontCorruptITs(Uses, RDA)) {
583e8d8bef9SDimitry Andric     ToRemove.insert(Uses.begin(), Uses.end());
584e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
585e8d8bef9SDimitry Andric                << " - can also remove:\n";
586e8d8bef9SDimitry Andric                for (auto *Use : Uses)
587e8d8bef9SDimitry Andric                  dbgs() << "   - " << *Use);
588480093f4SDimitry Andric 
589e8d8bef9SDimitry Andric     SmallPtrSet<MachineInstr*, 4> Killed;
590e8d8bef9SDimitry Andric     RDA.collectKilledOperands(MI, Killed);
591e8d8bef9SDimitry Andric     if (WontCorruptITs(Killed, RDA)) {
592e8d8bef9SDimitry Andric       ToRemove.insert(Killed.begin(), Killed.end());
593e8d8bef9SDimitry Andric       LLVM_DEBUG(for (auto *Dead : Killed)
594e8d8bef9SDimitry Andric                    dbgs() << "   - " << *Dead);
595480093f4SDimitry Andric     }
596e8d8bef9SDimitry Andric     return true;
597e8d8bef9SDimitry Andric   }
598480093f4SDimitry Andric   return false;
599480093f4SDimitry Andric }
600e8d8bef9SDimitry Andric 
ValidateTailPredicate()601e8d8bef9SDimitry Andric bool LowOverheadLoop::ValidateTailPredicate() {
602e8d8bef9SDimitry Andric   if (!IsTailPredicationLegal()) {
603e8d8bef9SDimitry Andric     LLVM_DEBUG(if (VCTPs.empty())
604e8d8bef9SDimitry Andric                  dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
605e8d8bef9SDimitry Andric                dbgs() << "ARM Loops: Tail-predication is not valid.\n");
606e8d8bef9SDimitry Andric     return false;
607e8d8bef9SDimitry Andric   }
608e8d8bef9SDimitry Andric 
609e8d8bef9SDimitry Andric   assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
610e8d8bef9SDimitry Andric   assert(ML.getBlocks().size() == 1 &&
611e8d8bef9SDimitry Andric          "Shouldn't be processing a loop with more than one block");
612e8d8bef9SDimitry Andric 
613e8d8bef9SDimitry Andric   if (DisableTailPredication) {
614e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
615e8d8bef9SDimitry Andric     return false;
616e8d8bef9SDimitry Andric   }
617e8d8bef9SDimitry Andric 
618*0fca6ea1SDimitry Andric   if (!VPTstate.isValid(RDA)) {
619e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
620e8d8bef9SDimitry Andric     return false;
621e8d8bef9SDimitry Andric   }
622e8d8bef9SDimitry Andric 
623e8d8bef9SDimitry Andric   if (!ValidateLiveOuts()) {
624e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
625e8d8bef9SDimitry Andric     return false;
626e8d8bef9SDimitry Andric   }
627e8d8bef9SDimitry Andric 
628480093f4SDimitry Andric   // For tail predication, we need to provide the number of elements, instead
629480093f4SDimitry Andric   // of the iteration count, to the loop start instruction. The number of
630480093f4SDimitry Andric   // elements is provided to the vctp instruction, so we need to check that
631480093f4SDimitry Andric   // we can use this register at InsertPt.
632e8d8bef9SDimitry Andric   MachineInstr *VCTP = VCTPs.back();
633fe6060f1SDimitry Andric   if (Start->getOpcode() == ARM::t2DoLoopStartTP ||
634fe6060f1SDimitry Andric       Start->getOpcode() == ARM::t2WhileLoopStartTP) {
635e8d8bef9SDimitry Andric     TPNumElements = Start->getOperand(2);
636e8d8bef9SDimitry Andric     StartInsertPt = Start;
637e8d8bef9SDimitry Andric     StartInsertBB = Start->getParent();
638e8d8bef9SDimitry Andric   } else {
639e8d8bef9SDimitry Andric     TPNumElements = VCTP->getOperand(1);
640e8d8bef9SDimitry Andric     MCRegister NumElements = TPNumElements.getReg().asMCReg();
641480093f4SDimitry Andric 
642480093f4SDimitry Andric     // If the register is defined within loop, then we can't perform TP.
643480093f4SDimitry Andric     // TODO: Check whether this is just a mov of a register that would be
644480093f4SDimitry Andric     // available.
6455ffd83dbSDimitry Andric     if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
646480093f4SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
647480093f4SDimitry Andric       return false;
648480093f4SDimitry Andric     }
649480093f4SDimitry Andric 
650480093f4SDimitry Andric     // The element count register maybe defined after InsertPt, in which case we
651480093f4SDimitry Andric     // need to try to move either InsertPt or the def so that the [w|d]lstp can
652480093f4SDimitry Andric     // use the value.
653e8d8bef9SDimitry Andric 
654e8d8bef9SDimitry Andric     if (StartInsertPt != StartInsertBB->end() &&
655e8d8bef9SDimitry Andric         !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
656e8d8bef9SDimitry Andric       if (auto *ElemDef =
657e8d8bef9SDimitry Andric               RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
658e8d8bef9SDimitry Andric         if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
659480093f4SDimitry Andric           ElemDef->removeFromParent();
660e8d8bef9SDimitry Andric           StartInsertBB->insert(StartInsertPt, ElemDef);
661e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs()
662e8d8bef9SDimitry Andric                      << "ARM Loops: Moved element count def: " << *ElemDef);
663e8d8bef9SDimitry Andric         } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
6645ffd83dbSDimitry Andric           StartInsertPt->removeFromParent();
665e8d8bef9SDimitry Andric           StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
666e8d8bef9SDimitry Andric                                      &*StartInsertPt);
667480093f4SDimitry Andric           LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
668480093f4SDimitry Andric         } else {
669e8d8bef9SDimitry Andric           // If we fail to move an instruction and the element count is provided
670e8d8bef9SDimitry Andric           // by a mov, use the mov operand if it will have the same value at the
671e8d8bef9SDimitry Andric           // insertion point
672e8d8bef9SDimitry Andric           MachineOperand Operand = ElemDef->getOperand(1);
673e8d8bef9SDimitry Andric           if (isMovRegOpcode(ElemDef->getOpcode()) &&
674e8d8bef9SDimitry Andric               RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
675e8d8bef9SDimitry Andric                   RDA.getUniqueReachingMIDef(&*StartInsertPt,
676e8d8bef9SDimitry Andric                                              Operand.getReg().asMCReg())) {
677e8d8bef9SDimitry Andric             TPNumElements = Operand;
678e8d8bef9SDimitry Andric             NumElements = TPNumElements.getReg();
679e8d8bef9SDimitry Andric           } else {
680e8d8bef9SDimitry Andric             LLVM_DEBUG(dbgs()
681e8d8bef9SDimitry Andric                        << "ARM Loops: Unable to move element count to loop "
682480093f4SDimitry Andric                        << "start instruction.\n");
683480093f4SDimitry Andric             return false;
684480093f4SDimitry Andric           }
685480093f4SDimitry Andric         }
686480093f4SDimitry Andric       }
687e8d8bef9SDimitry Andric     }
688480093f4SDimitry Andric 
689480093f4SDimitry Andric     // Especially in the case of while loops, InsertBB may not be the
690480093f4SDimitry Andric     // preheader, so we need to check that the register isn't redefined
691480093f4SDimitry Andric     // before entering the loop.
6925ffd83dbSDimitry Andric     auto CannotProvideElements = [this](MachineBasicBlock *MBB,
693e8d8bef9SDimitry Andric                                         MCRegister NumElements) {
694e8d8bef9SDimitry Andric       if (MBB->empty())
695e8d8bef9SDimitry Andric         return false;
696480093f4SDimitry Andric       // NumElements is redefined in this block.
6975ffd83dbSDimitry Andric       if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
698480093f4SDimitry Andric         return true;
699480093f4SDimitry Andric 
700480093f4SDimitry Andric       // Don't continue searching up through multiple predecessors.
701480093f4SDimitry Andric       if (MBB->pred_size() > 1)
702480093f4SDimitry Andric         return true;
703480093f4SDimitry Andric 
704480093f4SDimitry Andric       return false;
705480093f4SDimitry Andric     };
706480093f4SDimitry Andric 
707e8d8bef9SDimitry Andric     // Search backwards for a def, until we get to InsertBB.
7085ffd83dbSDimitry Andric     MachineBasicBlock *MBB = Preheader;
709e8d8bef9SDimitry Andric     while (MBB && MBB != StartInsertBB) {
710480093f4SDimitry Andric       if (CannotProvideElements(MBB, NumElements)) {
711480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
712480093f4SDimitry Andric         return false;
713480093f4SDimitry Andric       }
714480093f4SDimitry Andric       MBB = *MBB->pred_begin();
715480093f4SDimitry Andric     }
716e8d8bef9SDimitry Andric   }
717e8d8bef9SDimitry Andric 
718e8d8bef9SDimitry Andric   // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
719e8d8bef9SDimitry Andric   // world the [w|d]lstp instruction would be last instruction in the preheader
720e8d8bef9SDimitry Andric   // and so it would only affect instructions within the loop body. But due to
721e8d8bef9SDimitry Andric   // scheduling, and/or the logic in this pass (above), the insertion point can
722e8d8bef9SDimitry Andric   // be moved earlier. So if the Loop Start isn't the last instruction in the
723e8d8bef9SDimitry Andric   // preheader, and if the initial element count is smaller than the vector
724e8d8bef9SDimitry Andric   // width, the Loop Start instruction will immediately generate one or more
725e8d8bef9SDimitry Andric   // false lane mask which can, incorrectly, affect the proceeding MVE
726e8d8bef9SDimitry Andric   // instructions in the preheader.
727e8d8bef9SDimitry Andric   if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
728e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
729e8d8bef9SDimitry Andric     return false;
730e8d8bef9SDimitry Andric   }
731480093f4SDimitry Andric 
732349cc55cSDimitry Andric   // For any DoubleWidthResultInstrs we found whilst scanning instructions, they
733349cc55cSDimitry Andric   // need to compute an output size that is smaller than the VCTP mask operates
734349cc55cSDimitry Andric   // on. The VecSize of the DoubleWidthResult is the larger vector size - the
735349cc55cSDimitry Andric   // size it extends into, so any VCTP VecSize <= is valid.
736349cc55cSDimitry Andric   unsigned VCTPVecSize = getVecSize(*VCTP);
737349cc55cSDimitry Andric   for (MachineInstr *MI : DoubleWidthResultInstrs) {
738349cc55cSDimitry Andric     unsigned InstrVecSize = getVecSize(*MI);
739349cc55cSDimitry Andric     if (InstrVecSize > VCTPVecSize) {
740349cc55cSDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
741349cc55cSDimitry Andric                         << "VecSize:\n" << *MI);
742349cc55cSDimitry Andric       return false;
743349cc55cSDimitry Andric     }
744349cc55cSDimitry Andric   }
745349cc55cSDimitry Andric 
7465ffd83dbSDimitry Andric   // Check that the value change of the element count is what we expect and
7475ffd83dbSDimitry Andric   // that the predication will be equivalent. For this we need:
7485ffd83dbSDimitry Andric   // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
7495ffd83dbSDimitry Andric   // and we can also allow register copies within the chain too.
7505ffd83dbSDimitry Andric   auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
7515ffd83dbSDimitry Andric     return -getAddSubImmediate(*MI) == ExpectedVecWidth;
7525ffd83dbSDimitry Andric   };
7535ffd83dbSDimitry Andric 
754e8d8bef9SDimitry Andric   MachineBasicBlock *MBB = VCTP->getParent();
755e8d8bef9SDimitry Andric   // Remove modifications to the element count since they have no purpose in a
756e8d8bef9SDimitry Andric   // tail predicated loop. Explicitly refer to the vctp operand no matter which
757e8d8bef9SDimitry Andric   // register NumElements has been assigned to, since that is what the
758e8d8bef9SDimitry Andric   // modifications will be using
759e8d8bef9SDimitry Andric   if (auto *Def = RDA.getUniqueReachingMIDef(
760e8d8bef9SDimitry Andric           &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
7615ffd83dbSDimitry Andric     SmallPtrSet<MachineInstr*, 2> ElementChain;
762e8d8bef9SDimitry Andric     SmallPtrSet<MachineInstr*, 2> Ignore;
7635ffd83dbSDimitry Andric     unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
7645ffd83dbSDimitry Andric 
765e8d8bef9SDimitry Andric     Ignore.insert(VCTPs.begin(), VCTPs.end());
7665ffd83dbSDimitry Andric 
767e8d8bef9SDimitry Andric     if (TryRemove(Def, RDA, ElementChain, Ignore)) {
7685ffd83dbSDimitry Andric       bool FoundSub = false;
7695ffd83dbSDimitry Andric 
7705ffd83dbSDimitry Andric       for (auto *MI : ElementChain) {
7715ffd83dbSDimitry Andric         if (isMovRegOpcode(MI->getOpcode()))
7725ffd83dbSDimitry Andric           continue;
7735ffd83dbSDimitry Andric 
7745ffd83dbSDimitry Andric         if (isSubImmOpcode(MI->getOpcode())) {
775e8d8bef9SDimitry Andric           if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
776e8d8bef9SDimitry Andric             LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
777e8d8bef9SDimitry Andric                        " count: " << *MI);
7785ffd83dbSDimitry Andric             return false;
7795ffd83dbSDimitry Andric           }
780e8d8bef9SDimitry Andric           FoundSub = true;
781e8d8bef9SDimitry Andric         } else {
782e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
783e8d8bef9SDimitry Andric                      " count: " << *MI);
784e8d8bef9SDimitry Andric           return false;
785e8d8bef9SDimitry Andric         }
786e8d8bef9SDimitry Andric       }
7875ffd83dbSDimitry Andric       ToRemove.insert(ElementChain.begin(), ElementChain.end());
7885ffd83dbSDimitry Andric     }
7895ffd83dbSDimitry Andric   }
790fe6060f1SDimitry Andric 
791fe6060f1SDimitry Andric   // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
792fe6060f1SDimitry Andric   // can also remove any extra instructions in the preheader, which often
793fe6060f1SDimitry Andric   // includes a now unused MOV.
794fe6060f1SDimitry Andric   if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||
795fe6060f1SDimitry Andric        Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
796fe6060f1SDimitry Andric       Preheader && !Preheader->empty() &&
797fe6060f1SDimitry Andric       !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {
798fe6060f1SDimitry Andric     if (auto *Def = RDA.getUniqueReachingMIDef(
799fe6060f1SDimitry Andric             &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {
800fe6060f1SDimitry Andric       SmallPtrSet<MachineInstr*, 2> Ignore;
801fe6060f1SDimitry Andric       Ignore.insert(VCTPs.begin(), VCTPs.end());
802fe6060f1SDimitry Andric       TryRemove(Def, RDA, ToRemove, Ignore);
803fe6060f1SDimitry Andric     }
804fe6060f1SDimitry Andric   }
805fe6060f1SDimitry Andric 
806480093f4SDimitry Andric   return true;
807480093f4SDimitry Andric }
808480093f4SDimitry Andric 
isRegInClass(const MachineOperand & MO,const TargetRegisterClass * Class)8095ffd83dbSDimitry Andric static bool isRegInClass(const MachineOperand &MO,
8105ffd83dbSDimitry Andric                          const TargetRegisterClass *Class) {
8115ffd83dbSDimitry Andric   return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
8125ffd83dbSDimitry Andric }
8135ffd83dbSDimitry Andric 
8145ffd83dbSDimitry Andric // MVE 'narrowing' operate on half a lane, reading from half and writing
8155ffd83dbSDimitry Andric // to half, which are referred to has the top and bottom half. The other
8165ffd83dbSDimitry Andric // half retains its previous value.
retainsPreviousHalfElement(const MachineInstr & MI)8175ffd83dbSDimitry Andric static bool retainsPreviousHalfElement(const MachineInstr &MI) {
8185ffd83dbSDimitry Andric   const MCInstrDesc &MCID = MI.getDesc();
8195ffd83dbSDimitry Andric   uint64_t Flags = MCID.TSFlags;
8205ffd83dbSDimitry Andric   return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
8215ffd83dbSDimitry Andric }
8225ffd83dbSDimitry Andric 
8235ffd83dbSDimitry Andric // Some MVE instructions read from the top/bottom halves of their operand(s)
8245ffd83dbSDimitry Andric // and generate a vector result with result elements that are double the
8255ffd83dbSDimitry Andric // width of the input.
producesDoubleWidthResult(const MachineInstr & MI)8265ffd83dbSDimitry Andric static bool producesDoubleWidthResult(const MachineInstr &MI) {
8275ffd83dbSDimitry Andric   const MCInstrDesc &MCID = MI.getDesc();
8285ffd83dbSDimitry Andric   uint64_t Flags = MCID.TSFlags;
8295ffd83dbSDimitry Andric   return (Flags & ARMII::DoubleWidthResult) != 0;
8305ffd83dbSDimitry Andric }
8315ffd83dbSDimitry Andric 
8325ffd83dbSDimitry Andric // Can this instruction generate a non-zero result when given only zeroed
8335ffd83dbSDimitry Andric // operands? This allows us to know that, given operands with false bytes
8345ffd83dbSDimitry Andric // zeroed by masked loads, that the result will also contain zeros in those
8355ffd83dbSDimitry Andric // bytes.
canGenerateNonZeros(const MachineInstr & MI)8365ffd83dbSDimitry Andric static bool canGenerateNonZeros(const MachineInstr &MI) {
8375ffd83dbSDimitry Andric 
8385ffd83dbSDimitry Andric   // Check for instructions which can write into a larger element size,
8395ffd83dbSDimitry Andric   // possibly writing into a previous zero'd lane.
8405ffd83dbSDimitry Andric   if (producesDoubleWidthResult(MI))
8415ffd83dbSDimitry Andric     return true;
8425ffd83dbSDimitry Andric 
8435ffd83dbSDimitry Andric   switch (MI.getOpcode()) {
8445ffd83dbSDimitry Andric   default:
8455ffd83dbSDimitry Andric     break;
8465ffd83dbSDimitry Andric   // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
8475ffd83dbSDimitry Andric   // fp16 -> fp32 vector conversions.
8485ffd83dbSDimitry Andric   // Instructions that perform a NOT will generate 1s from 0s.
8495ffd83dbSDimitry Andric   case ARM::MVE_VMVN:
8505ffd83dbSDimitry Andric   case ARM::MVE_VORN:
8515ffd83dbSDimitry Andric   // Count leading zeros will do just that!
8525ffd83dbSDimitry Andric   case ARM::MVE_VCLZs8:
8535ffd83dbSDimitry Andric   case ARM::MVE_VCLZs16:
8545ffd83dbSDimitry Andric   case ARM::MVE_VCLZs32:
8555ffd83dbSDimitry Andric     return true;
8565ffd83dbSDimitry Andric   }
8575ffd83dbSDimitry Andric   return false;
8585ffd83dbSDimitry Andric }
8595ffd83dbSDimitry Andric 
8605ffd83dbSDimitry Andric // Look at its register uses to see if it only can only receive zeros
8615ffd83dbSDimitry Andric // into its false lanes which would then produce zeros. Also check that
8625ffd83dbSDimitry Andric // the output register is also defined by an FalseLanesZero instruction
8635ffd83dbSDimitry Andric // so that if tail-predication happens, the lanes that aren't updated will
8645ffd83dbSDimitry Andric // still be zeros.
producesFalseLanesZero(MachineInstr & MI,const TargetRegisterClass * QPRs,const ReachingDefAnalysis & RDA,InstSet & FalseLanesZero)8655ffd83dbSDimitry Andric static bool producesFalseLanesZero(MachineInstr &MI,
8665ffd83dbSDimitry Andric                                    const TargetRegisterClass *QPRs,
8675ffd83dbSDimitry Andric                                    const ReachingDefAnalysis &RDA,
8685ffd83dbSDimitry Andric                                    InstSet &FalseLanesZero) {
8695ffd83dbSDimitry Andric   if (canGenerateNonZeros(MI))
8705ffd83dbSDimitry Andric     return false;
8715ffd83dbSDimitry Andric 
872e8d8bef9SDimitry Andric   bool isPredicated = isVectorPredicated(&MI);
873e8d8bef9SDimitry Andric   // Predicated loads will write zeros to the falsely predicated bytes of the
874e8d8bef9SDimitry Andric   // destination register.
875e8d8bef9SDimitry Andric   if (MI.mayLoad())
876e8d8bef9SDimitry Andric     return isPredicated;
877e8d8bef9SDimitry Andric 
878e8d8bef9SDimitry Andric   auto IsZeroInit = [](MachineInstr *Def) {
879e8d8bef9SDimitry Andric     return !isVectorPredicated(Def) &&
880e8d8bef9SDimitry Andric            Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
881e8d8bef9SDimitry Andric            Def->getOperand(1).getImm() == 0;
882e8d8bef9SDimitry Andric   };
883e8d8bef9SDimitry Andric 
8845ffd83dbSDimitry Andric   bool AllowScalars = isHorizontalReduction(MI);
8855ffd83dbSDimitry Andric   for (auto &MO : MI.operands()) {
8865ffd83dbSDimitry Andric     if (!MO.isReg() || !MO.getReg())
8875ffd83dbSDimitry Andric       continue;
8885ffd83dbSDimitry Andric     if (!isRegInClass(MO, QPRs) && AllowScalars)
8895ffd83dbSDimitry Andric       continue;
890349cc55cSDimitry Andric     // Skip the lr predicate reg
891349cc55cSDimitry Andric     int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
89206c3fb27SDimitry Andric     if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2)
893349cc55cSDimitry Andric       continue;
894e8d8bef9SDimitry Andric 
895e8d8bef9SDimitry Andric     // Check that this instruction will produce zeros in its false lanes:
896e8d8bef9SDimitry Andric     // - If it only consumes false lanes zero or constant 0 (vmov #0)
897e8d8bef9SDimitry Andric     // - If it's predicated, it only matters that it's def register already has
898e8d8bef9SDimitry Andric     //   false lane zeros, so we can ignore the uses.
899e8d8bef9SDimitry Andric     SmallPtrSet<MachineInstr *, 2> Defs;
900e8d8bef9SDimitry Andric     RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
901bdd1243dSDimitry Andric     if (Defs.empty())
902bdd1243dSDimitry Andric       return false;
903e8d8bef9SDimitry Andric     for (auto *Def : Defs) {
904e8d8bef9SDimitry Andric       if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
905e8d8bef9SDimitry Andric         continue;
906e8d8bef9SDimitry Andric       if (MO.isUse() && isPredicated)
9075ffd83dbSDimitry Andric         continue;
9085ffd83dbSDimitry Andric       return false;
9095ffd83dbSDimitry Andric     }
910e8d8bef9SDimitry Andric   }
9115ffd83dbSDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
9125ffd83dbSDimitry Andric   return true;
9135ffd83dbSDimitry Andric }
9145ffd83dbSDimitry Andric 
ValidateLiveOuts()9155ffd83dbSDimitry Andric bool LowOverheadLoop::ValidateLiveOuts() {
9165ffd83dbSDimitry Andric   // We want to find out if the tail-predicated version of this loop will
9175ffd83dbSDimitry Andric   // produce the same values as the loop in its original form. For this to
9185ffd83dbSDimitry Andric   // be true, the newly inserted implicit predication must not change the
9195ffd83dbSDimitry Andric   // the (observable) results.
9205ffd83dbSDimitry Andric   // We're doing this because many instructions in the loop will not be
9215ffd83dbSDimitry Andric   // predicated and so the conversion from VPT predication to tail-predication
9225ffd83dbSDimitry Andric   // can result in different values being produced; due to the tail-predication
9235ffd83dbSDimitry Andric   // preventing many instructions from updating their falsely predicated
9245ffd83dbSDimitry Andric   // lanes. This analysis assumes that all the instructions perform lane-wise
9255ffd83dbSDimitry Andric   // operations and don't perform any exchanges.
9265ffd83dbSDimitry Andric   // A masked load, whether through VPT or tail predication, will write zeros
9275ffd83dbSDimitry Andric   // to any of the falsely predicated bytes. So, from the loads, we know that
9285ffd83dbSDimitry Andric   // the false lanes are zeroed and here we're trying to track that those false
9295ffd83dbSDimitry Andric   // lanes remain zero, or where they change, the differences are masked away
9305ffd83dbSDimitry Andric   // by their user(s).
931e8d8bef9SDimitry Andric   // All MVE stores have to be predicated, so we know that any predicate load
9325ffd83dbSDimitry Andric   // operands, or stored results are equivalent already. Other explicitly
9335ffd83dbSDimitry Andric   // predicated instructions will perform the same operation in the original
9345ffd83dbSDimitry Andric   // loop and the tail-predicated form too. Because of this, we can insert
9355ffd83dbSDimitry Andric   // loads, stores and other predicated instructions into our Predicated
9365ffd83dbSDimitry Andric   // set and build from there.
9375ffd83dbSDimitry Andric   const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
9385ffd83dbSDimitry Andric   SetVector<MachineInstr *> FalseLanesUnknown;
9395ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr *, 4> FalseLanesZero;
9405ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr *, 4> Predicated;
9415ffd83dbSDimitry Andric   MachineBasicBlock *Header = ML.getHeader();
9425ffd83dbSDimitry Andric 
943349cc55cSDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
944349cc55cSDimitry Andric 
9455ffd83dbSDimitry Andric   for (auto &MI : *Header) {
946e8d8bef9SDimitry Andric     if (!shouldInspect(MI))
9475ffd83dbSDimitry Andric       continue;
9485ffd83dbSDimitry Andric 
9495ffd83dbSDimitry Andric     if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
9505ffd83dbSDimitry Andric       continue;
9515ffd83dbSDimitry Andric 
952e8d8bef9SDimitry Andric     bool isPredicated = isVectorPredicated(&MI);
953e8d8bef9SDimitry Andric     bool retainsOrReduces =
954e8d8bef9SDimitry Andric       retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);
955e8d8bef9SDimitry Andric 
956e8d8bef9SDimitry Andric     if (isPredicated)
9575ffd83dbSDimitry Andric       Predicated.insert(&MI);
958e8d8bef9SDimitry Andric     if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
9595ffd83dbSDimitry Andric       FalseLanesZero.insert(&MI);
960e8d8bef9SDimitry Andric     else if (MI.getNumDefs() == 0)
961e8d8bef9SDimitry Andric       continue;
962349cc55cSDimitry Andric     else if (!isPredicated && retainsOrReduces) {
963349cc55cSDimitry Andric       LLVM_DEBUG(dbgs() << "  Unpredicated instruction that retainsOrReduces: " << MI);
964e8d8bef9SDimitry Andric       return false;
965349cc55cSDimitry Andric     } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
966e8d8bef9SDimitry Andric       FalseLanesUnknown.insert(&MI);
9675ffd83dbSDimitry Andric   }
9685ffd83dbSDimitry Andric 
969349cc55cSDimitry Andric   LLVM_DEBUG({
970349cc55cSDimitry Andric     dbgs() << "  Predicated:\n";
971349cc55cSDimitry Andric     for (auto *I : Predicated)
972349cc55cSDimitry Andric       dbgs() << "  " << *I;
973349cc55cSDimitry Andric     dbgs() << "  FalseLanesZero:\n";
974349cc55cSDimitry Andric     for (auto *I : FalseLanesZero)
975349cc55cSDimitry Andric       dbgs() << "  " << *I;
976349cc55cSDimitry Andric     dbgs() << "  FalseLanesUnknown:\n";
977349cc55cSDimitry Andric     for (auto *I : FalseLanesUnknown)
978349cc55cSDimitry Andric       dbgs() << "  " << *I;
979349cc55cSDimitry Andric   });
980349cc55cSDimitry Andric 
9815ffd83dbSDimitry Andric   auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
9825ffd83dbSDimitry Andric                               SmallPtrSetImpl<MachineInstr *> &Predicated) {
9835ffd83dbSDimitry Andric     SmallPtrSet<MachineInstr *, 2> Uses;
984e8d8bef9SDimitry Andric     RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
9855ffd83dbSDimitry Andric     for (auto *Use : Uses) {
9865ffd83dbSDimitry Andric       if (Use != MI && !Predicated.count(Use))
9875ffd83dbSDimitry Andric         return false;
9885ffd83dbSDimitry Andric     }
9895ffd83dbSDimitry Andric     return true;
9905ffd83dbSDimitry Andric   };
9915ffd83dbSDimitry Andric 
9925ffd83dbSDimitry Andric   // Visit the unknowns in reverse so that we can start at the values being
9935ffd83dbSDimitry Andric   // stored and then we can work towards the leaves, hopefully adding more
9945ffd83dbSDimitry Andric   // instructions to Predicated. Successfully terminating the loop means that
9955ffd83dbSDimitry Andric   // all the unknown values have to found to be masked by predicated user(s).
9965ffd83dbSDimitry Andric   // For any unpredicated values, we store them in NonPredicated so that we
9975ffd83dbSDimitry Andric   // can later check whether these form a reduction.
9985ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr*, 2> NonPredicated;
9995ffd83dbSDimitry Andric   for (auto *MI : reverse(FalseLanesUnknown)) {
10005ffd83dbSDimitry Andric     for (auto &MO : MI->operands()) {
10015ffd83dbSDimitry Andric       if (!isRegInClass(MO, QPRs) || !MO.isDef())
10025ffd83dbSDimitry Andric         continue;
10035ffd83dbSDimitry Andric       if (!HasPredicatedUsers(MI, MO, Predicated)) {
1004349cc55cSDimitry Andric         LLVM_DEBUG(dbgs() << "  Found an unknown def of : "
10055ffd83dbSDimitry Andric                           << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
10065ffd83dbSDimitry Andric         NonPredicated.insert(MI);
1007e8d8bef9SDimitry Andric         break;
10085ffd83dbSDimitry Andric       }
10095ffd83dbSDimitry Andric     }
10105ffd83dbSDimitry Andric     // Any unknown false lanes have been masked away by the user(s).
1011e8d8bef9SDimitry Andric     if (!NonPredicated.contains(MI))
10125ffd83dbSDimitry Andric       Predicated.insert(MI);
10135ffd83dbSDimitry Andric   }
10145ffd83dbSDimitry Andric 
10155ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
10165ffd83dbSDimitry Andric   SmallVector<MachineBasicBlock *, 2> ExitBlocks;
10175ffd83dbSDimitry Andric   ML.getExitBlocks(ExitBlocks);
10185ffd83dbSDimitry Andric   assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
10195ffd83dbSDimitry Andric   assert(ExitBlocks.size() == 1 && "Expected a single exit block");
10205ffd83dbSDimitry Andric   MachineBasicBlock *ExitBB = ExitBlocks.front();
10215ffd83dbSDimitry Andric   for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
1022e8d8bef9SDimitry Andric     // TODO: Instead of blocking predication, we could move the vctp to the exit
1023e8d8bef9SDimitry Andric     // block and calculate it's operand there in or the preheader.
1024349cc55cSDimitry Andric     if (RegMask.PhysReg == ARM::VPR) {
1025349cc55cSDimitry Andric       LLVM_DEBUG(dbgs() << "  VPR is live in to the exit block.");
1026e8d8bef9SDimitry Andric       return false;
1027349cc55cSDimitry Andric     }
10285ffd83dbSDimitry Andric     // Check Q-regs that are live in the exit blocks. We don't collect scalars
10295ffd83dbSDimitry Andric     // because they won't be affected by lane predication.
1030e8d8bef9SDimitry Andric     if (QPRs->contains(RegMask.PhysReg))
10315ffd83dbSDimitry Andric       if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
10325ffd83dbSDimitry Andric         LiveOutMIs.insert(MI);
10335ffd83dbSDimitry Andric   }
10345ffd83dbSDimitry Andric 
10355ffd83dbSDimitry Andric   // We've already validated that any VPT predication within the loop will be
10365ffd83dbSDimitry Andric   // equivalent when we perform the predication transformation; so we know that
10375ffd83dbSDimitry Andric   // any VPT predicated instruction is predicated upon VCTP. Any live-out
10385ffd83dbSDimitry Andric   // instruction needs to be predicated, so check this here. The instructions
10395ffd83dbSDimitry Andric   // in NonPredicated have been found to be a reduction that we can ensure its
1040349cc55cSDimitry Andric   // legality. Any MQPRCopy found will need to validate its input as if it was
1041349cc55cSDimitry Andric   // live out.
1042349cc55cSDimitry Andric   SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
1043349cc55cSDimitry Andric   while (!Worklist.empty()) {
1044349cc55cSDimitry Andric     MachineInstr *MI = Worklist.pop_back_val();
1045349cc55cSDimitry Andric     if (MI->getOpcode() == ARM::MQPRCopy) {
1046349cc55cSDimitry Andric       VMOVCopies.insert(MI);
1047349cc55cSDimitry Andric       MachineInstr *CopySrc =
1048349cc55cSDimitry Andric           RDA.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
1049349cc55cSDimitry Andric       if (CopySrc)
1050349cc55cSDimitry Andric         Worklist.push_back(CopySrc);
1051349cc55cSDimitry Andric     } else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
1052349cc55cSDimitry Andric       LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
1053349cc55cSDimitry Andric       VMOVCopies.clear();
10545ffd83dbSDimitry Andric       return false;
1055e8d8bef9SDimitry Andric     }
1056e8d8bef9SDimitry Andric   }
10575ffd83dbSDimitry Andric 
10585ffd83dbSDimitry Andric   return true;
10595ffd83dbSDimitry Andric }
10605ffd83dbSDimitry Andric 
Validate(ARMBasicBlockUtils * BBUtils)1061e8d8bef9SDimitry Andric void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
1062480093f4SDimitry Andric   if (Revert)
1063480093f4SDimitry Andric     return;
1064480093f4SDimitry Andric 
1065e8d8bef9SDimitry Andric   // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
1066e8d8bef9SDimitry Andric   // can only jump back.
1067e8d8bef9SDimitry Andric   auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
1068e8d8bef9SDimitry Andric                            ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
1069e8d8bef9SDimitry Andric     MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
1070e8d8bef9SDimitry Andric                                    ? End->getOperand(1).getMBB()
1071e8d8bef9SDimitry Andric                                    : End->getOperand(2).getMBB();
1072480093f4SDimitry Andric     // TODO Maybe there's cases where the target doesn't have to be the header,
1073480093f4SDimitry Andric     // but for now be safe and revert.
1074e8d8bef9SDimitry Andric     if (TgtBB != ML.getHeader()) {
1075e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
1076e8d8bef9SDimitry Andric       return false;
1077480093f4SDimitry Andric     }
1078480093f4SDimitry Andric 
1079480093f4SDimitry Andric     // The WLS and LE instructions have 12-bits for the label offset. WLS
1080480093f4SDimitry Andric     // requires a positive offset, while LE uses negative.
10815ffd83dbSDimitry Andric     if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
10825ffd83dbSDimitry Andric         !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
1083480093f4SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
1084e8d8bef9SDimitry Andric       return false;
1085480093f4SDimitry Andric     }
1086480093f4SDimitry Andric 
1087fe6060f1SDimitry Andric     if (isWhileLoopStart(*Start)) {
1088fe6060f1SDimitry Andric       MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);
1089fe6060f1SDimitry Andric       if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||
1090fe6060f1SDimitry Andric           !BBUtils->isBBInRange(Start, TargetBB, 4094)) {
1091480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
1092e8d8bef9SDimitry Andric         return false;
1093e8d8bef9SDimitry Andric       }
1094fe6060f1SDimitry Andric     }
1095e8d8bef9SDimitry Andric     return true;
1096e8d8bef9SDimitry Andric   };
1097e8d8bef9SDimitry Andric 
1098fe6060f1SDimitry Andric   StartInsertPt = MachineBasicBlock::iterator(Start);
1099fe6060f1SDimitry Andric   StartInsertBB = Start->getParent();
1100fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
1101fe6060f1SDimitry Andric                     << *StartInsertPt);
1102480093f4SDimitry Andric 
1103e8d8bef9SDimitry Andric   Revert = !ValidateRanges(Start, End, BBUtils, ML);
1104e8d8bef9SDimitry Andric   CannotTailPredicate = !ValidateTailPredicate();
1105480093f4SDimitry Andric }
1106480093f4SDimitry Andric 
AddVCTP(MachineInstr * MI)1107e8d8bef9SDimitry Andric bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
1108e8d8bef9SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
1109e8d8bef9SDimitry Andric   if (VCTPs.empty()) {
1110e8d8bef9SDimitry Andric     VCTPs.push_back(MI);
1111e8d8bef9SDimitry Andric     return true;
1112e8d8bef9SDimitry Andric   }
1113e8d8bef9SDimitry Andric 
1114e8d8bef9SDimitry Andric   // If we find another VCTP, check whether it uses the same value as the main VCTP.
1115e8d8bef9SDimitry Andric   // If it does, store it in the VCTPs set, else refuse it.
1116e8d8bef9SDimitry Andric   MachineInstr *Prev = VCTPs.back();
1117e8d8bef9SDimitry Andric   if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
1118e8d8bef9SDimitry Andric       !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
1119e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
1120e8d8bef9SDimitry Andric                          "definition from the main VCTP");
1121e8d8bef9SDimitry Andric     return false;
1122e8d8bef9SDimitry Andric   }
1123e8d8bef9SDimitry Andric   VCTPs.push_back(MI);
1124e8d8bef9SDimitry Andric   return true;
1125480093f4SDimitry Andric }
1126480093f4SDimitry Andric 
ValidateMVEStore(MachineInstr * MI,MachineLoop * ML)1127fe6060f1SDimitry Andric static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) {
1128fe6060f1SDimitry Andric 
1129fe6060f1SDimitry Andric   auto GetFrameIndex = [](MachineMemOperand *Operand) {
1130fe6060f1SDimitry Andric     const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
1131fe6060f1SDimitry Andric     if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
1132fe6060f1SDimitry Andric       if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(PseudoValue)) {
1133fe6060f1SDimitry Andric         return FS->getFrameIndex();
1134fe6060f1SDimitry Andric       }
1135fe6060f1SDimitry Andric     }
1136fe6060f1SDimitry Andric     return -1;
1137fe6060f1SDimitry Andric   };
1138fe6060f1SDimitry Andric 
1139fe6060f1SDimitry Andric   auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
1140fe6060f1SDimitry Andric     switch (I->getOpcode()) {
1141fe6060f1SDimitry Andric     case ARM::MVE_VSTRWU32:
1142fe6060f1SDimitry Andric     case ARM::MVE_VLDRWU32: {
1143fe6060f1SDimitry Andric       return I->getOperand(1).getReg() == ARM::SP &&
1144fe6060f1SDimitry Andric              I->memoperands().size() == 1 &&
1145fe6060f1SDimitry Andric              GetFrameIndex(I->memoperands().front()) >= 0;
1146fe6060f1SDimitry Andric     }
1147fe6060f1SDimitry Andric     default:
1148fe6060f1SDimitry Andric       return false;
1149fe6060f1SDimitry Andric     }
1150fe6060f1SDimitry Andric   };
1151fe6060f1SDimitry Andric 
1152fe6060f1SDimitry Andric   // An unpredicated vector register spill is allowed if all of the uses of the
1153fe6060f1SDimitry Andric   // stack slot are within the loop
1154fe6060f1SDimitry Andric   if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI))
1155fe6060f1SDimitry Andric     return false;
1156fe6060f1SDimitry Andric 
1157fe6060f1SDimitry Andric   // Search all blocks after the loop for accesses to the same stack slot.
1158fe6060f1SDimitry Andric   // ReachingDefAnalysis doesn't work for sp as it relies on registers being
1159fe6060f1SDimitry Andric   // live-out (which sp never is) to know what blocks to look in
1160fe6060f1SDimitry Andric   if (MI->memoperands().size() == 0)
1161fe6060f1SDimitry Andric     return false;
1162fe6060f1SDimitry Andric   int FI = GetFrameIndex(MI->memoperands().front());
1163fe6060f1SDimitry Andric 
1164349cc55cSDimitry Andric   auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
1165fe6060f1SDimitry Andric   if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))
1166fe6060f1SDimitry Andric     return false;
1167fe6060f1SDimitry Andric 
1168fe6060f1SDimitry Andric   SmallVector<MachineBasicBlock *> Frontier;
1169fe6060f1SDimitry Andric   ML->getExitBlocks(Frontier);
1170fe6060f1SDimitry Andric   SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()};
1171fe6060f1SDimitry Andric   unsigned Idx = 0;
1172fe6060f1SDimitry Andric   while (Idx < Frontier.size()) {
1173fe6060f1SDimitry Andric     MachineBasicBlock *BB = Frontier[Idx];
1174fe6060f1SDimitry Andric     bool LookAtSuccessors = true;
1175fe6060f1SDimitry Andric     for (auto &I : *BB) {
1176fe6060f1SDimitry Andric       if (!IsStackOp(&I) || I.memoperands().size() == 0)
1177fe6060f1SDimitry Andric         continue;
1178fe6060f1SDimitry Andric       if (GetFrameIndex(I.memoperands().front()) != FI)
1179fe6060f1SDimitry Andric         continue;
1180fe6060f1SDimitry Andric       // If this block has a store to the stack slot before any loads then we
1181fe6060f1SDimitry Andric       // can ignore the block
1182fe6060f1SDimitry Andric       if (I.getOpcode() == ARM::MVE_VSTRWU32) {
1183fe6060f1SDimitry Andric         LookAtSuccessors = false;
1184fe6060f1SDimitry Andric         break;
1185fe6060f1SDimitry Andric       }
1186fe6060f1SDimitry Andric       // If the store and the load are using the same stack slot then the
1187fe6060f1SDimitry Andric       // store isn't valid for tail predication
1188fe6060f1SDimitry Andric       if (I.getOpcode() == ARM::MVE_VLDRWU32)
1189fe6060f1SDimitry Andric         return false;
1190fe6060f1SDimitry Andric     }
1191fe6060f1SDimitry Andric 
1192fe6060f1SDimitry Andric     if (LookAtSuccessors) {
1193bdd1243dSDimitry Andric       for (auto *Succ : BB->successors()) {
1194fe6060f1SDimitry Andric         if (!Visited.contains(Succ) && !is_contained(Frontier, Succ))
1195fe6060f1SDimitry Andric           Frontier.push_back(Succ);
1196fe6060f1SDimitry Andric       }
1197fe6060f1SDimitry Andric     }
1198fe6060f1SDimitry Andric     Visited.insert(BB);
1199fe6060f1SDimitry Andric     Idx++;
1200fe6060f1SDimitry Andric   }
1201fe6060f1SDimitry Andric 
1202fe6060f1SDimitry Andric   return true;
1203fe6060f1SDimitry Andric }
1204fe6060f1SDimitry Andric 
ValidateMVEInst(MachineInstr * MI)1205480093f4SDimitry Andric bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
1206480093f4SDimitry Andric   if (CannotTailPredicate)
1207480093f4SDimitry Andric     return false;
1208480093f4SDimitry Andric 
1209e8d8bef9SDimitry Andric   if (!shouldInspect(*MI))
1210480093f4SDimitry Andric     return true;
1211e8d8bef9SDimitry Andric 
1212e8d8bef9SDimitry Andric   if (MI->getOpcode() == ARM::MVE_VPSEL ||
12135ffd83dbSDimitry Andric       MI->getOpcode() == ARM::MVE_VPNOT) {
1214480093f4SDimitry Andric     // TODO: Allow VPSEL and VPNOT, we currently cannot because:
1215480093f4SDimitry Andric     // 1) It will use the VPR as a predicate operand, but doesn't have to be
1216480093f4SDimitry Andric     //    instead a VPT block, which means we can assert while building up
12175ffd83dbSDimitry Andric     //    the VPT block because we don't find another VPT or VPST to being a new
1218480093f4SDimitry Andric     //    one.
1219480093f4SDimitry Andric     // 2) VPSEL still requires a VPR operand even after tail predicating,
1220480093f4SDimitry Andric     //    which means we can't remove it unless there is another
1221480093f4SDimitry Andric     //    instruction, such as vcmp, that can provide the VPR def.
12225ffd83dbSDimitry Andric     return false;
12235ffd83dbSDimitry Andric   }
1224480093f4SDimitry Andric 
1225e8d8bef9SDimitry Andric   // Record all VCTPs and check that they're equivalent to one another.
1226e8d8bef9SDimitry Andric   if (isVCTP(MI) && !AddVCTP(MI))
1227e8d8bef9SDimitry Andric     return false;
1228e8d8bef9SDimitry Andric 
1229e8d8bef9SDimitry Andric   // Inspect uses first so that any instructions that alter the VPR don't
1230e8d8bef9SDimitry Andric   // alter the predicate upon themselves.
1231480093f4SDimitry Andric   const MCInstrDesc &MCID = MI->getDesc();
1232e8d8bef9SDimitry Andric   bool IsUse = false;
1233e8d8bef9SDimitry Andric   unsigned LastOpIdx = MI->getNumOperands() - 1;
123406c3fb27SDimitry Andric   for (const auto &Op : enumerate(reverse(MCID.operands()))) {
1235e8d8bef9SDimitry Andric     const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
1236e8d8bef9SDimitry Andric     if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
1237480093f4SDimitry Andric       continue;
1238480093f4SDimitry Andric 
1239e8d8bef9SDimitry Andric     if (ARM::isVpred(Op.value().OperandType)) {
1240*0fca6ea1SDimitry Andric       VPTstate.addInst(MI);
1241480093f4SDimitry Andric       IsUse = true;
1242e8d8bef9SDimitry Andric     } else if (MI->getOpcode() != ARM::MVE_VPST) {
1243480093f4SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
1244480093f4SDimitry Andric       return false;
1245480093f4SDimitry Andric     }
1246480093f4SDimitry Andric   }
1247480093f4SDimitry Andric 
1248480093f4SDimitry Andric   // If we find an instruction that has been marked as not valid for tail
1249480093f4SDimitry Andric   // predication, only allow the instruction if it's contained within a valid
1250480093f4SDimitry Andric   // VPT block.
1251e8d8bef9SDimitry Andric   bool RequiresExplicitPredication =
1252e8d8bef9SDimitry Andric     (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
1253e8d8bef9SDimitry Andric   if (isDomainMVE(MI) && RequiresExplicitPredication) {
1254349cc55cSDimitry Andric     if (MI->getOpcode() == ARM::MQPRCopy)
1255349cc55cSDimitry Andric       return true;
1256349cc55cSDimitry Andric     if (!IsUse && producesDoubleWidthResult(*MI)) {
1257349cc55cSDimitry Andric       DoubleWidthResultInstrs.insert(MI);
1258349cc55cSDimitry Andric       return true;
1259349cc55cSDimitry Andric     }
1260349cc55cSDimitry Andric 
1261349cc55cSDimitry Andric     LLVM_DEBUG(if (!IsUse) dbgs()
1262349cc55cSDimitry Andric                << "ARM Loops: Can't tail predicate: " << *MI);
1263e8d8bef9SDimitry Andric     return IsUse;
1264480093f4SDimitry Andric   }
1265480093f4SDimitry Andric 
12665ffd83dbSDimitry Andric   // If the instruction is already explicitly predicated, then the conversion
1267e8d8bef9SDimitry Andric   // will be fine, but ensure that all store operations are predicated.
1268fe6060f1SDimitry Andric   if (MI->mayStore() && !ValidateMVEStore(MI, &ML))
1269e8d8bef9SDimitry Andric     return IsUse;
1270e8d8bef9SDimitry Andric 
1271e8d8bef9SDimitry Andric   // If this instruction defines the VPR, update the predicate for the
1272e8d8bef9SDimitry Andric   // proceeding instructions.
1273e8d8bef9SDimitry Andric   if (isVectorPredicate(MI)) {
1274e8d8bef9SDimitry Andric     // Clear the existing predicate when we're not in VPT Active state,
1275e8d8bef9SDimitry Andric     // otherwise we add to it.
1276e8d8bef9SDimitry Andric     if (!isVectorPredicated(MI))
1277*0fca6ea1SDimitry Andric       VPTstate.resetPredicate(MI);
1278e8d8bef9SDimitry Andric     else
1279*0fca6ea1SDimitry Andric       VPTstate.addPredicate(MI);
1280e8d8bef9SDimitry Andric   }
1281e8d8bef9SDimitry Andric 
1282e8d8bef9SDimitry Andric   // Finally once the predicate has been modified, we can start a new VPT
1283e8d8bef9SDimitry Andric   // block if necessary.
1284e8d8bef9SDimitry Andric   if (isVPTOpcode(MI->getOpcode()))
1285*0fca6ea1SDimitry Andric     VPTstate.CreateVPTBlock(MI);
1286e8d8bef9SDimitry Andric 
1287e8d8bef9SDimitry Andric   return true;
1288480093f4SDimitry Andric }
1289480093f4SDimitry Andric 
runOnMachineFunction(MachineFunction & mf)12908bcb0991SDimitry Andric bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
129181ad6265SDimitry Andric   const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
12928bcb0991SDimitry Andric   if (!ST.hasLOB())
12930b57cec5SDimitry Andric     return false;
12940b57cec5SDimitry Andric 
12958bcb0991SDimitry Andric   MF = &mf;
12968bcb0991SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
12970b57cec5SDimitry Andric 
1298*0fca6ea1SDimitry Andric   MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
1299480093f4SDimitry Andric   RDA = &getAnalysis<ReachingDefAnalysis>();
13008bcb0991SDimitry Andric   MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
13018bcb0991SDimitry Andric   MRI = &MF->getRegInfo();
13028bcb0991SDimitry Andric   TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
1303480093f4SDimitry Andric   TRI = ST.getRegisterInfo();
1304*0fca6ea1SDimitry Andric   BBUtils = std::make_unique<ARMBasicBlockUtils>(*MF);
13050b57cec5SDimitry Andric   BBUtils->computeAllBlockSizes();
13068bcb0991SDimitry Andric   BBUtils->adjustBBOffsetsAfter(&MF->front());
13070b57cec5SDimitry Andric 
13080b57cec5SDimitry Andric   bool Changed = false;
1309bdd1243dSDimitry Andric   for (auto *ML : *MLI) {
1310e8d8bef9SDimitry Andric     if (ML->isOutermost())
13110b57cec5SDimitry Andric       Changed |= ProcessLoop(ML);
13120b57cec5SDimitry Andric   }
13138bcb0991SDimitry Andric   Changed |= RevertNonLoops();
13140b57cec5SDimitry Andric   return Changed;
13150b57cec5SDimitry Andric }
13160b57cec5SDimitry Andric 
ProcessLoop(MachineLoop * ML)13170b57cec5SDimitry Andric bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
13180b57cec5SDimitry Andric   bool Changed = false;
13190b57cec5SDimitry Andric 
13200b57cec5SDimitry Andric   // Process inner loops first.
13210eae32dcSDimitry Andric   for (MachineLoop *L : *ML)
13220eae32dcSDimitry Andric     Changed |= ProcessLoop(L);
13230b57cec5SDimitry Andric 
1324fe6060f1SDimitry Andric   LLVM_DEBUG({
1325fe6060f1SDimitry Andric     dbgs() << "ARM Loops: Processing loop containing:\n";
1326480093f4SDimitry Andric     if (auto *Preheader = ML->getLoopPreheader())
1327fe6060f1SDimitry Andric       dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1328fe6060f1SDimitry Andric     else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true))
1329fe6060f1SDimitry Andric       dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1330480093f4SDimitry Andric     for (auto *MBB : ML->getBlocks())
1331fe6060f1SDimitry Andric       dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
1332fe6060f1SDimitry Andric   });
13330b57cec5SDimitry Andric 
13340b57cec5SDimitry Andric   // Search the given block for a loop start instruction. If one isn't found,
13350b57cec5SDimitry Andric   // and there's only one predecessor block, search that one too.
13360b57cec5SDimitry Andric   std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
13378bcb0991SDimitry Andric     [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
13380b57cec5SDimitry Andric     for (auto &MI : *MBB) {
1339480093f4SDimitry Andric       if (isLoopStart(MI))
13400b57cec5SDimitry Andric         return &MI;
13410b57cec5SDimitry Andric     }
13420b57cec5SDimitry Andric     if (MBB->pred_size() == 1)
13430b57cec5SDimitry Andric       return SearchForStart(*MBB->pred_begin());
13440b57cec5SDimitry Andric     return nullptr;
13450b57cec5SDimitry Andric   };
13460b57cec5SDimitry Andric 
13475ffd83dbSDimitry Andric   LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
1348480093f4SDimitry Andric   // Search the preheader for the start intrinsic.
13490b57cec5SDimitry Andric   // FIXME: I don't see why we shouldn't be supporting multiple predecessors
13500b57cec5SDimitry Andric   // with potentially multiple set.loop.iterations, so we need to enable this.
13515ffd83dbSDimitry Andric   if (LoLoop.Preheader)
13525ffd83dbSDimitry Andric     LoLoop.Start = SearchForStart(LoLoop.Preheader);
1353480093f4SDimitry Andric   else
1354fe6060f1SDimitry Andric     return Changed;
13550b57cec5SDimitry Andric 
13560b57cec5SDimitry Andric   // Find the low-overhead loop components and decide whether or not to fall
1357480093f4SDimitry Andric   // back to a normal loop. Also look for a vctp instructions and decide
1358480093f4SDimitry Andric   // whether we can convert that predicate using tail predication.
13590b57cec5SDimitry Andric   for (auto *MBB : reverse(ML->getBlocks())) {
13600b57cec5SDimitry Andric     for (auto &MI : *MBB) {
13615ffd83dbSDimitry Andric       if (MI.isDebugValue())
13625ffd83dbSDimitry Andric         continue;
13635ffd83dbSDimitry Andric       else if (MI.getOpcode() == ARM::t2LoopDec)
1364480093f4SDimitry Andric         LoLoop.Dec = &MI;
13650b57cec5SDimitry Andric       else if (MI.getOpcode() == ARM::t2LoopEnd)
1366480093f4SDimitry Andric         LoLoop.End = &MI;
1367e8d8bef9SDimitry Andric       else if (MI.getOpcode() == ARM::t2LoopEndDec)
1368e8d8bef9SDimitry Andric         LoLoop.End = LoLoop.Dec = &MI;
1369480093f4SDimitry Andric       else if (isLoopStart(MI))
1370480093f4SDimitry Andric         LoLoop.Start = &MI;
13718bcb0991SDimitry Andric       else if (MI.getDesc().isCall()) {
13720b57cec5SDimitry Andric         // TODO: Though the call will require LE to execute again, does this
13730b57cec5SDimitry Andric         // mean we should revert? Always executing LE hopefully should be
13740b57cec5SDimitry Andric         // faster than performing a sub,cmp,br or even subs,br.
1375480093f4SDimitry Andric         LoLoop.Revert = true;
13768bcb0991SDimitry Andric         LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
1377480093f4SDimitry Andric       } else {
1378480093f4SDimitry Andric         // Record VPR defs and build up their corresponding vpt blocks.
1379480093f4SDimitry Andric         // Check we know how to tail predicate any mve instructions.
1380480093f4SDimitry Andric         LoLoop.AnalyseMVEInst(&MI);
13818bcb0991SDimitry Andric       }
13820b57cec5SDimitry Andric     }
13830b57cec5SDimitry Andric   }
13840b57cec5SDimitry Andric 
1385480093f4SDimitry Andric   LLVM_DEBUG(LoLoop.dump());
1386480093f4SDimitry Andric   if (!LoLoop.FoundAllComponents()) {
1387480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
1388fe6060f1SDimitry Andric     return Changed;
13890b57cec5SDimitry Andric   }
13900b57cec5SDimitry Andric 
1391fe6060f1SDimitry Andric   assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
1392fe6060f1SDimitry Andric          "Expected t2WhileLoopStart to be removed before regalloc!");
1393fe6060f1SDimitry Andric 
1394e8d8bef9SDimitry Andric   // Check that the only instruction using LoopDec is LoopEnd. This can only
1395e8d8bef9SDimitry Andric   // happen when the Dec and End are separate, not a single t2LoopEndDec.
13965ffd83dbSDimitry Andric   // TODO: Check for copy chains that really have no effect.
1397e8d8bef9SDimitry Andric   if (LoLoop.Dec != LoLoop.End) {
13985ffd83dbSDimitry Andric     SmallPtrSet<MachineInstr *, 2> Uses;
1399e8d8bef9SDimitry Andric     RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
14005ffd83dbSDimitry Andric     if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
14015ffd83dbSDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
14025ffd83dbSDimitry Andric       LoLoop.Revert = true;
14035ffd83dbSDimitry Andric     }
1404e8d8bef9SDimitry Andric   }
1405e8d8bef9SDimitry Andric   LoLoop.Validate(BBUtils.get());
1406480093f4SDimitry Andric   Expand(LoLoop);
14070b57cec5SDimitry Andric   return true;
14080b57cec5SDimitry Andric }
14090b57cec5SDimitry Andric 
14100b57cec5SDimitry Andric // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
14110b57cec5SDimitry Andric // beq that branches to the exit branch.
14128bcb0991SDimitry Andric // TODO: We could also try to generate a cbz if the value in LR is also in
14130b57cec5SDimitry Andric // another low register.
RevertWhile(MachineInstr * MI) const14140b57cec5SDimitry Andric void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
14150b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
1416fe6060f1SDimitry Andric   MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI);
14178bcb0991SDimitry Andric   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
14188bcb0991SDimitry Andric     ARM::tBcc : ARM::t2Bcc;
14198bcb0991SDimitry Andric 
1420fe6060f1SDimitry Andric   RevertWhileLoopStartLR(MI, TII, BrOpc);
1421e8d8bef9SDimitry Andric }
1422e8d8bef9SDimitry Andric 
RevertDo(MachineInstr * MI) const1423e8d8bef9SDimitry Andric void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
1424e8d8bef9SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
1425e8d8bef9SDimitry Andric   RevertDoLoopStart(MI, TII);
14260b57cec5SDimitry Andric }
14270b57cec5SDimitry Andric 
RevertLoopDec(MachineInstr * MI) const14285ffd83dbSDimitry Andric bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
14290b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
14300b57cec5SDimitry Andric   MachineBasicBlock *MBB = MI->getParent();
14315ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr*, 1> Ignore;
14325ffd83dbSDimitry Andric   for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
14335ffd83dbSDimitry Andric     if (I->getOpcode() == ARM::t2LoopEnd) {
14345ffd83dbSDimitry Andric       Ignore.insert(&*I);
14355ffd83dbSDimitry Andric       break;
14365ffd83dbSDimitry Andric     }
14375ffd83dbSDimitry Andric   }
14388bcb0991SDimitry Andric 
1439480093f4SDimitry Andric   // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
1440e8d8bef9SDimitry Andric   bool SetFlags =
1441e8d8bef9SDimitry Andric       RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);
14428bcb0991SDimitry Andric 
1443e8d8bef9SDimitry Andric   llvm::RevertLoopDec(MI, TII, SetFlags);
14448bcb0991SDimitry Andric   return SetFlags;
14450b57cec5SDimitry Andric }
14460b57cec5SDimitry Andric 
14470b57cec5SDimitry Andric // Generate a subs, or sub and cmp, and a branch instead of an LE.
RevertLoopEnd(MachineInstr * MI,bool SkipCmp) const14488bcb0991SDimitry Andric void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
14490b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
14500b57cec5SDimitry Andric 
14518bcb0991SDimitry Andric   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
14528bcb0991SDimitry Andric   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
14538bcb0991SDimitry Andric     ARM::tBcc : ARM::t2Bcc;
14548bcb0991SDimitry Andric 
1455e8d8bef9SDimitry Andric   llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
1456e8d8bef9SDimitry Andric }
1457e8d8bef9SDimitry Andric 
1458e8d8bef9SDimitry Andric // Generate a subs, or sub and cmp, and a branch instead of an LE.
RevertLoopEndDec(MachineInstr * MI) const1459e8d8bef9SDimitry Andric void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
1460e8d8bef9SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
1461e8d8bef9SDimitry Andric   assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
1462e8d8bef9SDimitry Andric   MachineBasicBlock *MBB = MI->getParent();
1463e8d8bef9SDimitry Andric 
14648bcb0991SDimitry Andric   MachineInstrBuilder MIB =
1465e8d8bef9SDimitry Andric       BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
1466e8d8bef9SDimitry Andric   MIB.addDef(ARM::LR);
1467e8d8bef9SDimitry Andric   MIB.add(MI->getOperand(1));
1468e8d8bef9SDimitry Andric   MIB.addImm(1);
1469e8d8bef9SDimitry Andric   MIB.addImm(ARMCC::AL);
1470e8d8bef9SDimitry Andric   MIB.addReg(ARM::NoRegister);
1471e8d8bef9SDimitry Andric   MIB.addReg(ARM::CPSR);
1472e8d8bef9SDimitry Andric   MIB->getOperand(5).setIsDef(true);
1473e8d8bef9SDimitry Andric 
1474e8d8bef9SDimitry Andric   MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
1475e8d8bef9SDimitry Andric   unsigned BrOpc =
1476e8d8bef9SDimitry Andric       BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
1477e8d8bef9SDimitry Andric 
1478e8d8bef9SDimitry Andric   // Create bne
1479e8d8bef9SDimitry Andric   MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
1480e8d8bef9SDimitry Andric   MIB.add(MI->getOperand(2)); // branch target
14810b57cec5SDimitry Andric   MIB.addImm(ARMCC::NE);      // condition code
14820b57cec5SDimitry Andric   MIB.addReg(ARM::CPSR);
1483e8d8bef9SDimitry Andric 
14840b57cec5SDimitry Andric   MI->eraseFromParent();
14850b57cec5SDimitry Andric }
14860b57cec5SDimitry Andric 
14875ffd83dbSDimitry Andric // Perform dead code elimation on the loop iteration count setup expression.
14885ffd83dbSDimitry Andric // If we are tail-predicating, the number of elements to be processed is the
14895ffd83dbSDimitry Andric // operand of the VCTP instruction in the vector body, see getCount(), which is
14905ffd83dbSDimitry Andric // register $r3 in this example:
14915ffd83dbSDimitry Andric //
14925ffd83dbSDimitry Andric //   $lr = big-itercount-expression
14935ffd83dbSDimitry Andric //   ..
1494e8d8bef9SDimitry Andric //   $lr = t2DoLoopStart renamable $lr
14955ffd83dbSDimitry Andric //   vector.body:
14965ffd83dbSDimitry Andric //     ..
14975ffd83dbSDimitry Andric //     $vpr = MVE_VCTP32 renamable $r3
14985ffd83dbSDimitry Andric //     renamable $lr = t2LoopDec killed renamable $lr, 1
14995ffd83dbSDimitry Andric //     t2LoopEnd renamable $lr, %vector.body
15005ffd83dbSDimitry Andric //     tB %end
15015ffd83dbSDimitry Andric //
15025ffd83dbSDimitry Andric // What we would like achieve here is to replace the do-loop start pseudo
15035ffd83dbSDimitry Andric // instruction t2DoLoopStart with:
15045ffd83dbSDimitry Andric //
15055ffd83dbSDimitry Andric //    $lr = MVE_DLSTP_32 killed renamable $r3
15065ffd83dbSDimitry Andric //
15075ffd83dbSDimitry Andric // Thus, $r3 which defines the number of elements, is written to $lr,
15085ffd83dbSDimitry Andric // and then we want to delete the whole chain that used to define $lr,
15095ffd83dbSDimitry Andric // see the comment below how this chain could look like.
15105ffd83dbSDimitry Andric //
IterationCountDCE(LowOverheadLoop & LoLoop)15115ffd83dbSDimitry Andric void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
15125ffd83dbSDimitry Andric   if (!LoLoop.IsTailPredicationLegal())
15135ffd83dbSDimitry Andric     return;
15145ffd83dbSDimitry Andric 
15155ffd83dbSDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
15165ffd83dbSDimitry Andric 
1517fe6060f1SDimitry Andric   MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 1);
15185ffd83dbSDimitry Andric   if (!Def) {
15195ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
15205ffd83dbSDimitry Andric     return;
15215ffd83dbSDimitry Andric   }
15225ffd83dbSDimitry Andric 
15235ffd83dbSDimitry Andric   // Collect and remove the users of iteration count.
15245ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr*, 4> Killed  = { LoLoop.Start, LoLoop.Dec,
1525e8d8bef9SDimitry Andric                                             LoLoop.End };
1526e8d8bef9SDimitry Andric   if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
15275ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
15285ffd83dbSDimitry Andric }
15295ffd83dbSDimitry Andric 
ExpandLoopStart(LowOverheadLoop & LoLoop)1530480093f4SDimitry Andric MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
15315ffd83dbSDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
15325ffd83dbSDimitry Andric   // When using tail-predication, try to delete the dead code that was used to
15335ffd83dbSDimitry Andric   // calculate the number of loop iterations.
15345ffd83dbSDimitry Andric   IterationCountDCE(LoLoop);
15355ffd83dbSDimitry Andric 
1536e8d8bef9SDimitry Andric   MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
1537480093f4SDimitry Andric   MachineInstr *Start = LoLoop.Start;
1538e8d8bef9SDimitry Andric   MachineBasicBlock *MBB = LoLoop.StartInsertBB;
1539480093f4SDimitry Andric   unsigned Opc = LoLoop.getStartOpcode();
1540e8d8bef9SDimitry Andric   MachineOperand &Count = LoLoop.getLoopStartOperand();
1541480093f4SDimitry Andric 
1542fe6060f1SDimitry Andric   // A DLS lr, lr we needn't emit
1543fe6060f1SDimitry Andric   MachineInstr* NewStart;
1544bdd1243dSDimitry Andric   if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() &&
1545bdd1243dSDimitry Andric       Count.getReg() == ARM::LR) {
1546fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
1547fe6060f1SDimitry Andric     NewStart = nullptr;
1548fe6060f1SDimitry Andric   } else {
15490b57cec5SDimitry Andric     MachineInstrBuilder MIB =
1550e8d8bef9SDimitry Andric       BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
15510b57cec5SDimitry Andric 
15520b57cec5SDimitry Andric     MIB.addDef(ARM::LR);
1553480093f4SDimitry Andric     MIB.add(Count);
1554fe6060f1SDimitry Andric     if (isWhileLoopStart(*Start))
1555fe6060f1SDimitry Andric       MIB.addMBB(getWhileLoopStartTargetBB(*Start));
1556fe6060f1SDimitry Andric 
1557fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
1558fe6060f1SDimitry Andric     NewStart = &*MIB;
1559fe6060f1SDimitry Andric   }
15600b57cec5SDimitry Andric 
15615ffd83dbSDimitry Andric   LoLoop.ToRemove.insert(Start);
1562fe6060f1SDimitry Andric   return NewStart;
1563480093f4SDimitry Andric }
1564480093f4SDimitry Andric 
ConvertVPTBlocks(LowOverheadLoop & LoLoop)1565480093f4SDimitry Andric void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
1566480093f4SDimitry Andric   auto RemovePredicate = [](MachineInstr *MI) {
15674652422eSDimitry Andric     if (MI->isDebugInstr())
15684652422eSDimitry Andric       return;
1569480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
15704652422eSDimitry Andric     int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
15714652422eSDimitry Andric     assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");
1572480093f4SDimitry Andric     assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
1573480093f4SDimitry Andric            "Expected Then predicate!");
1574480093f4SDimitry Andric     MI->getOperand(PIdx).setImm(ARMVCC::None);
1575480093f4SDimitry Andric     MI->getOperand(PIdx + 1).setReg(0);
15760b57cec5SDimitry Andric   };
15770b57cec5SDimitry Andric 
1578480093f4SDimitry Andric   for (auto &Block : LoLoop.getVPTBlocks()) {
1579e8d8bef9SDimitry Andric     SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
1580e8d8bef9SDimitry Andric 
1581e8d8bef9SDimitry Andric     auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
1582e8d8bef9SDimitry Andric       assert(TheVCMP && "Replacing a removed or non-existent VCMP");
1583e8d8bef9SDimitry Andric       // Replace the VCMP with a VPT
1584e8d8bef9SDimitry Andric       MachineInstrBuilder MIB =
1585e8d8bef9SDimitry Andric           BuildMI(*At->getParent(), At, At->getDebugLoc(),
1586e8d8bef9SDimitry Andric                   TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
1587e8d8bef9SDimitry Andric       MIB.addImm(ARMVCC::Then);
1588e8d8bef9SDimitry Andric       // Register one
1589e8d8bef9SDimitry Andric       MIB.add(TheVCMP->getOperand(1));
1590e8d8bef9SDimitry Andric       // Register two
1591e8d8bef9SDimitry Andric       MIB.add(TheVCMP->getOperand(2));
1592e8d8bef9SDimitry Andric       // The comparison code, e.g. ge, eq, lt
1593e8d8bef9SDimitry Andric       MIB.add(TheVCMP->getOperand(3));
1594e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
1595e8d8bef9SDimitry Andric       LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1596e8d8bef9SDimitry Andric       LoLoop.ToRemove.insert(TheVCMP);
1597e8d8bef9SDimitry Andric       TheVCMP = nullptr;
1598e8d8bef9SDimitry Andric     };
1599e8d8bef9SDimitry Andric 
1600*0fca6ea1SDimitry Andric     if (LoLoop.VPTstate.isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
1601e8d8bef9SDimitry Andric       MachineInstr *VPST = Insts.front();
1602*0fca6ea1SDimitry Andric       if (Block.hasUniformPredicate()) {
1603e8d8bef9SDimitry Andric         // A vpt block starting with VPST, is only predicated upon vctp and has no
1604e8d8bef9SDimitry Andric         // internal vpr defs:
1605e8d8bef9SDimitry Andric         // - Remove vpst.
1606e8d8bef9SDimitry Andric         // - Unpredicate the remaining instructions.
1607e8d8bef9SDimitry Andric         LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1608e8d8bef9SDimitry Andric         for (unsigned i = 1; i < Insts.size(); ++i)
1609e8d8bef9SDimitry Andric           RemovePredicate(Insts[i]);
1610e8d8bef9SDimitry Andric       } else {
16115ffd83dbSDimitry Andric         // The VPT block has a non-uniform predicate but it uses a vpst and its
16125ffd83dbSDimitry Andric         // entry is guarded only by a vctp, which means we:
1613480093f4SDimitry Andric         // - Need to remove the original vpst.
1614480093f4SDimitry Andric         // - Then need to unpredicate any following instructions, until
1615480093f4SDimitry Andric         //   we come across the divergent vpr def.
1616480093f4SDimitry Andric         // - Insert a new vpst to predicate the instruction(s) that following
1617480093f4SDimitry Andric         //   the divergent vpr def.
1618*0fca6ea1SDimitry Andric         MachineInstr *Divergent = Block.getDivergent();
16194652422eSDimitry Andric         MachineBasicBlock *MBB = Divergent->getParent();
1620e8d8bef9SDimitry Andric         auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
16214652422eSDimitry Andric         while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())
16224652422eSDimitry Andric           ++DivergentNext;
16234652422eSDimitry Andric 
1624e8d8bef9SDimitry Andric         bool DivergentNextIsPredicated =
16254652422eSDimitry Andric             DivergentNext != MBB->end() &&
1626e8d8bef9SDimitry Andric             getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
1627e8d8bef9SDimitry Andric 
1628e8d8bef9SDimitry Andric         for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
1629e8d8bef9SDimitry Andric              I != E; ++I)
1630480093f4SDimitry Andric           RemovePredicate(&*I);
1631480093f4SDimitry Andric 
1632e8d8bef9SDimitry Andric         // Check if the instruction defining vpr is a vcmp so it can be combined
1633e8d8bef9SDimitry Andric         // with the VPST This should be the divergent instruction
1634e8d8bef9SDimitry Andric         MachineInstr *VCMP =
1635e8d8bef9SDimitry Andric             VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
1636e8d8bef9SDimitry Andric 
1637e8d8bef9SDimitry Andric         if (DivergentNextIsPredicated) {
1638e8d8bef9SDimitry Andric           // Insert a VPST at the divergent only if the next instruction
1639e8d8bef9SDimitry Andric           // would actually use it. A VCMP following a VPST can be
1640e8d8bef9SDimitry Andric           // merged into a VPT so do that instead if the VCMP exists.
1641e8d8bef9SDimitry Andric           if (!VCMP) {
1642e8d8bef9SDimitry Andric             // Create a VPST (with a null mask for now, we'll recompute it
1643e8d8bef9SDimitry Andric             // later)
1644e8d8bef9SDimitry Andric             MachineInstrBuilder MIB =
1645e8d8bef9SDimitry Andric                 BuildMI(*Divergent->getParent(), Divergent,
1646e8d8bef9SDimitry Andric                         Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
16475ffd83dbSDimitry Andric             MIB.addImm(0);
1648480093f4SDimitry Andric             LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
16495ffd83dbSDimitry Andric             LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1650e8d8bef9SDimitry Andric           } else {
1651e8d8bef9SDimitry Andric             // No RDA checks are necessary here since the VPST would have been
1652e8d8bef9SDimitry Andric             // directly after the VCMP
1653e8d8bef9SDimitry Andric             ReplaceVCMPWithVPT(VCMP, VCMP);
16545ffd83dbSDimitry Andric           }
16555ffd83dbSDimitry Andric         }
16565ffd83dbSDimitry Andric       }
1657e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1658e8d8bef9SDimitry Andric       LoLoop.ToRemove.insert(VPST);
1659e8d8bef9SDimitry Andric     } else if (Block.containsVCTP()) {
1660e8d8bef9SDimitry Andric       // The vctp will be removed, so either the entire block will be dead or
1661e8d8bef9SDimitry Andric       // the block mask of the vp(s)t will need to be recomputed.
1662e8d8bef9SDimitry Andric       MachineInstr *VPST = Insts.front();
1663e8d8bef9SDimitry Andric       if (Block.size() == 2) {
1664e8d8bef9SDimitry Andric         assert(VPST->getOpcode() == ARM::MVE_VPST &&
1665e8d8bef9SDimitry Andric                "Found a VPST in an otherwise empty vpt block");
1666e8d8bef9SDimitry Andric         LoLoop.ToRemove.insert(VPST);
1667e8d8bef9SDimitry Andric       } else
1668e8d8bef9SDimitry Andric         LoLoop.BlockMasksToRecompute.insert(VPST);
1669e8d8bef9SDimitry Andric     } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
1670e8d8bef9SDimitry Andric       // If this block starts with a VPST then attempt to merge it with the
1671e8d8bef9SDimitry Andric       // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
1672e8d8bef9SDimitry Andric       // block that no longer exists
1673e8d8bef9SDimitry Andric       MachineInstr *VPST = Insts.front();
1674e8d8bef9SDimitry Andric       auto Next = ++MachineBasicBlock::iterator(VPST);
1675e8d8bef9SDimitry Andric       assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
1676e8d8bef9SDimitry Andric              "The instruction after a VPST must be predicated");
1677e8d8bef9SDimitry Andric       (void)Next;
1678e8d8bef9SDimitry Andric       MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
1679e8d8bef9SDimitry Andric       if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
1680e8d8bef9SDimitry Andric           !LoLoop.ToRemove.contains(VprDef)) {
1681e8d8bef9SDimitry Andric         MachineInstr *VCMP = VprDef;
1682e8d8bef9SDimitry Andric         // The VCMP and VPST can only be merged if the VCMP's operands will have
1683e8d8bef9SDimitry Andric         // the same values at the VPST.
1684e8d8bef9SDimitry Andric         // If any of the instructions between the VCMP and VPST are predicated
1685e8d8bef9SDimitry Andric         // then a different code path is expected to have merged the VCMP and
1686e8d8bef9SDimitry Andric         // VPST already.
16870eae32dcSDimitry Andric         if (std::none_of(++MachineBasicBlock::iterator(VCMP),
1688e8d8bef9SDimitry Andric                          MachineBasicBlock::iterator(VPST), hasVPRUse) &&
1689e8d8bef9SDimitry Andric             RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
1690e8d8bef9SDimitry Andric             RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
1691e8d8bef9SDimitry Andric           ReplaceVCMPWithVPT(VCMP, VPST);
1692e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1693e8d8bef9SDimitry Andric           LoLoop.ToRemove.insert(VPST);
1694480093f4SDimitry Andric         }
1695480093f4SDimitry Andric       }
16965ffd83dbSDimitry Andric     }
16975ffd83dbSDimitry Andric   }
1698e8d8bef9SDimitry Andric 
1699e8d8bef9SDimitry Andric   LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
1700480093f4SDimitry Andric }
1701480093f4SDimitry Andric 
Expand(LowOverheadLoop & LoLoop)1702480093f4SDimitry Andric void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1703480093f4SDimitry Andric 
17040b57cec5SDimitry Andric   // Combine the LoopDec and LoopEnd instructions into LE(TP).
1705480093f4SDimitry Andric   auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
1706480093f4SDimitry Andric     MachineInstr *End = LoLoop.End;
17070b57cec5SDimitry Andric     MachineBasicBlock *MBB = End->getParent();
1708480093f4SDimitry Andric     unsigned Opc = LoLoop.IsTailPredicationLegal() ?
1709480093f4SDimitry Andric       ARM::MVE_LETP : ARM::t2LEUpdate;
17100b57cec5SDimitry Andric     MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
1711480093f4SDimitry Andric                                       TII->get(Opc));
17120b57cec5SDimitry Andric     MIB.addDef(ARM::LR);
1713e8d8bef9SDimitry Andric     unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
1714e8d8bef9SDimitry Andric     MIB.add(End->getOperand(Off + 0));
1715e8d8bef9SDimitry Andric     MIB.add(End->getOperand(Off + 1));
17160b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
17175ffd83dbSDimitry Andric     LoLoop.ToRemove.insert(LoLoop.Dec);
17185ffd83dbSDimitry Andric     LoLoop.ToRemove.insert(End);
17190b57cec5SDimitry Andric     return &*MIB;
17200b57cec5SDimitry Andric   };
17210b57cec5SDimitry Andric 
17220b57cec5SDimitry Andric   // TODO: We should be able to automatically remove these branches before we
17230b57cec5SDimitry Andric   // get here - probably by teaching analyzeBranch about the pseudo
17240b57cec5SDimitry Andric   // instructions.
17250b57cec5SDimitry Andric   // If there is an unconditional branch, after I, that just branches to the
17260b57cec5SDimitry Andric   // next block, remove it.
17270b57cec5SDimitry Andric   auto RemoveDeadBranch = [](MachineInstr *I) {
17280b57cec5SDimitry Andric     MachineBasicBlock *BB = I->getParent();
17290b57cec5SDimitry Andric     MachineInstr *Terminator = &BB->instr_back();
17300b57cec5SDimitry Andric     if (Terminator->isUnconditionalBranch() && I != Terminator) {
17310b57cec5SDimitry Andric       MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
17320b57cec5SDimitry Andric       if (BB->isLayoutSuccessor(Succ)) {
17330b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
17340b57cec5SDimitry Andric         Terminator->eraseFromParent();
17350b57cec5SDimitry Andric       }
17360b57cec5SDimitry Andric     }
17370b57cec5SDimitry Andric   };
17380b57cec5SDimitry Andric 
1739349cc55cSDimitry Andric   // And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
1740349cc55cSDimitry Andric   // Anything other MQPRCopy can be converted to MVE_VORR later on.
1741349cc55cSDimitry Andric   auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) {
1742349cc55cSDimitry Andric     for (auto *MI : VMOVCopies) {
1743349cc55cSDimitry Andric       LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
1744349cc55cSDimitry Andric       assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1745349cc55cSDimitry Andric       MachineBasicBlock *MBB = MI->getParent();
1746349cc55cSDimitry Andric       Register Dst = MI->getOperand(0).getReg();
1747349cc55cSDimitry Andric       Register Src = MI->getOperand(1).getReg();
1748349cc55cSDimitry Andric       auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
1749349cc55cSDimitry Andric                           ARM::D0 + (Dst - ARM::Q0) * 2)
1750349cc55cSDimitry Andric                       .addReg(ARM::D0 + (Src - ARM::Q0) * 2)
1751349cc55cSDimitry Andric                       .add(predOps(ARMCC::AL));
1752349cc55cSDimitry Andric       (void)MIB1;
1753349cc55cSDimitry Andric       LLVM_DEBUG(dbgs() << " into " << *MIB1);
1754349cc55cSDimitry Andric       auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
1755349cc55cSDimitry Andric                           ARM::D0 + (Dst - ARM::Q0) * 2 + 1)
1756349cc55cSDimitry Andric                       .addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1)
1757349cc55cSDimitry Andric                       .add(predOps(ARMCC::AL));
1758349cc55cSDimitry Andric       LLVM_DEBUG(dbgs() << " and  " << *MIB2);
1759349cc55cSDimitry Andric       (void)MIB2;
1760349cc55cSDimitry Andric       MI->eraseFromParent();
1761349cc55cSDimitry Andric     }
1762349cc55cSDimitry Andric   };
1763349cc55cSDimitry Andric 
1764480093f4SDimitry Andric   if (LoLoop.Revert) {
1765fe6060f1SDimitry Andric     if (isWhileLoopStart(*LoLoop.Start))
1766480093f4SDimitry Andric       RevertWhile(LoLoop.Start);
17670b57cec5SDimitry Andric     else
1768e8d8bef9SDimitry Andric       RevertDo(LoLoop.Start);
1769e8d8bef9SDimitry Andric     if (LoLoop.Dec == LoLoop.End)
1770e8d8bef9SDimitry Andric       RevertLoopEndDec(LoLoop.End);
1771e8d8bef9SDimitry Andric     else
1772e8d8bef9SDimitry Andric       RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
17730b57cec5SDimitry Andric   } else {
1774349cc55cSDimitry Andric     ExpandVMOVCopies(LoLoop.VMOVCopies);
1775480093f4SDimitry Andric     LoLoop.Start = ExpandLoopStart(LoLoop);
1776fe6060f1SDimitry Andric     if (LoLoop.Start)
1777480093f4SDimitry Andric       RemoveDeadBranch(LoLoop.Start);
1778480093f4SDimitry Andric     LoLoop.End = ExpandLoopEnd(LoLoop);
1779480093f4SDimitry Andric     RemoveDeadBranch(LoLoop.End);
1780e8d8bef9SDimitry Andric     if (LoLoop.IsTailPredicationLegal())
1781480093f4SDimitry Andric       ConvertVPTBlocks(LoLoop);
17825ffd83dbSDimitry Andric     for (auto *I : LoLoop.ToRemove) {
17835ffd83dbSDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
17845ffd83dbSDimitry Andric       I->eraseFromParent();
17855ffd83dbSDimitry Andric     }
17865ffd83dbSDimitry Andric     for (auto *I : LoLoop.BlockMasksToRecompute) {
17875ffd83dbSDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
17885ffd83dbSDimitry Andric       recomputeVPTBlockMask(*I);
17895ffd83dbSDimitry Andric       LLVM_DEBUG(dbgs() << "           ... done: " << *I);
1790480093f4SDimitry Andric     }
17910b57cec5SDimitry Andric   }
17925ffd83dbSDimitry Andric 
17935ffd83dbSDimitry Andric   PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
17945ffd83dbSDimitry Andric   DFS.ProcessLoop();
17955ffd83dbSDimitry Andric   const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
1796*0fca6ea1SDimitry Andric   fullyRecomputeLiveIns(PostOrder);
17975ffd83dbSDimitry Andric 
17985ffd83dbSDimitry Andric   for (auto *MBB : reverse(PostOrder))
17995ffd83dbSDimitry Andric     recomputeLivenessFlags(*MBB);
18005ffd83dbSDimitry Andric 
18015ffd83dbSDimitry Andric   // We've moved, removed and inserted new instructions, so update RDA.
18025ffd83dbSDimitry Andric   RDA->reset();
18030b57cec5SDimitry Andric }
18040b57cec5SDimitry Andric 
RevertNonLoops()18058bcb0991SDimitry Andric bool ARMLowOverheadLoops::RevertNonLoops() {
18068bcb0991SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
18078bcb0991SDimitry Andric   bool Changed = false;
18088bcb0991SDimitry Andric 
18098bcb0991SDimitry Andric   for (auto &MBB : *MF) {
18108bcb0991SDimitry Andric     SmallVector<MachineInstr*, 4> Starts;
18118bcb0991SDimitry Andric     SmallVector<MachineInstr*, 4> Decs;
18128bcb0991SDimitry Andric     SmallVector<MachineInstr*, 4> Ends;
1813e8d8bef9SDimitry Andric     SmallVector<MachineInstr *, 4> EndDecs;
1814349cc55cSDimitry Andric     SmallVector<MachineInstr *, 4> MQPRCopies;
18158bcb0991SDimitry Andric 
18168bcb0991SDimitry Andric     for (auto &I : MBB) {
1817480093f4SDimitry Andric       if (isLoopStart(I))
18188bcb0991SDimitry Andric         Starts.push_back(&I);
18198bcb0991SDimitry Andric       else if (I.getOpcode() == ARM::t2LoopDec)
18208bcb0991SDimitry Andric         Decs.push_back(&I);
18218bcb0991SDimitry Andric       else if (I.getOpcode() == ARM::t2LoopEnd)
18228bcb0991SDimitry Andric         Ends.push_back(&I);
1823e8d8bef9SDimitry Andric       else if (I.getOpcode() == ARM::t2LoopEndDec)
1824e8d8bef9SDimitry Andric         EndDecs.push_back(&I);
1825349cc55cSDimitry Andric       else if (I.getOpcode() == ARM::MQPRCopy)
1826349cc55cSDimitry Andric         MQPRCopies.push_back(&I);
18278bcb0991SDimitry Andric     }
18288bcb0991SDimitry Andric 
1829349cc55cSDimitry Andric     if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
1830349cc55cSDimitry Andric         MQPRCopies.empty())
18318bcb0991SDimitry Andric       continue;
18328bcb0991SDimitry Andric 
18338bcb0991SDimitry Andric     Changed = true;
18348bcb0991SDimitry Andric 
18358bcb0991SDimitry Andric     for (auto *Start : Starts) {
1836fe6060f1SDimitry Andric       if (isWhileLoopStart(*Start))
18378bcb0991SDimitry Andric         RevertWhile(Start);
18388bcb0991SDimitry Andric       else
1839e8d8bef9SDimitry Andric         RevertDo(Start);
18408bcb0991SDimitry Andric     }
18418bcb0991SDimitry Andric     for (auto *Dec : Decs)
18428bcb0991SDimitry Andric       RevertLoopDec(Dec);
18438bcb0991SDimitry Andric 
18448bcb0991SDimitry Andric     for (auto *End : Ends)
18458bcb0991SDimitry Andric       RevertLoopEnd(End);
1846e8d8bef9SDimitry Andric     for (auto *End : EndDecs)
1847e8d8bef9SDimitry Andric       RevertLoopEndDec(End);
1848349cc55cSDimitry Andric     for (auto *MI : MQPRCopies) {
1849349cc55cSDimitry Andric       LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
1850349cc55cSDimitry Andric       assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1851349cc55cSDimitry Andric       MachineBasicBlock *MBB = MI->getParent();
1852349cc55cSDimitry Andric       auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR),
1853349cc55cSDimitry Andric                          MI->getOperand(0).getReg())
1854349cc55cSDimitry Andric                      .add(MI->getOperand(1))
1855349cc55cSDimitry Andric                      .add(MI->getOperand(1));
1856349cc55cSDimitry Andric       addUnpredicatedMveVpredROp(MIB, MI->getOperand(0).getReg());
1857349cc55cSDimitry Andric       MI->eraseFromParent();
1858349cc55cSDimitry Andric     }
18598bcb0991SDimitry Andric   }
18608bcb0991SDimitry Andric   return Changed;
18618bcb0991SDimitry Andric }
18628bcb0991SDimitry Andric 
createARMLowOverheadLoopsPass()18630b57cec5SDimitry Andric FunctionPass *llvm::createARMLowOverheadLoopsPass() {
18640b57cec5SDimitry Andric   return new ARMLowOverheadLoops();
18650b57cec5SDimitry Andric }
1866