xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp (revision e8d8bef961a50d4dc22501cde4fb9fb0be1b2532)
10b57cec5SDimitry Andric //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
100b57cec5SDimitry Andric /// instructions into machine operations.
110b57cec5SDimitry Andric /// The expectation is that the loop contains three pseudo instructions:
120b57cec5SDimitry Andric /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
130b57cec5SDimitry Andric ///   form should be in the preheader, whereas the while form should be in the
148bcb0991SDimitry Andric ///   preheaders only predecessor.
150b57cec5SDimitry Andric /// - t2LoopDec - placed within in the loop body.
160b57cec5SDimitry Andric /// - t2LoopEnd - the loop latch terminator.
170b57cec5SDimitry Andric ///
18480093f4SDimitry Andric /// In addition to this, we also look for the presence of the VCTP instruction,
19480093f4SDimitry Andric /// which determines whether we can generated the tail-predicated low-overhead
20480093f4SDimitry Andric /// loop form.
21480093f4SDimitry Andric ///
22480093f4SDimitry Andric /// Assumptions and Dependencies:
23480093f4SDimitry Andric /// Low-overhead loops are constructed and executed using a setup instruction:
24480093f4SDimitry Andric /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25480093f4SDimitry Andric /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26480093f4SDimitry Andric /// but fixed polarity: WLS can only branch forwards and LE can only branch
27480093f4SDimitry Andric /// backwards. These restrictions mean that this pass is dependent upon block
28480093f4SDimitry Andric /// layout and block sizes, which is why it's the last pass to run. The same is
29480093f4SDimitry Andric /// true for ConstantIslands, but this pass does not increase the size of the
30480093f4SDimitry Andric /// basic blocks, nor does it change the CFG. Instructions are mainly removed
31480093f4SDimitry Andric /// during the transform and pseudo instructions are replaced by real ones. In
32480093f4SDimitry Andric /// some cases, when we have to revert to a 'normal' loop, we have to introduce
33480093f4SDimitry Andric /// multiple instructions for a single pseudo (see RevertWhile and
34480093f4SDimitry Andric /// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd
35480093f4SDimitry Andric /// are defined to be as large as this maximum sequence of replacement
36480093f4SDimitry Andric /// instructions.
37480093f4SDimitry Andric ///
385ffd83dbSDimitry Andric /// A note on VPR.P0 (the lane mask):
395ffd83dbSDimitry Andric /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
405ffd83dbSDimitry Andric /// "VPT Active" context (which includes low-overhead loops and vpt blocks).
415ffd83dbSDimitry Andric /// They will simply "and" the result of their calculation with the current
425ffd83dbSDimitry Andric /// value of VPR.P0. You can think of it like this:
435ffd83dbSDimitry Andric /// \verbatim
445ffd83dbSDimitry Andric /// if VPT active:    ; Between a DLSTP/LETP, or for predicated instrs
455ffd83dbSDimitry Andric ///   VPR.P0 &= Value
465ffd83dbSDimitry Andric /// else
475ffd83dbSDimitry Andric ///   VPR.P0 = Value
485ffd83dbSDimitry Andric /// \endverbatim
495ffd83dbSDimitry Andric /// When we're inside the low-overhead loop (between DLSTP and LETP), we always
505ffd83dbSDimitry Andric /// fall in the "VPT active" case, so we can consider that all VPR writes by
515ffd83dbSDimitry Andric /// one of those instruction is actually a "and".
520b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
530b57cec5SDimitry Andric 
540b57cec5SDimitry Andric #include "ARM.h"
550b57cec5SDimitry Andric #include "ARMBaseInstrInfo.h"
560b57cec5SDimitry Andric #include "ARMBaseRegisterInfo.h"
570b57cec5SDimitry Andric #include "ARMBasicBlockInfo.h"
580b57cec5SDimitry Andric #include "ARMSubtarget.h"
59*e8d8bef9SDimitry Andric #include "MVETailPredUtils.h"
60480093f4SDimitry Andric #include "Thumb2InstrInfo.h"
61480093f4SDimitry Andric #include "llvm/ADT/SetOperations.h"
62480093f4SDimitry Andric #include "llvm/ADT/SmallSet.h"
635ffd83dbSDimitry Andric #include "llvm/CodeGen/LivePhysRegs.h"
640b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
650b57cec5SDimitry Andric #include "llvm/CodeGen/MachineLoopInfo.h"
66480093f4SDimitry Andric #include "llvm/CodeGen/MachineLoopUtils.h"
670b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
68480093f4SDimitry Andric #include "llvm/CodeGen/Passes.h"
69480093f4SDimitry Andric #include "llvm/CodeGen/ReachingDefAnalysis.h"
70480093f4SDimitry Andric #include "llvm/MC/MCInstrDesc.h"
710b57cec5SDimitry Andric 
720b57cec5SDimitry Andric using namespace llvm;
730b57cec5SDimitry Andric 
740b57cec5SDimitry Andric #define DEBUG_TYPE "arm-low-overhead-loops"
750b57cec5SDimitry Andric #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
760b57cec5SDimitry Andric 
77*e8d8bef9SDimitry Andric static cl::opt<bool>
78*e8d8bef9SDimitry Andric DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
79*e8d8bef9SDimitry Andric     cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
80*e8d8bef9SDimitry Andric     cl::init(false));
81*e8d8bef9SDimitry Andric 
82*e8d8bef9SDimitry Andric static bool isVectorPredicated(MachineInstr *MI) {
83*e8d8bef9SDimitry Andric   int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
84*e8d8bef9SDimitry Andric   return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
85*e8d8bef9SDimitry Andric }
86*e8d8bef9SDimitry Andric 
87*e8d8bef9SDimitry Andric static bool isVectorPredicate(MachineInstr *MI) {
88*e8d8bef9SDimitry Andric   return MI->findRegisterDefOperandIdx(ARM::VPR) != -1;
89*e8d8bef9SDimitry Andric }
90*e8d8bef9SDimitry Andric 
91*e8d8bef9SDimitry Andric static bool hasVPRUse(MachineInstr &MI) {
92*e8d8bef9SDimitry Andric   return MI.findRegisterUseOperandIdx(ARM::VPR) != -1;
93*e8d8bef9SDimitry Andric }
94*e8d8bef9SDimitry Andric 
95*e8d8bef9SDimitry Andric static bool isDomainMVE(MachineInstr *MI) {
96*e8d8bef9SDimitry Andric   uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
97*e8d8bef9SDimitry Andric   return Domain == ARMII::DomainMVE;
98*e8d8bef9SDimitry Andric }
99*e8d8bef9SDimitry Andric 
100*e8d8bef9SDimitry Andric static bool shouldInspect(MachineInstr &MI) {
101*e8d8bef9SDimitry Andric   return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
102*e8d8bef9SDimitry Andric }
103*e8d8bef9SDimitry Andric 
104*e8d8bef9SDimitry Andric static bool isDo(MachineInstr *MI) {
105*e8d8bef9SDimitry Andric   return MI->getOpcode() != ARM::t2WhileLoopStart;
106*e8d8bef9SDimitry Andric }
107*e8d8bef9SDimitry Andric 
1080b57cec5SDimitry Andric namespace {
1090b57cec5SDimitry Andric 
1105ffd83dbSDimitry Andric   using InstSet = SmallPtrSetImpl<MachineInstr *>;
1115ffd83dbSDimitry Andric 
1125ffd83dbSDimitry Andric   class PostOrderLoopTraversal {
1135ffd83dbSDimitry Andric     MachineLoop &ML;
1145ffd83dbSDimitry Andric     MachineLoopInfo &MLI;
1155ffd83dbSDimitry Andric     SmallPtrSet<MachineBasicBlock*, 4> Visited;
1165ffd83dbSDimitry Andric     SmallVector<MachineBasicBlock*, 4> Order;
1175ffd83dbSDimitry Andric 
1185ffd83dbSDimitry Andric   public:
1195ffd83dbSDimitry Andric     PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
1205ffd83dbSDimitry Andric       : ML(ML), MLI(MLI) { }
1215ffd83dbSDimitry Andric 
1225ffd83dbSDimitry Andric     const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
1235ffd83dbSDimitry Andric       return Order;
1245ffd83dbSDimitry Andric     }
1255ffd83dbSDimitry Andric 
1265ffd83dbSDimitry Andric     // Visit all the blocks within the loop, as well as exit blocks and any
1275ffd83dbSDimitry Andric     // blocks properly dominating the header.
1285ffd83dbSDimitry Andric     void ProcessLoop() {
1295ffd83dbSDimitry Andric       std::function<void(MachineBasicBlock*)> Search = [this, &Search]
1305ffd83dbSDimitry Andric         (MachineBasicBlock *MBB) -> void {
1315ffd83dbSDimitry Andric         if (Visited.count(MBB))
1325ffd83dbSDimitry Andric           return;
1335ffd83dbSDimitry Andric 
1345ffd83dbSDimitry Andric         Visited.insert(MBB);
1355ffd83dbSDimitry Andric         for (auto *Succ : MBB->successors()) {
1365ffd83dbSDimitry Andric           if (!ML.contains(Succ))
1375ffd83dbSDimitry Andric             continue;
1385ffd83dbSDimitry Andric           Search(Succ);
1395ffd83dbSDimitry Andric         }
1405ffd83dbSDimitry Andric         Order.push_back(MBB);
1415ffd83dbSDimitry Andric       };
1425ffd83dbSDimitry Andric 
1435ffd83dbSDimitry Andric       // Insert exit blocks.
1445ffd83dbSDimitry Andric       SmallVector<MachineBasicBlock*, 2> ExitBlocks;
1455ffd83dbSDimitry Andric       ML.getExitBlocks(ExitBlocks);
146*e8d8bef9SDimitry Andric       append_range(Order, ExitBlocks);
1475ffd83dbSDimitry Andric 
1485ffd83dbSDimitry Andric       // Then add the loop body.
1495ffd83dbSDimitry Andric       Search(ML.getHeader());
1505ffd83dbSDimitry Andric 
1515ffd83dbSDimitry Andric       // Then try the preheader and its predecessors.
1525ffd83dbSDimitry Andric       std::function<void(MachineBasicBlock*)> GetPredecessor =
1535ffd83dbSDimitry Andric         [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
1545ffd83dbSDimitry Andric         Order.push_back(MBB);
1555ffd83dbSDimitry Andric         if (MBB->pred_size() == 1)
1565ffd83dbSDimitry Andric           GetPredecessor(*MBB->pred_begin());
1575ffd83dbSDimitry Andric       };
1585ffd83dbSDimitry Andric 
1595ffd83dbSDimitry Andric       if (auto *Preheader = ML.getLoopPreheader())
1605ffd83dbSDimitry Andric         GetPredecessor(Preheader);
1615ffd83dbSDimitry Andric       else if (auto *Preheader = MLI.findLoopPreheader(&ML, true))
1625ffd83dbSDimitry Andric         GetPredecessor(Preheader);
1635ffd83dbSDimitry Andric     }
1645ffd83dbSDimitry Andric   };
1655ffd83dbSDimitry Andric 
166480093f4SDimitry Andric   struct PredicatedMI {
167480093f4SDimitry Andric     MachineInstr *MI = nullptr;
168480093f4SDimitry Andric     SetVector<MachineInstr*> Predicates;
169480093f4SDimitry Andric 
170480093f4SDimitry Andric   public:
1715ffd83dbSDimitry Andric     PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) {
1725ffd83dbSDimitry Andric       assert(I && "Instruction must not be null!");
173480093f4SDimitry Andric       Predicates.insert(Preds.begin(), Preds.end());
174480093f4SDimitry Andric     }
175480093f4SDimitry Andric   };
176480093f4SDimitry Andric 
177*e8d8bef9SDimitry Andric   // Represent the current state of the VPR and hold all instances which
178*e8d8bef9SDimitry Andric   // represent a VPT block, which is a list of instructions that begins with a
179*e8d8bef9SDimitry Andric   // VPT/VPST and has a maximum of four proceeding instructions. All
180*e8d8bef9SDimitry Andric   // instructions within the block are predicated upon the vpr and we allow
181*e8d8bef9SDimitry Andric   // instructions to define the vpr within in the block too.
182*e8d8bef9SDimitry Andric   class VPTState {
183*e8d8bef9SDimitry Andric     friend struct LowOverheadLoop;
184*e8d8bef9SDimitry Andric 
185*e8d8bef9SDimitry Andric     SmallVector<MachineInstr *, 4> Insts;
186*e8d8bef9SDimitry Andric 
187*e8d8bef9SDimitry Andric     static SmallVector<VPTState, 4> Blocks;
188*e8d8bef9SDimitry Andric     static SetVector<MachineInstr *> CurrentPredicates;
189*e8d8bef9SDimitry Andric     static std::map<MachineInstr *,
190*e8d8bef9SDimitry Andric       std::unique_ptr<PredicatedMI>> PredicatedInsts;
191*e8d8bef9SDimitry Andric 
192*e8d8bef9SDimitry Andric     static void CreateVPTBlock(MachineInstr *MI) {
193*e8d8bef9SDimitry Andric       assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
194*e8d8bef9SDimitry Andric              && "Can't begin VPT without predicate");
195*e8d8bef9SDimitry Andric       Blocks.emplace_back(MI);
196*e8d8bef9SDimitry Andric       // The execution of MI is predicated upon the current set of instructions
197*e8d8bef9SDimitry Andric       // that are AND'ed together to form the VPR predicate value. In the case
198*e8d8bef9SDimitry Andric       // that MI is a VPT, CurrentPredicates will also just be MI.
199*e8d8bef9SDimitry Andric       PredicatedInsts.emplace(
200*e8d8bef9SDimitry Andric         MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
201*e8d8bef9SDimitry Andric     }
202*e8d8bef9SDimitry Andric 
203*e8d8bef9SDimitry Andric     static void reset() {
204*e8d8bef9SDimitry Andric       Blocks.clear();
205*e8d8bef9SDimitry Andric       PredicatedInsts.clear();
206*e8d8bef9SDimitry Andric       CurrentPredicates.clear();
207*e8d8bef9SDimitry Andric     }
208*e8d8bef9SDimitry Andric 
209*e8d8bef9SDimitry Andric     static void addInst(MachineInstr *MI) {
210*e8d8bef9SDimitry Andric       Blocks.back().insert(MI);
211*e8d8bef9SDimitry Andric       PredicatedInsts.emplace(
212*e8d8bef9SDimitry Andric         MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
213*e8d8bef9SDimitry Andric     }
214*e8d8bef9SDimitry Andric 
215*e8d8bef9SDimitry Andric     static void addPredicate(MachineInstr *MI) {
216*e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
217*e8d8bef9SDimitry Andric       CurrentPredicates.insert(MI);
218*e8d8bef9SDimitry Andric     }
219*e8d8bef9SDimitry Andric 
220*e8d8bef9SDimitry Andric     static void resetPredicate(MachineInstr *MI) {
221*e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
222*e8d8bef9SDimitry Andric       CurrentPredicates.clear();
223*e8d8bef9SDimitry Andric       CurrentPredicates.insert(MI);
224*e8d8bef9SDimitry Andric     }
225480093f4SDimitry Andric 
226480093f4SDimitry Andric   public:
227480093f4SDimitry Andric     // Have we found an instruction within the block which defines the vpr? If
228480093f4SDimitry Andric     // so, not all the instructions in the block will have the same predicate.
229*e8d8bef9SDimitry Andric     static bool hasUniformPredicate(VPTState &Block) {
230*e8d8bef9SDimitry Andric       return getDivergent(Block) == nullptr;
231480093f4SDimitry Andric     }
232480093f4SDimitry Andric 
233*e8d8bef9SDimitry Andric     // If it exists, return the first internal instruction which modifies the
234*e8d8bef9SDimitry Andric     // VPR.
235*e8d8bef9SDimitry Andric     static MachineInstr *getDivergent(VPTState &Block) {
236*e8d8bef9SDimitry Andric       SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
237*e8d8bef9SDimitry Andric       for (unsigned i = 1; i < Insts.size(); ++i) {
238*e8d8bef9SDimitry Andric         MachineInstr *Next = Insts[i];
239*e8d8bef9SDimitry Andric         if (isVectorPredicate(Next))
240*e8d8bef9SDimitry Andric           return Next; // Found an instruction altering the vpr.
241*e8d8bef9SDimitry Andric       }
242*e8d8bef9SDimitry Andric       return nullptr;
2435ffd83dbSDimitry Andric     }
2445ffd83dbSDimitry Andric 
245*e8d8bef9SDimitry Andric     // Return whether the given instruction is predicated upon a VCTP.
246*e8d8bef9SDimitry Andric     static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
247*e8d8bef9SDimitry Andric       SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
248*e8d8bef9SDimitry Andric       if (Exclusive && Predicates.size() != 1)
249*e8d8bef9SDimitry Andric         return false;
250*e8d8bef9SDimitry Andric       for (auto *PredMI : Predicates)
251*e8d8bef9SDimitry Andric         if (isVCTP(PredMI))
252*e8d8bef9SDimitry Andric           return true;
253*e8d8bef9SDimitry Andric       return false;
254480093f4SDimitry Andric     }
255480093f4SDimitry Andric 
256*e8d8bef9SDimitry Andric     // Is the VPST, controlling the block entry, predicated upon a VCTP.
257*e8d8bef9SDimitry Andric     static bool isEntryPredicatedOnVCTP(VPTState &Block,
258*e8d8bef9SDimitry Andric                                         bool Exclusive = false) {
259*e8d8bef9SDimitry Andric       SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
260*e8d8bef9SDimitry Andric       return isPredicatedOnVCTP(Insts.front(), Exclusive);
261*e8d8bef9SDimitry Andric     }
262*e8d8bef9SDimitry Andric 
263*e8d8bef9SDimitry Andric     // If this block begins with a VPT, we can check whether it's using
264*e8d8bef9SDimitry Andric     // at least one predicated input(s), as well as possible loop invariant
265*e8d8bef9SDimitry Andric     // which would result in it being implicitly predicated.
266*e8d8bef9SDimitry Andric     static bool hasImplicitlyValidVPT(VPTState &Block,
267*e8d8bef9SDimitry Andric                                       ReachingDefAnalysis &RDA) {
268*e8d8bef9SDimitry Andric       SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
269*e8d8bef9SDimitry Andric       MachineInstr *VPT = Insts.front();
270*e8d8bef9SDimitry Andric       assert(isVPTOpcode(VPT->getOpcode()) &&
271*e8d8bef9SDimitry Andric              "Expected VPT block to begin with VPT/VPST");
272*e8d8bef9SDimitry Andric 
273*e8d8bef9SDimitry Andric       if (VPT->getOpcode() == ARM::MVE_VPST)
274*e8d8bef9SDimitry Andric         return false;
275*e8d8bef9SDimitry Andric 
276*e8d8bef9SDimitry Andric       auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
277*e8d8bef9SDimitry Andric         MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
278*e8d8bef9SDimitry Andric         return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
279*e8d8bef9SDimitry Andric       };
280*e8d8bef9SDimitry Andric 
281*e8d8bef9SDimitry Andric       auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
282*e8d8bef9SDimitry Andric         MachineOperand &MO = MI->getOperand(Idx);
283*e8d8bef9SDimitry Andric         if (!MO.isReg() || !MO.getReg())
284*e8d8bef9SDimitry Andric           return true;
285*e8d8bef9SDimitry Andric 
286*e8d8bef9SDimitry Andric         SmallPtrSet<MachineInstr *, 2> Defs;
287*e8d8bef9SDimitry Andric         RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
288*e8d8bef9SDimitry Andric         if (Defs.empty())
289*e8d8bef9SDimitry Andric           return true;
290*e8d8bef9SDimitry Andric 
291*e8d8bef9SDimitry Andric         for (auto *Def : Defs)
292*e8d8bef9SDimitry Andric           if (Def->getParent() == VPT->getParent())
293*e8d8bef9SDimitry Andric             return false;
294*e8d8bef9SDimitry Andric         return true;
295*e8d8bef9SDimitry Andric       };
296*e8d8bef9SDimitry Andric 
297*e8d8bef9SDimitry Andric       // Check that at least one of the operands is directly predicated on a
298*e8d8bef9SDimitry Andric       // vctp and allow an invariant value too.
299*e8d8bef9SDimitry Andric       return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
300*e8d8bef9SDimitry Andric              (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
301*e8d8bef9SDimitry Andric              (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
302*e8d8bef9SDimitry Andric     }
303*e8d8bef9SDimitry Andric 
304*e8d8bef9SDimitry Andric     static bool isValid(ReachingDefAnalysis &RDA) {
305*e8d8bef9SDimitry Andric       // All predication within the loop should be based on vctp. If the block
306*e8d8bef9SDimitry Andric       // isn't predicated on entry, check whether the vctp is within the block
307*e8d8bef9SDimitry Andric       // and that all other instructions are then predicated on it.
308*e8d8bef9SDimitry Andric       for (auto &Block : Blocks) {
309*e8d8bef9SDimitry Andric         if (isEntryPredicatedOnVCTP(Block, false) ||
310*e8d8bef9SDimitry Andric             hasImplicitlyValidVPT(Block, RDA))
311*e8d8bef9SDimitry Andric           continue;
312*e8d8bef9SDimitry Andric 
313*e8d8bef9SDimitry Andric         SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
314*e8d8bef9SDimitry Andric         // We don't know how to convert a block with just a VPT;VCTP into
315*e8d8bef9SDimitry Andric         // anything valid once we remove the VCTP. For now just bail out.
316*e8d8bef9SDimitry Andric         assert(isVPTOpcode(Insts.front()->getOpcode()) &&
317*e8d8bef9SDimitry Andric                "Expected VPT block to start with a VPST or VPT!");
318*e8d8bef9SDimitry Andric         if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
319*e8d8bef9SDimitry Andric             isVCTP(Insts.back()))
320*e8d8bef9SDimitry Andric           return false;
321*e8d8bef9SDimitry Andric 
322*e8d8bef9SDimitry Andric         for (auto *MI : Insts) {
323*e8d8bef9SDimitry Andric           // Check that any internal VCTPs are 'Then' predicated.
324*e8d8bef9SDimitry Andric           if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
325*e8d8bef9SDimitry Andric             return false;
326*e8d8bef9SDimitry Andric           // Skip other instructions that build up the predicate.
327*e8d8bef9SDimitry Andric           if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
328*e8d8bef9SDimitry Andric             continue;
329*e8d8bef9SDimitry Andric           // Check that any other instructions are predicated upon a vctp.
330*e8d8bef9SDimitry Andric           // TODO: We could infer when VPTs are implicitly predicated on the
331*e8d8bef9SDimitry Andric           // vctp (when the operands are predicated).
332*e8d8bef9SDimitry Andric           if (!isPredicatedOnVCTP(MI)) {
333*e8d8bef9SDimitry Andric             LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
334*e8d8bef9SDimitry Andric             return false;
335*e8d8bef9SDimitry Andric           }
336*e8d8bef9SDimitry Andric         }
337*e8d8bef9SDimitry Andric       }
338*e8d8bef9SDimitry Andric       return true;
339*e8d8bef9SDimitry Andric     }
340*e8d8bef9SDimitry Andric 
341*e8d8bef9SDimitry Andric     VPTState(MachineInstr *MI) { Insts.push_back(MI); }
342*e8d8bef9SDimitry Andric 
343*e8d8bef9SDimitry Andric     void insert(MachineInstr *MI) {
344*e8d8bef9SDimitry Andric       Insts.push_back(MI);
345*e8d8bef9SDimitry Andric       // VPT/VPST + 4 predicated instructions.
346*e8d8bef9SDimitry Andric       assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
347*e8d8bef9SDimitry Andric     }
348*e8d8bef9SDimitry Andric 
349*e8d8bef9SDimitry Andric     bool containsVCTP() const {
350*e8d8bef9SDimitry Andric       for (auto *MI : Insts)
351*e8d8bef9SDimitry Andric         if (isVCTP(MI))
352*e8d8bef9SDimitry Andric           return true;
353*e8d8bef9SDimitry Andric       return false;
354480093f4SDimitry Andric     }
355480093f4SDimitry Andric 
356480093f4SDimitry Andric     unsigned size() const { return Insts.size(); }
357*e8d8bef9SDimitry Andric     SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
3585ffd83dbSDimitry Andric   };
3595ffd83dbSDimitry Andric 
360480093f4SDimitry Andric   struct LowOverheadLoop {
361480093f4SDimitry Andric 
3625ffd83dbSDimitry Andric     MachineLoop &ML;
3635ffd83dbSDimitry Andric     MachineBasicBlock *Preheader = nullptr;
3645ffd83dbSDimitry Andric     MachineLoopInfo &MLI;
3655ffd83dbSDimitry Andric     ReachingDefAnalysis &RDA;
3665ffd83dbSDimitry Andric     const TargetRegisterInfo &TRI;
3675ffd83dbSDimitry Andric     const ARMBaseInstrInfo &TII;
368480093f4SDimitry Andric     MachineFunction *MF = nullptr;
369*e8d8bef9SDimitry Andric     MachineBasicBlock::iterator StartInsertPt;
370*e8d8bef9SDimitry Andric     MachineBasicBlock *StartInsertBB = nullptr;
371480093f4SDimitry Andric     MachineInstr *Start = nullptr;
372480093f4SDimitry Andric     MachineInstr *Dec = nullptr;
373480093f4SDimitry Andric     MachineInstr *End = nullptr;
374*e8d8bef9SDimitry Andric     MachineOperand TPNumElements;
375*e8d8bef9SDimitry Andric     SmallVector<MachineInstr*, 4> VCTPs;
3765ffd83dbSDimitry Andric     SmallPtrSet<MachineInstr*, 4> ToRemove;
3775ffd83dbSDimitry Andric     SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
378480093f4SDimitry Andric     bool Revert = false;
379480093f4SDimitry Andric     bool CannotTailPredicate = false;
380480093f4SDimitry Andric 
3815ffd83dbSDimitry Andric     LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
3825ffd83dbSDimitry Andric                     ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
3835ffd83dbSDimitry Andric                     const ARMBaseInstrInfo &TII)
384*e8d8bef9SDimitry Andric         : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
385*e8d8bef9SDimitry Andric           TPNumElements(MachineOperand::CreateImm(0)) {
3865ffd83dbSDimitry Andric       MF = ML.getHeader()->getParent();
3875ffd83dbSDimitry Andric       if (auto *MBB = ML.getLoopPreheader())
3885ffd83dbSDimitry Andric         Preheader = MBB;
3895ffd83dbSDimitry Andric       else if (auto *MBB = MLI.findLoopPreheader(&ML, true))
3905ffd83dbSDimitry Andric         Preheader = MBB;
391*e8d8bef9SDimitry Andric       VPTState::reset();
392480093f4SDimitry Andric     }
393480093f4SDimitry Andric 
394480093f4SDimitry Andric     // If this is an MVE instruction, check that we know how to use tail
395480093f4SDimitry Andric     // predication with it. Record VPT blocks and return whether the
396480093f4SDimitry Andric     // instruction is valid for tail predication.
397480093f4SDimitry Andric     bool ValidateMVEInst(MachineInstr *MI);
398480093f4SDimitry Andric 
399480093f4SDimitry Andric     void AnalyseMVEInst(MachineInstr *MI) {
400480093f4SDimitry Andric       CannotTailPredicate = !ValidateMVEInst(MI);
401480093f4SDimitry Andric     }
402480093f4SDimitry Andric 
403480093f4SDimitry Andric     bool IsTailPredicationLegal() const {
404480093f4SDimitry Andric       // For now, let's keep things really simple and only support a single
405480093f4SDimitry Andric       // block for tail predication.
406*e8d8bef9SDimitry Andric       return !Revert && FoundAllComponents() && !VCTPs.empty() &&
4075ffd83dbSDimitry Andric              !CannotTailPredicate && ML.getNumBlocks() == 1;
408480093f4SDimitry Andric     }
409480093f4SDimitry Andric 
410*e8d8bef9SDimitry Andric     // Given that MI is a VCTP, check that is equivalent to any other VCTPs
411*e8d8bef9SDimitry Andric     // found.
412*e8d8bef9SDimitry Andric     bool AddVCTP(MachineInstr *MI);
413*e8d8bef9SDimitry Andric 
4145ffd83dbSDimitry Andric     // Check that the predication in the loop will be equivalent once we
4155ffd83dbSDimitry Andric     // perform the conversion. Also ensure that we can provide the number
4165ffd83dbSDimitry Andric     // of elements to the loop start instruction.
417*e8d8bef9SDimitry Andric     bool ValidateTailPredicate();
4185ffd83dbSDimitry Andric 
4195ffd83dbSDimitry Andric     // Check that any values available outside of the loop will be the same
4205ffd83dbSDimitry Andric     // after tail predication conversion.
4215ffd83dbSDimitry Andric     bool ValidateLiveOuts();
422480093f4SDimitry Andric 
423480093f4SDimitry Andric     // Is it safe to define LR with DLS/WLS?
424480093f4SDimitry Andric     // LR can be defined if it is the operand to start, because it's the same
425480093f4SDimitry Andric     // value, or if it's going to be equivalent to the operand to Start.
4265ffd83dbSDimitry Andric     MachineInstr *isSafeToDefineLR();
427480093f4SDimitry Andric 
428480093f4SDimitry Andric     // Check the branch targets are within range and we satisfy our
429480093f4SDimitry Andric     // restrictions.
430*e8d8bef9SDimitry Andric     void Validate(ARMBasicBlockUtils *BBUtils);
431480093f4SDimitry Andric 
432480093f4SDimitry Andric     bool FoundAllComponents() const {
433480093f4SDimitry Andric       return Start && Dec && End;
434480093f4SDimitry Andric     }
435480093f4SDimitry Andric 
436*e8d8bef9SDimitry Andric     SmallVectorImpl<VPTState> &getVPTBlocks() {
437*e8d8bef9SDimitry Andric       return VPTState::Blocks;
438*e8d8bef9SDimitry Andric     }
439480093f4SDimitry Andric 
440*e8d8bef9SDimitry Andric     // Return the operand for the loop start instruction. This will be the loop
441*e8d8bef9SDimitry Andric     // iteration count, or the number of elements if we're tail predicating.
442*e8d8bef9SDimitry Andric     MachineOperand &getLoopStartOperand() {
443*e8d8bef9SDimitry Andric       if (IsTailPredicationLegal())
444*e8d8bef9SDimitry Andric         return TPNumElements;
445*e8d8bef9SDimitry Andric       return isDo(Start) ? Start->getOperand(1) : Start->getOperand(0);
446480093f4SDimitry Andric     }
447480093f4SDimitry Andric 
448480093f4SDimitry Andric     unsigned getStartOpcode() const {
449*e8d8bef9SDimitry Andric       bool IsDo = isDo(Start);
450480093f4SDimitry Andric       if (!IsTailPredicationLegal())
451480093f4SDimitry Andric         return IsDo ? ARM::t2DLS : ARM::t2WLS;
452480093f4SDimitry Andric 
453*e8d8bef9SDimitry Andric       return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
454480093f4SDimitry Andric     }
455480093f4SDimitry Andric 
456480093f4SDimitry Andric     void dump() const {
457480093f4SDimitry Andric       if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
458480093f4SDimitry Andric       if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
459480093f4SDimitry Andric       if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
460*e8d8bef9SDimitry Andric       if (!VCTPs.empty()) {
461*e8d8bef9SDimitry Andric         dbgs() << "ARM Loops: Found VCTP(s):\n";
462*e8d8bef9SDimitry Andric         for (auto *MI : VCTPs)
463*e8d8bef9SDimitry Andric           dbgs() << " - " << *MI;
464*e8d8bef9SDimitry Andric       }
465480093f4SDimitry Andric       if (!FoundAllComponents())
466480093f4SDimitry Andric         dbgs() << "ARM Loops: Not a low-overhead loop.\n";
467480093f4SDimitry Andric       else if (!(Start && Dec && End))
468480093f4SDimitry Andric         dbgs() << "ARM Loops: Failed to find all loop components.\n";
469480093f4SDimitry Andric     }
470480093f4SDimitry Andric   };
471480093f4SDimitry Andric 
4720b57cec5SDimitry Andric   class ARMLowOverheadLoops : public MachineFunctionPass {
4738bcb0991SDimitry Andric     MachineFunction           *MF = nullptr;
474480093f4SDimitry Andric     MachineLoopInfo           *MLI = nullptr;
475480093f4SDimitry Andric     ReachingDefAnalysis       *RDA = nullptr;
4760b57cec5SDimitry Andric     const ARMBaseInstrInfo    *TII = nullptr;
4770b57cec5SDimitry Andric     MachineRegisterInfo       *MRI = nullptr;
478480093f4SDimitry Andric     const TargetRegisterInfo  *TRI = nullptr;
4790b57cec5SDimitry Andric     std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
4800b57cec5SDimitry Andric 
4810b57cec5SDimitry Andric   public:
4820b57cec5SDimitry Andric     static char ID;
4830b57cec5SDimitry Andric 
4840b57cec5SDimitry Andric     ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
4850b57cec5SDimitry Andric 
4860b57cec5SDimitry Andric     void getAnalysisUsage(AnalysisUsage &AU) const override {
4870b57cec5SDimitry Andric       AU.setPreservesCFG();
4880b57cec5SDimitry Andric       AU.addRequired<MachineLoopInfo>();
489480093f4SDimitry Andric       AU.addRequired<ReachingDefAnalysis>();
4900b57cec5SDimitry Andric       MachineFunctionPass::getAnalysisUsage(AU);
4910b57cec5SDimitry Andric     }
4920b57cec5SDimitry Andric 
4930b57cec5SDimitry Andric     bool runOnMachineFunction(MachineFunction &MF) override;
4940b57cec5SDimitry Andric 
4950b57cec5SDimitry Andric     MachineFunctionProperties getRequiredProperties() const override {
4960b57cec5SDimitry Andric       return MachineFunctionProperties().set(
497480093f4SDimitry Andric           MachineFunctionProperties::Property::NoVRegs).set(
498480093f4SDimitry Andric           MachineFunctionProperties::Property::TracksLiveness);
4990b57cec5SDimitry Andric     }
5000b57cec5SDimitry Andric 
5010b57cec5SDimitry Andric     StringRef getPassName() const override {
5020b57cec5SDimitry Andric       return ARM_LOW_OVERHEAD_LOOPS_NAME;
5030b57cec5SDimitry Andric     }
5048bcb0991SDimitry Andric 
5058bcb0991SDimitry Andric   private:
5068bcb0991SDimitry Andric     bool ProcessLoop(MachineLoop *ML);
5078bcb0991SDimitry Andric 
5088bcb0991SDimitry Andric     bool RevertNonLoops();
5098bcb0991SDimitry Andric 
5108bcb0991SDimitry Andric     void RevertWhile(MachineInstr *MI) const;
511*e8d8bef9SDimitry Andric     void RevertDo(MachineInstr *MI) const;
5128bcb0991SDimitry Andric 
5135ffd83dbSDimitry Andric     bool RevertLoopDec(MachineInstr *MI) const;
5148bcb0991SDimitry Andric 
5158bcb0991SDimitry Andric     void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
5168bcb0991SDimitry Andric 
517*e8d8bef9SDimitry Andric     void RevertLoopEndDec(MachineInstr *MI) const;
518480093f4SDimitry Andric 
519*e8d8bef9SDimitry Andric     void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
5205ffd83dbSDimitry Andric 
521480093f4SDimitry Andric     MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
522480093f4SDimitry Andric 
523480093f4SDimitry Andric     void Expand(LowOverheadLoop &LoLoop);
5248bcb0991SDimitry Andric 
5255ffd83dbSDimitry Andric     void IterationCountDCE(LowOverheadLoop &LoLoop);
5260b57cec5SDimitry Andric   };
5270b57cec5SDimitry Andric }
5280b57cec5SDimitry Andric 
5290b57cec5SDimitry Andric char ARMLowOverheadLoops::ID = 0;
5300b57cec5SDimitry Andric 
531*e8d8bef9SDimitry Andric SmallVector<VPTState, 4> VPTState::Blocks;
532*e8d8bef9SDimitry Andric SetVector<MachineInstr *> VPTState::CurrentPredicates;
533*e8d8bef9SDimitry Andric std::map<MachineInstr *,
534*e8d8bef9SDimitry Andric          std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
535*e8d8bef9SDimitry Andric 
5360b57cec5SDimitry Andric INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
5370b57cec5SDimitry Andric                 false, false)
5380b57cec5SDimitry Andric 
539*e8d8bef9SDimitry Andric static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
540*e8d8bef9SDimitry Andric                       InstSet &ToRemove, InstSet &Ignore) {
541480093f4SDimitry Andric 
542*e8d8bef9SDimitry Andric   // Check that we can remove all of Killed without having to modify any IT
543*e8d8bef9SDimitry Andric   // blocks.
544*e8d8bef9SDimitry Andric   auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
545*e8d8bef9SDimitry Andric     // Collect the dead code and the MBBs in which they reside.
546*e8d8bef9SDimitry Andric     SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
547*e8d8bef9SDimitry Andric     for (auto *Dead : Killed)
548*e8d8bef9SDimitry Andric       BasicBlocks.insert(Dead->getParent());
549*e8d8bef9SDimitry Andric 
550*e8d8bef9SDimitry Andric     // Collect IT blocks in all affected basic blocks.
551*e8d8bef9SDimitry Andric     std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
552*e8d8bef9SDimitry Andric     for (auto *MBB : BasicBlocks) {
553*e8d8bef9SDimitry Andric       for (auto &IT : *MBB) {
554*e8d8bef9SDimitry Andric         if (IT.getOpcode() != ARM::t2IT)
555*e8d8bef9SDimitry Andric           continue;
556*e8d8bef9SDimitry Andric         RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE),
557*e8d8bef9SDimitry Andric                                  ITBlocks[&IT]);
558*e8d8bef9SDimitry Andric       }
559*e8d8bef9SDimitry Andric     }
560*e8d8bef9SDimitry Andric 
561*e8d8bef9SDimitry Andric     // If we're removing all of the instructions within an IT block, then
562*e8d8bef9SDimitry Andric     // also remove the IT instruction.
563*e8d8bef9SDimitry Andric     SmallPtrSet<MachineInstr *, 2> ModifiedITs;
564*e8d8bef9SDimitry Andric     SmallPtrSet<MachineInstr *, 2> RemoveITs;
565*e8d8bef9SDimitry Andric     for (auto *Dead : Killed) {
566*e8d8bef9SDimitry Andric       if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) {
567*e8d8bef9SDimitry Andric         MachineInstr *IT = RDA.getMIOperand(Dead, *MO);
568*e8d8bef9SDimitry Andric         RemoveITs.insert(IT);
569*e8d8bef9SDimitry Andric         auto &CurrentBlock = ITBlocks[IT];
570*e8d8bef9SDimitry Andric         CurrentBlock.erase(Dead);
571*e8d8bef9SDimitry Andric         if (CurrentBlock.empty())
572*e8d8bef9SDimitry Andric           ModifiedITs.erase(IT);
573*e8d8bef9SDimitry Andric         else
574*e8d8bef9SDimitry Andric           ModifiedITs.insert(IT);
575*e8d8bef9SDimitry Andric       }
576*e8d8bef9SDimitry Andric     }
577*e8d8bef9SDimitry Andric     if (!ModifiedITs.empty())
578*e8d8bef9SDimitry Andric       return false;
579*e8d8bef9SDimitry Andric     Killed.insert(RemoveITs.begin(), RemoveITs.end());
580*e8d8bef9SDimitry Andric     return true;
581480093f4SDimitry Andric   };
582480093f4SDimitry Andric 
583*e8d8bef9SDimitry Andric   SmallPtrSet<MachineInstr *, 2> Uses;
584*e8d8bef9SDimitry Andric   if (!RDA.isSafeToRemove(MI, Uses, Ignore))
585*e8d8bef9SDimitry Andric     return false;
586480093f4SDimitry Andric 
587*e8d8bef9SDimitry Andric   if (WontCorruptITs(Uses, RDA)) {
588*e8d8bef9SDimitry Andric     ToRemove.insert(Uses.begin(), Uses.end());
589*e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
590*e8d8bef9SDimitry Andric                << " - can also remove:\n";
591*e8d8bef9SDimitry Andric                for (auto *Use : Uses)
592*e8d8bef9SDimitry Andric                  dbgs() << "   - " << *Use);
593480093f4SDimitry Andric 
594*e8d8bef9SDimitry Andric     SmallPtrSet<MachineInstr*, 4> Killed;
595*e8d8bef9SDimitry Andric     RDA.collectKilledOperands(MI, Killed);
596*e8d8bef9SDimitry Andric     if (WontCorruptITs(Killed, RDA)) {
597*e8d8bef9SDimitry Andric       ToRemove.insert(Killed.begin(), Killed.end());
598*e8d8bef9SDimitry Andric       LLVM_DEBUG(for (auto *Dead : Killed)
599*e8d8bef9SDimitry Andric                    dbgs() << "   - " << *Dead);
600480093f4SDimitry Andric     }
601*e8d8bef9SDimitry Andric     return true;
602*e8d8bef9SDimitry Andric   }
603480093f4SDimitry Andric   return false;
604480093f4SDimitry Andric }
605*e8d8bef9SDimitry Andric 
606*e8d8bef9SDimitry Andric bool LowOverheadLoop::ValidateTailPredicate() {
607*e8d8bef9SDimitry Andric   if (!IsTailPredicationLegal()) {
608*e8d8bef9SDimitry Andric     LLVM_DEBUG(if (VCTPs.empty())
609*e8d8bef9SDimitry Andric                  dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
610*e8d8bef9SDimitry Andric                dbgs() << "ARM Loops: Tail-predication is not valid.\n");
611*e8d8bef9SDimitry Andric     return false;
612*e8d8bef9SDimitry Andric   }
613*e8d8bef9SDimitry Andric 
614*e8d8bef9SDimitry Andric   assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
615*e8d8bef9SDimitry Andric   assert(ML.getBlocks().size() == 1 &&
616*e8d8bef9SDimitry Andric          "Shouldn't be processing a loop with more than one block");
617*e8d8bef9SDimitry Andric 
618*e8d8bef9SDimitry Andric   if (DisableTailPredication) {
619*e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
620*e8d8bef9SDimitry Andric     return false;
621*e8d8bef9SDimitry Andric   }
622*e8d8bef9SDimitry Andric 
623*e8d8bef9SDimitry Andric   if (!VPTState::isValid(RDA)) {
624*e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
625*e8d8bef9SDimitry Andric     return false;
626*e8d8bef9SDimitry Andric   }
627*e8d8bef9SDimitry Andric 
628*e8d8bef9SDimitry Andric   if (!ValidateLiveOuts()) {
629*e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
630*e8d8bef9SDimitry Andric     return false;
631*e8d8bef9SDimitry Andric   }
632*e8d8bef9SDimitry Andric 
633*e8d8bef9SDimitry Andric   // Check that creating a [W|D]LSTP, which will define LR with an element
634*e8d8bef9SDimitry Andric   // count instead of iteration count, won't affect any other instructions
635*e8d8bef9SDimitry Andric   // than the LoopStart and LoopDec.
636*e8d8bef9SDimitry Andric   // TODO: We should try to insert the [W|D]LSTP after any of the other uses.
637*e8d8bef9SDimitry Andric   Register StartReg = isDo(Start) ? Start->getOperand(1).getReg()
638*e8d8bef9SDimitry Andric                                   : Start->getOperand(0).getReg();
639*e8d8bef9SDimitry Andric   if (StartInsertPt == Start && StartReg == ARM::LR) {
640*e8d8bef9SDimitry Andric     if (auto *IterCount = RDA.getMIOperand(Start, isDo(Start) ? 1 : 0)) {
641*e8d8bef9SDimitry Andric       SmallPtrSet<MachineInstr *, 2> Uses;
642*e8d8bef9SDimitry Andric       RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses);
643*e8d8bef9SDimitry Andric       for (auto *Use : Uses) {
644*e8d8bef9SDimitry Andric         if (Use != Start && Use != Dec) {
645*e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs() << " ARM Loops: Found LR use: " << *Use);
646480093f4SDimitry Andric           return false;
647480093f4SDimitry Andric         }
648480093f4SDimitry Andric       }
649*e8d8bef9SDimitry Andric     }
650*e8d8bef9SDimitry Andric   }
6515ffd83dbSDimitry Andric 
652480093f4SDimitry Andric   // For tail predication, we need to provide the number of elements, instead
653480093f4SDimitry Andric   // of the iteration count, to the loop start instruction. The number of
654480093f4SDimitry Andric   // elements is provided to the vctp instruction, so we need to check that
655480093f4SDimitry Andric   // we can use this register at InsertPt.
656*e8d8bef9SDimitry Andric   MachineInstr *VCTP = VCTPs.back();
657*e8d8bef9SDimitry Andric   if (Start->getOpcode() == ARM::t2DoLoopStartTP) {
658*e8d8bef9SDimitry Andric     TPNumElements = Start->getOperand(2);
659*e8d8bef9SDimitry Andric     StartInsertPt = Start;
660*e8d8bef9SDimitry Andric     StartInsertBB = Start->getParent();
661*e8d8bef9SDimitry Andric   } else {
662*e8d8bef9SDimitry Andric     TPNumElements = VCTP->getOperand(1);
663*e8d8bef9SDimitry Andric     MCRegister NumElements = TPNumElements.getReg().asMCReg();
664480093f4SDimitry Andric 
665480093f4SDimitry Andric     // If the register is defined within loop, then we can't perform TP.
666480093f4SDimitry Andric     // TODO: Check whether this is just a mov of a register that would be
667480093f4SDimitry Andric     // available.
6685ffd83dbSDimitry Andric     if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
669480093f4SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
670480093f4SDimitry Andric       return false;
671480093f4SDimitry Andric     }
672480093f4SDimitry Andric 
673480093f4SDimitry Andric     // The element count register maybe defined after InsertPt, in which case we
674480093f4SDimitry Andric     // need to try to move either InsertPt or the def so that the [w|d]lstp can
675480093f4SDimitry Andric     // use the value.
676*e8d8bef9SDimitry Andric 
677*e8d8bef9SDimitry Andric     if (StartInsertPt != StartInsertBB->end() &&
678*e8d8bef9SDimitry Andric         !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
679*e8d8bef9SDimitry Andric       if (auto *ElemDef =
680*e8d8bef9SDimitry Andric               RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
681*e8d8bef9SDimitry Andric         if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
682480093f4SDimitry Andric           ElemDef->removeFromParent();
683*e8d8bef9SDimitry Andric           StartInsertBB->insert(StartInsertPt, ElemDef);
684*e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs()
685*e8d8bef9SDimitry Andric                      << "ARM Loops: Moved element count def: " << *ElemDef);
686*e8d8bef9SDimitry Andric         } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
6875ffd83dbSDimitry Andric           StartInsertPt->removeFromParent();
688*e8d8bef9SDimitry Andric           StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
689*e8d8bef9SDimitry Andric                                      &*StartInsertPt);
690480093f4SDimitry Andric           LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
691480093f4SDimitry Andric         } else {
692*e8d8bef9SDimitry Andric           // If we fail to move an instruction and the element count is provided
693*e8d8bef9SDimitry Andric           // by a mov, use the mov operand if it will have the same value at the
694*e8d8bef9SDimitry Andric           // insertion point
695*e8d8bef9SDimitry Andric           MachineOperand Operand = ElemDef->getOperand(1);
696*e8d8bef9SDimitry Andric           if (isMovRegOpcode(ElemDef->getOpcode()) &&
697*e8d8bef9SDimitry Andric               RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
698*e8d8bef9SDimitry Andric                   RDA.getUniqueReachingMIDef(&*StartInsertPt,
699*e8d8bef9SDimitry Andric                                              Operand.getReg().asMCReg())) {
700*e8d8bef9SDimitry Andric             TPNumElements = Operand;
701*e8d8bef9SDimitry Andric             NumElements = TPNumElements.getReg();
702*e8d8bef9SDimitry Andric           } else {
703*e8d8bef9SDimitry Andric             LLVM_DEBUG(dbgs()
704*e8d8bef9SDimitry Andric                        << "ARM Loops: Unable to move element count to loop "
705480093f4SDimitry Andric                        << "start instruction.\n");
706480093f4SDimitry Andric             return false;
707480093f4SDimitry Andric           }
708480093f4SDimitry Andric         }
709480093f4SDimitry Andric       }
710*e8d8bef9SDimitry Andric     }
711480093f4SDimitry Andric 
712480093f4SDimitry Andric     // Especially in the case of while loops, InsertBB may not be the
713480093f4SDimitry Andric     // preheader, so we need to check that the register isn't redefined
714480093f4SDimitry Andric     // before entering the loop.
7155ffd83dbSDimitry Andric     auto CannotProvideElements = [this](MachineBasicBlock *MBB,
716*e8d8bef9SDimitry Andric                                         MCRegister NumElements) {
717*e8d8bef9SDimitry Andric       if (MBB->empty())
718*e8d8bef9SDimitry Andric         return false;
719480093f4SDimitry Andric       // NumElements is redefined in this block.
7205ffd83dbSDimitry Andric       if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
721480093f4SDimitry Andric         return true;
722480093f4SDimitry Andric 
723480093f4SDimitry Andric       // Don't continue searching up through multiple predecessors.
724480093f4SDimitry Andric       if (MBB->pred_size() > 1)
725480093f4SDimitry Andric         return true;
726480093f4SDimitry Andric 
727480093f4SDimitry Andric       return false;
728480093f4SDimitry Andric     };
729480093f4SDimitry Andric 
730*e8d8bef9SDimitry Andric     // Search backwards for a def, until we get to InsertBB.
7315ffd83dbSDimitry Andric     MachineBasicBlock *MBB = Preheader;
732*e8d8bef9SDimitry Andric     while (MBB && MBB != StartInsertBB) {
733480093f4SDimitry Andric       if (CannotProvideElements(MBB, NumElements)) {
734480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
735480093f4SDimitry Andric         return false;
736480093f4SDimitry Andric       }
737480093f4SDimitry Andric       MBB = *MBB->pred_begin();
738480093f4SDimitry Andric     }
739*e8d8bef9SDimitry Andric   }
740*e8d8bef9SDimitry Andric 
741*e8d8bef9SDimitry Andric   // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
742*e8d8bef9SDimitry Andric   // world the [w|d]lstp instruction would be last instruction in the preheader
743*e8d8bef9SDimitry Andric   // and so it would only affect instructions within the loop body. But due to
744*e8d8bef9SDimitry Andric   // scheduling, and/or the logic in this pass (above), the insertion point can
745*e8d8bef9SDimitry Andric   // be moved earlier. So if the Loop Start isn't the last instruction in the
746*e8d8bef9SDimitry Andric   // preheader, and if the initial element count is smaller than the vector
747*e8d8bef9SDimitry Andric   // width, the Loop Start instruction will immediately generate one or more
748*e8d8bef9SDimitry Andric   // false lane mask which can, incorrectly, affect the proceeding MVE
749*e8d8bef9SDimitry Andric   // instructions in the preheader.
750*e8d8bef9SDimitry Andric   if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
751*e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
752*e8d8bef9SDimitry Andric     return false;
753*e8d8bef9SDimitry Andric   }
754480093f4SDimitry Andric 
7555ffd83dbSDimitry Andric   // Check that the value change of the element count is what we expect and
7565ffd83dbSDimitry Andric   // that the predication will be equivalent. For this we need:
7575ffd83dbSDimitry Andric   // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
7585ffd83dbSDimitry Andric   // and we can also allow register copies within the chain too.
7595ffd83dbSDimitry Andric   auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
7605ffd83dbSDimitry Andric     return -getAddSubImmediate(*MI) == ExpectedVecWidth;
7615ffd83dbSDimitry Andric   };
7625ffd83dbSDimitry Andric 
763*e8d8bef9SDimitry Andric   MachineBasicBlock *MBB = VCTP->getParent();
764*e8d8bef9SDimitry Andric   // Remove modifications to the element count since they have no purpose in a
765*e8d8bef9SDimitry Andric   // tail predicated loop. Explicitly refer to the vctp operand no matter which
766*e8d8bef9SDimitry Andric   // register NumElements has been assigned to, since that is what the
767*e8d8bef9SDimitry Andric   // modifications will be using
768*e8d8bef9SDimitry Andric   if (auto *Def = RDA.getUniqueReachingMIDef(
769*e8d8bef9SDimitry Andric           &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
7705ffd83dbSDimitry Andric     SmallPtrSet<MachineInstr*, 2> ElementChain;
771*e8d8bef9SDimitry Andric     SmallPtrSet<MachineInstr*, 2> Ignore;
7725ffd83dbSDimitry Andric     unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
7735ffd83dbSDimitry Andric 
774*e8d8bef9SDimitry Andric     Ignore.insert(VCTPs.begin(), VCTPs.end());
7755ffd83dbSDimitry Andric 
776*e8d8bef9SDimitry Andric     if (TryRemove(Def, RDA, ElementChain, Ignore)) {
7775ffd83dbSDimitry Andric       bool FoundSub = false;
7785ffd83dbSDimitry Andric 
7795ffd83dbSDimitry Andric       for (auto *MI : ElementChain) {
7805ffd83dbSDimitry Andric         if (isMovRegOpcode(MI->getOpcode()))
7815ffd83dbSDimitry Andric           continue;
7825ffd83dbSDimitry Andric 
7835ffd83dbSDimitry Andric         if (isSubImmOpcode(MI->getOpcode())) {
784*e8d8bef9SDimitry Andric           if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
785*e8d8bef9SDimitry Andric             LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
786*e8d8bef9SDimitry Andric                        " count: " << *MI);
7875ffd83dbSDimitry Andric             return false;
7885ffd83dbSDimitry Andric           }
789*e8d8bef9SDimitry Andric           FoundSub = true;
790*e8d8bef9SDimitry Andric         } else {
791*e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
792*e8d8bef9SDimitry Andric                      " count: " << *MI);
793*e8d8bef9SDimitry Andric           return false;
794*e8d8bef9SDimitry Andric         }
795*e8d8bef9SDimitry Andric       }
7965ffd83dbSDimitry Andric       ToRemove.insert(ElementChain.begin(), ElementChain.end());
7975ffd83dbSDimitry Andric     }
7985ffd83dbSDimitry Andric   }
799480093f4SDimitry Andric   return true;
800480093f4SDimitry Andric }
801480093f4SDimitry Andric 
8025ffd83dbSDimitry Andric static bool isRegInClass(const MachineOperand &MO,
8035ffd83dbSDimitry Andric                          const TargetRegisterClass *Class) {
8045ffd83dbSDimitry Andric   return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
8055ffd83dbSDimitry Andric }
8065ffd83dbSDimitry Andric 
8075ffd83dbSDimitry Andric // MVE 'narrowing' operate on half a lane, reading from half and writing
8085ffd83dbSDimitry Andric // to half, which are referred to has the top and bottom half. The other
8095ffd83dbSDimitry Andric // half retains its previous value.
8105ffd83dbSDimitry Andric static bool retainsPreviousHalfElement(const MachineInstr &MI) {
8115ffd83dbSDimitry Andric   const MCInstrDesc &MCID = MI.getDesc();
8125ffd83dbSDimitry Andric   uint64_t Flags = MCID.TSFlags;
8135ffd83dbSDimitry Andric   return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
8145ffd83dbSDimitry Andric }
8155ffd83dbSDimitry Andric 
8165ffd83dbSDimitry Andric // Some MVE instructions read from the top/bottom halves of their operand(s)
8175ffd83dbSDimitry Andric // and generate a vector result with result elements that are double the
8185ffd83dbSDimitry Andric // width of the input.
8195ffd83dbSDimitry Andric static bool producesDoubleWidthResult(const MachineInstr &MI) {
8205ffd83dbSDimitry Andric   const MCInstrDesc &MCID = MI.getDesc();
8215ffd83dbSDimitry Andric   uint64_t Flags = MCID.TSFlags;
8225ffd83dbSDimitry Andric   return (Flags & ARMII::DoubleWidthResult) != 0;
8235ffd83dbSDimitry Andric }
8245ffd83dbSDimitry Andric 
8255ffd83dbSDimitry Andric static bool isHorizontalReduction(const MachineInstr &MI) {
8265ffd83dbSDimitry Andric   const MCInstrDesc &MCID = MI.getDesc();
8275ffd83dbSDimitry Andric   uint64_t Flags = MCID.TSFlags;
8285ffd83dbSDimitry Andric   return (Flags & ARMII::HorizontalReduction) != 0;
8295ffd83dbSDimitry Andric }
8305ffd83dbSDimitry Andric 
8315ffd83dbSDimitry Andric // Can this instruction generate a non-zero result when given only zeroed
8325ffd83dbSDimitry Andric // operands? This allows us to know that, given operands with false bytes
8335ffd83dbSDimitry Andric // zeroed by masked loads, that the result will also contain zeros in those
8345ffd83dbSDimitry Andric // bytes.
8355ffd83dbSDimitry Andric static bool canGenerateNonZeros(const MachineInstr &MI) {
8365ffd83dbSDimitry Andric 
8375ffd83dbSDimitry Andric   // Check for instructions which can write into a larger element size,
8385ffd83dbSDimitry Andric   // possibly writing into a previous zero'd lane.
8395ffd83dbSDimitry Andric   if (producesDoubleWidthResult(MI))
8405ffd83dbSDimitry Andric     return true;
8415ffd83dbSDimitry Andric 
8425ffd83dbSDimitry Andric   switch (MI.getOpcode()) {
8435ffd83dbSDimitry Andric   default:
8445ffd83dbSDimitry Andric     break;
8455ffd83dbSDimitry Andric   // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
8465ffd83dbSDimitry Andric   // fp16 -> fp32 vector conversions.
8475ffd83dbSDimitry Andric   // Instructions that perform a NOT will generate 1s from 0s.
8485ffd83dbSDimitry Andric   case ARM::MVE_VMVN:
8495ffd83dbSDimitry Andric   case ARM::MVE_VORN:
8505ffd83dbSDimitry Andric   // Count leading zeros will do just that!
8515ffd83dbSDimitry Andric   case ARM::MVE_VCLZs8:
8525ffd83dbSDimitry Andric   case ARM::MVE_VCLZs16:
8535ffd83dbSDimitry Andric   case ARM::MVE_VCLZs32:
8545ffd83dbSDimitry Andric     return true;
8555ffd83dbSDimitry Andric   }
8565ffd83dbSDimitry Andric   return false;
8575ffd83dbSDimitry Andric }
8585ffd83dbSDimitry Andric 
8595ffd83dbSDimitry Andric // Look at its register uses to see if it only can only receive zeros
8605ffd83dbSDimitry Andric // into its false lanes which would then produce zeros. Also check that
8615ffd83dbSDimitry Andric // the output register is also defined by an FalseLanesZero instruction
8625ffd83dbSDimitry Andric // so that if tail-predication happens, the lanes that aren't updated will
8635ffd83dbSDimitry Andric // still be zeros.
8645ffd83dbSDimitry Andric static bool producesFalseLanesZero(MachineInstr &MI,
8655ffd83dbSDimitry Andric                                    const TargetRegisterClass *QPRs,
8665ffd83dbSDimitry Andric                                    const ReachingDefAnalysis &RDA,
8675ffd83dbSDimitry Andric                                    InstSet &FalseLanesZero) {
8685ffd83dbSDimitry Andric   if (canGenerateNonZeros(MI))
8695ffd83dbSDimitry Andric     return false;
8705ffd83dbSDimitry Andric 
871*e8d8bef9SDimitry Andric   bool isPredicated = isVectorPredicated(&MI);
872*e8d8bef9SDimitry Andric   // Predicated loads will write zeros to the falsely predicated bytes of the
873*e8d8bef9SDimitry Andric   // destination register.
874*e8d8bef9SDimitry Andric   if (MI.mayLoad())
875*e8d8bef9SDimitry Andric     return isPredicated;
876*e8d8bef9SDimitry Andric 
877*e8d8bef9SDimitry Andric   auto IsZeroInit = [](MachineInstr *Def) {
878*e8d8bef9SDimitry Andric     return !isVectorPredicated(Def) &&
879*e8d8bef9SDimitry Andric            Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
880*e8d8bef9SDimitry Andric            Def->getOperand(1).getImm() == 0;
881*e8d8bef9SDimitry Andric   };
882*e8d8bef9SDimitry Andric 
8835ffd83dbSDimitry Andric   bool AllowScalars = isHorizontalReduction(MI);
8845ffd83dbSDimitry Andric   for (auto &MO : MI.operands()) {
8855ffd83dbSDimitry Andric     if (!MO.isReg() || !MO.getReg())
8865ffd83dbSDimitry Andric       continue;
8875ffd83dbSDimitry Andric     if (!isRegInClass(MO, QPRs) && AllowScalars)
8885ffd83dbSDimitry Andric       continue;
889*e8d8bef9SDimitry Andric 
890*e8d8bef9SDimitry Andric     // Check that this instruction will produce zeros in its false lanes:
891*e8d8bef9SDimitry Andric     // - If it only consumes false lanes zero or constant 0 (vmov #0)
892*e8d8bef9SDimitry Andric     // - If it's predicated, it only matters that it's def register already has
893*e8d8bef9SDimitry Andric     //   false lane zeros, so we can ignore the uses.
894*e8d8bef9SDimitry Andric     SmallPtrSet<MachineInstr *, 2> Defs;
895*e8d8bef9SDimitry Andric     RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
896*e8d8bef9SDimitry Andric     for (auto *Def : Defs) {
897*e8d8bef9SDimitry Andric       if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
898*e8d8bef9SDimitry Andric         continue;
899*e8d8bef9SDimitry Andric       if (MO.isUse() && isPredicated)
9005ffd83dbSDimitry Andric         continue;
9015ffd83dbSDimitry Andric       return false;
9025ffd83dbSDimitry Andric     }
903*e8d8bef9SDimitry Andric   }
9045ffd83dbSDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
9055ffd83dbSDimitry Andric   return true;
9065ffd83dbSDimitry Andric }
9075ffd83dbSDimitry Andric 
9085ffd83dbSDimitry Andric bool LowOverheadLoop::ValidateLiveOuts() {
9095ffd83dbSDimitry Andric   // We want to find out if the tail-predicated version of this loop will
9105ffd83dbSDimitry Andric   // produce the same values as the loop in its original form. For this to
9115ffd83dbSDimitry Andric   // be true, the newly inserted implicit predication must not change the
9125ffd83dbSDimitry Andric   // the (observable) results.
9135ffd83dbSDimitry Andric   // We're doing this because many instructions in the loop will not be
9145ffd83dbSDimitry Andric   // predicated and so the conversion from VPT predication to tail-predication
9155ffd83dbSDimitry Andric   // can result in different values being produced; due to the tail-predication
9165ffd83dbSDimitry Andric   // preventing many instructions from updating their falsely predicated
9175ffd83dbSDimitry Andric   // lanes. This analysis assumes that all the instructions perform lane-wise
9185ffd83dbSDimitry Andric   // operations and don't perform any exchanges.
9195ffd83dbSDimitry Andric   // A masked load, whether through VPT or tail predication, will write zeros
9205ffd83dbSDimitry Andric   // to any of the falsely predicated bytes. So, from the loads, we know that
9215ffd83dbSDimitry Andric   // the false lanes are zeroed and here we're trying to track that those false
9225ffd83dbSDimitry Andric   // lanes remain zero, or where they change, the differences are masked away
9235ffd83dbSDimitry Andric   // by their user(s).
924*e8d8bef9SDimitry Andric   // All MVE stores have to be predicated, so we know that any predicate load
9255ffd83dbSDimitry Andric   // operands, or stored results are equivalent already. Other explicitly
9265ffd83dbSDimitry Andric   // predicated instructions will perform the same operation in the original
9275ffd83dbSDimitry Andric   // loop and the tail-predicated form too. Because of this, we can insert
9285ffd83dbSDimitry Andric   // loads, stores and other predicated instructions into our Predicated
9295ffd83dbSDimitry Andric   // set and build from there.
9305ffd83dbSDimitry Andric   const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
9315ffd83dbSDimitry Andric   SetVector<MachineInstr *> FalseLanesUnknown;
9325ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr *, 4> FalseLanesZero;
9335ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr *, 4> Predicated;
9345ffd83dbSDimitry Andric   MachineBasicBlock *Header = ML.getHeader();
9355ffd83dbSDimitry Andric 
9365ffd83dbSDimitry Andric   for (auto &MI : *Header) {
937*e8d8bef9SDimitry Andric     if (!shouldInspect(MI))
9385ffd83dbSDimitry Andric       continue;
9395ffd83dbSDimitry Andric 
9405ffd83dbSDimitry Andric     if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
9415ffd83dbSDimitry Andric       continue;
9425ffd83dbSDimitry Andric 
943*e8d8bef9SDimitry Andric     bool isPredicated = isVectorPredicated(&MI);
944*e8d8bef9SDimitry Andric     bool retainsOrReduces =
945*e8d8bef9SDimitry Andric       retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);
946*e8d8bef9SDimitry Andric 
947*e8d8bef9SDimitry Andric     if (isPredicated)
9485ffd83dbSDimitry Andric       Predicated.insert(&MI);
949*e8d8bef9SDimitry Andric     if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
9505ffd83dbSDimitry Andric       FalseLanesZero.insert(&MI);
951*e8d8bef9SDimitry Andric     else if (MI.getNumDefs() == 0)
952*e8d8bef9SDimitry Andric       continue;
953*e8d8bef9SDimitry Andric     else if (!isPredicated && retainsOrReduces)
954*e8d8bef9SDimitry Andric       return false;
955*e8d8bef9SDimitry Andric     else if (!isPredicated)
956*e8d8bef9SDimitry Andric       FalseLanesUnknown.insert(&MI);
9575ffd83dbSDimitry Andric   }
9585ffd83dbSDimitry Andric 
9595ffd83dbSDimitry Andric   auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
9605ffd83dbSDimitry Andric                               SmallPtrSetImpl<MachineInstr *> &Predicated) {
9615ffd83dbSDimitry Andric     SmallPtrSet<MachineInstr *, 2> Uses;
962*e8d8bef9SDimitry Andric     RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
9635ffd83dbSDimitry Andric     for (auto *Use : Uses) {
9645ffd83dbSDimitry Andric       if (Use != MI && !Predicated.count(Use))
9655ffd83dbSDimitry Andric         return false;
9665ffd83dbSDimitry Andric     }
9675ffd83dbSDimitry Andric     return true;
9685ffd83dbSDimitry Andric   };
9695ffd83dbSDimitry Andric 
9705ffd83dbSDimitry Andric   // Visit the unknowns in reverse so that we can start at the values being
9715ffd83dbSDimitry Andric   // stored and then we can work towards the leaves, hopefully adding more
9725ffd83dbSDimitry Andric   // instructions to Predicated. Successfully terminating the loop means that
9735ffd83dbSDimitry Andric   // all the unknown values have to found to be masked by predicated user(s).
9745ffd83dbSDimitry Andric   // For any unpredicated values, we store them in NonPredicated so that we
9755ffd83dbSDimitry Andric   // can later check whether these form a reduction.
9765ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr*, 2> NonPredicated;
9775ffd83dbSDimitry Andric   for (auto *MI : reverse(FalseLanesUnknown)) {
9785ffd83dbSDimitry Andric     for (auto &MO : MI->operands()) {
9795ffd83dbSDimitry Andric       if (!isRegInClass(MO, QPRs) || !MO.isDef())
9805ffd83dbSDimitry Andric         continue;
9815ffd83dbSDimitry Andric       if (!HasPredicatedUsers(MI, MO, Predicated)) {
9825ffd83dbSDimitry Andric         LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : "
9835ffd83dbSDimitry Andric                           << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
9845ffd83dbSDimitry Andric         NonPredicated.insert(MI);
985*e8d8bef9SDimitry Andric         break;
9865ffd83dbSDimitry Andric       }
9875ffd83dbSDimitry Andric     }
9885ffd83dbSDimitry Andric     // Any unknown false lanes have been masked away by the user(s).
989*e8d8bef9SDimitry Andric     if (!NonPredicated.contains(MI))
9905ffd83dbSDimitry Andric       Predicated.insert(MI);
9915ffd83dbSDimitry Andric   }
9925ffd83dbSDimitry Andric 
9935ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
9945ffd83dbSDimitry Andric   SmallVector<MachineBasicBlock *, 2> ExitBlocks;
9955ffd83dbSDimitry Andric   ML.getExitBlocks(ExitBlocks);
9965ffd83dbSDimitry Andric   assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
9975ffd83dbSDimitry Andric   assert(ExitBlocks.size() == 1 && "Expected a single exit block");
9985ffd83dbSDimitry Andric   MachineBasicBlock *ExitBB = ExitBlocks.front();
9995ffd83dbSDimitry Andric   for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
1000*e8d8bef9SDimitry Andric     // TODO: Instead of blocking predication, we could move the vctp to the exit
1001*e8d8bef9SDimitry Andric     // block and calculate it's operand there in or the preheader.
1002*e8d8bef9SDimitry Andric     if (RegMask.PhysReg == ARM::VPR)
1003*e8d8bef9SDimitry Andric       return false;
10045ffd83dbSDimitry Andric     // Check Q-regs that are live in the exit blocks. We don't collect scalars
10055ffd83dbSDimitry Andric     // because they won't be affected by lane predication.
1006*e8d8bef9SDimitry Andric     if (QPRs->contains(RegMask.PhysReg))
10075ffd83dbSDimitry Andric       if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
10085ffd83dbSDimitry Andric         LiveOutMIs.insert(MI);
10095ffd83dbSDimitry Andric   }
10105ffd83dbSDimitry Andric 
10115ffd83dbSDimitry Andric   // We've already validated that any VPT predication within the loop will be
10125ffd83dbSDimitry Andric   // equivalent when we perform the predication transformation; so we know that
10135ffd83dbSDimitry Andric   // any VPT predicated instruction is predicated upon VCTP. Any live-out
10145ffd83dbSDimitry Andric   // instruction needs to be predicated, so check this here. The instructions
10155ffd83dbSDimitry Andric   // in NonPredicated have been found to be a reduction that we can ensure its
10165ffd83dbSDimitry Andric   // legality.
1017*e8d8bef9SDimitry Andric   for (auto *MI : LiveOutMIs) {
1018*e8d8bef9SDimitry Andric     if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
1019*e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Unable to handle live out: " << *MI);
10205ffd83dbSDimitry Andric       return false;
1021*e8d8bef9SDimitry Andric     }
1022*e8d8bef9SDimitry Andric   }
10235ffd83dbSDimitry Andric 
10245ffd83dbSDimitry Andric   return true;
10255ffd83dbSDimitry Andric }
10265ffd83dbSDimitry Andric 
1027*e8d8bef9SDimitry Andric void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
1028480093f4SDimitry Andric   if (Revert)
1029480093f4SDimitry Andric     return;
1030480093f4SDimitry Andric 
1031*e8d8bef9SDimitry Andric   // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
1032*e8d8bef9SDimitry Andric   // can only jump back.
1033*e8d8bef9SDimitry Andric   auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
1034*e8d8bef9SDimitry Andric                            ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
1035*e8d8bef9SDimitry Andric     MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
1036*e8d8bef9SDimitry Andric                                    ? End->getOperand(1).getMBB()
1037*e8d8bef9SDimitry Andric                                    : End->getOperand(2).getMBB();
1038480093f4SDimitry Andric     // TODO Maybe there's cases where the target doesn't have to be the header,
1039480093f4SDimitry Andric     // but for now be safe and revert.
1040*e8d8bef9SDimitry Andric     if (TgtBB != ML.getHeader()) {
1041*e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
1042*e8d8bef9SDimitry Andric       return false;
1043480093f4SDimitry Andric     }
1044480093f4SDimitry Andric 
1045480093f4SDimitry Andric     // The WLS and LE instructions have 12-bits for the label offset. WLS
1046480093f4SDimitry Andric     // requires a positive offset, while LE uses negative.
10475ffd83dbSDimitry Andric     if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
10485ffd83dbSDimitry Andric         !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
1049480093f4SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
1050*e8d8bef9SDimitry Andric       return false;
1051480093f4SDimitry Andric     }
1052480093f4SDimitry Andric 
1053480093f4SDimitry Andric     if (Start->getOpcode() == ARM::t2WhileLoopStart &&
1054480093f4SDimitry Andric         (BBUtils->getOffsetOf(Start) >
1055480093f4SDimitry Andric          BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
1056480093f4SDimitry Andric          !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
1057480093f4SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
1058*e8d8bef9SDimitry Andric       return false;
1059*e8d8bef9SDimitry Andric     }
1060*e8d8bef9SDimitry Andric     return true;
1061*e8d8bef9SDimitry Andric   };
1062*e8d8bef9SDimitry Andric 
1063*e8d8bef9SDimitry Andric   // Find a suitable position to insert the loop start instruction. It needs to
1064*e8d8bef9SDimitry Andric   // be able to safely define LR.
1065*e8d8bef9SDimitry Andric   auto FindStartInsertionPoint = [](MachineInstr *Start, MachineInstr *Dec,
1066*e8d8bef9SDimitry Andric                                     MachineBasicBlock::iterator &InsertPt,
1067*e8d8bef9SDimitry Andric                                     MachineBasicBlock *&InsertBB,
1068*e8d8bef9SDimitry Andric                                     ReachingDefAnalysis &RDA,
1069*e8d8bef9SDimitry Andric                                     InstSet &ToRemove) {
1070*e8d8bef9SDimitry Andric     // For a t2DoLoopStart it is always valid to use the start insertion point.
1071*e8d8bef9SDimitry Andric     // For WLS we can define LR if LR already contains the same value.
1072*e8d8bef9SDimitry Andric     if (isDo(Start) || Start->getOperand(0).getReg() == ARM::LR) {
1073*e8d8bef9SDimitry Andric       InsertPt = MachineBasicBlock::iterator(Start);
1074*e8d8bef9SDimitry Andric       InsertBB = Start->getParent();
1075*e8d8bef9SDimitry Andric       return true;
1076480093f4SDimitry Andric     }
1077480093f4SDimitry Andric 
1078*e8d8bef9SDimitry Andric     // We've found no suitable LR def and Start doesn't use LR directly. Can we
1079*e8d8bef9SDimitry Andric     // just define LR anyway?
1080*e8d8bef9SDimitry Andric     if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR)))
1081*e8d8bef9SDimitry Andric       return false;
1082*e8d8bef9SDimitry Andric 
1083*e8d8bef9SDimitry Andric     InsertPt = MachineBasicBlock::iterator(Start);
1084*e8d8bef9SDimitry Andric     InsertBB = Start->getParent();
1085*e8d8bef9SDimitry Andric     return true;
1086*e8d8bef9SDimitry Andric   };
1087*e8d8bef9SDimitry Andric 
1088*e8d8bef9SDimitry Andric   if (!FindStartInsertionPoint(Start, Dec, StartInsertPt, StartInsertBB, RDA,
1089*e8d8bef9SDimitry Andric                                ToRemove)) {
1090480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
1091480093f4SDimitry Andric     Revert = true;
1092480093f4SDimitry Andric     return;
1093*e8d8bef9SDimitry Andric   }
1094*e8d8bef9SDimitry Andric   LLVM_DEBUG(if (StartInsertPt == StartInsertBB->end())
1095*e8d8bef9SDimitry Andric                dbgs() << "ARM Loops: Will insert LoopStart at end of block\n";
1096*e8d8bef9SDimitry Andric              else
1097*e8d8bef9SDimitry Andric                dbgs() << "ARM Loops: Will insert LoopStart at "
1098*e8d8bef9SDimitry Andric                       << *StartInsertPt
1099*e8d8bef9SDimitry Andric             );
1100480093f4SDimitry Andric 
1101*e8d8bef9SDimitry Andric   Revert = !ValidateRanges(Start, End, BBUtils, ML);
1102*e8d8bef9SDimitry Andric   CannotTailPredicate = !ValidateTailPredicate();
1103480093f4SDimitry Andric }
1104480093f4SDimitry Andric 
1105*e8d8bef9SDimitry Andric bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
1106*e8d8bef9SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
1107*e8d8bef9SDimitry Andric   if (VCTPs.empty()) {
1108*e8d8bef9SDimitry Andric     VCTPs.push_back(MI);
1109*e8d8bef9SDimitry Andric     return true;
1110*e8d8bef9SDimitry Andric   }
1111*e8d8bef9SDimitry Andric 
1112*e8d8bef9SDimitry Andric   // If we find another VCTP, check whether it uses the same value as the main VCTP.
1113*e8d8bef9SDimitry Andric   // If it does, store it in the VCTPs set, else refuse it.
1114*e8d8bef9SDimitry Andric   MachineInstr *Prev = VCTPs.back();
1115*e8d8bef9SDimitry Andric   if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
1116*e8d8bef9SDimitry Andric       !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
1117*e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
1118*e8d8bef9SDimitry Andric                          "definition from the main VCTP");
1119*e8d8bef9SDimitry Andric     return false;
1120*e8d8bef9SDimitry Andric   }
1121*e8d8bef9SDimitry Andric   VCTPs.push_back(MI);
1122*e8d8bef9SDimitry Andric   return true;
1123480093f4SDimitry Andric }
1124480093f4SDimitry Andric 
1125480093f4SDimitry Andric bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
1126480093f4SDimitry Andric   if (CannotTailPredicate)
1127480093f4SDimitry Andric     return false;
1128480093f4SDimitry Andric 
1129*e8d8bef9SDimitry Andric   if (!shouldInspect(*MI))
1130480093f4SDimitry Andric     return true;
1131*e8d8bef9SDimitry Andric 
1132*e8d8bef9SDimitry Andric   if (MI->getOpcode() == ARM::MVE_VPSEL ||
11335ffd83dbSDimitry Andric       MI->getOpcode() == ARM::MVE_VPNOT) {
1134480093f4SDimitry Andric     // TODO: Allow VPSEL and VPNOT, we currently cannot because:
1135480093f4SDimitry Andric     // 1) It will use the VPR as a predicate operand, but doesn't have to be
1136480093f4SDimitry Andric     //    instead a VPT block, which means we can assert while building up
11375ffd83dbSDimitry Andric     //    the VPT block because we don't find another VPT or VPST to being a new
1138480093f4SDimitry Andric     //    one.
1139480093f4SDimitry Andric     // 2) VPSEL still requires a VPR operand even after tail predicating,
1140480093f4SDimitry Andric     //    which means we can't remove it unless there is another
1141480093f4SDimitry Andric     //    instruction, such as vcmp, that can provide the VPR def.
11425ffd83dbSDimitry Andric     return false;
11435ffd83dbSDimitry Andric   }
1144480093f4SDimitry Andric 
1145*e8d8bef9SDimitry Andric   // Record all VCTPs and check that they're equivalent to one another.
1146*e8d8bef9SDimitry Andric   if (isVCTP(MI) && !AddVCTP(MI))
1147*e8d8bef9SDimitry Andric     return false;
1148*e8d8bef9SDimitry Andric 
1149*e8d8bef9SDimitry Andric   // Inspect uses first so that any instructions that alter the VPR don't
1150*e8d8bef9SDimitry Andric   // alter the predicate upon themselves.
1151480093f4SDimitry Andric   const MCInstrDesc &MCID = MI->getDesc();
1152*e8d8bef9SDimitry Andric   bool IsUse = false;
1153*e8d8bef9SDimitry Andric   unsigned LastOpIdx = MI->getNumOperands() - 1;
1154*e8d8bef9SDimitry Andric   for (auto &Op : enumerate(reverse(MCID.operands()))) {
1155*e8d8bef9SDimitry Andric     const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
1156*e8d8bef9SDimitry Andric     if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
1157480093f4SDimitry Andric       continue;
1158480093f4SDimitry Andric 
1159*e8d8bef9SDimitry Andric     if (ARM::isVpred(Op.value().OperandType)) {
1160*e8d8bef9SDimitry Andric       VPTState::addInst(MI);
1161480093f4SDimitry Andric       IsUse = true;
1162*e8d8bef9SDimitry Andric     } else if (MI->getOpcode() != ARM::MVE_VPST) {
1163480093f4SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
1164480093f4SDimitry Andric       return false;
1165480093f4SDimitry Andric     }
1166480093f4SDimitry Andric   }
1167480093f4SDimitry Andric 
1168480093f4SDimitry Andric   // If we find an instruction that has been marked as not valid for tail
1169480093f4SDimitry Andric   // predication, only allow the instruction if it's contained within a valid
1170480093f4SDimitry Andric   // VPT block.
1171*e8d8bef9SDimitry Andric   bool RequiresExplicitPredication =
1172*e8d8bef9SDimitry Andric     (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
1173*e8d8bef9SDimitry Andric   if (isDomainMVE(MI) && RequiresExplicitPredication) {
1174*e8d8bef9SDimitry Andric     LLVM_DEBUG(if (!IsUse)
1175*e8d8bef9SDimitry Andric                dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
1176*e8d8bef9SDimitry Andric     return IsUse;
1177480093f4SDimitry Andric   }
1178480093f4SDimitry Andric 
11795ffd83dbSDimitry Andric   // If the instruction is already explicitly predicated, then the conversion
1180*e8d8bef9SDimitry Andric   // will be fine, but ensure that all store operations are predicated.
1181*e8d8bef9SDimitry Andric   if (MI->mayStore())
1182*e8d8bef9SDimitry Andric     return IsUse;
1183*e8d8bef9SDimitry Andric 
1184*e8d8bef9SDimitry Andric   // If this instruction defines the VPR, update the predicate for the
1185*e8d8bef9SDimitry Andric   // proceeding instructions.
1186*e8d8bef9SDimitry Andric   if (isVectorPredicate(MI)) {
1187*e8d8bef9SDimitry Andric     // Clear the existing predicate when we're not in VPT Active state,
1188*e8d8bef9SDimitry Andric     // otherwise we add to it.
1189*e8d8bef9SDimitry Andric     if (!isVectorPredicated(MI))
1190*e8d8bef9SDimitry Andric       VPTState::resetPredicate(MI);
1191*e8d8bef9SDimitry Andric     else
1192*e8d8bef9SDimitry Andric       VPTState::addPredicate(MI);
1193*e8d8bef9SDimitry Andric   }
1194*e8d8bef9SDimitry Andric 
1195*e8d8bef9SDimitry Andric   // Finally once the predicate has been modified, we can start a new VPT
1196*e8d8bef9SDimitry Andric   // block if necessary.
1197*e8d8bef9SDimitry Andric   if (isVPTOpcode(MI->getOpcode()))
1198*e8d8bef9SDimitry Andric     VPTState::CreateVPTBlock(MI);
1199*e8d8bef9SDimitry Andric 
1200*e8d8bef9SDimitry Andric   return true;
1201480093f4SDimitry Andric }
1202480093f4SDimitry Andric 
12038bcb0991SDimitry Andric bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
12048bcb0991SDimitry Andric   const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
12058bcb0991SDimitry Andric   if (!ST.hasLOB())
12060b57cec5SDimitry Andric     return false;
12070b57cec5SDimitry Andric 
12088bcb0991SDimitry Andric   MF = &mf;
12098bcb0991SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
12100b57cec5SDimitry Andric 
1211480093f4SDimitry Andric   MLI = &getAnalysis<MachineLoopInfo>();
1212480093f4SDimitry Andric   RDA = &getAnalysis<ReachingDefAnalysis>();
12138bcb0991SDimitry Andric   MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
12148bcb0991SDimitry Andric   MRI = &MF->getRegInfo();
12158bcb0991SDimitry Andric   TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
1216480093f4SDimitry Andric   TRI = ST.getRegisterInfo();
12178bcb0991SDimitry Andric   BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
12180b57cec5SDimitry Andric   BBUtils->computeAllBlockSizes();
12198bcb0991SDimitry Andric   BBUtils->adjustBBOffsetsAfter(&MF->front());
12200b57cec5SDimitry Andric 
12210b57cec5SDimitry Andric   bool Changed = false;
1222480093f4SDimitry Andric   for (auto ML : *MLI) {
1223*e8d8bef9SDimitry Andric     if (ML->isOutermost())
12240b57cec5SDimitry Andric       Changed |= ProcessLoop(ML);
12250b57cec5SDimitry Andric   }
12268bcb0991SDimitry Andric   Changed |= RevertNonLoops();
12270b57cec5SDimitry Andric   return Changed;
12280b57cec5SDimitry Andric }
12290b57cec5SDimitry Andric 
12300b57cec5SDimitry Andric bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
12310b57cec5SDimitry Andric 
12320b57cec5SDimitry Andric   bool Changed = false;
12330b57cec5SDimitry Andric 
12340b57cec5SDimitry Andric   // Process inner loops first.
12350b57cec5SDimitry Andric   for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
12360b57cec5SDimitry Andric     Changed |= ProcessLoop(*I);
12370b57cec5SDimitry Andric 
1238480093f4SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n";
1239480093f4SDimitry Andric              if (auto *Preheader = ML->getLoopPreheader())
1240480093f4SDimitry Andric                dbgs() << " - " << Preheader->getName() << "\n";
1241480093f4SDimitry Andric              else if (auto *Preheader = MLI->findLoopPreheader(ML))
1242480093f4SDimitry Andric                dbgs() << " - " << Preheader->getName() << "\n";
12435ffd83dbSDimitry Andric              else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
12445ffd83dbSDimitry Andric                dbgs() << " - " << Preheader->getName() << "\n";
1245480093f4SDimitry Andric              for (auto *MBB : ML->getBlocks())
1246480093f4SDimitry Andric                dbgs() << " - " << MBB->getName() << "\n";
1247480093f4SDimitry Andric             );
12480b57cec5SDimitry Andric 
12490b57cec5SDimitry Andric   // Search the given block for a loop start instruction. If one isn't found,
12500b57cec5SDimitry Andric   // and there's only one predecessor block, search that one too.
12510b57cec5SDimitry Andric   std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
12528bcb0991SDimitry Andric     [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
12530b57cec5SDimitry Andric     for (auto &MI : *MBB) {
1254480093f4SDimitry Andric       if (isLoopStart(MI))
12550b57cec5SDimitry Andric         return &MI;
12560b57cec5SDimitry Andric     }
12570b57cec5SDimitry Andric     if (MBB->pred_size() == 1)
12580b57cec5SDimitry Andric       return SearchForStart(*MBB->pred_begin());
12590b57cec5SDimitry Andric     return nullptr;
12600b57cec5SDimitry Andric   };
12610b57cec5SDimitry Andric 
12625ffd83dbSDimitry Andric   LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
1263480093f4SDimitry Andric   // Search the preheader for the start intrinsic.
12640b57cec5SDimitry Andric   // FIXME: I don't see why we shouldn't be supporting multiple predecessors
12650b57cec5SDimitry Andric   // with potentially multiple set.loop.iterations, so we need to enable this.
12665ffd83dbSDimitry Andric   if (LoLoop.Preheader)
12675ffd83dbSDimitry Andric     LoLoop.Start = SearchForStart(LoLoop.Preheader);
1268480093f4SDimitry Andric   else
1269480093f4SDimitry Andric     return false;
12700b57cec5SDimitry Andric 
12710b57cec5SDimitry Andric   // Find the low-overhead loop components and decide whether or not to fall
1272480093f4SDimitry Andric   // back to a normal loop. Also look for a vctp instructions and decide
1273480093f4SDimitry Andric   // whether we can convert that predicate using tail predication.
12740b57cec5SDimitry Andric   for (auto *MBB : reverse(ML->getBlocks())) {
12750b57cec5SDimitry Andric     for (auto &MI : *MBB) {
12765ffd83dbSDimitry Andric       if (MI.isDebugValue())
12775ffd83dbSDimitry Andric         continue;
12785ffd83dbSDimitry Andric       else if (MI.getOpcode() == ARM::t2LoopDec)
1279480093f4SDimitry Andric         LoLoop.Dec = &MI;
12800b57cec5SDimitry Andric       else if (MI.getOpcode() == ARM::t2LoopEnd)
1281480093f4SDimitry Andric         LoLoop.End = &MI;
1282*e8d8bef9SDimitry Andric       else if (MI.getOpcode() == ARM::t2LoopEndDec)
1283*e8d8bef9SDimitry Andric         LoLoop.End = LoLoop.Dec = &MI;
1284480093f4SDimitry Andric       else if (isLoopStart(MI))
1285480093f4SDimitry Andric         LoLoop.Start = &MI;
12868bcb0991SDimitry Andric       else if (MI.getDesc().isCall()) {
12870b57cec5SDimitry Andric         // TODO: Though the call will require LE to execute again, does this
12880b57cec5SDimitry Andric         // mean we should revert? Always executing LE hopefully should be
12890b57cec5SDimitry Andric         // faster than performing a sub,cmp,br or even subs,br.
1290480093f4SDimitry Andric         LoLoop.Revert = true;
12918bcb0991SDimitry Andric         LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
1292480093f4SDimitry Andric       } else {
1293480093f4SDimitry Andric         // Record VPR defs and build up their corresponding vpt blocks.
1294480093f4SDimitry Andric         // Check we know how to tail predicate any mve instructions.
1295480093f4SDimitry Andric         LoLoop.AnalyseMVEInst(&MI);
12968bcb0991SDimitry Andric       }
12970b57cec5SDimitry Andric     }
12980b57cec5SDimitry Andric   }
12990b57cec5SDimitry Andric 
1300480093f4SDimitry Andric   LLVM_DEBUG(LoLoop.dump());
1301480093f4SDimitry Andric   if (!LoLoop.FoundAllComponents()) {
1302480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
13038bcb0991SDimitry Andric     return false;
13040b57cec5SDimitry Andric   }
13050b57cec5SDimitry Andric 
1306*e8d8bef9SDimitry Andric   // Check that the only instruction using LoopDec is LoopEnd. This can only
1307*e8d8bef9SDimitry Andric   // happen when the Dec and End are separate, not a single t2LoopEndDec.
13085ffd83dbSDimitry Andric   // TODO: Check for copy chains that really have no effect.
1309*e8d8bef9SDimitry Andric   if (LoLoop.Dec != LoLoop.End) {
13105ffd83dbSDimitry Andric     SmallPtrSet<MachineInstr *, 2> Uses;
1311*e8d8bef9SDimitry Andric     RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
13125ffd83dbSDimitry Andric     if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
13135ffd83dbSDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
13145ffd83dbSDimitry Andric       LoLoop.Revert = true;
13155ffd83dbSDimitry Andric     }
1316*e8d8bef9SDimitry Andric   }
1317*e8d8bef9SDimitry Andric   LoLoop.Validate(BBUtils.get());
1318480093f4SDimitry Andric   Expand(LoLoop);
13190b57cec5SDimitry Andric   return true;
13200b57cec5SDimitry Andric }
13210b57cec5SDimitry Andric 
13220b57cec5SDimitry Andric // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
13230b57cec5SDimitry Andric // beq that branches to the exit branch.
13248bcb0991SDimitry Andric // TODO: We could also try to generate a cbz if the value in LR is also in
13250b57cec5SDimitry Andric // another low register.
13260b57cec5SDimitry Andric void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
13270b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
13288bcb0991SDimitry Andric   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
13298bcb0991SDimitry Andric   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
13308bcb0991SDimitry Andric     ARM::tBcc : ARM::t2Bcc;
13318bcb0991SDimitry Andric 
1332*e8d8bef9SDimitry Andric   RevertWhileLoopStart(MI, TII, BrOpc);
1333*e8d8bef9SDimitry Andric }
1334*e8d8bef9SDimitry Andric 
1335*e8d8bef9SDimitry Andric void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
1336*e8d8bef9SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
1337*e8d8bef9SDimitry Andric   RevertDoLoopStart(MI, TII);
13380b57cec5SDimitry Andric }
13390b57cec5SDimitry Andric 
13405ffd83dbSDimitry Andric bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
13410b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
13420b57cec5SDimitry Andric   MachineBasicBlock *MBB = MI->getParent();
13435ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr*, 1> Ignore;
13445ffd83dbSDimitry Andric   for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
13455ffd83dbSDimitry Andric     if (I->getOpcode() == ARM::t2LoopEnd) {
13465ffd83dbSDimitry Andric       Ignore.insert(&*I);
13475ffd83dbSDimitry Andric       break;
13485ffd83dbSDimitry Andric     }
13495ffd83dbSDimitry Andric   }
13508bcb0991SDimitry Andric 
1351480093f4SDimitry Andric   // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
1352*e8d8bef9SDimitry Andric   bool SetFlags =
1353*e8d8bef9SDimitry Andric       RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);
13548bcb0991SDimitry Andric 
1355*e8d8bef9SDimitry Andric   llvm::RevertLoopDec(MI, TII, SetFlags);
13568bcb0991SDimitry Andric   return SetFlags;
13570b57cec5SDimitry Andric }
13580b57cec5SDimitry Andric 
13590b57cec5SDimitry Andric // Generate a subs, or sub and cmp, and a branch instead of an LE.
13608bcb0991SDimitry Andric void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
13610b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
13620b57cec5SDimitry Andric 
13638bcb0991SDimitry Andric   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
13648bcb0991SDimitry Andric   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
13658bcb0991SDimitry Andric     ARM::tBcc : ARM::t2Bcc;
13668bcb0991SDimitry Andric 
1367*e8d8bef9SDimitry Andric   llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
1368*e8d8bef9SDimitry Andric }
1369*e8d8bef9SDimitry Andric 
1370*e8d8bef9SDimitry Andric // Generate a subs, or sub and cmp, and a branch instead of an LE.
1371*e8d8bef9SDimitry Andric void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
1372*e8d8bef9SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
1373*e8d8bef9SDimitry Andric   assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
1374*e8d8bef9SDimitry Andric   MachineBasicBlock *MBB = MI->getParent();
1375*e8d8bef9SDimitry Andric 
13768bcb0991SDimitry Andric   MachineInstrBuilder MIB =
1377*e8d8bef9SDimitry Andric       BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
1378*e8d8bef9SDimitry Andric   MIB.addDef(ARM::LR);
1379*e8d8bef9SDimitry Andric   MIB.add(MI->getOperand(1));
1380*e8d8bef9SDimitry Andric   MIB.addImm(1);
1381*e8d8bef9SDimitry Andric   MIB.addImm(ARMCC::AL);
1382*e8d8bef9SDimitry Andric   MIB.addReg(ARM::NoRegister);
1383*e8d8bef9SDimitry Andric   MIB.addReg(ARM::CPSR);
1384*e8d8bef9SDimitry Andric   MIB->getOperand(5).setIsDef(true);
1385*e8d8bef9SDimitry Andric 
1386*e8d8bef9SDimitry Andric   MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
1387*e8d8bef9SDimitry Andric   unsigned BrOpc =
1388*e8d8bef9SDimitry Andric       BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
1389*e8d8bef9SDimitry Andric 
1390*e8d8bef9SDimitry Andric   // Create bne
1391*e8d8bef9SDimitry Andric   MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
1392*e8d8bef9SDimitry Andric   MIB.add(MI->getOperand(2)); // branch target
13930b57cec5SDimitry Andric   MIB.addImm(ARMCC::NE);      // condition code
13940b57cec5SDimitry Andric   MIB.addReg(ARM::CPSR);
1395*e8d8bef9SDimitry Andric 
13960b57cec5SDimitry Andric   MI->eraseFromParent();
13970b57cec5SDimitry Andric }
13980b57cec5SDimitry Andric 
13995ffd83dbSDimitry Andric // Perform dead code elimation on the loop iteration count setup expression.
14005ffd83dbSDimitry Andric // If we are tail-predicating, the number of elements to be processed is the
14015ffd83dbSDimitry Andric // operand of the VCTP instruction in the vector body, see getCount(), which is
14025ffd83dbSDimitry Andric // register $r3 in this example:
14035ffd83dbSDimitry Andric //
14045ffd83dbSDimitry Andric //   $lr = big-itercount-expression
14055ffd83dbSDimitry Andric //   ..
1406*e8d8bef9SDimitry Andric //   $lr = t2DoLoopStart renamable $lr
14075ffd83dbSDimitry Andric //   vector.body:
14085ffd83dbSDimitry Andric //     ..
14095ffd83dbSDimitry Andric //     $vpr = MVE_VCTP32 renamable $r3
14105ffd83dbSDimitry Andric //     renamable $lr = t2LoopDec killed renamable $lr, 1
14115ffd83dbSDimitry Andric //     t2LoopEnd renamable $lr, %vector.body
14125ffd83dbSDimitry Andric //     tB %end
14135ffd83dbSDimitry Andric //
14145ffd83dbSDimitry Andric // What we would like achieve here is to replace the do-loop start pseudo
14155ffd83dbSDimitry Andric // instruction t2DoLoopStart with:
14165ffd83dbSDimitry Andric //
14175ffd83dbSDimitry Andric //    $lr = MVE_DLSTP_32 killed renamable $r3
14185ffd83dbSDimitry Andric //
14195ffd83dbSDimitry Andric // Thus, $r3 which defines the number of elements, is written to $lr,
14205ffd83dbSDimitry Andric // and then we want to delete the whole chain that used to define $lr,
14215ffd83dbSDimitry Andric // see the comment below how this chain could look like.
14225ffd83dbSDimitry Andric //
14235ffd83dbSDimitry Andric void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
14245ffd83dbSDimitry Andric   if (!LoLoop.IsTailPredicationLegal())
14255ffd83dbSDimitry Andric     return;
14265ffd83dbSDimitry Andric 
14275ffd83dbSDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
14285ffd83dbSDimitry Andric 
1429*e8d8bef9SDimitry Andric   MachineInstr *Def =
1430*e8d8bef9SDimitry Andric       RDA->getMIOperand(LoLoop.Start, isDo(LoLoop.Start) ? 1 : 0);
14315ffd83dbSDimitry Andric   if (!Def) {
14325ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
14335ffd83dbSDimitry Andric     return;
14345ffd83dbSDimitry Andric   }
14355ffd83dbSDimitry Andric 
14365ffd83dbSDimitry Andric   // Collect and remove the users of iteration count.
14375ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr*, 4> Killed  = { LoLoop.Start, LoLoop.Dec,
1438*e8d8bef9SDimitry Andric                                             LoLoop.End };
1439*e8d8bef9SDimitry Andric   if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
14405ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
14415ffd83dbSDimitry Andric }
14425ffd83dbSDimitry Andric 
1443480093f4SDimitry Andric MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
14445ffd83dbSDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
14455ffd83dbSDimitry Andric   // When using tail-predication, try to delete the dead code that was used to
14465ffd83dbSDimitry Andric   // calculate the number of loop iterations.
14475ffd83dbSDimitry Andric   IterationCountDCE(LoLoop);
14485ffd83dbSDimitry Andric 
1449*e8d8bef9SDimitry Andric   MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
1450480093f4SDimitry Andric   MachineInstr *Start = LoLoop.Start;
1451*e8d8bef9SDimitry Andric   MachineBasicBlock *MBB = LoLoop.StartInsertBB;
1452480093f4SDimitry Andric   unsigned Opc = LoLoop.getStartOpcode();
1453*e8d8bef9SDimitry Andric   MachineOperand &Count = LoLoop.getLoopStartOperand();
1454480093f4SDimitry Andric 
14550b57cec5SDimitry Andric   MachineInstrBuilder MIB =
1456*e8d8bef9SDimitry Andric     BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
14570b57cec5SDimitry Andric 
14580b57cec5SDimitry Andric   MIB.addDef(ARM::LR);
1459480093f4SDimitry Andric   MIB.add(Count);
1460*e8d8bef9SDimitry Andric   if (!isDo(Start))
14610b57cec5SDimitry Andric     MIB.add(Start->getOperand(1));
14620b57cec5SDimitry Andric 
14635ffd83dbSDimitry Andric   LoLoop.ToRemove.insert(Start);
14640b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
14650b57cec5SDimitry Andric   return &*MIB;
1466480093f4SDimitry Andric }
1467480093f4SDimitry Andric 
1468480093f4SDimitry Andric void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
1469480093f4SDimitry Andric   auto RemovePredicate = [](MachineInstr *MI) {
1470480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
1471480093f4SDimitry Andric     if (int PIdx = llvm::findFirstVPTPredOperandIdx(*MI)) {
1472480093f4SDimitry Andric       assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
1473480093f4SDimitry Andric              "Expected Then predicate!");
1474480093f4SDimitry Andric       MI->getOperand(PIdx).setImm(ARMVCC::None);
1475480093f4SDimitry Andric       MI->getOperand(PIdx+1).setReg(0);
1476480093f4SDimitry Andric     } else
1477480093f4SDimitry Andric       llvm_unreachable("trying to unpredicate a non-predicated instruction");
14780b57cec5SDimitry Andric   };
14790b57cec5SDimitry Andric 
1480480093f4SDimitry Andric   for (auto &Block : LoLoop.getVPTBlocks()) {
1481*e8d8bef9SDimitry Andric     SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
1482*e8d8bef9SDimitry Andric 
1483*e8d8bef9SDimitry Andric     auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
1484*e8d8bef9SDimitry Andric       assert(TheVCMP && "Replacing a removed or non-existent VCMP");
1485*e8d8bef9SDimitry Andric       // Replace the VCMP with a VPT
1486*e8d8bef9SDimitry Andric       MachineInstrBuilder MIB =
1487*e8d8bef9SDimitry Andric           BuildMI(*At->getParent(), At, At->getDebugLoc(),
1488*e8d8bef9SDimitry Andric                   TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
1489*e8d8bef9SDimitry Andric       MIB.addImm(ARMVCC::Then);
1490*e8d8bef9SDimitry Andric       // Register one
1491*e8d8bef9SDimitry Andric       MIB.add(TheVCMP->getOperand(1));
1492*e8d8bef9SDimitry Andric       // Register two
1493*e8d8bef9SDimitry Andric       MIB.add(TheVCMP->getOperand(2));
1494*e8d8bef9SDimitry Andric       // The comparison code, e.g. ge, eq, lt
1495*e8d8bef9SDimitry Andric       MIB.add(TheVCMP->getOperand(3));
1496*e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
1497*e8d8bef9SDimitry Andric       LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1498*e8d8bef9SDimitry Andric       LoLoop.ToRemove.insert(TheVCMP);
1499*e8d8bef9SDimitry Andric       TheVCMP = nullptr;
1500*e8d8bef9SDimitry Andric     };
1501*e8d8bef9SDimitry Andric 
1502*e8d8bef9SDimitry Andric     if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
1503*e8d8bef9SDimitry Andric       MachineInstr *VPST = Insts.front();
1504*e8d8bef9SDimitry Andric       if (VPTState::hasUniformPredicate(Block)) {
1505*e8d8bef9SDimitry Andric         // A vpt block starting with VPST, is only predicated upon vctp and has no
1506*e8d8bef9SDimitry Andric         // internal vpr defs:
1507*e8d8bef9SDimitry Andric         // - Remove vpst.
1508*e8d8bef9SDimitry Andric         // - Unpredicate the remaining instructions.
1509*e8d8bef9SDimitry Andric         LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1510*e8d8bef9SDimitry Andric         for (unsigned i = 1; i < Insts.size(); ++i)
1511*e8d8bef9SDimitry Andric           RemovePredicate(Insts[i]);
1512*e8d8bef9SDimitry Andric       } else {
15135ffd83dbSDimitry Andric         // The VPT block has a non-uniform predicate but it uses a vpst and its
15145ffd83dbSDimitry Andric         // entry is guarded only by a vctp, which means we:
1515480093f4SDimitry Andric         // - Need to remove the original vpst.
1516480093f4SDimitry Andric         // - Then need to unpredicate any following instructions, until
1517480093f4SDimitry Andric         //   we come across the divergent vpr def.
1518480093f4SDimitry Andric         // - Insert a new vpst to predicate the instruction(s) that following
1519480093f4SDimitry Andric         //   the divergent vpr def.
1520*e8d8bef9SDimitry Andric         MachineInstr *Divergent = VPTState::getDivergent(Block);
1521*e8d8bef9SDimitry Andric         auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
1522*e8d8bef9SDimitry Andric         bool DivergentNextIsPredicated =
1523*e8d8bef9SDimitry Andric             getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
1524*e8d8bef9SDimitry Andric 
1525*e8d8bef9SDimitry Andric         for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
1526*e8d8bef9SDimitry Andric              I != E; ++I)
1527480093f4SDimitry Andric           RemovePredicate(&*I);
1528480093f4SDimitry Andric 
1529*e8d8bef9SDimitry Andric         // Check if the instruction defining vpr is a vcmp so it can be combined
1530*e8d8bef9SDimitry Andric         // with the VPST This should be the divergent instruction
1531*e8d8bef9SDimitry Andric         MachineInstr *VCMP =
1532*e8d8bef9SDimitry Andric             VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
1533*e8d8bef9SDimitry Andric 
1534*e8d8bef9SDimitry Andric         if (DivergentNextIsPredicated) {
1535*e8d8bef9SDimitry Andric           // Insert a VPST at the divergent only if the next instruction
1536*e8d8bef9SDimitry Andric           // would actually use it. A VCMP following a VPST can be
1537*e8d8bef9SDimitry Andric           // merged into a VPT so do that instead if the VCMP exists.
1538*e8d8bef9SDimitry Andric           if (!VCMP) {
1539*e8d8bef9SDimitry Andric             // Create a VPST (with a null mask for now, we'll recompute it
1540*e8d8bef9SDimitry Andric             // later)
1541*e8d8bef9SDimitry Andric             MachineInstrBuilder MIB =
1542*e8d8bef9SDimitry Andric                 BuildMI(*Divergent->getParent(), Divergent,
1543*e8d8bef9SDimitry Andric                         Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
15445ffd83dbSDimitry Andric             MIB.addImm(0);
1545480093f4SDimitry Andric             LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
15465ffd83dbSDimitry Andric             LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1547*e8d8bef9SDimitry Andric           } else {
1548*e8d8bef9SDimitry Andric             // No RDA checks are necessary here since the VPST would have been
1549*e8d8bef9SDimitry Andric             // directly after the VCMP
1550*e8d8bef9SDimitry Andric             ReplaceVCMPWithVPT(VCMP, VCMP);
15515ffd83dbSDimitry Andric           }
15525ffd83dbSDimitry Andric         }
15535ffd83dbSDimitry Andric       }
1554*e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1555*e8d8bef9SDimitry Andric       LoLoop.ToRemove.insert(VPST);
1556*e8d8bef9SDimitry Andric     } else if (Block.containsVCTP()) {
1557*e8d8bef9SDimitry Andric       // The vctp will be removed, so either the entire block will be dead or
1558*e8d8bef9SDimitry Andric       // the block mask of the vp(s)t will need to be recomputed.
1559*e8d8bef9SDimitry Andric       MachineInstr *VPST = Insts.front();
1560*e8d8bef9SDimitry Andric       if (Block.size() == 2) {
1561*e8d8bef9SDimitry Andric         assert(VPST->getOpcode() == ARM::MVE_VPST &&
1562*e8d8bef9SDimitry Andric                "Found a VPST in an otherwise empty vpt block");
1563*e8d8bef9SDimitry Andric         LoLoop.ToRemove.insert(VPST);
1564*e8d8bef9SDimitry Andric       } else
1565*e8d8bef9SDimitry Andric         LoLoop.BlockMasksToRecompute.insert(VPST);
1566*e8d8bef9SDimitry Andric     } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
1567*e8d8bef9SDimitry Andric       // If this block starts with a VPST then attempt to merge it with the
1568*e8d8bef9SDimitry Andric       // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
1569*e8d8bef9SDimitry Andric       // block that no longer exists
1570*e8d8bef9SDimitry Andric       MachineInstr *VPST = Insts.front();
1571*e8d8bef9SDimitry Andric       auto Next = ++MachineBasicBlock::iterator(VPST);
1572*e8d8bef9SDimitry Andric       assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
1573*e8d8bef9SDimitry Andric              "The instruction after a VPST must be predicated");
1574*e8d8bef9SDimitry Andric       (void)Next;
1575*e8d8bef9SDimitry Andric       MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
1576*e8d8bef9SDimitry Andric       if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
1577*e8d8bef9SDimitry Andric           !LoLoop.ToRemove.contains(VprDef)) {
1578*e8d8bef9SDimitry Andric         MachineInstr *VCMP = VprDef;
1579*e8d8bef9SDimitry Andric         // The VCMP and VPST can only be merged if the VCMP's operands will have
1580*e8d8bef9SDimitry Andric         // the same values at the VPST.
1581*e8d8bef9SDimitry Andric         // If any of the instructions between the VCMP and VPST are predicated
1582*e8d8bef9SDimitry Andric         // then a different code path is expected to have merged the VCMP and
1583*e8d8bef9SDimitry Andric         // VPST already.
1584*e8d8bef9SDimitry Andric         if (!std::any_of(++MachineBasicBlock::iterator(VCMP),
1585*e8d8bef9SDimitry Andric                          MachineBasicBlock::iterator(VPST), hasVPRUse) &&
1586*e8d8bef9SDimitry Andric             RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
1587*e8d8bef9SDimitry Andric             RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
1588*e8d8bef9SDimitry Andric           ReplaceVCMPWithVPT(VCMP, VPST);
1589*e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1590*e8d8bef9SDimitry Andric           LoLoop.ToRemove.insert(VPST);
1591480093f4SDimitry Andric         }
1592480093f4SDimitry Andric       }
15935ffd83dbSDimitry Andric     }
15945ffd83dbSDimitry Andric   }
1595*e8d8bef9SDimitry Andric 
1596*e8d8bef9SDimitry Andric   LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
1597480093f4SDimitry Andric }
1598480093f4SDimitry Andric 
1599480093f4SDimitry Andric void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1600480093f4SDimitry Andric 
16010b57cec5SDimitry Andric   // Combine the LoopDec and LoopEnd instructions into LE(TP).
1602480093f4SDimitry Andric   auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
1603480093f4SDimitry Andric     MachineInstr *End = LoLoop.End;
16040b57cec5SDimitry Andric     MachineBasicBlock *MBB = End->getParent();
1605480093f4SDimitry Andric     unsigned Opc = LoLoop.IsTailPredicationLegal() ?
1606480093f4SDimitry Andric       ARM::MVE_LETP : ARM::t2LEUpdate;
16070b57cec5SDimitry Andric     MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
1608480093f4SDimitry Andric                                       TII->get(Opc));
16090b57cec5SDimitry Andric     MIB.addDef(ARM::LR);
1610*e8d8bef9SDimitry Andric     unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
1611*e8d8bef9SDimitry Andric     MIB.add(End->getOperand(Off + 0));
1612*e8d8bef9SDimitry Andric     MIB.add(End->getOperand(Off + 1));
16130b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
16145ffd83dbSDimitry Andric     LoLoop.ToRemove.insert(LoLoop.Dec);
16155ffd83dbSDimitry Andric     LoLoop.ToRemove.insert(End);
16160b57cec5SDimitry Andric     return &*MIB;
16170b57cec5SDimitry Andric   };
16180b57cec5SDimitry Andric 
16190b57cec5SDimitry Andric   // TODO: We should be able to automatically remove these branches before we
16200b57cec5SDimitry Andric   // get here - probably by teaching analyzeBranch about the pseudo
16210b57cec5SDimitry Andric   // instructions.
16220b57cec5SDimitry Andric   // If there is an unconditional branch, after I, that just branches to the
16230b57cec5SDimitry Andric   // next block, remove it.
16240b57cec5SDimitry Andric   auto RemoveDeadBranch = [](MachineInstr *I) {
16250b57cec5SDimitry Andric     MachineBasicBlock *BB = I->getParent();
16260b57cec5SDimitry Andric     MachineInstr *Terminator = &BB->instr_back();
16270b57cec5SDimitry Andric     if (Terminator->isUnconditionalBranch() && I != Terminator) {
16280b57cec5SDimitry Andric       MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
16290b57cec5SDimitry Andric       if (BB->isLayoutSuccessor(Succ)) {
16300b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
16310b57cec5SDimitry Andric         Terminator->eraseFromParent();
16320b57cec5SDimitry Andric       }
16330b57cec5SDimitry Andric     }
16340b57cec5SDimitry Andric   };
16350b57cec5SDimitry Andric 
1636480093f4SDimitry Andric   if (LoLoop.Revert) {
1637480093f4SDimitry Andric     if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
1638480093f4SDimitry Andric       RevertWhile(LoLoop.Start);
16390b57cec5SDimitry Andric     else
1640*e8d8bef9SDimitry Andric       RevertDo(LoLoop.Start);
1641*e8d8bef9SDimitry Andric     if (LoLoop.Dec == LoLoop.End)
1642*e8d8bef9SDimitry Andric       RevertLoopEndDec(LoLoop.End);
1643*e8d8bef9SDimitry Andric     else
1644*e8d8bef9SDimitry Andric       RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
16450b57cec5SDimitry Andric   } else {
1646480093f4SDimitry Andric     LoLoop.Start = ExpandLoopStart(LoLoop);
1647480093f4SDimitry Andric     RemoveDeadBranch(LoLoop.Start);
1648480093f4SDimitry Andric     LoLoop.End = ExpandLoopEnd(LoLoop);
1649480093f4SDimitry Andric     RemoveDeadBranch(LoLoop.End);
1650*e8d8bef9SDimitry Andric     if (LoLoop.IsTailPredicationLegal())
1651480093f4SDimitry Andric       ConvertVPTBlocks(LoLoop);
16525ffd83dbSDimitry Andric     for (auto *I : LoLoop.ToRemove) {
16535ffd83dbSDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
16545ffd83dbSDimitry Andric       I->eraseFromParent();
16555ffd83dbSDimitry Andric     }
16565ffd83dbSDimitry Andric     for (auto *I : LoLoop.BlockMasksToRecompute) {
16575ffd83dbSDimitry Andric       LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
16585ffd83dbSDimitry Andric       recomputeVPTBlockMask(*I);
16595ffd83dbSDimitry Andric       LLVM_DEBUG(dbgs() << "           ... done: " << *I);
1660480093f4SDimitry Andric     }
16610b57cec5SDimitry Andric   }
16625ffd83dbSDimitry Andric 
16635ffd83dbSDimitry Andric   PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
16645ffd83dbSDimitry Andric   DFS.ProcessLoop();
16655ffd83dbSDimitry Andric   const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
16665ffd83dbSDimitry Andric   for (auto *MBB : PostOrder) {
16675ffd83dbSDimitry Andric     recomputeLiveIns(*MBB);
16685ffd83dbSDimitry Andric     // FIXME: For some reason, the live-in print order is non-deterministic for
16695ffd83dbSDimitry Andric     // our tests and I can't out why... So just sort them.
16705ffd83dbSDimitry Andric     MBB->sortUniqueLiveIns();
16715ffd83dbSDimitry Andric   }
16725ffd83dbSDimitry Andric 
16735ffd83dbSDimitry Andric   for (auto *MBB : reverse(PostOrder))
16745ffd83dbSDimitry Andric     recomputeLivenessFlags(*MBB);
16755ffd83dbSDimitry Andric 
16765ffd83dbSDimitry Andric   // We've moved, removed and inserted new instructions, so update RDA.
16775ffd83dbSDimitry Andric   RDA->reset();
16780b57cec5SDimitry Andric }
16790b57cec5SDimitry Andric 
16808bcb0991SDimitry Andric bool ARMLowOverheadLoops::RevertNonLoops() {
16818bcb0991SDimitry Andric   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
16828bcb0991SDimitry Andric   bool Changed = false;
16838bcb0991SDimitry Andric 
16848bcb0991SDimitry Andric   for (auto &MBB : *MF) {
16858bcb0991SDimitry Andric     SmallVector<MachineInstr*, 4> Starts;
16868bcb0991SDimitry Andric     SmallVector<MachineInstr*, 4> Decs;
16878bcb0991SDimitry Andric     SmallVector<MachineInstr*, 4> Ends;
1688*e8d8bef9SDimitry Andric     SmallVector<MachineInstr *, 4> EndDecs;
16898bcb0991SDimitry Andric 
16908bcb0991SDimitry Andric     for (auto &I : MBB) {
1691480093f4SDimitry Andric       if (isLoopStart(I))
16928bcb0991SDimitry Andric         Starts.push_back(&I);
16938bcb0991SDimitry Andric       else if (I.getOpcode() == ARM::t2LoopDec)
16948bcb0991SDimitry Andric         Decs.push_back(&I);
16958bcb0991SDimitry Andric       else if (I.getOpcode() == ARM::t2LoopEnd)
16968bcb0991SDimitry Andric         Ends.push_back(&I);
1697*e8d8bef9SDimitry Andric       else if (I.getOpcode() == ARM::t2LoopEndDec)
1698*e8d8bef9SDimitry Andric         EndDecs.push_back(&I);
16998bcb0991SDimitry Andric     }
17008bcb0991SDimitry Andric 
1701*e8d8bef9SDimitry Andric     if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty())
17028bcb0991SDimitry Andric       continue;
17038bcb0991SDimitry Andric 
17048bcb0991SDimitry Andric     Changed = true;
17058bcb0991SDimitry Andric 
17068bcb0991SDimitry Andric     for (auto *Start : Starts) {
17078bcb0991SDimitry Andric       if (Start->getOpcode() == ARM::t2WhileLoopStart)
17088bcb0991SDimitry Andric         RevertWhile(Start);
17098bcb0991SDimitry Andric       else
1710*e8d8bef9SDimitry Andric         RevertDo(Start);
17118bcb0991SDimitry Andric     }
17128bcb0991SDimitry Andric     for (auto *Dec : Decs)
17138bcb0991SDimitry Andric       RevertLoopDec(Dec);
17148bcb0991SDimitry Andric 
17158bcb0991SDimitry Andric     for (auto *End : Ends)
17168bcb0991SDimitry Andric       RevertLoopEnd(End);
1717*e8d8bef9SDimitry Andric     for (auto *End : EndDecs)
1718*e8d8bef9SDimitry Andric       RevertLoopEndDec(End);
17198bcb0991SDimitry Andric   }
17208bcb0991SDimitry Andric   return Changed;
17218bcb0991SDimitry Andric }
17228bcb0991SDimitry Andric 
17230b57cec5SDimitry Andric FunctionPass *llvm::createARMLowOverheadLoopsPass() {
17240b57cec5SDimitry Andric   return new ARMLowOverheadLoops();
17250b57cec5SDimitry Andric }
1726