10b57cec5SDimitry Andric //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
100b57cec5SDimitry Andric /// instructions into machine operations.
110b57cec5SDimitry Andric /// The expectation is that the loop contains three pseudo instructions:
120b57cec5SDimitry Andric /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
130b57cec5SDimitry Andric /// form should be in the preheader, whereas the while form should be in the
148bcb0991SDimitry Andric /// preheaders only predecessor.
150b57cec5SDimitry Andric /// - t2LoopDec - placed within in the loop body.
160b57cec5SDimitry Andric /// - t2LoopEnd - the loop latch terminator.
170b57cec5SDimitry Andric ///
18480093f4SDimitry Andric /// In addition to this, we also look for the presence of the VCTP instruction,
19480093f4SDimitry Andric /// which determines whether we can generated the tail-predicated low-overhead
20480093f4SDimitry Andric /// loop form.
21480093f4SDimitry Andric ///
22480093f4SDimitry Andric /// Assumptions and Dependencies:
23480093f4SDimitry Andric /// Low-overhead loops are constructed and executed using a setup instruction:
24480093f4SDimitry Andric /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25480093f4SDimitry Andric /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26480093f4SDimitry Andric /// but fixed polarity: WLS can only branch forwards and LE can only branch
27480093f4SDimitry Andric /// backwards. These restrictions mean that this pass is dependent upon block
28480093f4SDimitry Andric /// layout and block sizes, which is why it's the last pass to run. The same is
29480093f4SDimitry Andric /// true for ConstantIslands, but this pass does not increase the size of the
30480093f4SDimitry Andric /// basic blocks, nor does it change the CFG. Instructions are mainly removed
31480093f4SDimitry Andric /// during the transform and pseudo instructions are replaced by real ones. In
32480093f4SDimitry Andric /// some cases, when we have to revert to a 'normal' loop, we have to introduce
33480093f4SDimitry Andric /// multiple instructions for a single pseudo (see RevertWhile and
34fe6060f1SDimitry Andric /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
35480093f4SDimitry Andric /// are defined to be as large as this maximum sequence of replacement
36480093f4SDimitry Andric /// instructions.
37480093f4SDimitry Andric ///
385ffd83dbSDimitry Andric /// A note on VPR.P0 (the lane mask):
395ffd83dbSDimitry Andric /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
405ffd83dbSDimitry Andric /// "VPT Active" context (which includes low-overhead loops and vpt blocks).
415ffd83dbSDimitry Andric /// They will simply "and" the result of their calculation with the current
425ffd83dbSDimitry Andric /// value of VPR.P0. You can think of it like this:
435ffd83dbSDimitry Andric /// \verbatim
445ffd83dbSDimitry Andric /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs
455ffd83dbSDimitry Andric /// VPR.P0 &= Value
465ffd83dbSDimitry Andric /// else
475ffd83dbSDimitry Andric /// VPR.P0 = Value
485ffd83dbSDimitry Andric /// \endverbatim
495ffd83dbSDimitry Andric /// When we're inside the low-overhead loop (between DLSTP and LETP), we always
505ffd83dbSDimitry Andric /// fall in the "VPT active" case, so we can consider that all VPR writes by
515ffd83dbSDimitry Andric /// one of those instruction is actually a "and".
520b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
530b57cec5SDimitry Andric
540b57cec5SDimitry Andric #include "ARM.h"
550b57cec5SDimitry Andric #include "ARMBaseInstrInfo.h"
560b57cec5SDimitry Andric #include "ARMBaseRegisterInfo.h"
570b57cec5SDimitry Andric #include "ARMBasicBlockInfo.h"
580b57cec5SDimitry Andric #include "ARMSubtarget.h"
59e8d8bef9SDimitry Andric #include "MVETailPredUtils.h"
60480093f4SDimitry Andric #include "Thumb2InstrInfo.h"
61480093f4SDimitry Andric #include "llvm/ADT/SetOperations.h"
6281ad6265SDimitry Andric #include "llvm/ADT/SetVector.h"
635ffd83dbSDimitry Andric #include "llvm/CodeGen/LivePhysRegs.h"
6481ad6265SDimitry Andric #include "llvm/CodeGen/MachineFrameInfo.h"
650b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
660b57cec5SDimitry Andric #include "llvm/CodeGen/MachineLoopInfo.h"
67480093f4SDimitry Andric #include "llvm/CodeGen/MachineLoopUtils.h"
680b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
69480093f4SDimitry Andric #include "llvm/CodeGen/Passes.h"
70480093f4SDimitry Andric #include "llvm/CodeGen/ReachingDefAnalysis.h"
71480093f4SDimitry Andric #include "llvm/MC/MCInstrDesc.h"
720b57cec5SDimitry Andric
730b57cec5SDimitry Andric using namespace llvm;
740b57cec5SDimitry Andric
750b57cec5SDimitry Andric #define DEBUG_TYPE "arm-low-overhead-loops"
760b57cec5SDimitry Andric #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
770b57cec5SDimitry Andric
78e8d8bef9SDimitry Andric static cl::opt<bool>
79e8d8bef9SDimitry Andric DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
80e8d8bef9SDimitry Andric cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
81e8d8bef9SDimitry Andric cl::init(false));
82e8d8bef9SDimitry Andric
83bdd1243dSDimitry Andric static cl::opt<bool>
84bdd1243dSDimitry Andric DisableOmitDLS("arm-disable-omit-dls", cl::Hidden,
85bdd1243dSDimitry Andric cl::desc("Disable omitting 'dls lr, lr' instructions"),
86bdd1243dSDimitry Andric cl::init(false));
87bdd1243dSDimitry Andric
isVectorPredicated(MachineInstr * MI)88e8d8bef9SDimitry Andric static bool isVectorPredicated(MachineInstr *MI) {
89e8d8bef9SDimitry Andric int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
90e8d8bef9SDimitry Andric return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
91e8d8bef9SDimitry Andric }
92e8d8bef9SDimitry Andric
isVectorPredicate(MachineInstr * MI)93e8d8bef9SDimitry Andric static bool isVectorPredicate(MachineInstr *MI) {
94*0fca6ea1SDimitry Andric return MI->findRegisterDefOperandIdx(ARM::VPR, /*TRI=*/nullptr) != -1;
95e8d8bef9SDimitry Andric }
96e8d8bef9SDimitry Andric
hasVPRUse(MachineInstr & MI)97e8d8bef9SDimitry Andric static bool hasVPRUse(MachineInstr &MI) {
98*0fca6ea1SDimitry Andric return MI.findRegisterUseOperandIdx(ARM::VPR, /*TRI=*/nullptr) != -1;
99e8d8bef9SDimitry Andric }
100e8d8bef9SDimitry Andric
isDomainMVE(MachineInstr * MI)101e8d8bef9SDimitry Andric static bool isDomainMVE(MachineInstr *MI) {
102e8d8bef9SDimitry Andric uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
103e8d8bef9SDimitry Andric return Domain == ARMII::DomainMVE;
104e8d8bef9SDimitry Andric }
105e8d8bef9SDimitry Andric
getVecSize(const MachineInstr & MI)106349cc55cSDimitry Andric static int getVecSize(const MachineInstr &MI) {
107349cc55cSDimitry Andric const MCInstrDesc &MCID = MI.getDesc();
108349cc55cSDimitry Andric uint64_t Flags = MCID.TSFlags;
109349cc55cSDimitry Andric return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
110349cc55cSDimitry Andric }
111349cc55cSDimitry Andric
shouldInspect(MachineInstr & MI)112e8d8bef9SDimitry Andric static bool shouldInspect(MachineInstr &MI) {
113349cc55cSDimitry Andric if (MI.isDebugInstr())
114349cc55cSDimitry Andric return false;
115e8d8bef9SDimitry Andric return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
116e8d8bef9SDimitry Andric }
117e8d8bef9SDimitry Andric
isHorizontalReduction(const MachineInstr & MI)118*0fca6ea1SDimitry Andric static bool isHorizontalReduction(const MachineInstr &MI) {
119*0fca6ea1SDimitry Andric const MCInstrDesc &MCID = MI.getDesc();
120*0fca6ea1SDimitry Andric uint64_t Flags = MCID.TSFlags;
121*0fca6ea1SDimitry Andric return (Flags & ARMII::HorizontalReduction) != 0;
122*0fca6ea1SDimitry Andric }
123*0fca6ea1SDimitry Andric
1240b57cec5SDimitry Andric namespace {
1250b57cec5SDimitry Andric
1265ffd83dbSDimitry Andric using InstSet = SmallPtrSetImpl<MachineInstr *>;
1275ffd83dbSDimitry Andric
1285ffd83dbSDimitry Andric class PostOrderLoopTraversal {
1295ffd83dbSDimitry Andric MachineLoop &ML;
1305ffd83dbSDimitry Andric MachineLoopInfo &MLI;
1315ffd83dbSDimitry Andric SmallPtrSet<MachineBasicBlock*, 4> Visited;
1325ffd83dbSDimitry Andric SmallVector<MachineBasicBlock*, 4> Order;
1335ffd83dbSDimitry Andric
1345ffd83dbSDimitry Andric public:
PostOrderLoopTraversal(MachineLoop & ML,MachineLoopInfo & MLI)1355ffd83dbSDimitry Andric PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
1365ffd83dbSDimitry Andric : ML(ML), MLI(MLI) { }
1375ffd83dbSDimitry Andric
getOrder() const1385ffd83dbSDimitry Andric const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
1395ffd83dbSDimitry Andric return Order;
1405ffd83dbSDimitry Andric }
1415ffd83dbSDimitry Andric
1425ffd83dbSDimitry Andric // Visit all the blocks within the loop, as well as exit blocks and any
1435ffd83dbSDimitry Andric // blocks properly dominating the header.
ProcessLoop()1445ffd83dbSDimitry Andric void ProcessLoop() {
1455ffd83dbSDimitry Andric std::function<void(MachineBasicBlock*)> Search = [this, &Search]
1465ffd83dbSDimitry Andric (MachineBasicBlock *MBB) -> void {
1475ffd83dbSDimitry Andric if (Visited.count(MBB))
1485ffd83dbSDimitry Andric return;
1495ffd83dbSDimitry Andric
1505ffd83dbSDimitry Andric Visited.insert(MBB);
1515ffd83dbSDimitry Andric for (auto *Succ : MBB->successors()) {
1525ffd83dbSDimitry Andric if (!ML.contains(Succ))
1535ffd83dbSDimitry Andric continue;
1545ffd83dbSDimitry Andric Search(Succ);
1555ffd83dbSDimitry Andric }
1565ffd83dbSDimitry Andric Order.push_back(MBB);
1575ffd83dbSDimitry Andric };
1585ffd83dbSDimitry Andric
1595ffd83dbSDimitry Andric // Insert exit blocks.
1605ffd83dbSDimitry Andric SmallVector<MachineBasicBlock*, 2> ExitBlocks;
1615ffd83dbSDimitry Andric ML.getExitBlocks(ExitBlocks);
162e8d8bef9SDimitry Andric append_range(Order, ExitBlocks);
1635ffd83dbSDimitry Andric
1645ffd83dbSDimitry Andric // Then add the loop body.
1655ffd83dbSDimitry Andric Search(ML.getHeader());
1665ffd83dbSDimitry Andric
1675ffd83dbSDimitry Andric // Then try the preheader and its predecessors.
1685ffd83dbSDimitry Andric std::function<void(MachineBasicBlock*)> GetPredecessor =
1695ffd83dbSDimitry Andric [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
1705ffd83dbSDimitry Andric Order.push_back(MBB);
1715ffd83dbSDimitry Andric if (MBB->pred_size() == 1)
1725ffd83dbSDimitry Andric GetPredecessor(*MBB->pred_begin());
1735ffd83dbSDimitry Andric };
1745ffd83dbSDimitry Andric
1755ffd83dbSDimitry Andric if (auto *Preheader = ML.getLoopPreheader())
1765ffd83dbSDimitry Andric GetPredecessor(Preheader);
177fe6060f1SDimitry Andric else if (auto *Preheader = MLI.findLoopPreheader(&ML, true, true))
1785ffd83dbSDimitry Andric GetPredecessor(Preheader);
1795ffd83dbSDimitry Andric }
1805ffd83dbSDimitry Andric };
1815ffd83dbSDimitry Andric
182*0fca6ea1SDimitry Andric class VPTBlock {
183*0fca6ea1SDimitry Andric SmallVector<MachineInstr *, 4> Insts;
184480093f4SDimitry Andric
185480093f4SDimitry Andric public:
VPTBlock(MachineInstr * MI)186*0fca6ea1SDimitry Andric VPTBlock(MachineInstr *MI) { Insts.push_back(MI); }
187*0fca6ea1SDimitry Andric
188*0fca6ea1SDimitry Andric // Have we found an instruction within the block which defines the vpr? If
189*0fca6ea1SDimitry Andric // so, not all the instructions in the block will have the same predicate.
hasUniformPredicate()190*0fca6ea1SDimitry Andric bool hasUniformPredicate() { return getDivergent() == nullptr; }
191*0fca6ea1SDimitry Andric
192*0fca6ea1SDimitry Andric // If it exists, return the first internal instruction which modifies the
193*0fca6ea1SDimitry Andric // VPR.
getDivergent()194*0fca6ea1SDimitry Andric MachineInstr *getDivergent() {
195*0fca6ea1SDimitry Andric SmallVectorImpl<MachineInstr *> &Insts = getInsts();
196*0fca6ea1SDimitry Andric for (unsigned i = 1; i < Insts.size(); ++i) {
197*0fca6ea1SDimitry Andric MachineInstr *Next = Insts[i];
198*0fca6ea1SDimitry Andric if (isVectorPredicate(Next))
199*0fca6ea1SDimitry Andric return Next; // Found an instruction altering the vpr.
200480093f4SDimitry Andric }
201*0fca6ea1SDimitry Andric return nullptr;
202*0fca6ea1SDimitry Andric }
203*0fca6ea1SDimitry Andric
insert(MachineInstr * MI)204*0fca6ea1SDimitry Andric void insert(MachineInstr *MI) {
205*0fca6ea1SDimitry Andric Insts.push_back(MI);
206*0fca6ea1SDimitry Andric // VPT/VPST + 4 predicated instructions.
207*0fca6ea1SDimitry Andric assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
208*0fca6ea1SDimitry Andric }
209*0fca6ea1SDimitry Andric
containsVCTP() const210*0fca6ea1SDimitry Andric bool containsVCTP() const { return llvm::any_of(Insts, isVCTP); }
211*0fca6ea1SDimitry Andric
size() const212*0fca6ea1SDimitry Andric unsigned size() const { return Insts.size(); }
getInsts()213*0fca6ea1SDimitry Andric SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
214480093f4SDimitry Andric };
215480093f4SDimitry Andric
216e8d8bef9SDimitry Andric // Represent the current state of the VPR and hold all instances which
217e8d8bef9SDimitry Andric // represent a VPT block, which is a list of instructions that begins with a
218e8d8bef9SDimitry Andric // VPT/VPST and has a maximum of four proceeding instructions. All
219e8d8bef9SDimitry Andric // instructions within the block are predicated upon the vpr and we allow
220e8d8bef9SDimitry Andric // instructions to define the vpr within in the block too.
221e8d8bef9SDimitry Andric class VPTState {
222e8d8bef9SDimitry Andric friend struct LowOverheadLoop;
223e8d8bef9SDimitry Andric
224*0fca6ea1SDimitry Andric SmallVector<VPTBlock, 4> Blocks;
225*0fca6ea1SDimitry Andric SetVector<MachineInstr *> CurrentPredicates;
226*0fca6ea1SDimitry Andric std::map<MachineInstr *, SetVector<MachineInstr *>> PredicatedInsts;
227e8d8bef9SDimitry Andric
CreateVPTBlock(MachineInstr * MI)228*0fca6ea1SDimitry Andric void CreateVPTBlock(MachineInstr *MI) {
229e8d8bef9SDimitry Andric assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
230e8d8bef9SDimitry Andric && "Can't begin VPT without predicate");
231e8d8bef9SDimitry Andric Blocks.emplace_back(MI);
232e8d8bef9SDimitry Andric // The execution of MI is predicated upon the current set of instructions
233e8d8bef9SDimitry Andric // that are AND'ed together to form the VPR predicate value. In the case
234e8d8bef9SDimitry Andric // that MI is a VPT, CurrentPredicates will also just be MI.
235*0fca6ea1SDimitry Andric PredicatedInsts[MI] = CurrentPredicates;
236e8d8bef9SDimitry Andric }
237e8d8bef9SDimitry Andric
addInst(MachineInstr * MI)238*0fca6ea1SDimitry Andric void addInst(MachineInstr *MI) {
239e8d8bef9SDimitry Andric Blocks.back().insert(MI);
240*0fca6ea1SDimitry Andric PredicatedInsts[MI] = CurrentPredicates;
241e8d8bef9SDimitry Andric }
242e8d8bef9SDimitry Andric
addPredicate(MachineInstr * MI)243*0fca6ea1SDimitry Andric void addPredicate(MachineInstr *MI) {
244e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
245e8d8bef9SDimitry Andric CurrentPredicates.insert(MI);
246e8d8bef9SDimitry Andric }
247e8d8bef9SDimitry Andric
resetPredicate(MachineInstr * MI)248*0fca6ea1SDimitry Andric void resetPredicate(MachineInstr *MI) {
249e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
250e8d8bef9SDimitry Andric CurrentPredicates.clear();
251e8d8bef9SDimitry Andric CurrentPredicates.insert(MI);
252e8d8bef9SDimitry Andric }
253480093f4SDimitry Andric
254480093f4SDimitry Andric public:
255e8d8bef9SDimitry Andric // Return whether the given instruction is predicated upon a VCTP.
isPredicatedOnVCTP(MachineInstr * MI,bool Exclusive=false)256*0fca6ea1SDimitry Andric bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
257*0fca6ea1SDimitry Andric SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI];
258e8d8bef9SDimitry Andric if (Exclusive && Predicates.size() != 1)
259e8d8bef9SDimitry Andric return false;
260*0fca6ea1SDimitry Andric // We do not know how to convert an else predicate of a VCTP.
261*0fca6ea1SDimitry Andric if (getVPTInstrPredicate(*MI) == ARMVCC::Else)
262*0fca6ea1SDimitry Andric return false;
2630eae32dcSDimitry Andric return llvm::any_of(Predicates, isVCTP);
264480093f4SDimitry Andric }
265480093f4SDimitry Andric
266e8d8bef9SDimitry Andric // Is the VPST, controlling the block entry, predicated upon a VCTP.
isEntryPredicatedOnVCTP(VPTBlock & Block,bool Exclusive=false)267*0fca6ea1SDimitry Andric bool isEntryPredicatedOnVCTP(VPTBlock &Block, bool Exclusive = false) {
268e8d8bef9SDimitry Andric SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
269e8d8bef9SDimitry Andric return isPredicatedOnVCTP(Insts.front(), Exclusive);
270e8d8bef9SDimitry Andric }
271e8d8bef9SDimitry Andric
272e8d8bef9SDimitry Andric // If this block begins with a VPT, we can check whether it's using
273e8d8bef9SDimitry Andric // at least one predicated input(s), as well as possible loop invariant
274e8d8bef9SDimitry Andric // which would result in it being implicitly predicated.
hasImplicitlyValidVPT(VPTBlock & Block,ReachingDefAnalysis & RDA)275*0fca6ea1SDimitry Andric bool hasImplicitlyValidVPT(VPTBlock &Block, ReachingDefAnalysis &RDA) {
276e8d8bef9SDimitry Andric SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
277e8d8bef9SDimitry Andric MachineInstr *VPT = Insts.front();
278e8d8bef9SDimitry Andric assert(isVPTOpcode(VPT->getOpcode()) &&
279e8d8bef9SDimitry Andric "Expected VPT block to begin with VPT/VPST");
280e8d8bef9SDimitry Andric
281e8d8bef9SDimitry Andric if (VPT->getOpcode() == ARM::MVE_VPST)
282e8d8bef9SDimitry Andric return false;
283e8d8bef9SDimitry Andric
284*0fca6ea1SDimitry Andric // If the VPT block does not define something that is an "output", then
285*0fca6ea1SDimitry Andric // the tail-predicated version will just perform a subset of the original
286*0fca6ea1SDimitry Andric // vpt block, where the last lanes should not be used.
287*0fca6ea1SDimitry Andric if (isVPTOpcode(VPT->getOpcode()) &&
288*0fca6ea1SDimitry Andric all_of(Block.getInsts(), [](const MachineInstr *MI) {
289*0fca6ea1SDimitry Andric return !MI->mayStore() && !MI->mayLoad() &&
290*0fca6ea1SDimitry Andric !isHorizontalReduction(*MI) && !isVCTP(MI);
291*0fca6ea1SDimitry Andric }))
292*0fca6ea1SDimitry Andric return true;
293*0fca6ea1SDimitry Andric
294e8d8bef9SDimitry Andric auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
295e8d8bef9SDimitry Andric MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
296e8d8bef9SDimitry Andric return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
297e8d8bef9SDimitry Andric };
298e8d8bef9SDimitry Andric
299e8d8bef9SDimitry Andric auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
300e8d8bef9SDimitry Andric MachineOperand &MO = MI->getOperand(Idx);
301e8d8bef9SDimitry Andric if (!MO.isReg() || !MO.getReg())
302e8d8bef9SDimitry Andric return true;
303e8d8bef9SDimitry Andric
304e8d8bef9SDimitry Andric SmallPtrSet<MachineInstr *, 2> Defs;
305e8d8bef9SDimitry Andric RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
306e8d8bef9SDimitry Andric if (Defs.empty())
307e8d8bef9SDimitry Andric return true;
308e8d8bef9SDimitry Andric
309e8d8bef9SDimitry Andric for (auto *Def : Defs)
310e8d8bef9SDimitry Andric if (Def->getParent() == VPT->getParent())
311e8d8bef9SDimitry Andric return false;
312e8d8bef9SDimitry Andric return true;
313e8d8bef9SDimitry Andric };
314e8d8bef9SDimitry Andric
315e8d8bef9SDimitry Andric // Check that at least one of the operands is directly predicated on a
316e8d8bef9SDimitry Andric // vctp and allow an invariant value too.
317e8d8bef9SDimitry Andric return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
318e8d8bef9SDimitry Andric (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
319e8d8bef9SDimitry Andric (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
320e8d8bef9SDimitry Andric }
321e8d8bef9SDimitry Andric
isValid(ReachingDefAnalysis & RDA)322*0fca6ea1SDimitry Andric bool isValid(ReachingDefAnalysis &RDA) {
323e8d8bef9SDimitry Andric // All predication within the loop should be based on vctp. If the block
324e8d8bef9SDimitry Andric // isn't predicated on entry, check whether the vctp is within the block
325e8d8bef9SDimitry Andric // and that all other instructions are then predicated on it.
326e8d8bef9SDimitry Andric for (auto &Block : Blocks) {
327*0fca6ea1SDimitry Andric if (isEntryPredicatedOnVCTP(Block, false) &&
328*0fca6ea1SDimitry Andric !any_of(drop_begin(Block.getInsts()), [](const MachineInstr *MI) {
329*0fca6ea1SDimitry Andric return getVPTInstrPredicate(*MI) == ARMVCC::Else;
330*0fca6ea1SDimitry Andric }))
331*0fca6ea1SDimitry Andric continue;
332*0fca6ea1SDimitry Andric if (hasImplicitlyValidVPT(Block, RDA))
333e8d8bef9SDimitry Andric continue;
334e8d8bef9SDimitry Andric
335e8d8bef9SDimitry Andric SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
336e8d8bef9SDimitry Andric // We don't know how to convert a block with just a VPT;VCTP into
337e8d8bef9SDimitry Andric // anything valid once we remove the VCTP. For now just bail out.
338e8d8bef9SDimitry Andric assert(isVPTOpcode(Insts.front()->getOpcode()) &&
339e8d8bef9SDimitry Andric "Expected VPT block to start with a VPST or VPT!");
340e8d8bef9SDimitry Andric if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
341e8d8bef9SDimitry Andric isVCTP(Insts.back()))
342e8d8bef9SDimitry Andric return false;
343e8d8bef9SDimitry Andric
344e8d8bef9SDimitry Andric for (auto *MI : Insts) {
345e8d8bef9SDimitry Andric // Check that any internal VCTPs are 'Then' predicated.
346e8d8bef9SDimitry Andric if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
347e8d8bef9SDimitry Andric return false;
348e8d8bef9SDimitry Andric // Skip other instructions that build up the predicate.
349e8d8bef9SDimitry Andric if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
350e8d8bef9SDimitry Andric continue;
351e8d8bef9SDimitry Andric // Check that any other instructions are predicated upon a vctp.
352e8d8bef9SDimitry Andric // TODO: We could infer when VPTs are implicitly predicated on the
353e8d8bef9SDimitry Andric // vctp (when the operands are predicated).
354e8d8bef9SDimitry Andric if (!isPredicatedOnVCTP(MI)) {
355e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
356e8d8bef9SDimitry Andric return false;
357e8d8bef9SDimitry Andric }
358e8d8bef9SDimitry Andric }
359e8d8bef9SDimitry Andric }
360e8d8bef9SDimitry Andric return true;
361e8d8bef9SDimitry Andric }
3625ffd83dbSDimitry Andric };
3635ffd83dbSDimitry Andric
364480093f4SDimitry Andric struct LowOverheadLoop {
365480093f4SDimitry Andric
3665ffd83dbSDimitry Andric MachineLoop &ML;
3675ffd83dbSDimitry Andric MachineBasicBlock *Preheader = nullptr;
3685ffd83dbSDimitry Andric MachineLoopInfo &MLI;
3695ffd83dbSDimitry Andric ReachingDefAnalysis &RDA;
3705ffd83dbSDimitry Andric const TargetRegisterInfo &TRI;
3715ffd83dbSDimitry Andric const ARMBaseInstrInfo &TII;
372480093f4SDimitry Andric MachineFunction *MF = nullptr;
373e8d8bef9SDimitry Andric MachineBasicBlock::iterator StartInsertPt;
374e8d8bef9SDimitry Andric MachineBasicBlock *StartInsertBB = nullptr;
375480093f4SDimitry Andric MachineInstr *Start = nullptr;
376480093f4SDimitry Andric MachineInstr *Dec = nullptr;
377480093f4SDimitry Andric MachineInstr *End = nullptr;
378e8d8bef9SDimitry Andric MachineOperand TPNumElements;
379e8d8bef9SDimitry Andric SmallVector<MachineInstr *, 4> VCTPs;
3805ffd83dbSDimitry Andric SmallPtrSet<MachineInstr *, 4> ToRemove;
3815ffd83dbSDimitry Andric SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute;
382349cc55cSDimitry Andric SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs;
383349cc55cSDimitry Andric SmallPtrSet<MachineInstr *, 4> VMOVCopies;
384480093f4SDimitry Andric bool Revert = false;
385480093f4SDimitry Andric bool CannotTailPredicate = false;
386*0fca6ea1SDimitry Andric VPTState VPTstate;
387480093f4SDimitry Andric
LowOverheadLoop__anona50655620111::LowOverheadLoop3885ffd83dbSDimitry Andric LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
3895ffd83dbSDimitry Andric ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
3905ffd83dbSDimitry Andric const ARMBaseInstrInfo &TII)
391e8d8bef9SDimitry Andric : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
392e8d8bef9SDimitry Andric TPNumElements(MachineOperand::CreateImm(0)) {
3935ffd83dbSDimitry Andric MF = ML.getHeader()->getParent();
3945ffd83dbSDimitry Andric if (auto *MBB = ML.getLoopPreheader())
3955ffd83dbSDimitry Andric Preheader = MBB;
396fe6060f1SDimitry Andric else if (auto *MBB = MLI.findLoopPreheader(&ML, true, true))
3975ffd83dbSDimitry Andric Preheader = MBB;
398480093f4SDimitry Andric }
399480093f4SDimitry Andric
400480093f4SDimitry Andric // If this is an MVE instruction, check that we know how to use tail
401480093f4SDimitry Andric // predication with it. Record VPT blocks and return whether the
402480093f4SDimitry Andric // instruction is valid for tail predication.
403480093f4SDimitry Andric bool ValidateMVEInst(MachineInstr *MI);
404480093f4SDimitry Andric
AnalyseMVEInst__anona50655620111::LowOverheadLoop405480093f4SDimitry Andric void AnalyseMVEInst(MachineInstr *MI) {
406480093f4SDimitry Andric CannotTailPredicate = !ValidateMVEInst(MI);
407480093f4SDimitry Andric }
408480093f4SDimitry Andric
IsTailPredicationLegal__anona50655620111::LowOverheadLoop409480093f4SDimitry Andric bool IsTailPredicationLegal() const {
410480093f4SDimitry Andric // For now, let's keep things really simple and only support a single
411480093f4SDimitry Andric // block for tail predication.
412e8d8bef9SDimitry Andric return !Revert && FoundAllComponents() && !VCTPs.empty() &&
4135ffd83dbSDimitry Andric !CannotTailPredicate && ML.getNumBlocks() == 1;
414480093f4SDimitry Andric }
415480093f4SDimitry Andric
416e8d8bef9SDimitry Andric // Given that MI is a VCTP, check that is equivalent to any other VCTPs
417e8d8bef9SDimitry Andric // found.
418e8d8bef9SDimitry Andric bool AddVCTP(MachineInstr *MI);
419e8d8bef9SDimitry Andric
4205ffd83dbSDimitry Andric // Check that the predication in the loop will be equivalent once we
4215ffd83dbSDimitry Andric // perform the conversion. Also ensure that we can provide the number
4225ffd83dbSDimitry Andric // of elements to the loop start instruction.
423e8d8bef9SDimitry Andric bool ValidateTailPredicate();
4245ffd83dbSDimitry Andric
4255ffd83dbSDimitry Andric // Check that any values available outside of the loop will be the same
4265ffd83dbSDimitry Andric // after tail predication conversion.
4275ffd83dbSDimitry Andric bool ValidateLiveOuts();
428480093f4SDimitry Andric
429480093f4SDimitry Andric // Check the branch targets are within range and we satisfy our
430480093f4SDimitry Andric // restrictions.
431e8d8bef9SDimitry Andric void Validate(ARMBasicBlockUtils *BBUtils);
432480093f4SDimitry Andric
FoundAllComponents__anona50655620111::LowOverheadLoop433480093f4SDimitry Andric bool FoundAllComponents() const {
434480093f4SDimitry Andric return Start && Dec && End;
435480093f4SDimitry Andric }
436480093f4SDimitry Andric
getVPTBlocks__anona50655620111::LowOverheadLoop437*0fca6ea1SDimitry Andric SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTstate.Blocks; }
438480093f4SDimitry Andric
439e8d8bef9SDimitry Andric // Return the operand for the loop start instruction. This will be the loop
440e8d8bef9SDimitry Andric // iteration count, or the number of elements if we're tail predicating.
getLoopStartOperand__anona50655620111::LowOverheadLoop441e8d8bef9SDimitry Andric MachineOperand &getLoopStartOperand() {
442e8d8bef9SDimitry Andric if (IsTailPredicationLegal())
443e8d8bef9SDimitry Andric return TPNumElements;
444fe6060f1SDimitry Andric return Start->getOperand(1);
445480093f4SDimitry Andric }
446480093f4SDimitry Andric
getStartOpcode__anona50655620111::LowOverheadLoop447480093f4SDimitry Andric unsigned getStartOpcode() const {
448fe6060f1SDimitry Andric bool IsDo = isDoLoopStart(*Start);
449480093f4SDimitry Andric if (!IsTailPredicationLegal())
450480093f4SDimitry Andric return IsDo ? ARM::t2DLS : ARM::t2WLS;
451480093f4SDimitry Andric
452e8d8bef9SDimitry Andric return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
453480093f4SDimitry Andric }
454480093f4SDimitry Andric
dump__anona50655620111::LowOverheadLoop455480093f4SDimitry Andric void dump() const {
456480093f4SDimitry Andric if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
457480093f4SDimitry Andric if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
458480093f4SDimitry Andric if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
459e8d8bef9SDimitry Andric if (!VCTPs.empty()) {
460e8d8bef9SDimitry Andric dbgs() << "ARM Loops: Found VCTP(s):\n";
461e8d8bef9SDimitry Andric for (auto *MI : VCTPs)
462e8d8bef9SDimitry Andric dbgs() << " - " << *MI;
463e8d8bef9SDimitry Andric }
464480093f4SDimitry Andric if (!FoundAllComponents())
465480093f4SDimitry Andric dbgs() << "ARM Loops: Not a low-overhead loop.\n";
466480093f4SDimitry Andric else if (!(Start && Dec && End))
467480093f4SDimitry Andric dbgs() << "ARM Loops: Failed to find all loop components.\n";
468480093f4SDimitry Andric }
469480093f4SDimitry Andric };
470480093f4SDimitry Andric
4710b57cec5SDimitry Andric class ARMLowOverheadLoops : public MachineFunctionPass {
4728bcb0991SDimitry Andric MachineFunction *MF = nullptr;
473480093f4SDimitry Andric MachineLoopInfo *MLI = nullptr;
474480093f4SDimitry Andric ReachingDefAnalysis *RDA = nullptr;
4750b57cec5SDimitry Andric const ARMBaseInstrInfo *TII = nullptr;
4760b57cec5SDimitry Andric MachineRegisterInfo *MRI = nullptr;
477480093f4SDimitry Andric const TargetRegisterInfo *TRI = nullptr;
4780b57cec5SDimitry Andric std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
4790b57cec5SDimitry Andric
4800b57cec5SDimitry Andric public:
4810b57cec5SDimitry Andric static char ID;
4820b57cec5SDimitry Andric
ARMLowOverheadLoops()4830b57cec5SDimitry Andric ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
4840b57cec5SDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const4850b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
4860b57cec5SDimitry Andric AU.setPreservesCFG();
487*0fca6ea1SDimitry Andric AU.addRequired<MachineLoopInfoWrapperPass>();
488480093f4SDimitry Andric AU.addRequired<ReachingDefAnalysis>();
4890b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU);
4900b57cec5SDimitry Andric }
4910b57cec5SDimitry Andric
4920b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override;
4930b57cec5SDimitry Andric
getRequiredProperties() const4940b57cec5SDimitry Andric MachineFunctionProperties getRequiredProperties() const override {
4950b57cec5SDimitry Andric return MachineFunctionProperties().set(
496480093f4SDimitry Andric MachineFunctionProperties::Property::NoVRegs).set(
497480093f4SDimitry Andric MachineFunctionProperties::Property::TracksLiveness);
4980b57cec5SDimitry Andric }
4990b57cec5SDimitry Andric
getPassName() const5000b57cec5SDimitry Andric StringRef getPassName() const override {
5010b57cec5SDimitry Andric return ARM_LOW_OVERHEAD_LOOPS_NAME;
5020b57cec5SDimitry Andric }
5038bcb0991SDimitry Andric
5048bcb0991SDimitry Andric private:
5058bcb0991SDimitry Andric bool ProcessLoop(MachineLoop *ML);
5068bcb0991SDimitry Andric
5078bcb0991SDimitry Andric bool RevertNonLoops();
5088bcb0991SDimitry Andric
5098bcb0991SDimitry Andric void RevertWhile(MachineInstr *MI) const;
510e8d8bef9SDimitry Andric void RevertDo(MachineInstr *MI) const;
5118bcb0991SDimitry Andric
5125ffd83dbSDimitry Andric bool RevertLoopDec(MachineInstr *MI) const;
5138bcb0991SDimitry Andric
5148bcb0991SDimitry Andric void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
5158bcb0991SDimitry Andric
516e8d8bef9SDimitry Andric void RevertLoopEndDec(MachineInstr *MI) const;
517480093f4SDimitry Andric
518e8d8bef9SDimitry Andric void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
5195ffd83dbSDimitry Andric
520480093f4SDimitry Andric MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
521480093f4SDimitry Andric
522480093f4SDimitry Andric void Expand(LowOverheadLoop &LoLoop);
5238bcb0991SDimitry Andric
5245ffd83dbSDimitry Andric void IterationCountDCE(LowOverheadLoop &LoLoop);
5250b57cec5SDimitry Andric };
5260b57cec5SDimitry Andric }
5270b57cec5SDimitry Andric
5280b57cec5SDimitry Andric char ARMLowOverheadLoops::ID = 0;
5290b57cec5SDimitry Andric
INITIALIZE_PASS(ARMLowOverheadLoops,DEBUG_TYPE,ARM_LOW_OVERHEAD_LOOPS_NAME,false,false)5300b57cec5SDimitry Andric INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
5310b57cec5SDimitry Andric false, false)
5320b57cec5SDimitry Andric
533e8d8bef9SDimitry Andric static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
534e8d8bef9SDimitry Andric InstSet &ToRemove, InstSet &Ignore) {
535480093f4SDimitry Andric
536e8d8bef9SDimitry Andric // Check that we can remove all of Killed without having to modify any IT
537e8d8bef9SDimitry Andric // blocks.
538e8d8bef9SDimitry Andric auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
539e8d8bef9SDimitry Andric // Collect the dead code and the MBBs in which they reside.
540e8d8bef9SDimitry Andric SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
541e8d8bef9SDimitry Andric for (auto *Dead : Killed)
542e8d8bef9SDimitry Andric BasicBlocks.insert(Dead->getParent());
543e8d8bef9SDimitry Andric
544e8d8bef9SDimitry Andric // Collect IT blocks in all affected basic blocks.
545e8d8bef9SDimitry Andric std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
546e8d8bef9SDimitry Andric for (auto *MBB : BasicBlocks) {
547e8d8bef9SDimitry Andric for (auto &IT : *MBB) {
548e8d8bef9SDimitry Andric if (IT.getOpcode() != ARM::t2IT)
549e8d8bef9SDimitry Andric continue;
550e8d8bef9SDimitry Andric RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE),
551e8d8bef9SDimitry Andric ITBlocks[&IT]);
552e8d8bef9SDimitry Andric }
553e8d8bef9SDimitry Andric }
554e8d8bef9SDimitry Andric
555e8d8bef9SDimitry Andric // If we're removing all of the instructions within an IT block, then
556e8d8bef9SDimitry Andric // also remove the IT instruction.
557e8d8bef9SDimitry Andric SmallPtrSet<MachineInstr *, 2> ModifiedITs;
558e8d8bef9SDimitry Andric SmallPtrSet<MachineInstr *, 2> RemoveITs;
559e8d8bef9SDimitry Andric for (auto *Dead : Killed) {
560*0fca6ea1SDimitry Andric if (MachineOperand *MO =
561*0fca6ea1SDimitry Andric Dead->findRegisterUseOperand(ARM::ITSTATE, /*TRI=*/nullptr)) {
562e8d8bef9SDimitry Andric MachineInstr *IT = RDA.getMIOperand(Dead, *MO);
563e8d8bef9SDimitry Andric RemoveITs.insert(IT);
564e8d8bef9SDimitry Andric auto &CurrentBlock = ITBlocks[IT];
565e8d8bef9SDimitry Andric CurrentBlock.erase(Dead);
566e8d8bef9SDimitry Andric if (CurrentBlock.empty())
567e8d8bef9SDimitry Andric ModifiedITs.erase(IT);
568e8d8bef9SDimitry Andric else
569e8d8bef9SDimitry Andric ModifiedITs.insert(IT);
570e8d8bef9SDimitry Andric }
571e8d8bef9SDimitry Andric }
572e8d8bef9SDimitry Andric if (!ModifiedITs.empty())
573e8d8bef9SDimitry Andric return false;
574e8d8bef9SDimitry Andric Killed.insert(RemoveITs.begin(), RemoveITs.end());
575e8d8bef9SDimitry Andric return true;
576480093f4SDimitry Andric };
577480093f4SDimitry Andric
578e8d8bef9SDimitry Andric SmallPtrSet<MachineInstr *, 2> Uses;
579e8d8bef9SDimitry Andric if (!RDA.isSafeToRemove(MI, Uses, Ignore))
580e8d8bef9SDimitry Andric return false;
581480093f4SDimitry Andric
582e8d8bef9SDimitry Andric if (WontCorruptITs(Uses, RDA)) {
583e8d8bef9SDimitry Andric ToRemove.insert(Uses.begin(), Uses.end());
584e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
585e8d8bef9SDimitry Andric << " - can also remove:\n";
586e8d8bef9SDimitry Andric for (auto *Use : Uses)
587e8d8bef9SDimitry Andric dbgs() << " - " << *Use);
588480093f4SDimitry Andric
589e8d8bef9SDimitry Andric SmallPtrSet<MachineInstr*, 4> Killed;
590e8d8bef9SDimitry Andric RDA.collectKilledOperands(MI, Killed);
591e8d8bef9SDimitry Andric if (WontCorruptITs(Killed, RDA)) {
592e8d8bef9SDimitry Andric ToRemove.insert(Killed.begin(), Killed.end());
593e8d8bef9SDimitry Andric LLVM_DEBUG(for (auto *Dead : Killed)
594e8d8bef9SDimitry Andric dbgs() << " - " << *Dead);
595480093f4SDimitry Andric }
596e8d8bef9SDimitry Andric return true;
597e8d8bef9SDimitry Andric }
598480093f4SDimitry Andric return false;
599480093f4SDimitry Andric }
600e8d8bef9SDimitry Andric
ValidateTailPredicate()601e8d8bef9SDimitry Andric bool LowOverheadLoop::ValidateTailPredicate() {
602e8d8bef9SDimitry Andric if (!IsTailPredicationLegal()) {
603e8d8bef9SDimitry Andric LLVM_DEBUG(if (VCTPs.empty())
604e8d8bef9SDimitry Andric dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
605e8d8bef9SDimitry Andric dbgs() << "ARM Loops: Tail-predication is not valid.\n");
606e8d8bef9SDimitry Andric return false;
607e8d8bef9SDimitry Andric }
608e8d8bef9SDimitry Andric
609e8d8bef9SDimitry Andric assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
610e8d8bef9SDimitry Andric assert(ML.getBlocks().size() == 1 &&
611e8d8bef9SDimitry Andric "Shouldn't be processing a loop with more than one block");
612e8d8bef9SDimitry Andric
613e8d8bef9SDimitry Andric if (DisableTailPredication) {
614e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
615e8d8bef9SDimitry Andric return false;
616e8d8bef9SDimitry Andric }
617e8d8bef9SDimitry Andric
618*0fca6ea1SDimitry Andric if (!VPTstate.isValid(RDA)) {
619e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
620e8d8bef9SDimitry Andric return false;
621e8d8bef9SDimitry Andric }
622e8d8bef9SDimitry Andric
623e8d8bef9SDimitry Andric if (!ValidateLiveOuts()) {
624e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
625e8d8bef9SDimitry Andric return false;
626e8d8bef9SDimitry Andric }
627e8d8bef9SDimitry Andric
628480093f4SDimitry Andric // For tail predication, we need to provide the number of elements, instead
629480093f4SDimitry Andric // of the iteration count, to the loop start instruction. The number of
630480093f4SDimitry Andric // elements is provided to the vctp instruction, so we need to check that
631480093f4SDimitry Andric // we can use this register at InsertPt.
632e8d8bef9SDimitry Andric MachineInstr *VCTP = VCTPs.back();
633fe6060f1SDimitry Andric if (Start->getOpcode() == ARM::t2DoLoopStartTP ||
634fe6060f1SDimitry Andric Start->getOpcode() == ARM::t2WhileLoopStartTP) {
635e8d8bef9SDimitry Andric TPNumElements = Start->getOperand(2);
636e8d8bef9SDimitry Andric StartInsertPt = Start;
637e8d8bef9SDimitry Andric StartInsertBB = Start->getParent();
638e8d8bef9SDimitry Andric } else {
639e8d8bef9SDimitry Andric TPNumElements = VCTP->getOperand(1);
640e8d8bef9SDimitry Andric MCRegister NumElements = TPNumElements.getReg().asMCReg();
641480093f4SDimitry Andric
642480093f4SDimitry Andric // If the register is defined within loop, then we can't perform TP.
643480093f4SDimitry Andric // TODO: Check whether this is just a mov of a register that would be
644480093f4SDimitry Andric // available.
6455ffd83dbSDimitry Andric if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
646480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
647480093f4SDimitry Andric return false;
648480093f4SDimitry Andric }
649480093f4SDimitry Andric
650480093f4SDimitry Andric // The element count register maybe defined after InsertPt, in which case we
651480093f4SDimitry Andric // need to try to move either InsertPt or the def so that the [w|d]lstp can
652480093f4SDimitry Andric // use the value.
653e8d8bef9SDimitry Andric
654e8d8bef9SDimitry Andric if (StartInsertPt != StartInsertBB->end() &&
655e8d8bef9SDimitry Andric !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
656e8d8bef9SDimitry Andric if (auto *ElemDef =
657e8d8bef9SDimitry Andric RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
658e8d8bef9SDimitry Andric if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
659480093f4SDimitry Andric ElemDef->removeFromParent();
660e8d8bef9SDimitry Andric StartInsertBB->insert(StartInsertPt, ElemDef);
661e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs()
662e8d8bef9SDimitry Andric << "ARM Loops: Moved element count def: " << *ElemDef);
663e8d8bef9SDimitry Andric } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
6645ffd83dbSDimitry Andric StartInsertPt->removeFromParent();
665e8d8bef9SDimitry Andric StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
666e8d8bef9SDimitry Andric &*StartInsertPt);
667480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
668480093f4SDimitry Andric } else {
669e8d8bef9SDimitry Andric // If we fail to move an instruction and the element count is provided
670e8d8bef9SDimitry Andric // by a mov, use the mov operand if it will have the same value at the
671e8d8bef9SDimitry Andric // insertion point
672e8d8bef9SDimitry Andric MachineOperand Operand = ElemDef->getOperand(1);
673e8d8bef9SDimitry Andric if (isMovRegOpcode(ElemDef->getOpcode()) &&
674e8d8bef9SDimitry Andric RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
675e8d8bef9SDimitry Andric RDA.getUniqueReachingMIDef(&*StartInsertPt,
676e8d8bef9SDimitry Andric Operand.getReg().asMCReg())) {
677e8d8bef9SDimitry Andric TPNumElements = Operand;
678e8d8bef9SDimitry Andric NumElements = TPNumElements.getReg();
679e8d8bef9SDimitry Andric } else {
680e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs()
681e8d8bef9SDimitry Andric << "ARM Loops: Unable to move element count to loop "
682480093f4SDimitry Andric << "start instruction.\n");
683480093f4SDimitry Andric return false;
684480093f4SDimitry Andric }
685480093f4SDimitry Andric }
686480093f4SDimitry Andric }
687e8d8bef9SDimitry Andric }
688480093f4SDimitry Andric
689480093f4SDimitry Andric // Especially in the case of while loops, InsertBB may not be the
690480093f4SDimitry Andric // preheader, so we need to check that the register isn't redefined
691480093f4SDimitry Andric // before entering the loop.
6925ffd83dbSDimitry Andric auto CannotProvideElements = [this](MachineBasicBlock *MBB,
693e8d8bef9SDimitry Andric MCRegister NumElements) {
694e8d8bef9SDimitry Andric if (MBB->empty())
695e8d8bef9SDimitry Andric return false;
696480093f4SDimitry Andric // NumElements is redefined in this block.
6975ffd83dbSDimitry Andric if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
698480093f4SDimitry Andric return true;
699480093f4SDimitry Andric
700480093f4SDimitry Andric // Don't continue searching up through multiple predecessors.
701480093f4SDimitry Andric if (MBB->pred_size() > 1)
702480093f4SDimitry Andric return true;
703480093f4SDimitry Andric
704480093f4SDimitry Andric return false;
705480093f4SDimitry Andric };
706480093f4SDimitry Andric
707e8d8bef9SDimitry Andric // Search backwards for a def, until we get to InsertBB.
7085ffd83dbSDimitry Andric MachineBasicBlock *MBB = Preheader;
709e8d8bef9SDimitry Andric while (MBB && MBB != StartInsertBB) {
710480093f4SDimitry Andric if (CannotProvideElements(MBB, NumElements)) {
711480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
712480093f4SDimitry Andric return false;
713480093f4SDimitry Andric }
714480093f4SDimitry Andric MBB = *MBB->pred_begin();
715480093f4SDimitry Andric }
716e8d8bef9SDimitry Andric }
717e8d8bef9SDimitry Andric
718e8d8bef9SDimitry Andric // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
719e8d8bef9SDimitry Andric // world the [w|d]lstp instruction would be last instruction in the preheader
720e8d8bef9SDimitry Andric // and so it would only affect instructions within the loop body. But due to
721e8d8bef9SDimitry Andric // scheduling, and/or the logic in this pass (above), the insertion point can
722e8d8bef9SDimitry Andric // be moved earlier. So if the Loop Start isn't the last instruction in the
723e8d8bef9SDimitry Andric // preheader, and if the initial element count is smaller than the vector
724e8d8bef9SDimitry Andric // width, the Loop Start instruction will immediately generate one or more
725e8d8bef9SDimitry Andric // false lane mask which can, incorrectly, affect the proceeding MVE
726e8d8bef9SDimitry Andric // instructions in the preheader.
727e8d8bef9SDimitry Andric if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
728e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
729e8d8bef9SDimitry Andric return false;
730e8d8bef9SDimitry Andric }
731480093f4SDimitry Andric
732349cc55cSDimitry Andric // For any DoubleWidthResultInstrs we found whilst scanning instructions, they
733349cc55cSDimitry Andric // need to compute an output size that is smaller than the VCTP mask operates
734349cc55cSDimitry Andric // on. The VecSize of the DoubleWidthResult is the larger vector size - the
735349cc55cSDimitry Andric // size it extends into, so any VCTP VecSize <= is valid.
736349cc55cSDimitry Andric unsigned VCTPVecSize = getVecSize(*VCTP);
737349cc55cSDimitry Andric for (MachineInstr *MI : DoubleWidthResultInstrs) {
738349cc55cSDimitry Andric unsigned InstrVecSize = getVecSize(*MI);
739349cc55cSDimitry Andric if (InstrVecSize > VCTPVecSize) {
740349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
741349cc55cSDimitry Andric << "VecSize:\n" << *MI);
742349cc55cSDimitry Andric return false;
743349cc55cSDimitry Andric }
744349cc55cSDimitry Andric }
745349cc55cSDimitry Andric
7465ffd83dbSDimitry Andric // Check that the value change of the element count is what we expect and
7475ffd83dbSDimitry Andric // that the predication will be equivalent. For this we need:
7485ffd83dbSDimitry Andric // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
7495ffd83dbSDimitry Andric // and we can also allow register copies within the chain too.
7505ffd83dbSDimitry Andric auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
7515ffd83dbSDimitry Andric return -getAddSubImmediate(*MI) == ExpectedVecWidth;
7525ffd83dbSDimitry Andric };
7535ffd83dbSDimitry Andric
754e8d8bef9SDimitry Andric MachineBasicBlock *MBB = VCTP->getParent();
755e8d8bef9SDimitry Andric // Remove modifications to the element count since they have no purpose in a
756e8d8bef9SDimitry Andric // tail predicated loop. Explicitly refer to the vctp operand no matter which
757e8d8bef9SDimitry Andric // register NumElements has been assigned to, since that is what the
758e8d8bef9SDimitry Andric // modifications will be using
759e8d8bef9SDimitry Andric if (auto *Def = RDA.getUniqueReachingMIDef(
760e8d8bef9SDimitry Andric &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
7615ffd83dbSDimitry Andric SmallPtrSet<MachineInstr*, 2> ElementChain;
762e8d8bef9SDimitry Andric SmallPtrSet<MachineInstr*, 2> Ignore;
7635ffd83dbSDimitry Andric unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
7645ffd83dbSDimitry Andric
765e8d8bef9SDimitry Andric Ignore.insert(VCTPs.begin(), VCTPs.end());
7665ffd83dbSDimitry Andric
767e8d8bef9SDimitry Andric if (TryRemove(Def, RDA, ElementChain, Ignore)) {
7685ffd83dbSDimitry Andric bool FoundSub = false;
7695ffd83dbSDimitry Andric
7705ffd83dbSDimitry Andric for (auto *MI : ElementChain) {
7715ffd83dbSDimitry Andric if (isMovRegOpcode(MI->getOpcode()))
7725ffd83dbSDimitry Andric continue;
7735ffd83dbSDimitry Andric
7745ffd83dbSDimitry Andric if (isSubImmOpcode(MI->getOpcode())) {
775e8d8bef9SDimitry Andric if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
776e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
777e8d8bef9SDimitry Andric " count: " << *MI);
7785ffd83dbSDimitry Andric return false;
7795ffd83dbSDimitry Andric }
780e8d8bef9SDimitry Andric FoundSub = true;
781e8d8bef9SDimitry Andric } else {
782e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
783e8d8bef9SDimitry Andric " count: " << *MI);
784e8d8bef9SDimitry Andric return false;
785e8d8bef9SDimitry Andric }
786e8d8bef9SDimitry Andric }
7875ffd83dbSDimitry Andric ToRemove.insert(ElementChain.begin(), ElementChain.end());
7885ffd83dbSDimitry Andric }
7895ffd83dbSDimitry Andric }
790fe6060f1SDimitry Andric
791fe6060f1SDimitry Andric // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
792fe6060f1SDimitry Andric // can also remove any extra instructions in the preheader, which often
793fe6060f1SDimitry Andric // includes a now unused MOV.
794fe6060f1SDimitry Andric if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||
795fe6060f1SDimitry Andric Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
796fe6060f1SDimitry Andric Preheader && !Preheader->empty() &&
797fe6060f1SDimitry Andric !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {
798fe6060f1SDimitry Andric if (auto *Def = RDA.getUniqueReachingMIDef(
799fe6060f1SDimitry Andric &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {
800fe6060f1SDimitry Andric SmallPtrSet<MachineInstr*, 2> Ignore;
801fe6060f1SDimitry Andric Ignore.insert(VCTPs.begin(), VCTPs.end());
802fe6060f1SDimitry Andric TryRemove(Def, RDA, ToRemove, Ignore);
803fe6060f1SDimitry Andric }
804fe6060f1SDimitry Andric }
805fe6060f1SDimitry Andric
806480093f4SDimitry Andric return true;
807480093f4SDimitry Andric }
808480093f4SDimitry Andric
isRegInClass(const MachineOperand & MO,const TargetRegisterClass * Class)8095ffd83dbSDimitry Andric static bool isRegInClass(const MachineOperand &MO,
8105ffd83dbSDimitry Andric const TargetRegisterClass *Class) {
8115ffd83dbSDimitry Andric return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
8125ffd83dbSDimitry Andric }
8135ffd83dbSDimitry Andric
8145ffd83dbSDimitry Andric // MVE 'narrowing' operate on half a lane, reading from half and writing
8155ffd83dbSDimitry Andric // to half, which are referred to has the top and bottom half. The other
8165ffd83dbSDimitry Andric // half retains its previous value.
retainsPreviousHalfElement(const MachineInstr & MI)8175ffd83dbSDimitry Andric static bool retainsPreviousHalfElement(const MachineInstr &MI) {
8185ffd83dbSDimitry Andric const MCInstrDesc &MCID = MI.getDesc();
8195ffd83dbSDimitry Andric uint64_t Flags = MCID.TSFlags;
8205ffd83dbSDimitry Andric return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
8215ffd83dbSDimitry Andric }
8225ffd83dbSDimitry Andric
8235ffd83dbSDimitry Andric // Some MVE instructions read from the top/bottom halves of their operand(s)
8245ffd83dbSDimitry Andric // and generate a vector result with result elements that are double the
8255ffd83dbSDimitry Andric // width of the input.
producesDoubleWidthResult(const MachineInstr & MI)8265ffd83dbSDimitry Andric static bool producesDoubleWidthResult(const MachineInstr &MI) {
8275ffd83dbSDimitry Andric const MCInstrDesc &MCID = MI.getDesc();
8285ffd83dbSDimitry Andric uint64_t Flags = MCID.TSFlags;
8295ffd83dbSDimitry Andric return (Flags & ARMII::DoubleWidthResult) != 0;
8305ffd83dbSDimitry Andric }
8315ffd83dbSDimitry Andric
8325ffd83dbSDimitry Andric // Can this instruction generate a non-zero result when given only zeroed
8335ffd83dbSDimitry Andric // operands? This allows us to know that, given operands with false bytes
8345ffd83dbSDimitry Andric // zeroed by masked loads, that the result will also contain zeros in those
8355ffd83dbSDimitry Andric // bytes.
canGenerateNonZeros(const MachineInstr & MI)8365ffd83dbSDimitry Andric static bool canGenerateNonZeros(const MachineInstr &MI) {
8375ffd83dbSDimitry Andric
8385ffd83dbSDimitry Andric // Check for instructions which can write into a larger element size,
8395ffd83dbSDimitry Andric // possibly writing into a previous zero'd lane.
8405ffd83dbSDimitry Andric if (producesDoubleWidthResult(MI))
8415ffd83dbSDimitry Andric return true;
8425ffd83dbSDimitry Andric
8435ffd83dbSDimitry Andric switch (MI.getOpcode()) {
8445ffd83dbSDimitry Andric default:
8455ffd83dbSDimitry Andric break;
8465ffd83dbSDimitry Andric // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
8475ffd83dbSDimitry Andric // fp16 -> fp32 vector conversions.
8485ffd83dbSDimitry Andric // Instructions that perform a NOT will generate 1s from 0s.
8495ffd83dbSDimitry Andric case ARM::MVE_VMVN:
8505ffd83dbSDimitry Andric case ARM::MVE_VORN:
8515ffd83dbSDimitry Andric // Count leading zeros will do just that!
8525ffd83dbSDimitry Andric case ARM::MVE_VCLZs8:
8535ffd83dbSDimitry Andric case ARM::MVE_VCLZs16:
8545ffd83dbSDimitry Andric case ARM::MVE_VCLZs32:
8555ffd83dbSDimitry Andric return true;
8565ffd83dbSDimitry Andric }
8575ffd83dbSDimitry Andric return false;
8585ffd83dbSDimitry Andric }
8595ffd83dbSDimitry Andric
8605ffd83dbSDimitry Andric // Look at its register uses to see if it only can only receive zeros
8615ffd83dbSDimitry Andric // into its false lanes which would then produce zeros. Also check that
8625ffd83dbSDimitry Andric // the output register is also defined by an FalseLanesZero instruction
8635ffd83dbSDimitry Andric // so that if tail-predication happens, the lanes that aren't updated will
8645ffd83dbSDimitry Andric // still be zeros.
producesFalseLanesZero(MachineInstr & MI,const TargetRegisterClass * QPRs,const ReachingDefAnalysis & RDA,InstSet & FalseLanesZero)8655ffd83dbSDimitry Andric static bool producesFalseLanesZero(MachineInstr &MI,
8665ffd83dbSDimitry Andric const TargetRegisterClass *QPRs,
8675ffd83dbSDimitry Andric const ReachingDefAnalysis &RDA,
8685ffd83dbSDimitry Andric InstSet &FalseLanesZero) {
8695ffd83dbSDimitry Andric if (canGenerateNonZeros(MI))
8705ffd83dbSDimitry Andric return false;
8715ffd83dbSDimitry Andric
872e8d8bef9SDimitry Andric bool isPredicated = isVectorPredicated(&MI);
873e8d8bef9SDimitry Andric // Predicated loads will write zeros to the falsely predicated bytes of the
874e8d8bef9SDimitry Andric // destination register.
875e8d8bef9SDimitry Andric if (MI.mayLoad())
876e8d8bef9SDimitry Andric return isPredicated;
877e8d8bef9SDimitry Andric
878e8d8bef9SDimitry Andric auto IsZeroInit = [](MachineInstr *Def) {
879e8d8bef9SDimitry Andric return !isVectorPredicated(Def) &&
880e8d8bef9SDimitry Andric Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
881e8d8bef9SDimitry Andric Def->getOperand(1).getImm() == 0;
882e8d8bef9SDimitry Andric };
883e8d8bef9SDimitry Andric
8845ffd83dbSDimitry Andric bool AllowScalars = isHorizontalReduction(MI);
8855ffd83dbSDimitry Andric for (auto &MO : MI.operands()) {
8865ffd83dbSDimitry Andric if (!MO.isReg() || !MO.getReg())
8875ffd83dbSDimitry Andric continue;
8885ffd83dbSDimitry Andric if (!isRegInClass(MO, QPRs) && AllowScalars)
8895ffd83dbSDimitry Andric continue;
890349cc55cSDimitry Andric // Skip the lr predicate reg
891349cc55cSDimitry Andric int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
89206c3fb27SDimitry Andric if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2)
893349cc55cSDimitry Andric continue;
894e8d8bef9SDimitry Andric
895e8d8bef9SDimitry Andric // Check that this instruction will produce zeros in its false lanes:
896e8d8bef9SDimitry Andric // - If it only consumes false lanes zero or constant 0 (vmov #0)
897e8d8bef9SDimitry Andric // - If it's predicated, it only matters that it's def register already has
898e8d8bef9SDimitry Andric // false lane zeros, so we can ignore the uses.
899e8d8bef9SDimitry Andric SmallPtrSet<MachineInstr *, 2> Defs;
900e8d8bef9SDimitry Andric RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
901bdd1243dSDimitry Andric if (Defs.empty())
902bdd1243dSDimitry Andric return false;
903e8d8bef9SDimitry Andric for (auto *Def : Defs) {
904e8d8bef9SDimitry Andric if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
905e8d8bef9SDimitry Andric continue;
906e8d8bef9SDimitry Andric if (MO.isUse() && isPredicated)
9075ffd83dbSDimitry Andric continue;
9085ffd83dbSDimitry Andric return false;
9095ffd83dbSDimitry Andric }
910e8d8bef9SDimitry Andric }
9115ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
9125ffd83dbSDimitry Andric return true;
9135ffd83dbSDimitry Andric }
9145ffd83dbSDimitry Andric
ValidateLiveOuts()9155ffd83dbSDimitry Andric bool LowOverheadLoop::ValidateLiveOuts() {
9165ffd83dbSDimitry Andric // We want to find out if the tail-predicated version of this loop will
9175ffd83dbSDimitry Andric // produce the same values as the loop in its original form. For this to
9185ffd83dbSDimitry Andric // be true, the newly inserted implicit predication must not change the
9195ffd83dbSDimitry Andric // the (observable) results.
9205ffd83dbSDimitry Andric // We're doing this because many instructions in the loop will not be
9215ffd83dbSDimitry Andric // predicated and so the conversion from VPT predication to tail-predication
9225ffd83dbSDimitry Andric // can result in different values being produced; due to the tail-predication
9235ffd83dbSDimitry Andric // preventing many instructions from updating their falsely predicated
9245ffd83dbSDimitry Andric // lanes. This analysis assumes that all the instructions perform lane-wise
9255ffd83dbSDimitry Andric // operations and don't perform any exchanges.
9265ffd83dbSDimitry Andric // A masked load, whether through VPT or tail predication, will write zeros
9275ffd83dbSDimitry Andric // to any of the falsely predicated bytes. So, from the loads, we know that
9285ffd83dbSDimitry Andric // the false lanes are zeroed and here we're trying to track that those false
9295ffd83dbSDimitry Andric // lanes remain zero, or where they change, the differences are masked away
9305ffd83dbSDimitry Andric // by their user(s).
931e8d8bef9SDimitry Andric // All MVE stores have to be predicated, so we know that any predicate load
9325ffd83dbSDimitry Andric // operands, or stored results are equivalent already. Other explicitly
9335ffd83dbSDimitry Andric // predicated instructions will perform the same operation in the original
9345ffd83dbSDimitry Andric // loop and the tail-predicated form too. Because of this, we can insert
9355ffd83dbSDimitry Andric // loads, stores and other predicated instructions into our Predicated
9365ffd83dbSDimitry Andric // set and build from there.
9375ffd83dbSDimitry Andric const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
9385ffd83dbSDimitry Andric SetVector<MachineInstr *> FalseLanesUnknown;
9395ffd83dbSDimitry Andric SmallPtrSet<MachineInstr *, 4> FalseLanesZero;
9405ffd83dbSDimitry Andric SmallPtrSet<MachineInstr *, 4> Predicated;
9415ffd83dbSDimitry Andric MachineBasicBlock *Header = ML.getHeader();
9425ffd83dbSDimitry Andric
943349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
944349cc55cSDimitry Andric
9455ffd83dbSDimitry Andric for (auto &MI : *Header) {
946e8d8bef9SDimitry Andric if (!shouldInspect(MI))
9475ffd83dbSDimitry Andric continue;
9485ffd83dbSDimitry Andric
9495ffd83dbSDimitry Andric if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
9505ffd83dbSDimitry Andric continue;
9515ffd83dbSDimitry Andric
952e8d8bef9SDimitry Andric bool isPredicated = isVectorPredicated(&MI);
953e8d8bef9SDimitry Andric bool retainsOrReduces =
954e8d8bef9SDimitry Andric retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);
955e8d8bef9SDimitry Andric
956e8d8bef9SDimitry Andric if (isPredicated)
9575ffd83dbSDimitry Andric Predicated.insert(&MI);
958e8d8bef9SDimitry Andric if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
9595ffd83dbSDimitry Andric FalseLanesZero.insert(&MI);
960e8d8bef9SDimitry Andric else if (MI.getNumDefs() == 0)
961e8d8bef9SDimitry Andric continue;
962349cc55cSDimitry Andric else if (!isPredicated && retainsOrReduces) {
963349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI);
964e8d8bef9SDimitry Andric return false;
965349cc55cSDimitry Andric } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
966e8d8bef9SDimitry Andric FalseLanesUnknown.insert(&MI);
9675ffd83dbSDimitry Andric }
9685ffd83dbSDimitry Andric
969349cc55cSDimitry Andric LLVM_DEBUG({
970349cc55cSDimitry Andric dbgs() << " Predicated:\n";
971349cc55cSDimitry Andric for (auto *I : Predicated)
972349cc55cSDimitry Andric dbgs() << " " << *I;
973349cc55cSDimitry Andric dbgs() << " FalseLanesZero:\n";
974349cc55cSDimitry Andric for (auto *I : FalseLanesZero)
975349cc55cSDimitry Andric dbgs() << " " << *I;
976349cc55cSDimitry Andric dbgs() << " FalseLanesUnknown:\n";
977349cc55cSDimitry Andric for (auto *I : FalseLanesUnknown)
978349cc55cSDimitry Andric dbgs() << " " << *I;
979349cc55cSDimitry Andric });
980349cc55cSDimitry Andric
9815ffd83dbSDimitry Andric auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
9825ffd83dbSDimitry Andric SmallPtrSetImpl<MachineInstr *> &Predicated) {
9835ffd83dbSDimitry Andric SmallPtrSet<MachineInstr *, 2> Uses;
984e8d8bef9SDimitry Andric RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
9855ffd83dbSDimitry Andric for (auto *Use : Uses) {
9865ffd83dbSDimitry Andric if (Use != MI && !Predicated.count(Use))
9875ffd83dbSDimitry Andric return false;
9885ffd83dbSDimitry Andric }
9895ffd83dbSDimitry Andric return true;
9905ffd83dbSDimitry Andric };
9915ffd83dbSDimitry Andric
9925ffd83dbSDimitry Andric // Visit the unknowns in reverse so that we can start at the values being
9935ffd83dbSDimitry Andric // stored and then we can work towards the leaves, hopefully adding more
9945ffd83dbSDimitry Andric // instructions to Predicated. Successfully terminating the loop means that
9955ffd83dbSDimitry Andric // all the unknown values have to found to be masked by predicated user(s).
9965ffd83dbSDimitry Andric // For any unpredicated values, we store them in NonPredicated so that we
9975ffd83dbSDimitry Andric // can later check whether these form a reduction.
9985ffd83dbSDimitry Andric SmallPtrSet<MachineInstr*, 2> NonPredicated;
9995ffd83dbSDimitry Andric for (auto *MI : reverse(FalseLanesUnknown)) {
10005ffd83dbSDimitry Andric for (auto &MO : MI->operands()) {
10015ffd83dbSDimitry Andric if (!isRegInClass(MO, QPRs) || !MO.isDef())
10025ffd83dbSDimitry Andric continue;
10035ffd83dbSDimitry Andric if (!HasPredicatedUsers(MI, MO, Predicated)) {
1004349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " Found an unknown def of : "
10055ffd83dbSDimitry Andric << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
10065ffd83dbSDimitry Andric NonPredicated.insert(MI);
1007e8d8bef9SDimitry Andric break;
10085ffd83dbSDimitry Andric }
10095ffd83dbSDimitry Andric }
10105ffd83dbSDimitry Andric // Any unknown false lanes have been masked away by the user(s).
1011e8d8bef9SDimitry Andric if (!NonPredicated.contains(MI))
10125ffd83dbSDimitry Andric Predicated.insert(MI);
10135ffd83dbSDimitry Andric }
10145ffd83dbSDimitry Andric
10155ffd83dbSDimitry Andric SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
10165ffd83dbSDimitry Andric SmallVector<MachineBasicBlock *, 2> ExitBlocks;
10175ffd83dbSDimitry Andric ML.getExitBlocks(ExitBlocks);
10185ffd83dbSDimitry Andric assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
10195ffd83dbSDimitry Andric assert(ExitBlocks.size() == 1 && "Expected a single exit block");
10205ffd83dbSDimitry Andric MachineBasicBlock *ExitBB = ExitBlocks.front();
10215ffd83dbSDimitry Andric for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
1022e8d8bef9SDimitry Andric // TODO: Instead of blocking predication, we could move the vctp to the exit
1023e8d8bef9SDimitry Andric // block and calculate it's operand there in or the preheader.
1024349cc55cSDimitry Andric if (RegMask.PhysReg == ARM::VPR) {
1025349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " VPR is live in to the exit block.");
1026e8d8bef9SDimitry Andric return false;
1027349cc55cSDimitry Andric }
10285ffd83dbSDimitry Andric // Check Q-regs that are live in the exit blocks. We don't collect scalars
10295ffd83dbSDimitry Andric // because they won't be affected by lane predication.
1030e8d8bef9SDimitry Andric if (QPRs->contains(RegMask.PhysReg))
10315ffd83dbSDimitry Andric if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
10325ffd83dbSDimitry Andric LiveOutMIs.insert(MI);
10335ffd83dbSDimitry Andric }
10345ffd83dbSDimitry Andric
10355ffd83dbSDimitry Andric // We've already validated that any VPT predication within the loop will be
10365ffd83dbSDimitry Andric // equivalent when we perform the predication transformation; so we know that
10375ffd83dbSDimitry Andric // any VPT predicated instruction is predicated upon VCTP. Any live-out
10385ffd83dbSDimitry Andric // instruction needs to be predicated, so check this here. The instructions
10395ffd83dbSDimitry Andric // in NonPredicated have been found to be a reduction that we can ensure its
1040349cc55cSDimitry Andric // legality. Any MQPRCopy found will need to validate its input as if it was
1041349cc55cSDimitry Andric // live out.
1042349cc55cSDimitry Andric SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
1043349cc55cSDimitry Andric while (!Worklist.empty()) {
1044349cc55cSDimitry Andric MachineInstr *MI = Worklist.pop_back_val();
1045349cc55cSDimitry Andric if (MI->getOpcode() == ARM::MQPRCopy) {
1046349cc55cSDimitry Andric VMOVCopies.insert(MI);
1047349cc55cSDimitry Andric MachineInstr *CopySrc =
1048349cc55cSDimitry Andric RDA.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
1049349cc55cSDimitry Andric if (CopySrc)
1050349cc55cSDimitry Andric Worklist.push_back(CopySrc);
1051349cc55cSDimitry Andric } else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
1052349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
1053349cc55cSDimitry Andric VMOVCopies.clear();
10545ffd83dbSDimitry Andric return false;
1055e8d8bef9SDimitry Andric }
1056e8d8bef9SDimitry Andric }
10575ffd83dbSDimitry Andric
10585ffd83dbSDimitry Andric return true;
10595ffd83dbSDimitry Andric }
10605ffd83dbSDimitry Andric
Validate(ARMBasicBlockUtils * BBUtils)1061e8d8bef9SDimitry Andric void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
1062480093f4SDimitry Andric if (Revert)
1063480093f4SDimitry Andric return;
1064480093f4SDimitry Andric
1065e8d8bef9SDimitry Andric // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
1066e8d8bef9SDimitry Andric // can only jump back.
1067e8d8bef9SDimitry Andric auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
1068e8d8bef9SDimitry Andric ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
1069e8d8bef9SDimitry Andric MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
1070e8d8bef9SDimitry Andric ? End->getOperand(1).getMBB()
1071e8d8bef9SDimitry Andric : End->getOperand(2).getMBB();
1072480093f4SDimitry Andric // TODO Maybe there's cases where the target doesn't have to be the header,
1073480093f4SDimitry Andric // but for now be safe and revert.
1074e8d8bef9SDimitry Andric if (TgtBB != ML.getHeader()) {
1075e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
1076e8d8bef9SDimitry Andric return false;
1077480093f4SDimitry Andric }
1078480093f4SDimitry Andric
1079480093f4SDimitry Andric // The WLS and LE instructions have 12-bits for the label offset. WLS
1080480093f4SDimitry Andric // requires a positive offset, while LE uses negative.
10815ffd83dbSDimitry Andric if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
10825ffd83dbSDimitry Andric !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
1083480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
1084e8d8bef9SDimitry Andric return false;
1085480093f4SDimitry Andric }
1086480093f4SDimitry Andric
1087fe6060f1SDimitry Andric if (isWhileLoopStart(*Start)) {
1088fe6060f1SDimitry Andric MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);
1089fe6060f1SDimitry Andric if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||
1090fe6060f1SDimitry Andric !BBUtils->isBBInRange(Start, TargetBB, 4094)) {
1091480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
1092e8d8bef9SDimitry Andric return false;
1093e8d8bef9SDimitry Andric }
1094fe6060f1SDimitry Andric }
1095e8d8bef9SDimitry Andric return true;
1096e8d8bef9SDimitry Andric };
1097e8d8bef9SDimitry Andric
1098fe6060f1SDimitry Andric StartInsertPt = MachineBasicBlock::iterator(Start);
1099fe6060f1SDimitry Andric StartInsertBB = Start->getParent();
1100fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
1101fe6060f1SDimitry Andric << *StartInsertPt);
1102480093f4SDimitry Andric
1103e8d8bef9SDimitry Andric Revert = !ValidateRanges(Start, End, BBUtils, ML);
1104e8d8bef9SDimitry Andric CannotTailPredicate = !ValidateTailPredicate();
1105480093f4SDimitry Andric }
1106480093f4SDimitry Andric
AddVCTP(MachineInstr * MI)1107e8d8bef9SDimitry Andric bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
1108e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
1109e8d8bef9SDimitry Andric if (VCTPs.empty()) {
1110e8d8bef9SDimitry Andric VCTPs.push_back(MI);
1111e8d8bef9SDimitry Andric return true;
1112e8d8bef9SDimitry Andric }
1113e8d8bef9SDimitry Andric
1114e8d8bef9SDimitry Andric // If we find another VCTP, check whether it uses the same value as the main VCTP.
1115e8d8bef9SDimitry Andric // If it does, store it in the VCTPs set, else refuse it.
1116e8d8bef9SDimitry Andric MachineInstr *Prev = VCTPs.back();
1117e8d8bef9SDimitry Andric if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
1118e8d8bef9SDimitry Andric !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
1119e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
1120e8d8bef9SDimitry Andric "definition from the main VCTP");
1121e8d8bef9SDimitry Andric return false;
1122e8d8bef9SDimitry Andric }
1123e8d8bef9SDimitry Andric VCTPs.push_back(MI);
1124e8d8bef9SDimitry Andric return true;
1125480093f4SDimitry Andric }
1126480093f4SDimitry Andric
ValidateMVEStore(MachineInstr * MI,MachineLoop * ML)1127fe6060f1SDimitry Andric static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) {
1128fe6060f1SDimitry Andric
1129fe6060f1SDimitry Andric auto GetFrameIndex = [](MachineMemOperand *Operand) {
1130fe6060f1SDimitry Andric const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
1131fe6060f1SDimitry Andric if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
1132fe6060f1SDimitry Andric if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(PseudoValue)) {
1133fe6060f1SDimitry Andric return FS->getFrameIndex();
1134fe6060f1SDimitry Andric }
1135fe6060f1SDimitry Andric }
1136fe6060f1SDimitry Andric return -1;
1137fe6060f1SDimitry Andric };
1138fe6060f1SDimitry Andric
1139fe6060f1SDimitry Andric auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
1140fe6060f1SDimitry Andric switch (I->getOpcode()) {
1141fe6060f1SDimitry Andric case ARM::MVE_VSTRWU32:
1142fe6060f1SDimitry Andric case ARM::MVE_VLDRWU32: {
1143fe6060f1SDimitry Andric return I->getOperand(1).getReg() == ARM::SP &&
1144fe6060f1SDimitry Andric I->memoperands().size() == 1 &&
1145fe6060f1SDimitry Andric GetFrameIndex(I->memoperands().front()) >= 0;
1146fe6060f1SDimitry Andric }
1147fe6060f1SDimitry Andric default:
1148fe6060f1SDimitry Andric return false;
1149fe6060f1SDimitry Andric }
1150fe6060f1SDimitry Andric };
1151fe6060f1SDimitry Andric
1152fe6060f1SDimitry Andric // An unpredicated vector register spill is allowed if all of the uses of the
1153fe6060f1SDimitry Andric // stack slot are within the loop
1154fe6060f1SDimitry Andric if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI))
1155fe6060f1SDimitry Andric return false;
1156fe6060f1SDimitry Andric
1157fe6060f1SDimitry Andric // Search all blocks after the loop for accesses to the same stack slot.
1158fe6060f1SDimitry Andric // ReachingDefAnalysis doesn't work for sp as it relies on registers being
1159fe6060f1SDimitry Andric // live-out (which sp never is) to know what blocks to look in
1160fe6060f1SDimitry Andric if (MI->memoperands().size() == 0)
1161fe6060f1SDimitry Andric return false;
1162fe6060f1SDimitry Andric int FI = GetFrameIndex(MI->memoperands().front());
1163fe6060f1SDimitry Andric
1164349cc55cSDimitry Andric auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
1165fe6060f1SDimitry Andric if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))
1166fe6060f1SDimitry Andric return false;
1167fe6060f1SDimitry Andric
1168fe6060f1SDimitry Andric SmallVector<MachineBasicBlock *> Frontier;
1169fe6060f1SDimitry Andric ML->getExitBlocks(Frontier);
1170fe6060f1SDimitry Andric SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()};
1171fe6060f1SDimitry Andric unsigned Idx = 0;
1172fe6060f1SDimitry Andric while (Idx < Frontier.size()) {
1173fe6060f1SDimitry Andric MachineBasicBlock *BB = Frontier[Idx];
1174fe6060f1SDimitry Andric bool LookAtSuccessors = true;
1175fe6060f1SDimitry Andric for (auto &I : *BB) {
1176fe6060f1SDimitry Andric if (!IsStackOp(&I) || I.memoperands().size() == 0)
1177fe6060f1SDimitry Andric continue;
1178fe6060f1SDimitry Andric if (GetFrameIndex(I.memoperands().front()) != FI)
1179fe6060f1SDimitry Andric continue;
1180fe6060f1SDimitry Andric // If this block has a store to the stack slot before any loads then we
1181fe6060f1SDimitry Andric // can ignore the block
1182fe6060f1SDimitry Andric if (I.getOpcode() == ARM::MVE_VSTRWU32) {
1183fe6060f1SDimitry Andric LookAtSuccessors = false;
1184fe6060f1SDimitry Andric break;
1185fe6060f1SDimitry Andric }
1186fe6060f1SDimitry Andric // If the store and the load are using the same stack slot then the
1187fe6060f1SDimitry Andric // store isn't valid for tail predication
1188fe6060f1SDimitry Andric if (I.getOpcode() == ARM::MVE_VLDRWU32)
1189fe6060f1SDimitry Andric return false;
1190fe6060f1SDimitry Andric }
1191fe6060f1SDimitry Andric
1192fe6060f1SDimitry Andric if (LookAtSuccessors) {
1193bdd1243dSDimitry Andric for (auto *Succ : BB->successors()) {
1194fe6060f1SDimitry Andric if (!Visited.contains(Succ) && !is_contained(Frontier, Succ))
1195fe6060f1SDimitry Andric Frontier.push_back(Succ);
1196fe6060f1SDimitry Andric }
1197fe6060f1SDimitry Andric }
1198fe6060f1SDimitry Andric Visited.insert(BB);
1199fe6060f1SDimitry Andric Idx++;
1200fe6060f1SDimitry Andric }
1201fe6060f1SDimitry Andric
1202fe6060f1SDimitry Andric return true;
1203fe6060f1SDimitry Andric }
1204fe6060f1SDimitry Andric
ValidateMVEInst(MachineInstr * MI)1205480093f4SDimitry Andric bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
1206480093f4SDimitry Andric if (CannotTailPredicate)
1207480093f4SDimitry Andric return false;
1208480093f4SDimitry Andric
1209e8d8bef9SDimitry Andric if (!shouldInspect(*MI))
1210480093f4SDimitry Andric return true;
1211e8d8bef9SDimitry Andric
1212e8d8bef9SDimitry Andric if (MI->getOpcode() == ARM::MVE_VPSEL ||
12135ffd83dbSDimitry Andric MI->getOpcode() == ARM::MVE_VPNOT) {
1214480093f4SDimitry Andric // TODO: Allow VPSEL and VPNOT, we currently cannot because:
1215480093f4SDimitry Andric // 1) It will use the VPR as a predicate operand, but doesn't have to be
1216480093f4SDimitry Andric // instead a VPT block, which means we can assert while building up
12175ffd83dbSDimitry Andric // the VPT block because we don't find another VPT or VPST to being a new
1218480093f4SDimitry Andric // one.
1219480093f4SDimitry Andric // 2) VPSEL still requires a VPR operand even after tail predicating,
1220480093f4SDimitry Andric // which means we can't remove it unless there is another
1221480093f4SDimitry Andric // instruction, such as vcmp, that can provide the VPR def.
12225ffd83dbSDimitry Andric return false;
12235ffd83dbSDimitry Andric }
1224480093f4SDimitry Andric
1225e8d8bef9SDimitry Andric // Record all VCTPs and check that they're equivalent to one another.
1226e8d8bef9SDimitry Andric if (isVCTP(MI) && !AddVCTP(MI))
1227e8d8bef9SDimitry Andric return false;
1228e8d8bef9SDimitry Andric
1229e8d8bef9SDimitry Andric // Inspect uses first so that any instructions that alter the VPR don't
1230e8d8bef9SDimitry Andric // alter the predicate upon themselves.
1231480093f4SDimitry Andric const MCInstrDesc &MCID = MI->getDesc();
1232e8d8bef9SDimitry Andric bool IsUse = false;
1233e8d8bef9SDimitry Andric unsigned LastOpIdx = MI->getNumOperands() - 1;
123406c3fb27SDimitry Andric for (const auto &Op : enumerate(reverse(MCID.operands()))) {
1235e8d8bef9SDimitry Andric const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
1236e8d8bef9SDimitry Andric if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
1237480093f4SDimitry Andric continue;
1238480093f4SDimitry Andric
1239e8d8bef9SDimitry Andric if (ARM::isVpred(Op.value().OperandType)) {
1240*0fca6ea1SDimitry Andric VPTstate.addInst(MI);
1241480093f4SDimitry Andric IsUse = true;
1242e8d8bef9SDimitry Andric } else if (MI->getOpcode() != ARM::MVE_VPST) {
1243480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
1244480093f4SDimitry Andric return false;
1245480093f4SDimitry Andric }
1246480093f4SDimitry Andric }
1247480093f4SDimitry Andric
1248480093f4SDimitry Andric // If we find an instruction that has been marked as not valid for tail
1249480093f4SDimitry Andric // predication, only allow the instruction if it's contained within a valid
1250480093f4SDimitry Andric // VPT block.
1251e8d8bef9SDimitry Andric bool RequiresExplicitPredication =
1252e8d8bef9SDimitry Andric (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
1253e8d8bef9SDimitry Andric if (isDomainMVE(MI) && RequiresExplicitPredication) {
1254349cc55cSDimitry Andric if (MI->getOpcode() == ARM::MQPRCopy)
1255349cc55cSDimitry Andric return true;
1256349cc55cSDimitry Andric if (!IsUse && producesDoubleWidthResult(*MI)) {
1257349cc55cSDimitry Andric DoubleWidthResultInstrs.insert(MI);
1258349cc55cSDimitry Andric return true;
1259349cc55cSDimitry Andric }
1260349cc55cSDimitry Andric
1261349cc55cSDimitry Andric LLVM_DEBUG(if (!IsUse) dbgs()
1262349cc55cSDimitry Andric << "ARM Loops: Can't tail predicate: " << *MI);
1263e8d8bef9SDimitry Andric return IsUse;
1264480093f4SDimitry Andric }
1265480093f4SDimitry Andric
12665ffd83dbSDimitry Andric // If the instruction is already explicitly predicated, then the conversion
1267e8d8bef9SDimitry Andric // will be fine, but ensure that all store operations are predicated.
1268fe6060f1SDimitry Andric if (MI->mayStore() && !ValidateMVEStore(MI, &ML))
1269e8d8bef9SDimitry Andric return IsUse;
1270e8d8bef9SDimitry Andric
1271e8d8bef9SDimitry Andric // If this instruction defines the VPR, update the predicate for the
1272e8d8bef9SDimitry Andric // proceeding instructions.
1273e8d8bef9SDimitry Andric if (isVectorPredicate(MI)) {
1274e8d8bef9SDimitry Andric // Clear the existing predicate when we're not in VPT Active state,
1275e8d8bef9SDimitry Andric // otherwise we add to it.
1276e8d8bef9SDimitry Andric if (!isVectorPredicated(MI))
1277*0fca6ea1SDimitry Andric VPTstate.resetPredicate(MI);
1278e8d8bef9SDimitry Andric else
1279*0fca6ea1SDimitry Andric VPTstate.addPredicate(MI);
1280e8d8bef9SDimitry Andric }
1281e8d8bef9SDimitry Andric
1282e8d8bef9SDimitry Andric // Finally once the predicate has been modified, we can start a new VPT
1283e8d8bef9SDimitry Andric // block if necessary.
1284e8d8bef9SDimitry Andric if (isVPTOpcode(MI->getOpcode()))
1285*0fca6ea1SDimitry Andric VPTstate.CreateVPTBlock(MI);
1286e8d8bef9SDimitry Andric
1287e8d8bef9SDimitry Andric return true;
1288480093f4SDimitry Andric }
1289480093f4SDimitry Andric
runOnMachineFunction(MachineFunction & mf)12908bcb0991SDimitry Andric bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
129181ad6265SDimitry Andric const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
12928bcb0991SDimitry Andric if (!ST.hasLOB())
12930b57cec5SDimitry Andric return false;
12940b57cec5SDimitry Andric
12958bcb0991SDimitry Andric MF = &mf;
12968bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
12970b57cec5SDimitry Andric
1298*0fca6ea1SDimitry Andric MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
1299480093f4SDimitry Andric RDA = &getAnalysis<ReachingDefAnalysis>();
13008bcb0991SDimitry Andric MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
13018bcb0991SDimitry Andric MRI = &MF->getRegInfo();
13028bcb0991SDimitry Andric TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
1303480093f4SDimitry Andric TRI = ST.getRegisterInfo();
1304*0fca6ea1SDimitry Andric BBUtils = std::make_unique<ARMBasicBlockUtils>(*MF);
13050b57cec5SDimitry Andric BBUtils->computeAllBlockSizes();
13068bcb0991SDimitry Andric BBUtils->adjustBBOffsetsAfter(&MF->front());
13070b57cec5SDimitry Andric
13080b57cec5SDimitry Andric bool Changed = false;
1309bdd1243dSDimitry Andric for (auto *ML : *MLI) {
1310e8d8bef9SDimitry Andric if (ML->isOutermost())
13110b57cec5SDimitry Andric Changed |= ProcessLoop(ML);
13120b57cec5SDimitry Andric }
13138bcb0991SDimitry Andric Changed |= RevertNonLoops();
13140b57cec5SDimitry Andric return Changed;
13150b57cec5SDimitry Andric }
13160b57cec5SDimitry Andric
ProcessLoop(MachineLoop * ML)13170b57cec5SDimitry Andric bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
13180b57cec5SDimitry Andric bool Changed = false;
13190b57cec5SDimitry Andric
13200b57cec5SDimitry Andric // Process inner loops first.
13210eae32dcSDimitry Andric for (MachineLoop *L : *ML)
13220eae32dcSDimitry Andric Changed |= ProcessLoop(L);
13230b57cec5SDimitry Andric
1324fe6060f1SDimitry Andric LLVM_DEBUG({
1325fe6060f1SDimitry Andric dbgs() << "ARM Loops: Processing loop containing:\n";
1326480093f4SDimitry Andric if (auto *Preheader = ML->getLoopPreheader())
1327fe6060f1SDimitry Andric dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1328fe6060f1SDimitry Andric else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true))
1329fe6060f1SDimitry Andric dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1330480093f4SDimitry Andric for (auto *MBB : ML->getBlocks())
1331fe6060f1SDimitry Andric dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
1332fe6060f1SDimitry Andric });
13330b57cec5SDimitry Andric
13340b57cec5SDimitry Andric // Search the given block for a loop start instruction. If one isn't found,
13350b57cec5SDimitry Andric // and there's only one predecessor block, search that one too.
13360b57cec5SDimitry Andric std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
13378bcb0991SDimitry Andric [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
13380b57cec5SDimitry Andric for (auto &MI : *MBB) {
1339480093f4SDimitry Andric if (isLoopStart(MI))
13400b57cec5SDimitry Andric return &MI;
13410b57cec5SDimitry Andric }
13420b57cec5SDimitry Andric if (MBB->pred_size() == 1)
13430b57cec5SDimitry Andric return SearchForStart(*MBB->pred_begin());
13440b57cec5SDimitry Andric return nullptr;
13450b57cec5SDimitry Andric };
13460b57cec5SDimitry Andric
13475ffd83dbSDimitry Andric LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
1348480093f4SDimitry Andric // Search the preheader for the start intrinsic.
13490b57cec5SDimitry Andric // FIXME: I don't see why we shouldn't be supporting multiple predecessors
13500b57cec5SDimitry Andric // with potentially multiple set.loop.iterations, so we need to enable this.
13515ffd83dbSDimitry Andric if (LoLoop.Preheader)
13525ffd83dbSDimitry Andric LoLoop.Start = SearchForStart(LoLoop.Preheader);
1353480093f4SDimitry Andric else
1354fe6060f1SDimitry Andric return Changed;
13550b57cec5SDimitry Andric
13560b57cec5SDimitry Andric // Find the low-overhead loop components and decide whether or not to fall
1357480093f4SDimitry Andric // back to a normal loop. Also look for a vctp instructions and decide
1358480093f4SDimitry Andric // whether we can convert that predicate using tail predication.
13590b57cec5SDimitry Andric for (auto *MBB : reverse(ML->getBlocks())) {
13600b57cec5SDimitry Andric for (auto &MI : *MBB) {
13615ffd83dbSDimitry Andric if (MI.isDebugValue())
13625ffd83dbSDimitry Andric continue;
13635ffd83dbSDimitry Andric else if (MI.getOpcode() == ARM::t2LoopDec)
1364480093f4SDimitry Andric LoLoop.Dec = &MI;
13650b57cec5SDimitry Andric else if (MI.getOpcode() == ARM::t2LoopEnd)
1366480093f4SDimitry Andric LoLoop.End = &MI;
1367e8d8bef9SDimitry Andric else if (MI.getOpcode() == ARM::t2LoopEndDec)
1368e8d8bef9SDimitry Andric LoLoop.End = LoLoop.Dec = &MI;
1369480093f4SDimitry Andric else if (isLoopStart(MI))
1370480093f4SDimitry Andric LoLoop.Start = &MI;
13718bcb0991SDimitry Andric else if (MI.getDesc().isCall()) {
13720b57cec5SDimitry Andric // TODO: Though the call will require LE to execute again, does this
13730b57cec5SDimitry Andric // mean we should revert? Always executing LE hopefully should be
13740b57cec5SDimitry Andric // faster than performing a sub,cmp,br or even subs,br.
1375480093f4SDimitry Andric LoLoop.Revert = true;
13768bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
1377480093f4SDimitry Andric } else {
1378480093f4SDimitry Andric // Record VPR defs and build up their corresponding vpt blocks.
1379480093f4SDimitry Andric // Check we know how to tail predicate any mve instructions.
1380480093f4SDimitry Andric LoLoop.AnalyseMVEInst(&MI);
13818bcb0991SDimitry Andric }
13820b57cec5SDimitry Andric }
13830b57cec5SDimitry Andric }
13840b57cec5SDimitry Andric
1385480093f4SDimitry Andric LLVM_DEBUG(LoLoop.dump());
1386480093f4SDimitry Andric if (!LoLoop.FoundAllComponents()) {
1387480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
1388fe6060f1SDimitry Andric return Changed;
13890b57cec5SDimitry Andric }
13900b57cec5SDimitry Andric
1391fe6060f1SDimitry Andric assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
1392fe6060f1SDimitry Andric "Expected t2WhileLoopStart to be removed before regalloc!");
1393fe6060f1SDimitry Andric
1394e8d8bef9SDimitry Andric // Check that the only instruction using LoopDec is LoopEnd. This can only
1395e8d8bef9SDimitry Andric // happen when the Dec and End are separate, not a single t2LoopEndDec.
13965ffd83dbSDimitry Andric // TODO: Check for copy chains that really have no effect.
1397e8d8bef9SDimitry Andric if (LoLoop.Dec != LoLoop.End) {
13985ffd83dbSDimitry Andric SmallPtrSet<MachineInstr *, 2> Uses;
1399e8d8bef9SDimitry Andric RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
14005ffd83dbSDimitry Andric if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
14015ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
14025ffd83dbSDimitry Andric LoLoop.Revert = true;
14035ffd83dbSDimitry Andric }
1404e8d8bef9SDimitry Andric }
1405e8d8bef9SDimitry Andric LoLoop.Validate(BBUtils.get());
1406480093f4SDimitry Andric Expand(LoLoop);
14070b57cec5SDimitry Andric return true;
14080b57cec5SDimitry Andric }
14090b57cec5SDimitry Andric
14100b57cec5SDimitry Andric // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
14110b57cec5SDimitry Andric // beq that branches to the exit branch.
14128bcb0991SDimitry Andric // TODO: We could also try to generate a cbz if the value in LR is also in
14130b57cec5SDimitry Andric // another low register.
RevertWhile(MachineInstr * MI) const14140b57cec5SDimitry Andric void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
14150b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
1416fe6060f1SDimitry Andric MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI);
14178bcb0991SDimitry Andric unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
14188bcb0991SDimitry Andric ARM::tBcc : ARM::t2Bcc;
14198bcb0991SDimitry Andric
1420fe6060f1SDimitry Andric RevertWhileLoopStartLR(MI, TII, BrOpc);
1421e8d8bef9SDimitry Andric }
1422e8d8bef9SDimitry Andric
RevertDo(MachineInstr * MI) const1423e8d8bef9SDimitry Andric void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
1424e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
1425e8d8bef9SDimitry Andric RevertDoLoopStart(MI, TII);
14260b57cec5SDimitry Andric }
14270b57cec5SDimitry Andric
RevertLoopDec(MachineInstr * MI) const14285ffd83dbSDimitry Andric bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
14290b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
14300b57cec5SDimitry Andric MachineBasicBlock *MBB = MI->getParent();
14315ffd83dbSDimitry Andric SmallPtrSet<MachineInstr*, 1> Ignore;
14325ffd83dbSDimitry Andric for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
14335ffd83dbSDimitry Andric if (I->getOpcode() == ARM::t2LoopEnd) {
14345ffd83dbSDimitry Andric Ignore.insert(&*I);
14355ffd83dbSDimitry Andric break;
14365ffd83dbSDimitry Andric }
14375ffd83dbSDimitry Andric }
14388bcb0991SDimitry Andric
1439480093f4SDimitry Andric // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
1440e8d8bef9SDimitry Andric bool SetFlags =
1441e8d8bef9SDimitry Andric RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);
14428bcb0991SDimitry Andric
1443e8d8bef9SDimitry Andric llvm::RevertLoopDec(MI, TII, SetFlags);
14448bcb0991SDimitry Andric return SetFlags;
14450b57cec5SDimitry Andric }
14460b57cec5SDimitry Andric
14470b57cec5SDimitry Andric // Generate a subs, or sub and cmp, and a branch instead of an LE.
RevertLoopEnd(MachineInstr * MI,bool SkipCmp) const14488bcb0991SDimitry Andric void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
14490b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
14500b57cec5SDimitry Andric
14518bcb0991SDimitry Andric MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
14528bcb0991SDimitry Andric unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
14538bcb0991SDimitry Andric ARM::tBcc : ARM::t2Bcc;
14548bcb0991SDimitry Andric
1455e8d8bef9SDimitry Andric llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
1456e8d8bef9SDimitry Andric }
1457e8d8bef9SDimitry Andric
1458e8d8bef9SDimitry Andric // Generate a subs, or sub and cmp, and a branch instead of an LE.
RevertLoopEndDec(MachineInstr * MI) const1459e8d8bef9SDimitry Andric void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
1460e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
1461e8d8bef9SDimitry Andric assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
1462e8d8bef9SDimitry Andric MachineBasicBlock *MBB = MI->getParent();
1463e8d8bef9SDimitry Andric
14648bcb0991SDimitry Andric MachineInstrBuilder MIB =
1465e8d8bef9SDimitry Andric BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
1466e8d8bef9SDimitry Andric MIB.addDef(ARM::LR);
1467e8d8bef9SDimitry Andric MIB.add(MI->getOperand(1));
1468e8d8bef9SDimitry Andric MIB.addImm(1);
1469e8d8bef9SDimitry Andric MIB.addImm(ARMCC::AL);
1470e8d8bef9SDimitry Andric MIB.addReg(ARM::NoRegister);
1471e8d8bef9SDimitry Andric MIB.addReg(ARM::CPSR);
1472e8d8bef9SDimitry Andric MIB->getOperand(5).setIsDef(true);
1473e8d8bef9SDimitry Andric
1474e8d8bef9SDimitry Andric MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
1475e8d8bef9SDimitry Andric unsigned BrOpc =
1476e8d8bef9SDimitry Andric BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
1477e8d8bef9SDimitry Andric
1478e8d8bef9SDimitry Andric // Create bne
1479e8d8bef9SDimitry Andric MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
1480e8d8bef9SDimitry Andric MIB.add(MI->getOperand(2)); // branch target
14810b57cec5SDimitry Andric MIB.addImm(ARMCC::NE); // condition code
14820b57cec5SDimitry Andric MIB.addReg(ARM::CPSR);
1483e8d8bef9SDimitry Andric
14840b57cec5SDimitry Andric MI->eraseFromParent();
14850b57cec5SDimitry Andric }
14860b57cec5SDimitry Andric
14875ffd83dbSDimitry Andric // Perform dead code elimation on the loop iteration count setup expression.
14885ffd83dbSDimitry Andric // If we are tail-predicating, the number of elements to be processed is the
14895ffd83dbSDimitry Andric // operand of the VCTP instruction in the vector body, see getCount(), which is
14905ffd83dbSDimitry Andric // register $r3 in this example:
14915ffd83dbSDimitry Andric //
14925ffd83dbSDimitry Andric // $lr = big-itercount-expression
14935ffd83dbSDimitry Andric // ..
1494e8d8bef9SDimitry Andric // $lr = t2DoLoopStart renamable $lr
14955ffd83dbSDimitry Andric // vector.body:
14965ffd83dbSDimitry Andric // ..
14975ffd83dbSDimitry Andric // $vpr = MVE_VCTP32 renamable $r3
14985ffd83dbSDimitry Andric // renamable $lr = t2LoopDec killed renamable $lr, 1
14995ffd83dbSDimitry Andric // t2LoopEnd renamable $lr, %vector.body
15005ffd83dbSDimitry Andric // tB %end
15015ffd83dbSDimitry Andric //
15025ffd83dbSDimitry Andric // What we would like achieve here is to replace the do-loop start pseudo
15035ffd83dbSDimitry Andric // instruction t2DoLoopStart with:
15045ffd83dbSDimitry Andric //
15055ffd83dbSDimitry Andric // $lr = MVE_DLSTP_32 killed renamable $r3
15065ffd83dbSDimitry Andric //
15075ffd83dbSDimitry Andric // Thus, $r3 which defines the number of elements, is written to $lr,
15085ffd83dbSDimitry Andric // and then we want to delete the whole chain that used to define $lr,
15095ffd83dbSDimitry Andric // see the comment below how this chain could look like.
15105ffd83dbSDimitry Andric //
IterationCountDCE(LowOverheadLoop & LoLoop)15115ffd83dbSDimitry Andric void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
15125ffd83dbSDimitry Andric if (!LoLoop.IsTailPredicationLegal())
15135ffd83dbSDimitry Andric return;
15145ffd83dbSDimitry Andric
15155ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
15165ffd83dbSDimitry Andric
1517fe6060f1SDimitry Andric MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 1);
15185ffd83dbSDimitry Andric if (!Def) {
15195ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
15205ffd83dbSDimitry Andric return;
15215ffd83dbSDimitry Andric }
15225ffd83dbSDimitry Andric
15235ffd83dbSDimitry Andric // Collect and remove the users of iteration count.
15245ffd83dbSDimitry Andric SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec,
1525e8d8bef9SDimitry Andric LoLoop.End };
1526e8d8bef9SDimitry Andric if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
15275ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
15285ffd83dbSDimitry Andric }
15295ffd83dbSDimitry Andric
ExpandLoopStart(LowOverheadLoop & LoLoop)1530480093f4SDimitry Andric MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
15315ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
15325ffd83dbSDimitry Andric // When using tail-predication, try to delete the dead code that was used to
15335ffd83dbSDimitry Andric // calculate the number of loop iterations.
15345ffd83dbSDimitry Andric IterationCountDCE(LoLoop);
15355ffd83dbSDimitry Andric
1536e8d8bef9SDimitry Andric MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
1537480093f4SDimitry Andric MachineInstr *Start = LoLoop.Start;
1538e8d8bef9SDimitry Andric MachineBasicBlock *MBB = LoLoop.StartInsertBB;
1539480093f4SDimitry Andric unsigned Opc = LoLoop.getStartOpcode();
1540e8d8bef9SDimitry Andric MachineOperand &Count = LoLoop.getLoopStartOperand();
1541480093f4SDimitry Andric
1542fe6060f1SDimitry Andric // A DLS lr, lr we needn't emit
1543fe6060f1SDimitry Andric MachineInstr* NewStart;
1544bdd1243dSDimitry Andric if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() &&
1545bdd1243dSDimitry Andric Count.getReg() == ARM::LR) {
1546fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
1547fe6060f1SDimitry Andric NewStart = nullptr;
1548fe6060f1SDimitry Andric } else {
15490b57cec5SDimitry Andric MachineInstrBuilder MIB =
1550e8d8bef9SDimitry Andric BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
15510b57cec5SDimitry Andric
15520b57cec5SDimitry Andric MIB.addDef(ARM::LR);
1553480093f4SDimitry Andric MIB.add(Count);
1554fe6060f1SDimitry Andric if (isWhileLoopStart(*Start))
1555fe6060f1SDimitry Andric MIB.addMBB(getWhileLoopStartTargetBB(*Start));
1556fe6060f1SDimitry Andric
1557fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
1558fe6060f1SDimitry Andric NewStart = &*MIB;
1559fe6060f1SDimitry Andric }
15600b57cec5SDimitry Andric
15615ffd83dbSDimitry Andric LoLoop.ToRemove.insert(Start);
1562fe6060f1SDimitry Andric return NewStart;
1563480093f4SDimitry Andric }
1564480093f4SDimitry Andric
ConvertVPTBlocks(LowOverheadLoop & LoLoop)1565480093f4SDimitry Andric void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
1566480093f4SDimitry Andric auto RemovePredicate = [](MachineInstr *MI) {
15674652422eSDimitry Andric if (MI->isDebugInstr())
15684652422eSDimitry Andric return;
1569480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
15704652422eSDimitry Andric int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
15714652422eSDimitry Andric assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");
1572480093f4SDimitry Andric assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
1573480093f4SDimitry Andric "Expected Then predicate!");
1574480093f4SDimitry Andric MI->getOperand(PIdx).setImm(ARMVCC::None);
1575480093f4SDimitry Andric MI->getOperand(PIdx + 1).setReg(0);
15760b57cec5SDimitry Andric };
15770b57cec5SDimitry Andric
1578480093f4SDimitry Andric for (auto &Block : LoLoop.getVPTBlocks()) {
1579e8d8bef9SDimitry Andric SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
1580e8d8bef9SDimitry Andric
1581e8d8bef9SDimitry Andric auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
1582e8d8bef9SDimitry Andric assert(TheVCMP && "Replacing a removed or non-existent VCMP");
1583e8d8bef9SDimitry Andric // Replace the VCMP with a VPT
1584e8d8bef9SDimitry Andric MachineInstrBuilder MIB =
1585e8d8bef9SDimitry Andric BuildMI(*At->getParent(), At, At->getDebugLoc(),
1586e8d8bef9SDimitry Andric TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
1587e8d8bef9SDimitry Andric MIB.addImm(ARMVCC::Then);
1588e8d8bef9SDimitry Andric // Register one
1589e8d8bef9SDimitry Andric MIB.add(TheVCMP->getOperand(1));
1590e8d8bef9SDimitry Andric // Register two
1591e8d8bef9SDimitry Andric MIB.add(TheVCMP->getOperand(2));
1592e8d8bef9SDimitry Andric // The comparison code, e.g. ge, eq, lt
1593e8d8bef9SDimitry Andric MIB.add(TheVCMP->getOperand(3));
1594e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
1595e8d8bef9SDimitry Andric LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1596e8d8bef9SDimitry Andric LoLoop.ToRemove.insert(TheVCMP);
1597e8d8bef9SDimitry Andric TheVCMP = nullptr;
1598e8d8bef9SDimitry Andric };
1599e8d8bef9SDimitry Andric
1600*0fca6ea1SDimitry Andric if (LoLoop.VPTstate.isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
1601e8d8bef9SDimitry Andric MachineInstr *VPST = Insts.front();
1602*0fca6ea1SDimitry Andric if (Block.hasUniformPredicate()) {
1603e8d8bef9SDimitry Andric // A vpt block starting with VPST, is only predicated upon vctp and has no
1604e8d8bef9SDimitry Andric // internal vpr defs:
1605e8d8bef9SDimitry Andric // - Remove vpst.
1606e8d8bef9SDimitry Andric // - Unpredicate the remaining instructions.
1607e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1608e8d8bef9SDimitry Andric for (unsigned i = 1; i < Insts.size(); ++i)
1609e8d8bef9SDimitry Andric RemovePredicate(Insts[i]);
1610e8d8bef9SDimitry Andric } else {
16115ffd83dbSDimitry Andric // The VPT block has a non-uniform predicate but it uses a vpst and its
16125ffd83dbSDimitry Andric // entry is guarded only by a vctp, which means we:
1613480093f4SDimitry Andric // - Need to remove the original vpst.
1614480093f4SDimitry Andric // - Then need to unpredicate any following instructions, until
1615480093f4SDimitry Andric // we come across the divergent vpr def.
1616480093f4SDimitry Andric // - Insert a new vpst to predicate the instruction(s) that following
1617480093f4SDimitry Andric // the divergent vpr def.
1618*0fca6ea1SDimitry Andric MachineInstr *Divergent = Block.getDivergent();
16194652422eSDimitry Andric MachineBasicBlock *MBB = Divergent->getParent();
1620e8d8bef9SDimitry Andric auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
16214652422eSDimitry Andric while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())
16224652422eSDimitry Andric ++DivergentNext;
16234652422eSDimitry Andric
1624e8d8bef9SDimitry Andric bool DivergentNextIsPredicated =
16254652422eSDimitry Andric DivergentNext != MBB->end() &&
1626e8d8bef9SDimitry Andric getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
1627e8d8bef9SDimitry Andric
1628e8d8bef9SDimitry Andric for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
1629e8d8bef9SDimitry Andric I != E; ++I)
1630480093f4SDimitry Andric RemovePredicate(&*I);
1631480093f4SDimitry Andric
1632e8d8bef9SDimitry Andric // Check if the instruction defining vpr is a vcmp so it can be combined
1633e8d8bef9SDimitry Andric // with the VPST This should be the divergent instruction
1634e8d8bef9SDimitry Andric MachineInstr *VCMP =
1635e8d8bef9SDimitry Andric VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
1636e8d8bef9SDimitry Andric
1637e8d8bef9SDimitry Andric if (DivergentNextIsPredicated) {
1638e8d8bef9SDimitry Andric // Insert a VPST at the divergent only if the next instruction
1639e8d8bef9SDimitry Andric // would actually use it. A VCMP following a VPST can be
1640e8d8bef9SDimitry Andric // merged into a VPT so do that instead if the VCMP exists.
1641e8d8bef9SDimitry Andric if (!VCMP) {
1642e8d8bef9SDimitry Andric // Create a VPST (with a null mask for now, we'll recompute it
1643e8d8bef9SDimitry Andric // later)
1644e8d8bef9SDimitry Andric MachineInstrBuilder MIB =
1645e8d8bef9SDimitry Andric BuildMI(*Divergent->getParent(), Divergent,
1646e8d8bef9SDimitry Andric Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
16475ffd83dbSDimitry Andric MIB.addImm(0);
1648480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
16495ffd83dbSDimitry Andric LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1650e8d8bef9SDimitry Andric } else {
1651e8d8bef9SDimitry Andric // No RDA checks are necessary here since the VPST would have been
1652e8d8bef9SDimitry Andric // directly after the VCMP
1653e8d8bef9SDimitry Andric ReplaceVCMPWithVPT(VCMP, VCMP);
16545ffd83dbSDimitry Andric }
16555ffd83dbSDimitry Andric }
16565ffd83dbSDimitry Andric }
1657e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1658e8d8bef9SDimitry Andric LoLoop.ToRemove.insert(VPST);
1659e8d8bef9SDimitry Andric } else if (Block.containsVCTP()) {
1660e8d8bef9SDimitry Andric // The vctp will be removed, so either the entire block will be dead or
1661e8d8bef9SDimitry Andric // the block mask of the vp(s)t will need to be recomputed.
1662e8d8bef9SDimitry Andric MachineInstr *VPST = Insts.front();
1663e8d8bef9SDimitry Andric if (Block.size() == 2) {
1664e8d8bef9SDimitry Andric assert(VPST->getOpcode() == ARM::MVE_VPST &&
1665e8d8bef9SDimitry Andric "Found a VPST in an otherwise empty vpt block");
1666e8d8bef9SDimitry Andric LoLoop.ToRemove.insert(VPST);
1667e8d8bef9SDimitry Andric } else
1668e8d8bef9SDimitry Andric LoLoop.BlockMasksToRecompute.insert(VPST);
1669e8d8bef9SDimitry Andric } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
1670e8d8bef9SDimitry Andric // If this block starts with a VPST then attempt to merge it with the
1671e8d8bef9SDimitry Andric // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
1672e8d8bef9SDimitry Andric // block that no longer exists
1673e8d8bef9SDimitry Andric MachineInstr *VPST = Insts.front();
1674e8d8bef9SDimitry Andric auto Next = ++MachineBasicBlock::iterator(VPST);
1675e8d8bef9SDimitry Andric assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
1676e8d8bef9SDimitry Andric "The instruction after a VPST must be predicated");
1677e8d8bef9SDimitry Andric (void)Next;
1678e8d8bef9SDimitry Andric MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
1679e8d8bef9SDimitry Andric if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
1680e8d8bef9SDimitry Andric !LoLoop.ToRemove.contains(VprDef)) {
1681e8d8bef9SDimitry Andric MachineInstr *VCMP = VprDef;
1682e8d8bef9SDimitry Andric // The VCMP and VPST can only be merged if the VCMP's operands will have
1683e8d8bef9SDimitry Andric // the same values at the VPST.
1684e8d8bef9SDimitry Andric // If any of the instructions between the VCMP and VPST are predicated
1685e8d8bef9SDimitry Andric // then a different code path is expected to have merged the VCMP and
1686e8d8bef9SDimitry Andric // VPST already.
16870eae32dcSDimitry Andric if (std::none_of(++MachineBasicBlock::iterator(VCMP),
1688e8d8bef9SDimitry Andric MachineBasicBlock::iterator(VPST), hasVPRUse) &&
1689e8d8bef9SDimitry Andric RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
1690e8d8bef9SDimitry Andric RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
1691e8d8bef9SDimitry Andric ReplaceVCMPWithVPT(VCMP, VPST);
1692e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1693e8d8bef9SDimitry Andric LoLoop.ToRemove.insert(VPST);
1694480093f4SDimitry Andric }
1695480093f4SDimitry Andric }
16965ffd83dbSDimitry Andric }
16975ffd83dbSDimitry Andric }
1698e8d8bef9SDimitry Andric
1699e8d8bef9SDimitry Andric LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
1700480093f4SDimitry Andric }
1701480093f4SDimitry Andric
Expand(LowOverheadLoop & LoLoop)1702480093f4SDimitry Andric void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1703480093f4SDimitry Andric
17040b57cec5SDimitry Andric // Combine the LoopDec and LoopEnd instructions into LE(TP).
1705480093f4SDimitry Andric auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
1706480093f4SDimitry Andric MachineInstr *End = LoLoop.End;
17070b57cec5SDimitry Andric MachineBasicBlock *MBB = End->getParent();
1708480093f4SDimitry Andric unsigned Opc = LoLoop.IsTailPredicationLegal() ?
1709480093f4SDimitry Andric ARM::MVE_LETP : ARM::t2LEUpdate;
17100b57cec5SDimitry Andric MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
1711480093f4SDimitry Andric TII->get(Opc));
17120b57cec5SDimitry Andric MIB.addDef(ARM::LR);
1713e8d8bef9SDimitry Andric unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
1714e8d8bef9SDimitry Andric MIB.add(End->getOperand(Off + 0));
1715e8d8bef9SDimitry Andric MIB.add(End->getOperand(Off + 1));
17160b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
17175ffd83dbSDimitry Andric LoLoop.ToRemove.insert(LoLoop.Dec);
17185ffd83dbSDimitry Andric LoLoop.ToRemove.insert(End);
17190b57cec5SDimitry Andric return &*MIB;
17200b57cec5SDimitry Andric };
17210b57cec5SDimitry Andric
17220b57cec5SDimitry Andric // TODO: We should be able to automatically remove these branches before we
17230b57cec5SDimitry Andric // get here - probably by teaching analyzeBranch about the pseudo
17240b57cec5SDimitry Andric // instructions.
17250b57cec5SDimitry Andric // If there is an unconditional branch, after I, that just branches to the
17260b57cec5SDimitry Andric // next block, remove it.
17270b57cec5SDimitry Andric auto RemoveDeadBranch = [](MachineInstr *I) {
17280b57cec5SDimitry Andric MachineBasicBlock *BB = I->getParent();
17290b57cec5SDimitry Andric MachineInstr *Terminator = &BB->instr_back();
17300b57cec5SDimitry Andric if (Terminator->isUnconditionalBranch() && I != Terminator) {
17310b57cec5SDimitry Andric MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
17320b57cec5SDimitry Andric if (BB->isLayoutSuccessor(Succ)) {
17330b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
17340b57cec5SDimitry Andric Terminator->eraseFromParent();
17350b57cec5SDimitry Andric }
17360b57cec5SDimitry Andric }
17370b57cec5SDimitry Andric };
17380b57cec5SDimitry Andric
1739349cc55cSDimitry Andric // And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
1740349cc55cSDimitry Andric // Anything other MQPRCopy can be converted to MVE_VORR later on.
1741349cc55cSDimitry Andric auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) {
1742349cc55cSDimitry Andric for (auto *MI : VMOVCopies) {
1743349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
1744349cc55cSDimitry Andric assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1745349cc55cSDimitry Andric MachineBasicBlock *MBB = MI->getParent();
1746349cc55cSDimitry Andric Register Dst = MI->getOperand(0).getReg();
1747349cc55cSDimitry Andric Register Src = MI->getOperand(1).getReg();
1748349cc55cSDimitry Andric auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
1749349cc55cSDimitry Andric ARM::D0 + (Dst - ARM::Q0) * 2)
1750349cc55cSDimitry Andric .addReg(ARM::D0 + (Src - ARM::Q0) * 2)
1751349cc55cSDimitry Andric .add(predOps(ARMCC::AL));
1752349cc55cSDimitry Andric (void)MIB1;
1753349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " into " << *MIB1);
1754349cc55cSDimitry Andric auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
1755349cc55cSDimitry Andric ARM::D0 + (Dst - ARM::Q0) * 2 + 1)
1756349cc55cSDimitry Andric .addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1)
1757349cc55cSDimitry Andric .add(predOps(ARMCC::AL));
1758349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " and " << *MIB2);
1759349cc55cSDimitry Andric (void)MIB2;
1760349cc55cSDimitry Andric MI->eraseFromParent();
1761349cc55cSDimitry Andric }
1762349cc55cSDimitry Andric };
1763349cc55cSDimitry Andric
1764480093f4SDimitry Andric if (LoLoop.Revert) {
1765fe6060f1SDimitry Andric if (isWhileLoopStart(*LoLoop.Start))
1766480093f4SDimitry Andric RevertWhile(LoLoop.Start);
17670b57cec5SDimitry Andric else
1768e8d8bef9SDimitry Andric RevertDo(LoLoop.Start);
1769e8d8bef9SDimitry Andric if (LoLoop.Dec == LoLoop.End)
1770e8d8bef9SDimitry Andric RevertLoopEndDec(LoLoop.End);
1771e8d8bef9SDimitry Andric else
1772e8d8bef9SDimitry Andric RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
17730b57cec5SDimitry Andric } else {
1774349cc55cSDimitry Andric ExpandVMOVCopies(LoLoop.VMOVCopies);
1775480093f4SDimitry Andric LoLoop.Start = ExpandLoopStart(LoLoop);
1776fe6060f1SDimitry Andric if (LoLoop.Start)
1777480093f4SDimitry Andric RemoveDeadBranch(LoLoop.Start);
1778480093f4SDimitry Andric LoLoop.End = ExpandLoopEnd(LoLoop);
1779480093f4SDimitry Andric RemoveDeadBranch(LoLoop.End);
1780e8d8bef9SDimitry Andric if (LoLoop.IsTailPredicationLegal())
1781480093f4SDimitry Andric ConvertVPTBlocks(LoLoop);
17825ffd83dbSDimitry Andric for (auto *I : LoLoop.ToRemove) {
17835ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
17845ffd83dbSDimitry Andric I->eraseFromParent();
17855ffd83dbSDimitry Andric }
17865ffd83dbSDimitry Andric for (auto *I : LoLoop.BlockMasksToRecompute) {
17875ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
17885ffd83dbSDimitry Andric recomputeVPTBlockMask(*I);
17895ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << " ... done: " << *I);
1790480093f4SDimitry Andric }
17910b57cec5SDimitry Andric }
17925ffd83dbSDimitry Andric
17935ffd83dbSDimitry Andric PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
17945ffd83dbSDimitry Andric DFS.ProcessLoop();
17955ffd83dbSDimitry Andric const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
1796*0fca6ea1SDimitry Andric fullyRecomputeLiveIns(PostOrder);
17975ffd83dbSDimitry Andric
17985ffd83dbSDimitry Andric for (auto *MBB : reverse(PostOrder))
17995ffd83dbSDimitry Andric recomputeLivenessFlags(*MBB);
18005ffd83dbSDimitry Andric
18015ffd83dbSDimitry Andric // We've moved, removed and inserted new instructions, so update RDA.
18025ffd83dbSDimitry Andric RDA->reset();
18030b57cec5SDimitry Andric }
18040b57cec5SDimitry Andric
RevertNonLoops()18058bcb0991SDimitry Andric bool ARMLowOverheadLoops::RevertNonLoops() {
18068bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
18078bcb0991SDimitry Andric bool Changed = false;
18088bcb0991SDimitry Andric
18098bcb0991SDimitry Andric for (auto &MBB : *MF) {
18108bcb0991SDimitry Andric SmallVector<MachineInstr*, 4> Starts;
18118bcb0991SDimitry Andric SmallVector<MachineInstr*, 4> Decs;
18128bcb0991SDimitry Andric SmallVector<MachineInstr*, 4> Ends;
1813e8d8bef9SDimitry Andric SmallVector<MachineInstr *, 4> EndDecs;
1814349cc55cSDimitry Andric SmallVector<MachineInstr *, 4> MQPRCopies;
18158bcb0991SDimitry Andric
18168bcb0991SDimitry Andric for (auto &I : MBB) {
1817480093f4SDimitry Andric if (isLoopStart(I))
18188bcb0991SDimitry Andric Starts.push_back(&I);
18198bcb0991SDimitry Andric else if (I.getOpcode() == ARM::t2LoopDec)
18208bcb0991SDimitry Andric Decs.push_back(&I);
18218bcb0991SDimitry Andric else if (I.getOpcode() == ARM::t2LoopEnd)
18228bcb0991SDimitry Andric Ends.push_back(&I);
1823e8d8bef9SDimitry Andric else if (I.getOpcode() == ARM::t2LoopEndDec)
1824e8d8bef9SDimitry Andric EndDecs.push_back(&I);
1825349cc55cSDimitry Andric else if (I.getOpcode() == ARM::MQPRCopy)
1826349cc55cSDimitry Andric MQPRCopies.push_back(&I);
18278bcb0991SDimitry Andric }
18288bcb0991SDimitry Andric
1829349cc55cSDimitry Andric if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
1830349cc55cSDimitry Andric MQPRCopies.empty())
18318bcb0991SDimitry Andric continue;
18328bcb0991SDimitry Andric
18338bcb0991SDimitry Andric Changed = true;
18348bcb0991SDimitry Andric
18358bcb0991SDimitry Andric for (auto *Start : Starts) {
1836fe6060f1SDimitry Andric if (isWhileLoopStart(*Start))
18378bcb0991SDimitry Andric RevertWhile(Start);
18388bcb0991SDimitry Andric else
1839e8d8bef9SDimitry Andric RevertDo(Start);
18408bcb0991SDimitry Andric }
18418bcb0991SDimitry Andric for (auto *Dec : Decs)
18428bcb0991SDimitry Andric RevertLoopDec(Dec);
18438bcb0991SDimitry Andric
18448bcb0991SDimitry Andric for (auto *End : Ends)
18458bcb0991SDimitry Andric RevertLoopEnd(End);
1846e8d8bef9SDimitry Andric for (auto *End : EndDecs)
1847e8d8bef9SDimitry Andric RevertLoopEndDec(End);
1848349cc55cSDimitry Andric for (auto *MI : MQPRCopies) {
1849349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
1850349cc55cSDimitry Andric assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1851349cc55cSDimitry Andric MachineBasicBlock *MBB = MI->getParent();
1852349cc55cSDimitry Andric auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR),
1853349cc55cSDimitry Andric MI->getOperand(0).getReg())
1854349cc55cSDimitry Andric .add(MI->getOperand(1))
1855349cc55cSDimitry Andric .add(MI->getOperand(1));
1856349cc55cSDimitry Andric addUnpredicatedMveVpredROp(MIB, MI->getOperand(0).getReg());
1857349cc55cSDimitry Andric MI->eraseFromParent();
1858349cc55cSDimitry Andric }
18598bcb0991SDimitry Andric }
18608bcb0991SDimitry Andric return Changed;
18618bcb0991SDimitry Andric }
18628bcb0991SDimitry Andric
createARMLowOverheadLoopsPass()18630b57cec5SDimitry Andric FunctionPass *llvm::createARMLowOverheadLoopsPass() {
18640b57cec5SDimitry Andric return new ARMLowOverheadLoops();
18650b57cec5SDimitry Andric }
1866