xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp (revision ee0fe82ee2892f5ece189db0eab38913aaab5f0f)
1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10 /// instructions into machine operations.
11 /// The expectation is that the loop contains three pseudo instructions:
12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13 ///   form should be in the preheader, whereas the while form should be in the
14 ///   preheaders only predecessor. TODO: Could DoLoopStart get moved into the
15 ///   pre-preheader?
16 /// - t2LoopDec - placed within in the loop body.
17 /// - t2LoopEnd - the loop latch terminator.
18 ///
19 //===----------------------------------------------------------------------===//
20 
21 #include "ARM.h"
22 #include "ARMBaseInstrInfo.h"
23 #include "ARMBaseRegisterInfo.h"
24 #include "ARMBasicBlockInfo.h"
25 #include "ARMSubtarget.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineLoopInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "arm-low-overhead-loops"
33 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
34 
35 namespace {
36 
37   class ARMLowOverheadLoops : public MachineFunctionPass {
38     const ARMBaseInstrInfo    *TII = nullptr;
39     MachineRegisterInfo       *MRI = nullptr;
40     std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
41 
42   public:
43     static char ID;
44 
45     ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
46 
47     void getAnalysisUsage(AnalysisUsage &AU) const override {
48       AU.setPreservesCFG();
49       AU.addRequired<MachineLoopInfo>();
50       MachineFunctionPass::getAnalysisUsage(AU);
51     }
52 
53     bool runOnMachineFunction(MachineFunction &MF) override;
54 
55     bool ProcessLoop(MachineLoop *ML);
56 
57     void RevertWhile(MachineInstr *MI) const;
58 
59     void RevertLoopDec(MachineInstr *MI) const;
60 
61     void RevertLoopEnd(MachineInstr *MI) const;
62 
63     void Expand(MachineLoop *ML, MachineInstr *Start,
64                 MachineInstr *Dec, MachineInstr *End, bool Revert);
65 
66     MachineFunctionProperties getRequiredProperties() const override {
67       return MachineFunctionProperties().set(
68           MachineFunctionProperties::Property::NoVRegs);
69     }
70 
71     StringRef getPassName() const override {
72       return ARM_LOW_OVERHEAD_LOOPS_NAME;
73     }
74   };
75 }
76 
77 char ARMLowOverheadLoops::ID = 0;
78 
79 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
80                 false, false)
81 
82 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) {
83   if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB())
84     return false;
85 
86   LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n");
87 
88   auto &MLI = getAnalysis<MachineLoopInfo>();
89   MRI = &MF.getRegInfo();
90   TII = static_cast<const ARMBaseInstrInfo*>(
91     MF.getSubtarget().getInstrInfo());
92   BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
93   BBUtils->computeAllBlockSizes();
94   BBUtils->adjustBBOffsetsAfter(&MF.front());
95 
96   bool Changed = false;
97   for (auto ML : MLI) {
98     if (!ML->getParentLoop())
99       Changed |= ProcessLoop(ML);
100   }
101   return Changed;
102 }
103 
104 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
105 
106   bool Changed = false;
107 
108   // Process inner loops first.
109   for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
110     Changed |= ProcessLoop(*I);
111 
112   LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
113 
114   auto IsLoopStart = [](MachineInstr &MI) {
115     return MI.getOpcode() == ARM::t2DoLoopStart ||
116            MI.getOpcode() == ARM::t2WhileLoopStart;
117   };
118 
119   // Search the given block for a loop start instruction. If one isn't found,
120   // and there's only one predecessor block, search that one too.
121   std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
122     [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
123     for (auto &MI : *MBB) {
124       if (IsLoopStart(MI))
125         return &MI;
126     }
127     if (MBB->pred_size() == 1)
128       return SearchForStart(*MBB->pred_begin());
129     return nullptr;
130   };
131 
132   MachineInstr *Start = nullptr;
133   MachineInstr *Dec = nullptr;
134   MachineInstr *End = nullptr;
135   bool Revert = false;
136 
137   // Search the preheader for the start intrinsic, or look through the
138   // predecessors of the header to find exactly one set.iterations intrinsic.
139   // FIXME: I don't see why we shouldn't be supporting multiple predecessors
140   // with potentially multiple set.loop.iterations, so we need to enable this.
141   if (auto *Preheader = ML->getLoopPreheader()) {
142     Start = SearchForStart(Preheader);
143   } else {
144     LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n"
145                << " - Performing manual predecessor search.\n");
146     MachineBasicBlock *Pred = nullptr;
147     for (auto *MBB : ML->getHeader()->predecessors()) {
148       if (!ML->contains(MBB)) {
149         if (Pred) {
150           LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n");
151           Start = nullptr;
152           break;
153         }
154         Pred = MBB;
155         Start = SearchForStart(MBB);
156       }
157     }
158   }
159 
160   // Find the low-overhead loop components and decide whether or not to fall
161   // back to a normal loop.
162   for (auto *MBB : reverse(ML->getBlocks())) {
163     for (auto &MI : *MBB) {
164       if (MI.getOpcode() == ARM::t2LoopDec)
165         Dec = &MI;
166       else if (MI.getOpcode() == ARM::t2LoopEnd)
167         End = &MI;
168       else if (MI.getDesc().isCall())
169         // TODO: Though the call will require LE to execute again, does this
170         // mean we should revert? Always executing LE hopefully should be
171         // faster than performing a sub,cmp,br or even subs,br.
172         Revert = true;
173 
174       if (!Dec)
175         continue;
176 
177       // If we find that we load/store LR between LoopDec and LoopEnd, expect
178       // that the decremented value has been spilled to the stack. Because
179       // this value isn't actually going to be produced until the latch, by LE,
180       // we would need to generate a real sub. The value is also likely to be
181       // reloaded for use of LoopEnd - in which in case we'd need to perform
182       // an add because it gets negated again by LE! The other option is to
183       // then generate the other form of LE which doesn't perform the sub.
184       if (MI.mayLoad() || MI.mayStore())
185         Revert =
186           MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR;
187     }
188 
189     if (Dec && End && Revert)
190       break;
191   }
192 
193   if (!Start && !Dec && !End) {
194     LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
195     return Changed;
196   } if (!(Start && Dec && End)) {
197     report_fatal_error("Failed to find all loop components");
198   }
199 
200   if (!End->getOperand(1).isMBB() ||
201       End->getOperand(1).getMBB() != ML->getHeader())
202     report_fatal_error("Expected LoopEnd to target Loop Header");
203 
204   // The WLS and LE instructions have 12-bits for the label offset. WLS
205   // requires a positive offset, while LE uses negative.
206   if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
207       !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
208     LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
209     Revert = true;
210   }
211   if (Start->getOpcode() == ARM::t2WhileLoopStart &&
212       (BBUtils->getOffsetOf(Start) >
213        BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
214        !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
215     LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
216     Revert = true;
217   }
218 
219   LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start
220                     << " - Found Loop Dec: " << *Dec
221                     << " - Found Loop End: " << *End);
222 
223   Expand(ML, Start, Dec, End, Revert);
224   return true;
225 }
226 
227 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
228 // beq that branches to the exit branch.
229 // FIXME: Need to check that we're not trashing the CPSR when generating the
230 // cmp. We could also try to generate a cbz if the value in LR is also in
231 // another low register.
232 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
233   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
234   MachineBasicBlock *MBB = MI->getParent();
235   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
236                                     TII->get(ARM::t2CMPri));
237   MIB.addReg(ARM::LR);
238   MIB.addImm(0);
239   MIB.addImm(ARMCC::AL);
240   MIB.addReg(ARM::CPSR);
241 
242   // TODO: Try to use tBcc instead
243   MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
244   MIB.add(MI->getOperand(1));   // branch target
245   MIB.addImm(ARMCC::EQ);        // condition code
246   MIB.addReg(ARM::CPSR);
247   MI->eraseFromParent();
248 }
249 
250 // TODO: Check flags so that we can possibly generate a tSubs or tSub.
251 void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
252   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
253   MachineBasicBlock *MBB = MI->getParent();
254   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
255                                     TII->get(ARM::t2SUBri));
256   MIB.addDef(ARM::LR);
257   MIB.add(MI->getOperand(1));
258   MIB.add(MI->getOperand(2));
259   MIB.addImm(ARMCC::AL);
260   MIB.addReg(0);
261   MIB.addReg(0);
262   MI->eraseFromParent();
263 }
264 
265 // Generate a subs, or sub and cmp, and a branch instead of an LE.
266 // FIXME: Need to check that we're not trashing the CPSR when generating
267 // the cmp.
268 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const {
269   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
270 
271   // Create cmp
272   MachineBasicBlock *MBB = MI->getParent();
273   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
274                                     TII->get(ARM::t2CMPri));
275   MIB.addReg(ARM::LR);
276   MIB.addImm(0);
277   MIB.addImm(ARMCC::AL);
278   MIB.addReg(ARM::CPSR);
279 
280   // TODO Try to use tBcc instead.
281   // Create bne
282   MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
283   MIB.add(MI->getOperand(1));   // branch target
284   MIB.addImm(ARMCC::NE);        // condition code
285   MIB.addReg(ARM::CPSR);
286   MI->eraseFromParent();
287 }
288 
289 void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
290                                  MachineInstr *Dec, MachineInstr *End,
291                                  bool Revert) {
292 
293   auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) {
294     // The trip count should already been held in LR since the instructions
295     // within the loop can only read and write to LR. So, there should be a
296     // mov to setup the count. WLS/DLS perform this move, so find the original
297     // and delete it - inserting WLS/DLS in its place.
298     MachineBasicBlock *MBB = Start->getParent();
299     MachineInstr *InsertPt = Start;
300     for (auto &I : MRI->def_instructions(ARM::LR)) {
301       if (I.getParent() != MBB)
302         continue;
303 
304       // Always execute.
305       if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL)
306         continue;
307 
308       // Only handle move reg, if the trip count it will need moving into a reg
309       // before the setup instruction anyway.
310       if (!I.getDesc().isMoveReg() ||
311           !I.getOperand(1).isIdenticalTo(Start->getOperand(0)))
312         continue;
313       InsertPt = &I;
314       break;
315     }
316 
317     unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
318       ARM::t2DLS : ARM::t2WLS;
319     MachineInstrBuilder MIB =
320       BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
321 
322     MIB.addDef(ARM::LR);
323     MIB.add(Start->getOperand(0));
324     if (Opc == ARM::t2WLS)
325       MIB.add(Start->getOperand(1));
326 
327     if (InsertPt != Start)
328       InsertPt->eraseFromParent();
329     Start->eraseFromParent();
330     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
331     return &*MIB;
332   };
333 
334   // Combine the LoopDec and LoopEnd instructions into LE(TP).
335   auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec,
336                               MachineInstr *End) {
337     MachineBasicBlock *MBB = End->getParent();
338     MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
339                                       TII->get(ARM::t2LEUpdate));
340     MIB.addDef(ARM::LR);
341     MIB.add(End->getOperand(0));
342     MIB.add(End->getOperand(1));
343     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
344 
345     End->eraseFromParent();
346     Dec->eraseFromParent();
347     return &*MIB;
348   };
349 
350   // TODO: We should be able to automatically remove these branches before we
351   // get here - probably by teaching analyzeBranch about the pseudo
352   // instructions.
353   // If there is an unconditional branch, after I, that just branches to the
354   // next block, remove it.
355   auto RemoveDeadBranch = [](MachineInstr *I) {
356     MachineBasicBlock *BB = I->getParent();
357     MachineInstr *Terminator = &BB->instr_back();
358     if (Terminator->isUnconditionalBranch() && I != Terminator) {
359       MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
360       if (BB->isLayoutSuccessor(Succ)) {
361         LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
362         Terminator->eraseFromParent();
363       }
364     }
365   };
366 
367   if (Revert) {
368     if (Start->getOpcode() == ARM::t2WhileLoopStart)
369       RevertWhile(Start);
370     else
371       Start->eraseFromParent();
372     RevertLoopDec(Dec);
373     RevertLoopEnd(End);
374   } else {
375     Start = ExpandLoopStart(ML, Start);
376     RemoveDeadBranch(Start);
377     End = ExpandLoopEnd(ML, Dec, End);
378     RemoveDeadBranch(End);
379   }
380 }
381 
382 FunctionPass *llvm::createARMLowOverheadLoopsPass() {
383   return new ARMLowOverheadLoops();
384 }
385