1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo 10 /// instructions into machine operations. 11 /// The expectation is that the loop contains three pseudo instructions: 12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop 13 /// form should be in the preheader, whereas the while form should be in the 14 /// preheaders only predecessor. TODO: Could DoLoopStart get moved into the 15 /// pre-preheader? 16 /// - t2LoopDec - placed within in the loop body. 17 /// - t2LoopEnd - the loop latch terminator. 18 /// 19 //===----------------------------------------------------------------------===// 20 21 #include "ARM.h" 22 #include "ARMBaseInstrInfo.h" 23 #include "ARMBaseRegisterInfo.h" 24 #include "ARMBasicBlockInfo.h" 25 #include "ARMSubtarget.h" 26 #include "llvm/CodeGen/MachineFunctionPass.h" 27 #include "llvm/CodeGen/MachineLoopInfo.h" 28 #include "llvm/CodeGen/MachineRegisterInfo.h" 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "arm-low-overhead-loops" 33 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" 34 35 namespace { 36 37 class ARMLowOverheadLoops : public MachineFunctionPass { 38 const ARMBaseInstrInfo *TII = nullptr; 39 MachineRegisterInfo *MRI = nullptr; 40 std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; 41 42 public: 43 static char ID; 44 45 ARMLowOverheadLoops() : MachineFunctionPass(ID) { } 46 47 void getAnalysisUsage(AnalysisUsage &AU) const override { 48 AU.setPreservesCFG(); 49 AU.addRequired<MachineLoopInfo>(); 50 MachineFunctionPass::getAnalysisUsage(AU); 51 } 52 53 bool runOnMachineFunction(MachineFunction &MF) override; 54 55 bool ProcessLoop(MachineLoop *ML); 56 57 void RevertWhile(MachineInstr *MI) const; 58 59 void RevertLoopDec(MachineInstr *MI) const; 60 61 void RevertLoopEnd(MachineInstr *MI) const; 62 63 void Expand(MachineLoop *ML, MachineInstr *Start, 64 MachineInstr *Dec, MachineInstr *End, bool Revert); 65 66 MachineFunctionProperties getRequiredProperties() const override { 67 return MachineFunctionProperties().set( 68 MachineFunctionProperties::Property::NoVRegs); 69 } 70 71 StringRef getPassName() const override { 72 return ARM_LOW_OVERHEAD_LOOPS_NAME; 73 } 74 }; 75 } 76 77 char ARMLowOverheadLoops::ID = 0; 78 79 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, 80 false, false) 81 82 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) { 83 if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB()) 84 return false; 85 86 LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n"); 87 88 auto &MLI = getAnalysis<MachineLoopInfo>(); 89 MRI = &MF.getRegInfo(); 90 TII = static_cast<const ARMBaseInstrInfo*>( 91 MF.getSubtarget().getInstrInfo()); 92 BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF)); 93 BBUtils->computeAllBlockSizes(); 94 BBUtils->adjustBBOffsetsAfter(&MF.front()); 95 96 bool Changed = false; 97 for (auto ML : MLI) { 98 if (!ML->getParentLoop()) 99 Changed |= ProcessLoop(ML); 100 } 101 return Changed; 102 } 103 104 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { 105 106 bool Changed = false; 107 108 // Process inner loops first. 109 for (auto I = ML->begin(), E = ML->end(); I != E; ++I) 110 Changed |= ProcessLoop(*I); 111 112 LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); 113 114 auto IsLoopStart = [](MachineInstr &MI) { 115 return MI.getOpcode() == ARM::t2DoLoopStart || 116 MI.getOpcode() == ARM::t2WhileLoopStart; 117 }; 118 119 // Search the given block for a loop start instruction. If one isn't found, 120 // and there's only one predecessor block, search that one too. 121 std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = 122 [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { 123 for (auto &MI : *MBB) { 124 if (IsLoopStart(MI)) 125 return &MI; 126 } 127 if (MBB->pred_size() == 1) 128 return SearchForStart(*MBB->pred_begin()); 129 return nullptr; 130 }; 131 132 MachineInstr *Start = nullptr; 133 MachineInstr *Dec = nullptr; 134 MachineInstr *End = nullptr; 135 bool Revert = false; 136 137 // Search the preheader for the start intrinsic, or look through the 138 // predecessors of the header to find exactly one set.iterations intrinsic. 139 // FIXME: I don't see why we shouldn't be supporting multiple predecessors 140 // with potentially multiple set.loop.iterations, so we need to enable this. 141 if (auto *Preheader = ML->getLoopPreheader()) { 142 Start = SearchForStart(Preheader); 143 } else { 144 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n" 145 << " - Performing manual predecessor search.\n"); 146 MachineBasicBlock *Pred = nullptr; 147 for (auto *MBB : ML->getHeader()->predecessors()) { 148 if (!ML->contains(MBB)) { 149 if (Pred) { 150 LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n"); 151 Start = nullptr; 152 break; 153 } 154 Pred = MBB; 155 Start = SearchForStart(MBB); 156 } 157 } 158 } 159 160 // Find the low-overhead loop components and decide whether or not to fall 161 // back to a normal loop. 162 for (auto *MBB : reverse(ML->getBlocks())) { 163 for (auto &MI : *MBB) { 164 if (MI.getOpcode() == ARM::t2LoopDec) 165 Dec = &MI; 166 else if (MI.getOpcode() == ARM::t2LoopEnd) 167 End = &MI; 168 else if (MI.getDesc().isCall()) 169 // TODO: Though the call will require LE to execute again, does this 170 // mean we should revert? Always executing LE hopefully should be 171 // faster than performing a sub,cmp,br or even subs,br. 172 Revert = true; 173 174 if (!Dec) 175 continue; 176 177 // If we find that we load/store LR between LoopDec and LoopEnd, expect 178 // that the decremented value has been spilled to the stack. Because 179 // this value isn't actually going to be produced until the latch, by LE, 180 // we would need to generate a real sub. The value is also likely to be 181 // reloaded for use of LoopEnd - in which in case we'd need to perform 182 // an add because it gets negated again by LE! The other option is to 183 // then generate the other form of LE which doesn't perform the sub. 184 if (MI.mayLoad() || MI.mayStore()) 185 Revert = 186 MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR; 187 } 188 189 if (Dec && End && Revert) 190 break; 191 } 192 193 if (!Start && !Dec && !End) { 194 LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); 195 return Changed; 196 } if (!(Start && Dec && End)) { 197 report_fatal_error("Failed to find all loop components"); 198 } 199 200 if (!End->getOperand(1).isMBB() || 201 End->getOperand(1).getMBB() != ML->getHeader()) 202 report_fatal_error("Expected LoopEnd to target Loop Header"); 203 204 // The WLS and LE instructions have 12-bits for the label offset. WLS 205 // requires a positive offset, while LE uses negative. 206 if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || 207 !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { 208 LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); 209 Revert = true; 210 } 211 if (Start->getOpcode() == ARM::t2WhileLoopStart && 212 (BBUtils->getOffsetOf(Start) > 213 BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || 214 !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { 215 LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); 216 Revert = true; 217 } 218 219 LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start 220 << " - Found Loop Dec: " << *Dec 221 << " - Found Loop End: " << *End); 222 223 Expand(ML, Start, Dec, End, Revert); 224 return true; 225 } 226 227 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a 228 // beq that branches to the exit branch. 229 // FIXME: Need to check that we're not trashing the CPSR when generating the 230 // cmp. We could also try to generate a cbz if the value in LR is also in 231 // another low register. 232 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { 233 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); 234 MachineBasicBlock *MBB = MI->getParent(); 235 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 236 TII->get(ARM::t2CMPri)); 237 MIB.addReg(ARM::LR); 238 MIB.addImm(0); 239 MIB.addImm(ARMCC::AL); 240 MIB.addReg(ARM::CPSR); 241 242 // TODO: Try to use tBcc instead 243 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); 244 MIB.add(MI->getOperand(1)); // branch target 245 MIB.addImm(ARMCC::EQ); // condition code 246 MIB.addReg(ARM::CPSR); 247 MI->eraseFromParent(); 248 } 249 250 // TODO: Check flags so that we can possibly generate a tSubs or tSub. 251 void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { 252 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); 253 MachineBasicBlock *MBB = MI->getParent(); 254 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 255 TII->get(ARM::t2SUBri)); 256 MIB.addDef(ARM::LR); 257 MIB.add(MI->getOperand(1)); 258 MIB.add(MI->getOperand(2)); 259 MIB.addImm(ARMCC::AL); 260 MIB.addReg(0); 261 MIB.addReg(0); 262 MI->eraseFromParent(); 263 } 264 265 // Generate a subs, or sub and cmp, and a branch instead of an LE. 266 // FIXME: Need to check that we're not trashing the CPSR when generating 267 // the cmp. 268 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const { 269 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); 270 271 // Create cmp 272 MachineBasicBlock *MBB = MI->getParent(); 273 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 274 TII->get(ARM::t2CMPri)); 275 MIB.addReg(ARM::LR); 276 MIB.addImm(0); 277 MIB.addImm(ARMCC::AL); 278 MIB.addReg(ARM::CPSR); 279 280 // TODO Try to use tBcc instead. 281 // Create bne 282 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); 283 MIB.add(MI->getOperand(1)); // branch target 284 MIB.addImm(ARMCC::NE); // condition code 285 MIB.addReg(ARM::CPSR); 286 MI->eraseFromParent(); 287 } 288 289 void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, 290 MachineInstr *Dec, MachineInstr *End, 291 bool Revert) { 292 293 auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) { 294 // The trip count should already been held in LR since the instructions 295 // within the loop can only read and write to LR. So, there should be a 296 // mov to setup the count. WLS/DLS perform this move, so find the original 297 // and delete it - inserting WLS/DLS in its place. 298 MachineBasicBlock *MBB = Start->getParent(); 299 MachineInstr *InsertPt = Start; 300 for (auto &I : MRI->def_instructions(ARM::LR)) { 301 if (I.getParent() != MBB) 302 continue; 303 304 // Always execute. 305 if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL) 306 continue; 307 308 // Only handle move reg, if the trip count it will need moving into a reg 309 // before the setup instruction anyway. 310 if (!I.getDesc().isMoveReg() || 311 !I.getOperand(1).isIdenticalTo(Start->getOperand(0))) 312 continue; 313 InsertPt = &I; 314 break; 315 } 316 317 unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ? 318 ARM::t2DLS : ARM::t2WLS; 319 MachineInstrBuilder MIB = 320 BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); 321 322 MIB.addDef(ARM::LR); 323 MIB.add(Start->getOperand(0)); 324 if (Opc == ARM::t2WLS) 325 MIB.add(Start->getOperand(1)); 326 327 if (InsertPt != Start) 328 InsertPt->eraseFromParent(); 329 Start->eraseFromParent(); 330 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); 331 return &*MIB; 332 }; 333 334 // Combine the LoopDec and LoopEnd instructions into LE(TP). 335 auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec, 336 MachineInstr *End) { 337 MachineBasicBlock *MBB = End->getParent(); 338 MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), 339 TII->get(ARM::t2LEUpdate)); 340 MIB.addDef(ARM::LR); 341 MIB.add(End->getOperand(0)); 342 MIB.add(End->getOperand(1)); 343 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); 344 345 End->eraseFromParent(); 346 Dec->eraseFromParent(); 347 return &*MIB; 348 }; 349 350 // TODO: We should be able to automatically remove these branches before we 351 // get here - probably by teaching analyzeBranch about the pseudo 352 // instructions. 353 // If there is an unconditional branch, after I, that just branches to the 354 // next block, remove it. 355 auto RemoveDeadBranch = [](MachineInstr *I) { 356 MachineBasicBlock *BB = I->getParent(); 357 MachineInstr *Terminator = &BB->instr_back(); 358 if (Terminator->isUnconditionalBranch() && I != Terminator) { 359 MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB(); 360 if (BB->isLayoutSuccessor(Succ)) { 361 LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); 362 Terminator->eraseFromParent(); 363 } 364 } 365 }; 366 367 if (Revert) { 368 if (Start->getOpcode() == ARM::t2WhileLoopStart) 369 RevertWhile(Start); 370 else 371 Start->eraseFromParent(); 372 RevertLoopDec(Dec); 373 RevertLoopEnd(End); 374 } else { 375 Start = ExpandLoopStart(ML, Start); 376 RemoveDeadBranch(Start); 377 End = ExpandLoopEnd(ML, Dec, End); 378 RemoveDeadBranch(End); 379 } 380 } 381 382 FunctionPass *llvm::createARMLowOverheadLoopsPass() { 383 return new ARMLowOverheadLoops(); 384 } 385