1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo 10 /// instructions into machine operations. 11 /// The expectation is that the loop contains three pseudo instructions: 12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop 13 /// form should be in the preheader, whereas the while form should be in the 14 /// preheaders only predecessor. 15 /// - t2LoopDec - placed within in the loop body. 16 /// - t2LoopEnd - the loop latch terminator. 17 /// 18 //===----------------------------------------------------------------------===// 19 20 #include "ARM.h" 21 #include "ARMBaseInstrInfo.h" 22 #include "ARMBaseRegisterInfo.h" 23 #include "ARMBasicBlockInfo.h" 24 #include "ARMSubtarget.h" 25 #include "llvm/CodeGen/MachineFunctionPass.h" 26 #include "llvm/CodeGen/MachineLoopInfo.h" 27 #include "llvm/CodeGen/MachineRegisterInfo.h" 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "arm-low-overhead-loops" 32 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" 33 34 namespace { 35 36 class ARMLowOverheadLoops : public MachineFunctionPass { 37 MachineFunction *MF = nullptr; 38 const ARMBaseInstrInfo *TII = nullptr; 39 MachineRegisterInfo *MRI = nullptr; 40 std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; 41 42 public: 43 static char ID; 44 45 ARMLowOverheadLoops() : MachineFunctionPass(ID) { } 46 47 void getAnalysisUsage(AnalysisUsage &AU) const override { 48 AU.setPreservesCFG(); 49 AU.addRequired<MachineLoopInfo>(); 50 MachineFunctionPass::getAnalysisUsage(AU); 51 } 52 53 bool runOnMachineFunction(MachineFunction &MF) override; 54 55 MachineFunctionProperties getRequiredProperties() const override { 56 return MachineFunctionProperties().set( 57 MachineFunctionProperties::Property::NoVRegs); 58 } 59 60 StringRef getPassName() const override { 61 return ARM_LOW_OVERHEAD_LOOPS_NAME; 62 } 63 64 private: 65 bool ProcessLoop(MachineLoop *ML); 66 67 MachineInstr * IsSafeToDefineLR(MachineInstr *MI); 68 69 bool RevertNonLoops(); 70 71 void RevertWhile(MachineInstr *MI) const; 72 73 bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const; 74 75 void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; 76 77 void Expand(MachineLoop *ML, MachineInstr *Start, 78 MachineInstr *InsertPt, MachineInstr *Dec, 79 MachineInstr *End, bool Revert); 80 81 }; 82 } 83 84 char ARMLowOverheadLoops::ID = 0; 85 86 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, 87 false, false) 88 89 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { 90 const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget()); 91 if (!ST.hasLOB()) 92 return false; 93 94 MF = &mf; 95 LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"); 96 97 auto &MLI = getAnalysis<MachineLoopInfo>(); 98 MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); 99 MRI = &MF->getRegInfo(); 100 TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); 101 BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF)); 102 BBUtils->computeAllBlockSizes(); 103 BBUtils->adjustBBOffsetsAfter(&MF->front()); 104 105 bool Changed = false; 106 for (auto ML : MLI) { 107 if (!ML->getParentLoop()) 108 Changed |= ProcessLoop(ML); 109 } 110 Changed |= RevertNonLoops(); 111 return Changed; 112 } 113 114 static bool IsLoopStart(MachineInstr &MI) { 115 return MI.getOpcode() == ARM::t2DoLoopStart || 116 MI.getOpcode() == ARM::t2WhileLoopStart; 117 } 118 119 template<typename T> 120 static MachineInstr* SearchForDef(MachineInstr *Begin, T End, unsigned Reg) { 121 for(auto &MI : make_range(T(Begin), End)) { 122 for (auto &MO : MI.operands()) { 123 if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg) 124 continue; 125 return &MI; 126 } 127 } 128 return nullptr; 129 } 130 131 static MachineInstr* SearchForUse(MachineInstr *Begin, 132 MachineBasicBlock::iterator End, 133 unsigned Reg) { 134 for(auto &MI : make_range(MachineBasicBlock::iterator(Begin), End)) { 135 for (auto &MO : MI.operands()) { 136 if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg) 137 continue; 138 return &MI; 139 } 140 } 141 return nullptr; 142 } 143 144 // Is it safe to define LR with DLS/WLS? 145 // LR can defined if it is the operand to start, because it's the same value, 146 // or if it's going to be equivalent to the operand to Start. 147 MachineInstr *ARMLowOverheadLoops::IsSafeToDefineLR(MachineInstr *Start) { 148 149 auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) { 150 return MI->getOpcode() == ARM::tMOVr && 151 MI->getOperand(0).getReg() == ARM::LR && 152 MI->getOperand(1).getReg() == Reg && 153 MI->getOperand(2).getImm() == ARMCC::AL; 154 }; 155 156 MachineBasicBlock *MBB = Start->getParent(); 157 unsigned CountReg = Start->getOperand(0).getReg(); 158 // Walk forward and backward in the block to find the closest instructions 159 // that define LR. Then also filter them out if they're not a mov lr. 160 MachineInstr *PredLRDef = SearchForDef(Start, MBB->rend(), ARM::LR); 161 if (PredLRDef && !IsMoveLR(PredLRDef, CountReg)) 162 PredLRDef = nullptr; 163 164 MachineInstr *SuccLRDef = SearchForDef(Start, MBB->end(), ARM::LR); 165 if (SuccLRDef && !IsMoveLR(SuccLRDef, CountReg)) 166 SuccLRDef = nullptr; 167 168 // We've either found one, two or none mov lr instructions... Now figure out 169 // if they are performing the equilvant mov that the Start instruction will. 170 // Do this by scanning forward and backward to see if there's a def of the 171 // register holding the count value. If we find a suitable def, return it as 172 // the insert point. Later, if InsertPt != Start, then we can remove the 173 // redundant instruction. 174 if (SuccLRDef) { 175 MachineBasicBlock::iterator End(SuccLRDef); 176 if (!SearchForDef(Start, End, CountReg)) { 177 return SuccLRDef; 178 } else 179 SuccLRDef = nullptr; 180 } 181 if (PredLRDef) { 182 MachineBasicBlock::reverse_iterator End(PredLRDef); 183 if (!SearchForDef(Start, End, CountReg)) { 184 return PredLRDef; 185 } else 186 PredLRDef = nullptr; 187 } 188 189 // We can define LR because LR already contains the same value. 190 if (Start->getOperand(0).getReg() == ARM::LR) 191 return Start; 192 193 // We've found no suitable LR def and Start doesn't use LR directly. Can we 194 // just define LR anyway? 195 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 196 LivePhysRegs LiveRegs(*TRI); 197 LiveRegs.addLiveOuts(*MBB); 198 199 // Not if we've haven't found a suitable mov and LR is live out. 200 if (LiveRegs.contains(ARM::LR)) 201 return nullptr; 202 203 // If LR is not live out, we can insert the instruction if nothing else 204 // uses LR after it. 205 if (!SearchForUse(Start, MBB->end(), ARM::LR)) 206 return Start; 207 208 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find suitable insertion point for" 209 << " LR\n"); 210 return nullptr; 211 } 212 213 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { 214 215 bool Changed = false; 216 217 // Process inner loops first. 218 for (auto I = ML->begin(), E = ML->end(); I != E; ++I) 219 Changed |= ProcessLoop(*I); 220 221 LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); 222 223 // Search the given block for a loop start instruction. If one isn't found, 224 // and there's only one predecessor block, search that one too. 225 std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = 226 [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { 227 for (auto &MI : *MBB) { 228 if (IsLoopStart(MI)) 229 return &MI; 230 } 231 if (MBB->pred_size() == 1) 232 return SearchForStart(*MBB->pred_begin()); 233 return nullptr; 234 }; 235 236 MachineInstr *Start = nullptr; 237 MachineInstr *Dec = nullptr; 238 MachineInstr *End = nullptr; 239 bool Revert = false; 240 241 // Search the preheader for the start intrinsic, or look through the 242 // predecessors of the header to find exactly one set.iterations intrinsic. 243 // FIXME: I don't see why we shouldn't be supporting multiple predecessors 244 // with potentially multiple set.loop.iterations, so we need to enable this. 245 if (auto *Preheader = ML->getLoopPreheader()) { 246 Start = SearchForStart(Preheader); 247 } else { 248 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n" 249 << " - Performing manual predecessor search.\n"); 250 MachineBasicBlock *Pred = nullptr; 251 for (auto *MBB : ML->getHeader()->predecessors()) { 252 if (!ML->contains(MBB)) { 253 if (Pred) { 254 LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n"); 255 Start = nullptr; 256 break; 257 } 258 Pred = MBB; 259 Start = SearchForStart(MBB); 260 } 261 } 262 } 263 264 // Find the low-overhead loop components and decide whether or not to fall 265 // back to a normal loop. 266 for (auto *MBB : reverse(ML->getBlocks())) { 267 for (auto &MI : *MBB) { 268 if (MI.getOpcode() == ARM::t2LoopDec) 269 Dec = &MI; 270 else if (MI.getOpcode() == ARM::t2LoopEnd) 271 End = &MI; 272 else if (IsLoopStart(MI)) 273 Start = &MI; 274 else if (MI.getDesc().isCall()) { 275 // TODO: Though the call will require LE to execute again, does this 276 // mean we should revert? Always executing LE hopefully should be 277 // faster than performing a sub,cmp,br or even subs,br. 278 Revert = true; 279 LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n"); 280 } 281 282 if (!Dec || End) 283 continue; 284 285 // If we find that LR has been written or read between LoopDec and 286 // LoopEnd, expect that the decremented value is being used else where. 287 // Because this value isn't actually going to be produced until the 288 // latch, by LE, we would need to generate a real sub. The value is also 289 // likely to be copied/reloaded for use of LoopEnd - in which in case 290 // we'd need to perform an add because it gets subtracted again by LE! 291 // The other option is to then generate the other form of LE which doesn't 292 // perform the sub. 293 for (auto &MO : MI.operands()) { 294 if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() && 295 MO.getReg() == ARM::LR) { 296 LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI); 297 Revert = true; 298 break; 299 } 300 } 301 } 302 303 if (Dec && End && Revert) 304 break; 305 } 306 307 LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; 308 if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; 309 if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;); 310 311 if (!Start && !Dec && !End) { 312 LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); 313 return Changed; 314 } else if (!(Start && Dec && End)) { 315 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n"); 316 return false; 317 } 318 319 if (!End->getOperand(1).isMBB()) 320 report_fatal_error("Expected LoopEnd to target basic block"); 321 322 // TODO Maybe there's cases where the target doesn't have to be the header, 323 // but for now be safe and revert. 324 if (End->getOperand(1).getMBB() != ML->getHeader()) { 325 LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); 326 Revert = true; 327 } 328 329 // The WLS and LE instructions have 12-bits for the label offset. WLS 330 // requires a positive offset, while LE uses negative. 331 if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || 332 !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { 333 LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); 334 Revert = true; 335 } 336 if (Start->getOpcode() == ARM::t2WhileLoopStart && 337 (BBUtils->getOffsetOf(Start) > 338 BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || 339 !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { 340 LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); 341 Revert = true; 342 } 343 344 MachineInstr *InsertPt = Revert ? nullptr : IsSafeToDefineLR(Start); 345 if (!InsertPt) { 346 LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); 347 Revert = true; 348 } else 349 LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); 350 351 Expand(ML, Start, InsertPt, Dec, End, Revert); 352 return true; 353 } 354 355 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a 356 // beq that branches to the exit branch. 357 // TODO: We could also try to generate a cbz if the value in LR is also in 358 // another low register. 359 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { 360 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); 361 MachineBasicBlock *MBB = MI->getParent(); 362 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 363 TII->get(ARM::t2CMPri)); 364 MIB.add(MI->getOperand(0)); 365 MIB.addImm(0); 366 MIB.addImm(ARMCC::AL); 367 MIB.addReg(ARM::NoRegister); 368 369 MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); 370 unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? 371 ARM::tBcc : ARM::t2Bcc; 372 373 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); 374 MIB.add(MI->getOperand(1)); // branch target 375 MIB.addImm(ARMCC::EQ); // condition code 376 MIB.addReg(ARM::CPSR); 377 MI->eraseFromParent(); 378 } 379 380 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI, 381 bool AllowFlags) const { 382 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); 383 MachineBasicBlock *MBB = MI->getParent(); 384 385 // If nothing uses or defines CPSR between LoopDec and LoopEnd, use a t2SUBS. 386 bool SetFlags = false; 387 if (AllowFlags) { 388 if (auto *Def = SearchForDef(MI, MBB->end(), ARM::CPSR)) { 389 if (!SearchForUse(MI, MBB->end(), ARM::CPSR) && 390 Def->getOpcode() == ARM::t2LoopEnd) 391 SetFlags = true; 392 } 393 } 394 395 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 396 TII->get(ARM::t2SUBri)); 397 MIB.addDef(ARM::LR); 398 MIB.add(MI->getOperand(1)); 399 MIB.add(MI->getOperand(2)); 400 MIB.addImm(ARMCC::AL); 401 MIB.addReg(0); 402 403 if (SetFlags) { 404 MIB.addReg(ARM::CPSR); 405 MIB->getOperand(5).setIsDef(true); 406 } else 407 MIB.addReg(0); 408 409 MI->eraseFromParent(); 410 return SetFlags; 411 } 412 413 // Generate a subs, or sub and cmp, and a branch instead of an LE. 414 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { 415 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); 416 417 MachineBasicBlock *MBB = MI->getParent(); 418 // Create cmp 419 if (!SkipCmp) { 420 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 421 TII->get(ARM::t2CMPri)); 422 MIB.addReg(ARM::LR); 423 MIB.addImm(0); 424 MIB.addImm(ARMCC::AL); 425 MIB.addReg(ARM::NoRegister); 426 } 427 428 MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); 429 unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? 430 ARM::tBcc : ARM::t2Bcc; 431 432 // Create bne 433 MachineInstrBuilder MIB = 434 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); 435 MIB.add(MI->getOperand(1)); // branch target 436 MIB.addImm(ARMCC::NE); // condition code 437 MIB.addReg(ARM::CPSR); 438 MI->eraseFromParent(); 439 } 440 441 void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, 442 MachineInstr *InsertPt, 443 MachineInstr *Dec, MachineInstr *End, 444 bool Revert) { 445 446 auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start, 447 MachineInstr *InsertPt) { 448 MachineBasicBlock *MBB = InsertPt->getParent(); 449 unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ? 450 ARM::t2DLS : ARM::t2WLS; 451 MachineInstrBuilder MIB = 452 BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); 453 454 MIB.addDef(ARM::LR); 455 MIB.add(Start->getOperand(0)); 456 if (Opc == ARM::t2WLS) 457 MIB.add(Start->getOperand(1)); 458 459 if (InsertPt != Start) 460 InsertPt->eraseFromParent(); 461 Start->eraseFromParent(); 462 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); 463 return &*MIB; 464 }; 465 466 // Combine the LoopDec and LoopEnd instructions into LE(TP). 467 auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec, 468 MachineInstr *End) { 469 MachineBasicBlock *MBB = End->getParent(); 470 MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), 471 TII->get(ARM::t2LEUpdate)); 472 MIB.addDef(ARM::LR); 473 MIB.add(End->getOperand(0)); 474 MIB.add(End->getOperand(1)); 475 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); 476 477 End->eraseFromParent(); 478 Dec->eraseFromParent(); 479 return &*MIB; 480 }; 481 482 // TODO: We should be able to automatically remove these branches before we 483 // get here - probably by teaching analyzeBranch about the pseudo 484 // instructions. 485 // If there is an unconditional branch, after I, that just branches to the 486 // next block, remove it. 487 auto RemoveDeadBranch = [](MachineInstr *I) { 488 MachineBasicBlock *BB = I->getParent(); 489 MachineInstr *Terminator = &BB->instr_back(); 490 if (Terminator->isUnconditionalBranch() && I != Terminator) { 491 MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB(); 492 if (BB->isLayoutSuccessor(Succ)) { 493 LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); 494 Terminator->eraseFromParent(); 495 } 496 } 497 }; 498 499 if (Revert) { 500 if (Start->getOpcode() == ARM::t2WhileLoopStart) 501 RevertWhile(Start); 502 else 503 Start->eraseFromParent(); 504 bool FlagsAlreadySet = RevertLoopDec(Dec, true); 505 RevertLoopEnd(End, FlagsAlreadySet); 506 } else { 507 Start = ExpandLoopStart(ML, Start, InsertPt); 508 RemoveDeadBranch(Start); 509 End = ExpandLoopEnd(ML, Dec, End); 510 RemoveDeadBranch(End); 511 } 512 } 513 514 bool ARMLowOverheadLoops::RevertNonLoops() { 515 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n"); 516 bool Changed = false; 517 518 for (auto &MBB : *MF) { 519 SmallVector<MachineInstr*, 4> Starts; 520 SmallVector<MachineInstr*, 4> Decs; 521 SmallVector<MachineInstr*, 4> Ends; 522 523 for (auto &I : MBB) { 524 if (IsLoopStart(I)) 525 Starts.push_back(&I); 526 else if (I.getOpcode() == ARM::t2LoopDec) 527 Decs.push_back(&I); 528 else if (I.getOpcode() == ARM::t2LoopEnd) 529 Ends.push_back(&I); 530 } 531 532 if (Starts.empty() && Decs.empty() && Ends.empty()) 533 continue; 534 535 Changed = true; 536 537 for (auto *Start : Starts) { 538 if (Start->getOpcode() == ARM::t2WhileLoopStart) 539 RevertWhile(Start); 540 else 541 Start->eraseFromParent(); 542 } 543 for (auto *Dec : Decs) 544 RevertLoopDec(Dec); 545 546 for (auto *End : Ends) 547 RevertLoopEnd(End); 548 } 549 return Changed; 550 } 551 552 FunctionPass *llvm::createARMLowOverheadLoopsPass() { 553 return new ARMLowOverheadLoops(); 554 } 555