1 //===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass does a few optimisations related to Tail predicated loops 10 /// and MVE VPT blocks before register allocation is performed. For VPT blocks 11 /// the goal is to maximize the sizes of the blocks that will be created by the 12 /// MVE VPT Block Insertion pass (which runs after register allocation). For 13 /// tail predicated loops we transform the loop into something that will 14 /// hopefully make the backend ARMLowOverheadLoops pass's job easier. 15 /// 16 //===----------------------------------------------------------------------===// 17 18 #include "ARM.h" 19 #include "ARMSubtarget.h" 20 #include "MCTargetDesc/ARMBaseInfo.h" 21 #include "MVETailPredUtils.h" 22 #include "Thumb2InstrInfo.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/CodeGen/MachineBasicBlock.h" 25 #include "llvm/CodeGen/MachineDominators.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 #include "llvm/CodeGen/MachineInstr.h" 29 #include "llvm/CodeGen/MachineLoopInfo.h" 30 #include "llvm/InitializePasses.h" 31 #include "llvm/Support/Debug.h" 32 #include <cassert> 33 34 using namespace llvm; 35 36 #define DEBUG_TYPE "arm-mve-vpt-opts" 37 38 static cl::opt<bool> 39 MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden, 40 cl::desc("Enable merging Loop End and Dec instructions."), 41 cl::init(true)); 42 43 namespace { 44 class MVETPAndVPTOptimisations : public MachineFunctionPass { 45 public: 46 static char ID; 47 const Thumb2InstrInfo *TII; 48 MachineRegisterInfo *MRI; 49 50 MVETPAndVPTOptimisations() : MachineFunctionPass(ID) {} 51 52 bool runOnMachineFunction(MachineFunction &Fn) override; 53 54 void getAnalysisUsage(AnalysisUsage &AU) const override { 55 AU.addRequired<MachineLoopInfo>(); 56 AU.addPreserved<MachineLoopInfo>(); 57 AU.addRequired<MachineDominatorTree>(); 58 AU.addPreserved<MachineDominatorTree>(); 59 MachineFunctionPass::getAnalysisUsage(AU); 60 } 61 62 StringRef getPassName() const override { 63 return "ARM MVE TailPred and VPT Optimisation Pass"; 64 } 65 66 private: 67 bool LowerWhileLoopStart(MachineLoop *ML); 68 bool MergeLoopEnd(MachineLoop *ML); 69 bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT); 70 MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB, 71 MachineInstr &Instr, 72 MachineOperand &User, 73 Register Target); 74 bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB); 75 bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB); 76 bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT); 77 bool ConvertVPSEL(MachineBasicBlock &MBB); 78 bool HintDoLoopStartReg(MachineBasicBlock &MBB); 79 MachineInstr *CheckForLRUseInPredecessors(MachineBasicBlock *PreHeader, 80 MachineInstr *LoopStart); 81 }; 82 83 char MVETPAndVPTOptimisations::ID = 0; 84 85 } // end anonymous namespace 86 87 INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE, 88 "ARM MVE TailPred and VPT Optimisations pass", false, 89 false) 90 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 91 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 92 INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE, 93 "ARM MVE TailPred and VPT Optimisations pass", false, false) 94 95 static MachineInstr *LookThroughCOPY(MachineInstr *MI, 96 MachineRegisterInfo *MRI) { 97 while (MI && MI->getOpcode() == TargetOpcode::COPY && 98 MI->getOperand(1).getReg().isVirtual()) 99 MI = MRI->getVRegDef(MI->getOperand(1).getReg()); 100 return MI; 101 } 102 103 // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and 104 // corresponding PHI that make up a low overhead loop. Only handles 'do' loops 105 // at the moment, returning a t2DoLoopStart in LoopStart. 106 static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI, 107 MachineInstr *&LoopStart, MachineInstr *&LoopPhi, 108 MachineInstr *&LoopDec, MachineInstr *&LoopEnd) { 109 MachineBasicBlock *Header = ML->getHeader(); 110 MachineBasicBlock *Latch = ML->getLoopLatch(); 111 if (!Header || !Latch) { 112 LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n"); 113 return false; 114 } 115 116 // Find the loop end from the terminators. 117 LoopEnd = nullptr; 118 for (auto &T : Latch->terminators()) { 119 if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) { 120 LoopEnd = &T; 121 break; 122 } 123 if (T.getOpcode() == ARM::t2LoopEndDec && 124 T.getOperand(2).getMBB() == Header) { 125 LoopEnd = &T; 126 break; 127 } 128 } 129 if (!LoopEnd) { 130 LLVM_DEBUG(dbgs() << " no LoopEnd\n"); 131 return false; 132 } 133 LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd); 134 135 // Find the dec from the use of the end. There may be copies between 136 // instructions. We expect the loop to loop like: 137 // $vs = t2DoLoopStart ... 138 // loop: 139 // $vp = phi [ $vs ], [ $vd ] 140 // ... 141 // $vd = t2LoopDec $vp 142 // ... 143 // t2LoopEnd $vd, loop 144 if (LoopEnd->getOpcode() == ARM::t2LoopEndDec) 145 LoopDec = LoopEnd; 146 else { 147 LoopDec = 148 LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI); 149 if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) { 150 LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n"); 151 return false; 152 } 153 } 154 LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec); 155 156 LoopPhi = 157 LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI); 158 if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI || 159 LoopPhi->getNumOperands() != 5 || 160 (LoopPhi->getOperand(2).getMBB() != Latch && 161 LoopPhi->getOperand(4).getMBB() != Latch)) { 162 LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n"); 163 return false; 164 } 165 LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi); 166 167 Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch 168 ? LoopPhi->getOperand(3).getReg() 169 : LoopPhi->getOperand(1).getReg(); 170 LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI); 171 if (!LoopStart || (LoopStart->getOpcode() != ARM::t2DoLoopStart && 172 LoopStart->getOpcode() != ARM::t2WhileLoopSetup && 173 LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) { 174 LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n"); 175 return false; 176 } 177 LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart); 178 179 return true; 180 } 181 182 static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) { 183 MachineBasicBlock *MBB = MI->getParent(); 184 assert(MI->getOpcode() == ARM::t2WhileLoopSetup && 185 "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!"); 186 187 // Subs 188 MachineInstrBuilder MIB = 189 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); 190 MIB.add(MI->getOperand(0)); 191 MIB.add(MI->getOperand(1)); 192 MIB.addImm(0); 193 MIB.addImm(ARMCC::AL); 194 MIB.addReg(ARM::NoRegister); 195 MIB.addReg(ARM::CPSR, RegState::Define); 196 197 // Attempt to find a t2WhileLoopStart and revert to a t2Bcc. 198 for (MachineInstr &I : MBB->terminators()) { 199 if (I.getOpcode() == ARM::t2WhileLoopStart) { 200 MachineInstrBuilder MIB = 201 BuildMI(*MBB, &I, I.getDebugLoc(), TII->get(ARM::t2Bcc)); 202 MIB.add(MI->getOperand(1)); // branch target 203 MIB.addImm(ARMCC::EQ); 204 MIB.addReg(ARM::CPSR); 205 I.eraseFromParent(); 206 break; 207 } 208 } 209 210 MI->eraseFromParent(); 211 } 212 213 // The Hardware Loop insertion and ISel Lowering produce the pseudos for the 214 // start of a while loop: 215 // %a:gprlr = t2WhileLoopSetup %Cnt 216 // t2WhileLoopStart %a, %BB 217 // We want to convert those to a single instruction which, like t2LoopEndDec and 218 // t2DoLoopStartTP is both a terminator and produces a value: 219 // %a:grplr: t2WhileLoopStartLR %Cnt, %BB 220 // 221 // Otherwise if we can't, we revert the loop. t2WhileLoopSetup and 222 // t2WhileLoopStart are not valid past regalloc. 223 bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) { 224 LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop " 225 << ML->getHeader()->getName() << "\n"); 226 227 MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; 228 if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) 229 return false; 230 231 if (LoopStart->getOpcode() != ARM::t2WhileLoopSetup) 232 return false; 233 234 Register LR = LoopStart->getOperand(0).getReg(); 235 auto WLSIt = find_if(MRI->use_nodbg_instructions(LR), [](auto &MI) { 236 return MI.getOpcode() == ARM::t2WhileLoopStart; 237 }); 238 if (!MergeEndDec || WLSIt == MRI->use_instr_nodbg_end()) { 239 RevertWhileLoopSetup(LoopStart, TII); 240 RevertLoopDec(LoopStart, TII); 241 RevertLoopEnd(LoopStart, TII); 242 return true; 243 } 244 245 MachineInstrBuilder MI = 246 BuildMI(*WLSIt->getParent(), *WLSIt, WLSIt->getDebugLoc(), 247 TII->get(ARM::t2WhileLoopStartLR), LR) 248 .add(LoopStart->getOperand(1)) 249 .add(WLSIt->getOperand(1)); 250 (void)MI; 251 LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI.getInstr()); 252 253 WLSIt->eraseFromParent(); 254 LoopStart->eraseFromParent(); 255 return true; 256 } 257 258 // Return true if this instruction is invalid in a low overhead loop, usually 259 // because it clobbers LR. 260 static bool IsInvalidTPInstruction(MachineInstr &MI) { 261 return MI.isCall() || isLoopStart(MI); 262 } 263 264 // Starting from PreHeader, search for invalid instructions back until the 265 // LoopStart block is reached. If invalid instructions are found, the loop start 266 // is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will 267 // return the new DLS LoopStart if updated. 268 MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors( 269 MachineBasicBlock *PreHeader, MachineInstr *LoopStart) { 270 SmallVector<MachineBasicBlock *> Worklist; 271 SmallPtrSet<MachineBasicBlock *, 4> Visited; 272 Worklist.push_back(PreHeader); 273 Visited.insert(LoopStart->getParent()); 274 275 while (!Worklist.empty()) { 276 MachineBasicBlock *MBB = Worklist.pop_back_val(); 277 if (Visited.count(MBB)) 278 continue; 279 280 for (MachineInstr &MI : *MBB) { 281 if (!IsInvalidTPInstruction(MI)) 282 continue; 283 284 LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI); 285 286 // Create a t2DoLoopStart at the end of the preheader. 287 MachineInstrBuilder MIB = 288 BuildMI(*PreHeader, PreHeader->getFirstTerminator(), 289 LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart)); 290 MIB.add(LoopStart->getOperand(0)); 291 MIB.add(LoopStart->getOperand(1)); 292 293 // Make sure to remove the kill flags, to prevent them from being invalid. 294 LoopStart->getOperand(1).setIsKill(false); 295 296 // Revert the t2WhileLoopStartLR to a CMP and Br. 297 RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true); 298 return MIB; 299 } 300 301 Visited.insert(MBB); 302 for (auto *Pred : MBB->predecessors()) 303 Worklist.push_back(Pred); 304 } 305 return LoopStart; 306 } 307 308 // This function converts loops with t2LoopEnd and t2LoopEnd instructions into 309 // a single t2LoopEndDec instruction. To do that it needs to make sure that LR 310 // will be valid to be used for the low overhead loop, which means nothing else 311 // is using LR (especially calls) and there are no superfluous copies in the 312 // loop. The t2LoopEndDec is a branching terminator that produces a value (the 313 // decrement) around the loop edge, which means we need to be careful that they 314 // will be valid to allocate without any spilling. 315 bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) { 316 if (!MergeEndDec) 317 return false; 318 319 LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName() 320 << "\n"); 321 322 MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; 323 if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) 324 return false; 325 326 // Check if there is an illegal instruction (a call) in the low overhead loop 327 // and if so revert it now before we get any further. While loops also need to 328 // check the preheaders, but can be reverted to a DLS loop if needed. 329 auto *PreHeader = ML->getLoopPreheader(); 330 if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader) 331 LoopStart = CheckForLRUseInPredecessors(PreHeader, LoopStart); 332 333 for (MachineBasicBlock *MBB : ML->blocks()) { 334 for (MachineInstr &MI : *MBB) { 335 if (IsInvalidTPInstruction(MI)) { 336 LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI); 337 if (LoopStart->getOpcode() == ARM::t2DoLoopStart) 338 RevertDoLoopStart(LoopStart, TII); 339 else 340 RevertWhileLoopStartLR(LoopStart, TII); 341 RevertLoopDec(LoopDec, TII); 342 RevertLoopEnd(LoopEnd, TII); 343 return true; 344 } 345 } 346 } 347 348 // Remove any copies from the loop, to ensure the phi that remains is both 349 // simpler and contains no extra uses. Because t2LoopEndDec is a terminator 350 // that cannot spill, we need to be careful what remains in the loop. 351 Register PhiReg = LoopPhi->getOperand(0).getReg(); 352 Register DecReg = LoopDec->getOperand(0).getReg(); 353 Register StartReg = LoopStart->getOperand(0).getReg(); 354 // Ensure the uses are expected, and collect any copies we want to remove. 355 SmallVector<MachineInstr *, 4> Copies; 356 auto CheckUsers = [&Copies](Register BaseReg, 357 ArrayRef<MachineInstr *> ExpectedUsers, 358 MachineRegisterInfo *MRI) { 359 SmallVector<Register, 4> Worklist; 360 Worklist.push_back(BaseReg); 361 while (!Worklist.empty()) { 362 Register Reg = Worklist.pop_back_val(); 363 for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { 364 if (count(ExpectedUsers, &MI)) 365 continue; 366 if (MI.getOpcode() != TargetOpcode::COPY || 367 !MI.getOperand(0).getReg().isVirtual()) { 368 LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI); 369 return false; 370 } 371 Worklist.push_back(MI.getOperand(0).getReg()); 372 Copies.push_back(&MI); 373 } 374 } 375 return true; 376 }; 377 if (!CheckUsers(PhiReg, {LoopDec}, MRI) || 378 !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) || 379 !CheckUsers(StartReg, {LoopPhi}, MRI)) { 380 // Don't leave a t2WhileLoopStartLR without the LoopDecEnd. 381 if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR) { 382 RevertWhileLoopStartLR(LoopStart, TII); 383 RevertLoopDec(LoopDec, TII); 384 RevertLoopEnd(LoopEnd, TII); 385 return true; 386 } 387 return false; 388 } 389 390 MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass); 391 MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass); 392 MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass); 393 394 if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) { 395 LoopPhi->getOperand(3).setReg(StartReg); 396 LoopPhi->getOperand(1).setReg(DecReg); 397 } else { 398 LoopPhi->getOperand(1).setReg(StartReg); 399 LoopPhi->getOperand(3).setReg(DecReg); 400 } 401 402 // Replace the loop dec and loop end as a single instruction. 403 MachineInstrBuilder MI = 404 BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(), 405 TII->get(ARM::t2LoopEndDec), DecReg) 406 .addReg(PhiReg) 407 .add(LoopEnd->getOperand(1)); 408 (void)MI; 409 LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr()); 410 411 LoopDec->eraseFromParent(); 412 LoopEnd->eraseFromParent(); 413 for (auto *MI : Copies) 414 MI->eraseFromParent(); 415 return true; 416 } 417 418 // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP 419 // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP 420 // instruction, making the backend ARMLowOverheadLoops passes job of finding the 421 // VCTP operand much simpler. 422 bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, 423 MachineDominatorTree *DT) { 424 LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop " 425 << ML->getHeader()->getName() << "\n"); 426 427 // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's 428 // in the loop. 429 MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; 430 if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) 431 return false; 432 if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart && 433 LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) 434 return false; 435 436 SmallVector<MachineInstr *, 4> VCTPs; 437 for (MachineBasicBlock *BB : ML->blocks()) 438 for (MachineInstr &MI : *BB) 439 if (isVCTP(&MI)) 440 VCTPs.push_back(&MI); 441 442 if (VCTPs.empty()) { 443 LLVM_DEBUG(dbgs() << " no VCTPs\n"); 444 return false; 445 } 446 447 // Check all VCTPs are the same. 448 MachineInstr *FirstVCTP = *VCTPs.begin(); 449 for (MachineInstr *VCTP : VCTPs) { 450 LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP); 451 if (VCTP->getOpcode() != FirstVCTP->getOpcode() || 452 VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) { 453 LLVM_DEBUG(dbgs() << " VCTP's are not identical\n"); 454 return false; 455 } 456 } 457 458 // Check for the register being used can be setup before the loop. We expect 459 // this to be: 460 // $vx = ... 461 // loop: 462 // $vp = PHI [ $vx ], [ $vd ] 463 // .. 464 // $vpr = VCTP $vp 465 // .. 466 // $vd = t2SUBri $vp, #n 467 // .. 468 Register CountReg = FirstVCTP->getOperand(1).getReg(); 469 if (!CountReg.isVirtual()) { 470 LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n"); 471 return false; 472 } 473 MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI); 474 if (!Phi || Phi->getOpcode() != TargetOpcode::PHI || 475 Phi->getNumOperands() != 5 || 476 (Phi->getOperand(2).getMBB() != ML->getLoopLatch() && 477 Phi->getOperand(4).getMBB() != ML->getLoopLatch())) { 478 LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n"); 479 return false; 480 } 481 CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch() 482 ? Phi->getOperand(3).getReg() 483 : Phi->getOperand(1).getReg(); 484 485 // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of 486 // the preheader and add the new CountReg to it. We attempt to place it late 487 // in the preheader, but may need to move that earlier based on uses. 488 MachineBasicBlock *MBB = LoopStart->getParent(); 489 MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator(); 490 for (MachineInstr &Use : 491 MRI->use_instructions(LoopStart->getOperand(0).getReg())) 492 if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) || 493 !DT->dominates(ML->getHeader(), Use.getParent())) { 494 LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n"); 495 return false; 496 } 497 498 unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart 499 ? ARM::t2DoLoopStartTP 500 : ARM::t2WhileLoopStartTP; 501 MachineInstrBuilder MI = 502 BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc)) 503 .add(LoopStart->getOperand(0)) 504 .add(LoopStart->getOperand(1)) 505 .addReg(CountReg); 506 if (NewOpc == ARM::t2WhileLoopStartTP) 507 MI.add(LoopStart->getOperand(2)); 508 LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with " 509 << *MI.getInstr()); 510 MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass); 511 LoopStart->eraseFromParent(); 512 513 return true; 514 } 515 516 // Returns true if Opcode is any VCMP Opcode. 517 static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; } 518 519 // Returns true if a VCMP with this Opcode can have its operands swapped. 520 // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs, 521 // and VCMPr instructions (since the r is always on the right). 522 static bool CanHaveSwappedOperands(unsigned Opcode) { 523 switch (Opcode) { 524 default: 525 return true; 526 case ARM::MVE_VCMPf32: 527 case ARM::MVE_VCMPf16: 528 case ARM::MVE_VCMPf32r: 529 case ARM::MVE_VCMPf16r: 530 case ARM::MVE_VCMPi8r: 531 case ARM::MVE_VCMPi16r: 532 case ARM::MVE_VCMPi32r: 533 case ARM::MVE_VCMPu8r: 534 case ARM::MVE_VCMPu16r: 535 case ARM::MVE_VCMPu32r: 536 case ARM::MVE_VCMPs8r: 537 case ARM::MVE_VCMPs16r: 538 case ARM::MVE_VCMPs32r: 539 return false; 540 } 541 } 542 543 // Returns the CondCode of a VCMP Instruction. 544 static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) { 545 assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP"); 546 return ARMCC::CondCodes(Instr.getOperand(3).getImm()); 547 } 548 549 // Returns true if Cond is equivalent to a VPNOT instruction on the result of 550 // Prev. Cond and Prev must be VCMPs. 551 static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) { 552 assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode())); 553 554 // Opcodes must match. 555 if (Cond.getOpcode() != Prev.getOpcode()) 556 return false; 557 558 MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2); 559 MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2); 560 561 // If the VCMP has the opposite condition with the same operands, we can 562 // replace it with a VPNOT 563 ARMCC::CondCodes ExpectedCode = GetCondCode(Cond); 564 ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode); 565 if (ExpectedCode == GetCondCode(Prev)) 566 if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2)) 567 return true; 568 // Check again with operands swapped if possible 569 if (!CanHaveSwappedOperands(Cond.getOpcode())) 570 return false; 571 ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode); 572 return ExpectedCode == GetCondCode(Prev) && CondOP1.isIdenticalTo(PrevOP2) && 573 CondOP2.isIdenticalTo(PrevOP1); 574 } 575 576 // Returns true if Instr writes to VCCR. 577 static bool IsWritingToVCCR(MachineInstr &Instr) { 578 if (Instr.getNumOperands() == 0) 579 return false; 580 MachineOperand &Dst = Instr.getOperand(0); 581 if (!Dst.isReg()) 582 return false; 583 Register DstReg = Dst.getReg(); 584 if (!DstReg.isVirtual()) 585 return false; 586 MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo(); 587 const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg); 588 return RegClass && (RegClass->getID() == ARM::VCCRRegClassID); 589 } 590 591 // Transforms 592 // <Instr that uses %A ('User' Operand)> 593 // Into 594 // %K = VPNOT %Target 595 // <Instr that uses %K ('User' Operand)> 596 // And returns the newly inserted VPNOT. 597 // This optimization is done in the hopes of preventing spills/reloads of VPR by 598 // reducing the number of VCCR values with overlapping lifetimes. 599 MachineInstr &MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT( 600 MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User, 601 Register Target) { 602 Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target)); 603 604 MachineInstrBuilder MIBuilder = 605 BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT)) 606 .addDef(NewResult) 607 .addReg(Target); 608 addUnpredicatedMveVpredNOp(MIBuilder); 609 610 // Make the user use NewResult instead, and clear its kill flag. 611 User.setReg(NewResult); 612 User.setIsKill(false); 613 614 LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): "; 615 MIBuilder.getInstr()->dump()); 616 617 return *MIBuilder.getInstr(); 618 } 619 620 // Moves a VPNOT before its first user if an instruction that uses Reg is found 621 // in-between the VPNOT and its user. 622 // Returns true if there is at least one user of the VPNOT in the block. 623 static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB, 624 MachineBasicBlock::iterator Iter, 625 Register Reg) { 626 assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!"); 627 assert(getVPTInstrPredicate(*Iter) == ARMVCC::None && 628 "The VPNOT cannot be predicated"); 629 630 MachineInstr &VPNOT = *Iter; 631 Register VPNOTResult = VPNOT.getOperand(0).getReg(); 632 Register VPNOTOperand = VPNOT.getOperand(1).getReg(); 633 634 // Whether the VPNOT will need to be moved, and whether we found a user of the 635 // VPNOT. 636 bool MustMove = false, HasUser = false; 637 MachineOperand *VPNOTOperandKiller = nullptr; 638 for (; Iter != MBB.end(); ++Iter) { 639 if (MachineOperand *MO = 640 Iter->findRegisterUseOperand(VPNOTOperand, /*isKill*/ true)) { 641 // If we find the operand that kills the VPNOTOperand's result, save it. 642 VPNOTOperandKiller = MO; 643 } 644 645 if (Iter->findRegisterUseOperandIdx(Reg) != -1) { 646 MustMove = true; 647 continue; 648 } 649 650 if (Iter->findRegisterUseOperandIdx(VPNOTResult) == -1) 651 continue; 652 653 HasUser = true; 654 if (!MustMove) 655 break; 656 657 // Move the VPNOT right before Iter 658 LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << " Before: "; 659 Iter->dump()); 660 MBB.splice(Iter, &MBB, VPNOT.getIterator()); 661 // If we move the instr, and its operand was killed earlier, remove the kill 662 // flag. 663 if (VPNOTOperandKiller) 664 VPNOTOperandKiller->setIsKill(false); 665 666 break; 667 } 668 return HasUser; 669 } 670 671 // This optimisation attempts to reduce the number of overlapping lifetimes of 672 // VCCR values by replacing uses of old VCCR values with VPNOTs. For example, 673 // this replaces 674 // %A:vccr = (something) 675 // %B:vccr = VPNOT %A 676 // %Foo = (some op that uses %B) 677 // %Bar = (some op that uses %A) 678 // With 679 // %A:vccr = (something) 680 // %B:vccr = VPNOT %A 681 // %Foo = (some op that uses %B) 682 // %TMP2:vccr = VPNOT %B 683 // %Bar = (some op that uses %A) 684 bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) { 685 MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end(); 686 SmallVector<MachineInstr *, 4> DeadInstructions; 687 bool Modified = false; 688 689 while (Iter != End) { 690 Register VCCRValue, OppositeVCCRValue; 691 // The first loop looks for 2 unpredicated instructions: 692 // %A:vccr = (instr) ; A is stored in VCCRValue 693 // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue 694 for (; Iter != End; ++Iter) { 695 // We're only interested in unpredicated instructions that write to VCCR. 696 if (!IsWritingToVCCR(*Iter) || 697 getVPTInstrPredicate(*Iter) != ARMVCC::None) 698 continue; 699 Register Dst = Iter->getOperand(0).getReg(); 700 701 // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've 702 // found what we were looking for. 703 if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT && 704 Iter->findRegisterUseOperandIdx(VCCRValue) != -1) { 705 // Move the VPNOT closer to its first user if needed, and ignore if it 706 // has no users. 707 if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue)) 708 continue; 709 710 OppositeVCCRValue = Dst; 711 ++Iter; 712 break; 713 } 714 715 // Else, just set VCCRValue. 716 VCCRValue = Dst; 717 } 718 719 // If the first inner loop didn't find anything, stop here. 720 if (Iter == End) 721 break; 722 723 assert(VCCRValue && OppositeVCCRValue && 724 "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop " 725 "stopped before the end of the block!"); 726 assert(VCCRValue != OppositeVCCRValue && 727 "VCCRValue should not be equal to OppositeVCCRValue!"); 728 729 // LastVPNOTResult always contains the same value as OppositeVCCRValue. 730 Register LastVPNOTResult = OppositeVCCRValue; 731 732 // This second loop tries to optimize the remaining instructions. 733 for (; Iter != End; ++Iter) { 734 bool IsInteresting = false; 735 736 if (MachineOperand *MO = Iter->findRegisterUseOperand(VCCRValue)) { 737 IsInteresting = true; 738 739 // - If the instruction is a VPNOT, it can be removed, and we can just 740 // replace its uses with LastVPNOTResult. 741 // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue. 742 if (Iter->getOpcode() == ARM::MVE_VPNOT) { 743 Register Result = Iter->getOperand(0).getReg(); 744 745 MRI->replaceRegWith(Result, LastVPNOTResult); 746 DeadInstructions.push_back(&*Iter); 747 Modified = true; 748 749 LLVM_DEBUG(dbgs() 750 << "Replacing all uses of '" << printReg(Result) 751 << "' with '" << printReg(LastVPNOTResult) << "'\n"); 752 } else { 753 MachineInstr &VPNOT = 754 ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult); 755 Modified = true; 756 757 LastVPNOTResult = VPNOT.getOperand(0).getReg(); 758 std::swap(VCCRValue, OppositeVCCRValue); 759 760 LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue) 761 << "' with '" << printReg(LastVPNOTResult) 762 << "' in instr: " << *Iter); 763 } 764 } else { 765 // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult 766 // instead as they contain the same value. 767 if (MachineOperand *MO = 768 Iter->findRegisterUseOperand(OppositeVCCRValue)) { 769 IsInteresting = true; 770 771 // This is pointless if LastVPNOTResult == OppositeVCCRValue. 772 if (LastVPNOTResult != OppositeVCCRValue) { 773 LLVM_DEBUG(dbgs() << "Replacing usage of '" 774 << printReg(OppositeVCCRValue) << "' with '" 775 << printReg(LastVPNOTResult) << " for instr: "; 776 Iter->dump()); 777 MO->setReg(LastVPNOTResult); 778 Modified = true; 779 } 780 781 MO->setIsKill(false); 782 } 783 784 // If this is an unpredicated VPNOT on 785 // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it. 786 if (Iter->getOpcode() == ARM::MVE_VPNOT && 787 getVPTInstrPredicate(*Iter) == ARMVCC::None) { 788 Register VPNOTOperand = Iter->getOperand(1).getReg(); 789 if (VPNOTOperand == LastVPNOTResult || 790 VPNOTOperand == OppositeVCCRValue) { 791 IsInteresting = true; 792 793 std::swap(VCCRValue, OppositeVCCRValue); 794 LastVPNOTResult = Iter->getOperand(0).getReg(); 795 } 796 } 797 } 798 799 // If this instruction was not interesting, and it writes to VCCR, stop. 800 if (!IsInteresting && IsWritingToVCCR(*Iter)) 801 break; 802 } 803 } 804 805 for (MachineInstr *DeadInstruction : DeadInstructions) 806 DeadInstruction->eraseFromParent(); 807 808 return Modified; 809 } 810 811 // This optimisation replaces VCMPs with VPNOTs when they are equivalent. 812 bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) { 813 SmallVector<MachineInstr *, 4> DeadInstructions; 814 815 // The last VCMP that we have seen and that couldn't be replaced. 816 // This is reset when an instruction that writes to VCCR/VPR is found, or when 817 // a VCMP is replaced with a VPNOT. 818 // We'll only replace VCMPs with VPNOTs when this is not null, and when the 819 // current VCMP is the opposite of PrevVCMP. 820 MachineInstr *PrevVCMP = nullptr; 821 // If we find an instruction that kills the result of PrevVCMP, we save the 822 // operand here to remove the kill flag in case we need to use PrevVCMP's 823 // result. 824 MachineOperand *PrevVCMPResultKiller = nullptr; 825 826 for (MachineInstr &Instr : MBB.instrs()) { 827 if (PrevVCMP) { 828 if (MachineOperand *MO = Instr.findRegisterUseOperand( 829 PrevVCMP->getOperand(0).getReg(), /*isKill*/ true)) { 830 // If we come accross the instr that kills PrevVCMP's result, record it 831 // so we can remove the kill flag later if we need to. 832 PrevVCMPResultKiller = MO; 833 } 834 } 835 836 // Ignore predicated instructions. 837 if (getVPTInstrPredicate(Instr) != ARMVCC::None) 838 continue; 839 840 // Only look at VCMPs 841 if (!IsVCMP(Instr.getOpcode())) { 842 // If the instruction writes to VCCR, forget the previous VCMP. 843 if (IsWritingToVCCR(Instr)) 844 PrevVCMP = nullptr; 845 continue; 846 } 847 848 if (!PrevVCMP || !IsVPNOTEquivalent(Instr, *PrevVCMP)) { 849 PrevVCMP = &Instr; 850 continue; 851 } 852 853 // The register containing the result of the VCMP that we're going to 854 // replace. 855 Register PrevVCMPResultReg = PrevVCMP->getOperand(0).getReg(); 856 857 // Build a VPNOT to replace the VCMP, reusing its operands. 858 MachineInstrBuilder MIBuilder = 859 BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT)) 860 .add(Instr.getOperand(0)) 861 .addReg(PrevVCMPResultReg); 862 addUnpredicatedMveVpredNOp(MIBuilder); 863 LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): "; 864 MIBuilder.getInstr()->dump(); dbgs() << " Removed VCMP: "; 865 Instr.dump()); 866 867 // If we found an instruction that uses, and kills PrevVCMP's result, 868 // remove the kill flag. 869 if (PrevVCMPResultKiller) 870 PrevVCMPResultKiller->setIsKill(false); 871 872 // Finally, mark the old VCMP for removal and reset 873 // PrevVCMP/PrevVCMPResultKiller. 874 DeadInstructions.push_back(&Instr); 875 PrevVCMP = nullptr; 876 PrevVCMPResultKiller = nullptr; 877 } 878 879 for (MachineInstr *DeadInstruction : DeadInstructions) 880 DeadInstruction->eraseFromParent(); 881 882 return !DeadInstructions.empty(); 883 } 884 885 bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB, 886 MachineDominatorTree *DT) { 887 // Scan through the block, looking for instructions that use constants moves 888 // into VPR that are the negative of one another. These are expected to be 889 // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant 890 // mask is kept it or and VPNOT's of it are added or reused as we scan through 891 // the function. 892 unsigned LastVPTImm = 0; 893 Register LastVPTReg = 0; 894 SmallSet<MachineInstr *, 4> DeadInstructions; 895 896 for (MachineInstr &Instr : MBB.instrs()) { 897 // Look for predicated MVE instructions. 898 int PIdx = llvm::findFirstVPTPredOperandIdx(Instr); 899 if (PIdx == -1) 900 continue; 901 Register VPR = Instr.getOperand(PIdx + 1).getReg(); 902 if (!VPR.isVirtual()) 903 continue; 904 905 // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr. 906 MachineInstr *Copy = MRI->getVRegDef(VPR); 907 if (!Copy || Copy->getOpcode() != TargetOpcode::COPY || 908 !Copy->getOperand(1).getReg().isVirtual() || 909 MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) { 910 LastVPTReg = 0; 911 continue; 912 } 913 Register GPR = Copy->getOperand(1).getReg(); 914 915 // Find the Immediate used by the copy. 916 auto getImm = [&](Register GPR) -> unsigned { 917 MachineInstr *Def = MRI->getVRegDef(GPR); 918 if (Def && (Def->getOpcode() == ARM::t2MOVi || 919 Def->getOpcode() == ARM::t2MOVi16)) 920 return Def->getOperand(1).getImm(); 921 return -1U; 922 }; 923 unsigned Imm = getImm(GPR); 924 if (Imm == -1U) { 925 LastVPTReg = 0; 926 continue; 927 } 928 929 unsigned NotImm = ~Imm & 0xffff; 930 if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) { 931 Instr.getOperand(PIdx + 1).setReg(LastVPTReg); 932 if (MRI->use_empty(VPR)) { 933 DeadInstructions.insert(Copy); 934 if (MRI->hasOneUse(GPR)) 935 DeadInstructions.insert(MRI->getVRegDef(GPR)); 936 } 937 LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr); 938 } else if (LastVPTReg != 0 && LastVPTImm == NotImm) { 939 // We have found the not of a previous constant. Create a VPNot of the 940 // earlier predicate reg and use it instead of the copy. 941 Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass); 942 auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(), 943 TII->get(ARM::MVE_VPNOT), NewVPR) 944 .addReg(LastVPTReg); 945 addUnpredicatedMveVpredNOp(VPNot); 946 947 // Use the new register and check if the def is now dead. 948 Instr.getOperand(PIdx + 1).setReg(NewVPR); 949 if (MRI->use_empty(VPR)) { 950 DeadInstructions.insert(Copy); 951 if (MRI->hasOneUse(GPR)) 952 DeadInstructions.insert(MRI->getVRegDef(GPR)); 953 } 954 LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at " 955 << Instr); 956 VPR = NewVPR; 957 } 958 959 LastVPTImm = Imm; 960 LastVPTReg = VPR; 961 } 962 963 for (MachineInstr *DI : DeadInstructions) 964 DI->eraseFromParent(); 965 966 return !DeadInstructions.empty(); 967 } 968 969 // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a 970 // somewhat blunt approximation to allow tail predicated with vpsel 971 // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly 972 // different semantics under tail predication. Until that is modelled we just 973 // convert to a VMOVT (via a predicated VORR) instead. 974 bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) { 975 bool HasVCTP = false; 976 SmallVector<MachineInstr *, 4> DeadInstructions; 977 978 for (MachineInstr &MI : MBB.instrs()) { 979 if (isVCTP(&MI)) { 980 HasVCTP = true; 981 continue; 982 } 983 984 if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL) 985 continue; 986 987 MachineInstrBuilder MIBuilder = 988 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR)) 989 .add(MI.getOperand(0)) 990 .add(MI.getOperand(1)) 991 .add(MI.getOperand(1)) 992 .addImm(ARMVCC::Then) 993 .add(MI.getOperand(4)) 994 .add(MI.getOperand(2)); 995 // Silence unused variable warning in release builds. 996 (void)MIBuilder; 997 LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump(); 998 dbgs() << " with VMOVT: "; MIBuilder.getInstr()->dump()); 999 DeadInstructions.push_back(&MI); 1000 } 1001 1002 for (MachineInstr *DeadInstruction : DeadInstructions) 1003 DeadInstruction->eraseFromParent(); 1004 1005 return !DeadInstructions.empty(); 1006 } 1007 1008 // Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as 1009 // the instruction may be removable as a noop. 1010 bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) { 1011 bool Changed = false; 1012 for (MachineInstr &MI : MBB.instrs()) { 1013 if (MI.getOpcode() != ARM::t2DoLoopStart) 1014 continue; 1015 Register R = MI.getOperand(1).getReg(); 1016 MachineFunction *MF = MI.getParent()->getParent(); 1017 MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0); 1018 Changed = true; 1019 } 1020 return Changed; 1021 } 1022 1023 bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { 1024 const ARMSubtarget &STI = 1025 static_cast<const ARMSubtarget &>(Fn.getSubtarget()); 1026 1027 if (!STI.isThumb2() || !STI.hasLOB()) 1028 return false; 1029 1030 TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); 1031 MRI = &Fn.getRegInfo(); 1032 MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>(); 1033 MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); 1034 1035 LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n" 1036 << "********** Function: " << Fn.getName() << '\n'); 1037 1038 bool Modified = false; 1039 for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) { 1040 Modified |= LowerWhileLoopStart(ML); 1041 Modified |= MergeLoopEnd(ML); 1042 Modified |= ConvertTailPredLoop(ML, DT); 1043 } 1044 1045 for (MachineBasicBlock &MBB : Fn) { 1046 Modified |= HintDoLoopStartReg(MBB); 1047 Modified |= ReplaceConstByVPNOTs(MBB, DT); 1048 Modified |= ReplaceVCMPsByVPNOTs(MBB); 1049 Modified |= ReduceOldVCCRValueUses(MBB); 1050 Modified |= ConvertVPSEL(MBB); 1051 } 1052 1053 LLVM_DEBUG(dbgs() << "**************************************\n"); 1054 return Modified; 1055 } 1056 1057 /// createMVETPAndVPTOptimisationsPass 1058 FunctionPass *llvm::createMVETPAndVPTOptimisationsPass() { 1059 return new MVETPAndVPTOptimisations(); 1060 } 1061