1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs below peephole optimizations on MIR level. 10 // 11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri 12 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri 13 // 14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi 15 // MOVi64imm + ADDXrr ==> ANDXri + ANDXri 16 // 17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi 18 // MOVi64imm + SUBXrr ==> SUBXri + SUBXri 19 // 20 // The mov pseudo instruction could be expanded to multiple mov instructions 21 // later. In this case, we could try to split the constant operand of mov 22 // instruction into two immediates which can be directly encoded into 23 // *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of 24 // multiple `mov` + `and/add/sub` instructions. 25 // 26 // 4. Remove redundant ORRWrs which is generated by zero-extend. 27 // 28 // %3:gpr32 = ORRWrs $wzr, %2, 0 29 // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 30 // 31 // If AArch64's 32-bit form of instruction defines the source operand of 32 // ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source 33 // operand are set to zero. 34 // 35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx 36 // ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx 37 // 38 // 6. %intermediate:gpr32 = COPY %src:fpr128 39 // %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32 40 // ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0 41 // 42 // In cases where a source FPR is copied to a GPR in order to be copied 43 // to a destination FPR, we can directly copy the values between the FPRs, 44 // eliminating the use of the Integer unit. When we match a pattern of 45 // INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR 46 // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr 47 // instructions. 48 // 49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high 50 // 64-bits. For example, 51 // 52 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 53 // %2:fpr64 = MOVID 0 54 // %4:fpr128 = IMPLICIT_DEF 55 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), %2:fpr64, %subreg.dsub 56 // %6:fpr128 = IMPLICIT_DEF 57 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub 58 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, %3:fpr128, 0 59 // ==> 60 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 61 // %6:fpr128 = IMPLICIT_DEF 62 // %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub 63 // 64 // 8. Remove redundant CSELs that select between identical registers, by 65 // replacing them with unconditional moves. 66 // 67 // 9. Replace UBFMXri with UBFMWri if the instruction is equivalent to a 32 bit 68 // LSR or LSL alias of UBFM. 69 // 70 //===----------------------------------------------------------------------===// 71 72 #include "AArch64ExpandImm.h" 73 #include "AArch64InstrInfo.h" 74 #include "MCTargetDesc/AArch64AddressingModes.h" 75 #include "llvm/CodeGen/MachineDominators.h" 76 #include "llvm/CodeGen/MachineLoopInfo.h" 77 78 using namespace llvm; 79 80 #define DEBUG_TYPE "aarch64-mi-peephole-opt" 81 82 namespace { 83 84 struct AArch64MIPeepholeOpt : public MachineFunctionPass { 85 static char ID; 86 87 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {} 88 89 const AArch64InstrInfo *TII; 90 const AArch64RegisterInfo *TRI; 91 MachineLoopInfo *MLI; 92 MachineRegisterInfo *MRI; 93 94 using OpcodePair = std::pair<unsigned, unsigned>; 95 template <typename T> 96 using SplitAndOpcFunc = 97 std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>; 98 using BuildMIFunc = 99 std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned, 100 Register, Register, Register)>; 101 102 /// For instructions where an immediate operand could be split into two 103 /// separate immediate instructions, use the splitTwoPartImm two handle the 104 /// optimization. 105 /// 106 /// To implement, the following function types must be passed to 107 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if 108 /// splitting the immediate is valid and returns the associated new opcode. A 109 /// BuildMIFunc must be implemented to build the two immediate instructions. 110 /// 111 /// Example Pattern (where IMM would require 2+ MOV instructions): 112 /// %dst = <Instr>rr %src IMM [...] 113 /// becomes: 114 /// %tmp = <Instr>ri %src (encode half IMM) [...] 115 /// %dst = <Instr>ri %tmp (encode half IMM) [...] 116 template <typename T> 117 bool splitTwoPartImm(MachineInstr &MI, 118 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr); 119 120 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, 121 MachineInstr *&SubregToRegMI); 122 123 template <typename T> 124 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI); 125 template <typename T> 126 bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); 127 128 template <typename T> 129 bool visitAND(unsigned Opc, MachineInstr &MI); 130 bool visitORR(MachineInstr &MI); 131 bool visitCSEL(MachineInstr &MI); 132 bool visitINSERT(MachineInstr &MI); 133 bool visitINSviGPR(MachineInstr &MI, unsigned Opc); 134 bool visitINSvi64lane(MachineInstr &MI); 135 bool visitFMOVDr(MachineInstr &MI); 136 bool visitUBFMXri(MachineInstr &MI); 137 bool visitCopy(MachineInstr &MI); 138 bool runOnMachineFunction(MachineFunction &MF) override; 139 140 StringRef getPassName() const override { 141 return "AArch64 MI Peephole Optimization pass"; 142 } 143 144 void getAnalysisUsage(AnalysisUsage &AU) const override { 145 AU.setPreservesCFG(); 146 AU.addRequired<MachineLoopInfoWrapperPass>(); 147 MachineFunctionPass::getAnalysisUsage(AU); 148 } 149 }; 150 151 char AArch64MIPeepholeOpt::ID = 0; 152 153 } // end anonymous namespace 154 155 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", 156 "AArch64 MI Peephole Optimization", false, false) 157 158 template <typename T> 159 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { 160 T UImm = static_cast<T>(Imm); 161 if (AArch64_AM::isLogicalImmediate(UImm, RegSize)) 162 return false; 163 164 // If this immediate can be handled by one instruction, do not split it. 165 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 166 AArch64_IMM::expandMOVImm(UImm, RegSize, Insn); 167 if (Insn.size() == 1) 168 return false; 169 170 // The bitmask immediate consists of consecutive ones. Let's say there is 171 // constant 0b00000000001000000000010000000000 which does not consist of 172 // consecutive ones. We can split it in to two bitmask immediate like 173 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111. 174 // If we do AND with these two bitmask immediate, we can see original one. 175 unsigned LowestBitSet = llvm::countr_zero(UImm); 176 unsigned HighestBitSet = Log2_64(UImm); 177 178 // Create a mask which is filled with one from the position of lowest bit set 179 // to the position of highest bit set. 180 T NewImm1 = (static_cast<T>(2) << HighestBitSet) - 181 (static_cast<T>(1) << LowestBitSet); 182 // Create a mask which is filled with one outside the position of lowest bit 183 // set and the position of highest bit set. 184 T NewImm2 = UImm | ~NewImm1; 185 186 // If the split value is not valid bitmask immediate, do not split this 187 // constant. 188 if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) 189 return false; 190 191 Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize); 192 Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize); 193 return true; 194 } 195 196 template <typename T> 197 bool AArch64MIPeepholeOpt::visitAND( 198 unsigned Opc, MachineInstr &MI) { 199 // Try below transformation. 200 // 201 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri 202 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri 203 // 204 // The mov pseudo instruction could be expanded to multiple mov instructions 205 // later. Let's try to split the constant operand of mov instruction into two 206 // bitmask immediates. It makes only two AND instructions instead of multiple 207 // mov + and instructions. 208 209 return splitTwoPartImm<T>( 210 MI, 211 [Opc](T Imm, unsigned RegSize, T &Imm0, 212 T &Imm1) -> std::optional<OpcodePair> { 213 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) 214 return std::make_pair(Opc, Opc); 215 return std::nullopt; 216 }, 217 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 218 unsigned Imm1, Register SrcReg, Register NewTmpReg, 219 Register NewDstReg) { 220 DebugLoc DL = MI.getDebugLoc(); 221 MachineBasicBlock *MBB = MI.getParent(); 222 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 223 .addReg(SrcReg) 224 .addImm(Imm0); 225 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 226 .addReg(NewTmpReg) 227 .addImm(Imm1); 228 }); 229 } 230 231 bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) { 232 // Check this ORR comes from below zero-extend pattern. 233 // 234 // def : Pat<(i64 (zext GPR32:$src)), 235 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; 236 if (MI.getOperand(3).getImm() != 0) 237 return false; 238 239 if (MI.getOperand(1).getReg() != AArch64::WZR) 240 return false; 241 242 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 243 if (!SrcMI) 244 return false; 245 246 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC 247 // 248 // When you use the 32-bit form of an instruction, the upper 32 bits of the 249 // source registers are ignored and the upper 32 bits of the destination 250 // register are set to zero. 251 // 252 // If AArch64's 32-bit form of instruction defines the source operand of 253 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is 254 // real AArch64 instruction and if it is not, do not process the opcode 255 // conservatively. 256 if (SrcMI->getOpcode() == TargetOpcode::COPY && 257 SrcMI->getOperand(1).getReg().isVirtual()) { 258 const TargetRegisterClass *RC = 259 MRI->getRegClass(SrcMI->getOperand(1).getReg()); 260 261 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know 262 // that the upper bits are zero. 263 if (RC != &AArch64::FPR32RegClass && 264 ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass && 265 RC != &AArch64::ZPRRegClass) || 266 SrcMI->getOperand(1).getSubReg() != AArch64::ssub)) 267 return false; 268 Register CpySrc; 269 if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) { 270 CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass); 271 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), 272 TII->get(TargetOpcode::COPY), CpySrc) 273 .add(SrcMI->getOperand(1)); 274 } else { 275 CpySrc = SrcMI->getOperand(1).getReg(); 276 } 277 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), 278 TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg()) 279 .addReg(CpySrc); 280 SrcMI->eraseFromParent(); 281 } 282 else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) 283 return false; 284 285 Register DefReg = MI.getOperand(0).getReg(); 286 Register SrcReg = MI.getOperand(2).getReg(); 287 MRI->replaceRegWith(DefReg, SrcReg); 288 MRI->clearKillFlags(SrcReg); 289 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n"); 290 MI.eraseFromParent(); 291 292 return true; 293 } 294 295 bool AArch64MIPeepholeOpt::visitCSEL(MachineInstr &MI) { 296 // Replace CSEL with MOV when both inputs are the same register. 297 if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) 298 return false; 299 300 auto ZeroReg = 301 MI.getOpcode() == AArch64::CSELXr ? AArch64::XZR : AArch64::WZR; 302 auto OrOpcode = 303 MI.getOpcode() == AArch64::CSELXr ? AArch64::ORRXrs : AArch64::ORRWrs; 304 305 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(OrOpcode)) 306 .addReg(MI.getOperand(0).getReg(), RegState::Define) 307 .addReg(ZeroReg) 308 .addReg(MI.getOperand(1).getReg()) 309 .addImm(0); 310 311 MI.eraseFromParent(); 312 return true; 313 } 314 315 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) { 316 // Check this INSERT_SUBREG comes from below zero-extend pattern. 317 // 318 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx 319 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx 320 // 321 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a 322 // COPY would destroy the upper part of the register anyway 323 if (!MI.isRegTiedToDefOperand(1)) 324 return false; 325 326 Register DstReg = MI.getOperand(0).getReg(); 327 const TargetRegisterClass *RC = MRI->getRegClass(DstReg); 328 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 329 if (!SrcMI) 330 return false; 331 332 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC 333 // 334 // When you use the 32-bit form of an instruction, the upper 32 bits of the 335 // source registers are ignored and the upper 32 bits of the destination 336 // register are set to zero. 337 // 338 // If AArch64's 32-bit form of instruction defines the source operand of 339 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is 340 // real AArch64 instruction and if it is not, do not process the opcode 341 // conservatively. 342 if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) || 343 !AArch64::GPR64allRegClass.hasSubClassEq(RC)) 344 return false; 345 346 // Build a SUBREG_TO_REG instruction 347 MachineInstr *SubregMI = 348 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 349 TII->get(TargetOpcode::SUBREG_TO_REG), DstReg) 350 .addImm(0) 351 .add(MI.getOperand(2)) 352 .add(MI.getOperand(3)); 353 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n"); 354 (void)SubregMI; 355 MI.eraseFromParent(); 356 357 return true; 358 } 359 360 template <typename T> 361 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { 362 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both 363 // imm0 and imm1 are non-zero 12-bit unsigned int. 364 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || 365 (Imm & ~static_cast<T>(0xffffff)) != 0) 366 return false; 367 368 // The immediate can not be composed via a single instruction. 369 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 370 AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); 371 if (Insn.size() == 1) 372 return false; 373 374 // Split Imm into (Imm0 << 12) + Imm1; 375 Imm0 = (Imm >> 12) & 0xfff; 376 Imm1 = Imm & 0xfff; 377 return true; 378 } 379 380 template <typename T> 381 bool AArch64MIPeepholeOpt::visitADDSUB( 382 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) { 383 // Try below transformation. 384 // 385 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri 386 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri 387 // 388 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri 389 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri 390 // 391 // The mov pseudo instruction could be expanded to multiple mov instructions 392 // later. Let's try to split the constant operand of mov instruction into two 393 // legal add/sub immediates. It makes only two ADD/SUB instructions instead of 394 // multiple `mov` + `and/sub` instructions. 395 396 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant 397 // folded. Make sure that we don't generate invalid instructions that use XZR 398 // in those cases. 399 if (MI.getOperand(1).getReg() == AArch64::XZR || 400 MI.getOperand(1).getReg() == AArch64::WZR) 401 return false; 402 403 return splitTwoPartImm<T>( 404 MI, 405 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0, 406 T &Imm1) -> std::optional<OpcodePair> { 407 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) 408 return std::make_pair(PosOpc, PosOpc); 409 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) 410 return std::make_pair(NegOpc, NegOpc); 411 return std::nullopt; 412 }, 413 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 414 unsigned Imm1, Register SrcReg, Register NewTmpReg, 415 Register NewDstReg) { 416 DebugLoc DL = MI.getDebugLoc(); 417 MachineBasicBlock *MBB = MI.getParent(); 418 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 419 .addReg(SrcReg) 420 .addImm(Imm0) 421 .addImm(12); 422 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 423 .addReg(NewTmpReg) 424 .addImm(Imm1) 425 .addImm(0); 426 }); 427 } 428 429 template <typename T> 430 bool AArch64MIPeepholeOpt::visitADDSSUBS( 431 OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) { 432 // Try the same transformation as ADDSUB but with additional requirement 433 // that the condition code usages are only for Equal and Not Equal 434 435 if (MI.getOperand(1).getReg() == AArch64::XZR || 436 MI.getOperand(1).getReg() == AArch64::WZR) 437 return false; 438 439 return splitTwoPartImm<T>( 440 MI, 441 [PosOpcs, NegOpcs, &MI, &TRI = TRI, 442 &MRI = MRI](T Imm, unsigned RegSize, T &Imm0, 443 T &Imm1) -> std::optional<OpcodePair> { 444 OpcodePair OP; 445 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) 446 OP = PosOpcs; 447 else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) 448 OP = NegOpcs; 449 else 450 return std::nullopt; 451 // Check conditional uses last since it is expensive for scanning 452 // proceeding instructions 453 MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 454 std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI); 455 if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V) 456 return std::nullopt; 457 return OP; 458 }, 459 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 460 unsigned Imm1, Register SrcReg, Register NewTmpReg, 461 Register NewDstReg) { 462 DebugLoc DL = MI.getDebugLoc(); 463 MachineBasicBlock *MBB = MI.getParent(); 464 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 465 .addReg(SrcReg) 466 .addImm(Imm0) 467 .addImm(12); 468 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 469 .addReg(NewTmpReg) 470 .addImm(Imm1) 471 .addImm(0); 472 }); 473 } 474 475 // Checks if the corresponding MOV immediate instruction is applicable for 476 // this peephole optimization. 477 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, 478 MachineInstr *&MovMI, 479 MachineInstr *&SubregToRegMI) { 480 // Check whether current MBB is in loop and the AND is loop invariant. 481 MachineBasicBlock *MBB = MI.getParent(); 482 MachineLoop *L = MLI->getLoopFor(MBB); 483 if (L && !L->isLoopInvariant(MI)) 484 return false; 485 486 // Check whether current MI's operand is MOV with immediate. 487 MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 488 if (!MovMI) 489 return false; 490 491 // If it is SUBREG_TO_REG, check its operand. 492 SubregToRegMI = nullptr; 493 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { 494 SubregToRegMI = MovMI; 495 MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); 496 if (!MovMI) 497 return false; 498 } 499 500 if (MovMI->getOpcode() != AArch64::MOVi32imm && 501 MovMI->getOpcode() != AArch64::MOVi64imm) 502 return false; 503 504 // If the MOV has multiple uses, do not split the immediate because it causes 505 // more instructions. 506 if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) 507 return false; 508 if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) 509 return false; 510 511 // It is OK to perform this peephole optimization. 512 return true; 513 } 514 515 template <typename T> 516 bool AArch64MIPeepholeOpt::splitTwoPartImm( 517 MachineInstr &MI, 518 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) { 519 unsigned RegSize = sizeof(T) * 8; 520 assert((RegSize == 32 || RegSize == 64) && 521 "Invalid RegSize for legal immediate peephole optimization"); 522 523 // Perform several essential checks against current MI. 524 MachineInstr *MovMI, *SubregToRegMI; 525 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) 526 return false; 527 528 // Split the immediate to Imm0 and Imm1, and calculate the Opcode. 529 T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1; 530 // For the 32 bit form of instruction, the upper 32 bits of the destination 531 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits 532 // of Imm to zero. This is essential if the Immediate value was a negative 533 // number since it was sign extended when we assign to the 64-bit Imm. 534 if (SubregToRegMI) 535 Imm &= 0xFFFFFFFF; 536 OpcodePair Opcode; 537 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1)) 538 Opcode = *R; 539 else 540 return false; 541 542 // Create new MIs using the first and second opcodes. Opcodes might differ for 543 // flag setting operations that should only set flags on second instruction. 544 // NewTmpReg = Opcode.first SrcReg Imm0 545 // NewDstReg = Opcode.second NewTmpReg Imm1 546 547 // Determine register classes for destinations and register operands 548 MachineFunction *MF = MI.getMF(); 549 const TargetRegisterClass *FirstInstrDstRC = 550 TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF); 551 const TargetRegisterClass *FirstInstrOperandRC = 552 TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF); 553 const TargetRegisterClass *SecondInstrDstRC = 554 (Opcode.first == Opcode.second) 555 ? FirstInstrDstRC 556 : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF); 557 const TargetRegisterClass *SecondInstrOperandRC = 558 (Opcode.first == Opcode.second) 559 ? FirstInstrOperandRC 560 : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF); 561 562 // Get old registers destinations and new register destinations 563 Register DstReg = MI.getOperand(0).getReg(); 564 Register SrcReg = MI.getOperand(1).getReg(); 565 Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC); 566 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to 567 // reuse that same destination register. 568 Register NewDstReg = DstReg.isVirtual() 569 ? MRI->createVirtualRegister(SecondInstrDstRC) 570 : DstReg; 571 572 // Constrain registers based on their new uses 573 MRI->constrainRegClass(SrcReg, FirstInstrOperandRC); 574 MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC); 575 if (DstReg != NewDstReg) 576 MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg)); 577 578 // Call the delegating operation to build the instruction 579 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg); 580 581 // replaceRegWith changes MI's definition register. Keep it for SSA form until 582 // deleting MI. Only if we made a new destination register. 583 if (DstReg != NewDstReg) { 584 MRI->replaceRegWith(DstReg, NewDstReg); 585 MI.getOperand(0).setReg(DstReg); 586 } 587 588 // Record the MIs need to be removed. 589 MI.eraseFromParent(); 590 if (SubregToRegMI) 591 SubregToRegMI->eraseFromParent(); 592 MovMI->eraseFromParent(); 593 594 return true; 595 } 596 597 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) { 598 // Check if this INSvi[X]gpr comes from COPY of a source FPR128 599 // 600 // From 601 // %intermediate1:gpr64 = COPY %src:fpr128 602 // %intermediate2:gpr32 = COPY %intermediate1:gpr64 603 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32 604 // To 605 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128, 606 // src_index 607 // where src_index = 0, X = [8|16|32|64] 608 609 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); 610 611 // For a chain of COPY instructions, find the initial source register 612 // and check if it's an FPR128 613 while (true) { 614 if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY) 615 return false; 616 617 if (!SrcMI->getOperand(1).getReg().isVirtual()) 618 return false; 619 620 if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) == 621 &AArch64::FPR128RegClass) { 622 break; 623 } 624 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg()); 625 } 626 627 Register DstReg = MI.getOperand(0).getReg(); 628 Register SrcReg = SrcMI->getOperand(1).getReg(); 629 MachineInstr *INSvilaneMI = 630 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg) 631 .add(MI.getOperand(1)) 632 .add(MI.getOperand(2)) 633 .addUse(SrcReg, getRegState(SrcMI->getOperand(1))) 634 .addImm(0); 635 636 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n"); 637 (void)INSvilaneMI; 638 MI.eraseFromParent(); 639 return true; 640 } 641 642 // All instructions that set a FPR64 will implicitly zero the top bits of the 643 // register. 644 static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI, 645 MachineRegisterInfo *MRI) { 646 if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef()) 647 return false; 648 const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg()); 649 if (RC != &AArch64::FPR64RegClass) 650 return false; 651 return MI->getOpcode() > TargetOpcode::GENERIC_OP_END; 652 } 653 654 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { 655 // Check the MI for low 64-bits sets zero for high 64-bits implicitly. 656 // We are expecting below case. 657 // 658 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 659 // %6:fpr128 = IMPLICIT_DEF 660 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub 661 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 662 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 663 if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG) 664 return false; 665 Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg()); 666 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI)) 667 return false; 668 669 // Check there is `mov 0` MI for high 64-bits. 670 // We are expecting below cases. 671 // 672 // %2:fpr64 = MOVID 0 673 // %4:fpr128 = IMPLICIT_DEF 674 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub 675 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 676 // or 677 // %5:fpr128 = MOVIv2d_ns 0 678 // %6:fpr64 = COPY %5.dsub:fpr128 679 // %8:fpr128 = IMPLICIT_DEF 680 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub 681 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0 682 MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); 683 if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG) 684 return false; 685 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg()); 686 if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY) 687 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg()); 688 if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID && 689 High64MI->getOpcode() != AArch64::MOVIv2d_ns)) 690 return false; 691 if (High64MI->getOperand(1).getImm() != 0) 692 return false; 693 694 // Let's remove MIs for high 64-bits. 695 Register OldDef = MI.getOperand(0).getReg(); 696 Register NewDef = MI.getOperand(1).getReg(); 697 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef)); 698 MRI->replaceRegWith(OldDef, NewDef); 699 MI.eraseFromParent(); 700 701 return true; 702 } 703 704 bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) { 705 // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR. 706 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 707 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI)) 708 return false; 709 710 // Let's remove MIs for high 64-bits. 711 Register OldDef = MI.getOperand(0).getReg(); 712 Register NewDef = MI.getOperand(1).getReg(); 713 LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n"); 714 MRI->clearKillFlags(OldDef); 715 MRI->clearKillFlags(NewDef); 716 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef)); 717 MRI->replaceRegWith(OldDef, NewDef); 718 MI.eraseFromParent(); 719 720 return true; 721 } 722 723 bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) { 724 // Check if the instruction is equivalent to a 32 bit LSR or LSL alias of 725 // UBFM, and replace the UBFMXri instruction with its 32 bit variant, UBFMWri. 726 int64_t Immr = MI.getOperand(2).getImm(); 727 int64_t Imms = MI.getOperand(3).getImm(); 728 729 bool IsLSR = Imms == 31 && Immr <= Imms; 730 bool IsLSL = Immr == Imms + 33; 731 if (!IsLSR && !IsLSL) 732 return false; 733 734 if (IsLSL) { 735 Immr -= 32; 736 } 737 738 const TargetRegisterClass *DstRC64 = 739 TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI, *MI.getMF()); 740 const TargetRegisterClass *DstRC32 = 741 TRI->getSubRegisterClass(DstRC64, AArch64::sub_32); 742 assert(DstRC32 && "Destination register class of UBFMXri doesn't have a " 743 "sub_32 subregister class"); 744 745 const TargetRegisterClass *SrcRC64 = 746 TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI, *MI.getMF()); 747 const TargetRegisterClass *SrcRC32 = 748 TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32); 749 assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 " 750 "subregister class"); 751 752 Register DstReg64 = MI.getOperand(0).getReg(); 753 Register DstReg32 = MRI->createVirtualRegister(DstRC32); 754 Register SrcReg64 = MI.getOperand(1).getReg(); 755 Register SrcReg32 = MRI->createVirtualRegister(SrcRC32); 756 757 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::COPY), 758 SrcReg32) 759 .addReg(SrcReg64, 0, AArch64::sub_32); 760 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::UBFMWri), 761 DstReg32) 762 .addReg(SrcReg32) 763 .addImm(Immr) 764 .addImm(Imms); 765 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 766 TII->get(AArch64::SUBREG_TO_REG), DstReg64) 767 .addImm(0) 768 .addReg(DstReg32) 769 .addImm(AArch64::sub_32); 770 MI.eraseFromParent(); 771 return true; 772 } 773 774 // Across a basic-block we might have in i32 extract from a value that only 775 // operates on upper bits (for example a sxtw). We can replace the COPY with a 776 // new version skipping the sxtw. 777 bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) { 778 Register InputReg = MI.getOperand(1).getReg(); 779 if (MI.getOperand(1).getSubReg() != AArch64::sub_32 || 780 !MRI->hasOneNonDBGUse(InputReg)) 781 return false; 782 783 MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg); 784 SmallPtrSet<MachineInstr *, 4> DeadInstrs; 785 DeadInstrs.insert(SrcMI); 786 while (SrcMI && SrcMI->isFullCopy() && 787 MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) { 788 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg()); 789 DeadInstrs.insert(SrcMI); 790 } 791 792 if (!SrcMI) 793 return false; 794 795 // Look for SXTW(X) and return Reg. 796 auto getSXTWSrcReg = [](MachineInstr *SrcMI) -> Register { 797 if (SrcMI->getOpcode() != AArch64::SBFMXri || 798 SrcMI->getOperand(2).getImm() != 0 || 799 SrcMI->getOperand(3).getImm() != 31) 800 return AArch64::NoRegister; 801 return SrcMI->getOperand(1).getReg(); 802 }; 803 // Look for SUBREG_TO_REG(ORRWrr(WZR, COPY(X.sub_32))) 804 auto getUXTWSrcReg = [&](MachineInstr *SrcMI) -> Register { 805 if (SrcMI->getOpcode() != AArch64::SUBREG_TO_REG || 806 SrcMI->getOperand(3).getImm() != AArch64::sub_32 || 807 !MRI->hasOneNonDBGUse(SrcMI->getOperand(2).getReg())) 808 return AArch64::NoRegister; 809 MachineInstr *Orr = MRI->getUniqueVRegDef(SrcMI->getOperand(2).getReg()); 810 if (!Orr || Orr->getOpcode() != AArch64::ORRWrr || 811 Orr->getOperand(1).getReg() != AArch64::WZR || 812 !MRI->hasOneNonDBGUse(Orr->getOperand(2).getReg())) 813 return AArch64::NoRegister; 814 MachineInstr *Cpy = MRI->getUniqueVRegDef(Orr->getOperand(2).getReg()); 815 if (!Cpy || Cpy->getOpcode() != AArch64::COPY || 816 Cpy->getOperand(1).getSubReg() != AArch64::sub_32) 817 return AArch64::NoRegister; 818 DeadInstrs.insert(Orr); 819 return Cpy->getOperand(1).getReg(); 820 }; 821 822 Register SrcReg = getSXTWSrcReg(SrcMI); 823 if (!SrcReg) 824 SrcReg = getUXTWSrcReg(SrcMI); 825 if (!SrcReg) 826 return false; 827 828 MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg)); 829 LLVM_DEBUG(dbgs() << "Optimizing: " << MI); 830 MI.getOperand(1).setReg(SrcReg); 831 LLVM_DEBUG(dbgs() << " to: " << MI); 832 for (auto *DeadMI : DeadInstrs) { 833 LLVM_DEBUG(dbgs() << " Removing: " << *DeadMI); 834 DeadMI->eraseFromParent(); 835 } 836 return true; 837 } 838 839 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { 840 if (skipFunction(MF.getFunction())) 841 return false; 842 843 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); 844 TRI = static_cast<const AArch64RegisterInfo *>( 845 MF.getSubtarget().getRegisterInfo()); 846 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); 847 MRI = &MF.getRegInfo(); 848 849 assert(MRI->isSSA() && "Expected to be run on SSA form!"); 850 851 bool Changed = false; 852 853 for (MachineBasicBlock &MBB : MF) { 854 for (MachineInstr &MI : make_early_inc_range(MBB)) { 855 switch (MI.getOpcode()) { 856 default: 857 break; 858 case AArch64::INSERT_SUBREG: 859 Changed |= visitINSERT(MI); 860 break; 861 case AArch64::ANDWrr: 862 Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI); 863 break; 864 case AArch64::ANDXrr: 865 Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); 866 break; 867 case AArch64::ORRWrs: 868 Changed |= visitORR(MI); 869 break; 870 case AArch64::ADDWrr: 871 Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI); 872 break; 873 case AArch64::SUBWrr: 874 Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI); 875 break; 876 case AArch64::ADDXrr: 877 Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI); 878 break; 879 case AArch64::SUBXrr: 880 Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI); 881 break; 882 case AArch64::ADDSWrr: 883 Changed |= 884 visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri}, 885 {AArch64::SUBWri, AArch64::SUBSWri}, MI); 886 break; 887 case AArch64::SUBSWrr: 888 Changed |= 889 visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri}, 890 {AArch64::ADDWri, AArch64::ADDSWri}, MI); 891 break; 892 case AArch64::ADDSXrr: 893 Changed |= 894 visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri}, 895 {AArch64::SUBXri, AArch64::SUBSXri}, MI); 896 break; 897 case AArch64::SUBSXrr: 898 Changed |= 899 visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri}, 900 {AArch64::ADDXri, AArch64::ADDSXri}, MI); 901 break; 902 case AArch64::CSELWr: 903 case AArch64::CSELXr: 904 Changed |= visitCSEL(MI); 905 break; 906 case AArch64::INSvi64gpr: 907 Changed |= visitINSviGPR(MI, AArch64::INSvi64lane); 908 break; 909 case AArch64::INSvi32gpr: 910 Changed |= visitINSviGPR(MI, AArch64::INSvi32lane); 911 break; 912 case AArch64::INSvi16gpr: 913 Changed |= visitINSviGPR(MI, AArch64::INSvi16lane); 914 break; 915 case AArch64::INSvi8gpr: 916 Changed |= visitINSviGPR(MI, AArch64::INSvi8lane); 917 break; 918 case AArch64::INSvi64lane: 919 Changed |= visitINSvi64lane(MI); 920 break; 921 case AArch64::FMOVDr: 922 Changed |= visitFMOVDr(MI); 923 break; 924 case AArch64::UBFMXri: 925 Changed |= visitUBFMXri(MI); 926 break; 927 case AArch64::COPY: 928 Changed |= visitCopy(MI); 929 break; 930 } 931 } 932 } 933 934 return Changed; 935 } 936 937 FunctionPass *llvm::createAArch64MIPeepholeOptPass() { 938 return new AArch64MIPeepholeOpt(); 939 } 940