1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs below peephole optimizations on MIR level. 10 // 11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri 12 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri 13 // 14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi 15 // MOVi64imm + ADDXrr ==> ANDXri + ANDXri 16 // 17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi 18 // MOVi64imm + SUBXrr ==> SUBXri + SUBXri 19 // 20 // The mov pseudo instruction could be expanded to multiple mov instructions 21 // later. In this case, we could try to split the constant operand of mov 22 // instruction into two immediates which can be directly encoded into 23 // *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of 24 // multiple `mov` + `and/add/sub` instructions. 25 // 26 // 4. Remove redundant ORRWrs which is generated by zero-extend. 27 // 28 // %3:gpr32 = ORRWrs $wzr, %2, 0 29 // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 30 // 31 // If AArch64's 32-bit form of instruction defines the source operand of 32 // ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source 33 // operand are set to zero. 34 // 35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx 36 // ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx 37 // 38 // 6. %intermediate:gpr32 = COPY %src:fpr128 39 // %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32 40 // ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0 41 // 42 // In cases where a source FPR is copied to a GPR in order to be copied 43 // to a destination FPR, we can directly copy the values between the FPRs, 44 // eliminating the use of the Integer unit. When we match a pattern of 45 // INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR 46 // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr 47 // instructions. 48 // 49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high 50 // 64-bits. For example, 51 // 52 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 53 // %2:fpr64 = MOVID 0 54 // %4:fpr128 = IMPLICIT_DEF 55 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub 56 // %6:fpr128 = IMPLICIT_DEF 57 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub 58 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 59 // ==> 60 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 61 // %6:fpr128 = IMPLICIT_DEF 62 // %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub 63 // 64 //===----------------------------------------------------------------------===// 65 66 #include "AArch64ExpandImm.h" 67 #include "AArch64InstrInfo.h" 68 #include "MCTargetDesc/AArch64AddressingModes.h" 69 #include "llvm/CodeGen/MachineDominators.h" 70 #include "llvm/CodeGen/MachineLoopInfo.h" 71 72 using namespace llvm; 73 74 #define DEBUG_TYPE "aarch64-mi-peephole-opt" 75 76 namespace { 77 78 struct AArch64MIPeepholeOpt : public MachineFunctionPass { 79 static char ID; 80 81 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) { 82 initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry()); 83 } 84 85 const AArch64InstrInfo *TII; 86 const AArch64RegisterInfo *TRI; 87 MachineLoopInfo *MLI; 88 MachineRegisterInfo *MRI; 89 90 using OpcodePair = std::pair<unsigned, unsigned>; 91 template <typename T> 92 using SplitAndOpcFunc = 93 std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>; 94 using BuildMIFunc = 95 std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned, 96 Register, Register, Register)>; 97 98 /// For instructions where an immediate operand could be split into two 99 /// separate immediate instructions, use the splitTwoPartImm two handle the 100 /// optimization. 101 /// 102 /// To implement, the following function types must be passed to 103 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if 104 /// splitting the immediate is valid and returns the associated new opcode. A 105 /// BuildMIFunc must be implemented to build the two immediate instructions. 106 /// 107 /// Example Pattern (where IMM would require 2+ MOV instructions): 108 /// %dst = <Instr>rr %src IMM [...] 109 /// becomes: 110 /// %tmp = <Instr>ri %src (encode half IMM) [...] 111 /// %dst = <Instr>ri %tmp (encode half IMM) [...] 112 template <typename T> 113 bool splitTwoPartImm(MachineInstr &MI, 114 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr); 115 116 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, 117 MachineInstr *&SubregToRegMI); 118 119 template <typename T> 120 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI); 121 template <typename T> 122 bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); 123 124 template <typename T> 125 bool visitAND(unsigned Opc, MachineInstr &MI); 126 bool visitORR(MachineInstr &MI); 127 bool visitINSERT(MachineInstr &MI); 128 bool visitINSviGPR(MachineInstr &MI, unsigned Opc); 129 bool visitINSvi64lane(MachineInstr &MI); 130 bool visitFMOVDr(MachineInstr &MI); 131 bool visitCopy(MachineInstr &MI); 132 bool runOnMachineFunction(MachineFunction &MF) override; 133 134 StringRef getPassName() const override { 135 return "AArch64 MI Peephole Optimization pass"; 136 } 137 138 void getAnalysisUsage(AnalysisUsage &AU) const override { 139 AU.setPreservesCFG(); 140 AU.addRequired<MachineLoopInfoWrapperPass>(); 141 MachineFunctionPass::getAnalysisUsage(AU); 142 } 143 }; 144 145 char AArch64MIPeepholeOpt::ID = 0; 146 147 } // end anonymous namespace 148 149 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", 150 "AArch64 MI Peephole Optimization", false, false) 151 152 template <typename T> 153 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { 154 T UImm = static_cast<T>(Imm); 155 if (AArch64_AM::isLogicalImmediate(UImm, RegSize)) 156 return false; 157 158 // If this immediate can be handled by one instruction, do not split it. 159 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 160 AArch64_IMM::expandMOVImm(UImm, RegSize, Insn); 161 if (Insn.size() == 1) 162 return false; 163 164 // The bitmask immediate consists of consecutive ones. Let's say there is 165 // constant 0b00000000001000000000010000000000 which does not consist of 166 // consecutive ones. We can split it in to two bitmask immediate like 167 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111. 168 // If we do AND with these two bitmask immediate, we can see original one. 169 unsigned LowestBitSet = llvm::countr_zero(UImm); 170 unsigned HighestBitSet = Log2_64(UImm); 171 172 // Create a mask which is filled with one from the position of lowest bit set 173 // to the position of highest bit set. 174 T NewImm1 = (static_cast<T>(2) << HighestBitSet) - 175 (static_cast<T>(1) << LowestBitSet); 176 // Create a mask which is filled with one outside the position of lowest bit 177 // set and the position of highest bit set. 178 T NewImm2 = UImm | ~NewImm1; 179 180 // If the split value is not valid bitmask immediate, do not split this 181 // constant. 182 if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) 183 return false; 184 185 Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize); 186 Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize); 187 return true; 188 } 189 190 template <typename T> 191 bool AArch64MIPeepholeOpt::visitAND( 192 unsigned Opc, MachineInstr &MI) { 193 // Try below transformation. 194 // 195 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri 196 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri 197 // 198 // The mov pseudo instruction could be expanded to multiple mov instructions 199 // later. Let's try to split the constant operand of mov instruction into two 200 // bitmask immediates. It makes only two AND instructions intead of multiple 201 // mov + and instructions. 202 203 return splitTwoPartImm<T>( 204 MI, 205 [Opc](T Imm, unsigned RegSize, T &Imm0, 206 T &Imm1) -> std::optional<OpcodePair> { 207 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) 208 return std::make_pair(Opc, Opc); 209 return std::nullopt; 210 }, 211 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 212 unsigned Imm1, Register SrcReg, Register NewTmpReg, 213 Register NewDstReg) { 214 DebugLoc DL = MI.getDebugLoc(); 215 MachineBasicBlock *MBB = MI.getParent(); 216 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 217 .addReg(SrcReg) 218 .addImm(Imm0); 219 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 220 .addReg(NewTmpReg) 221 .addImm(Imm1); 222 }); 223 } 224 225 bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) { 226 // Check this ORR comes from below zero-extend pattern. 227 // 228 // def : Pat<(i64 (zext GPR32:$src)), 229 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; 230 if (MI.getOperand(3).getImm() != 0) 231 return false; 232 233 if (MI.getOperand(1).getReg() != AArch64::WZR) 234 return false; 235 236 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 237 if (!SrcMI) 238 return false; 239 240 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC 241 // 242 // When you use the 32-bit form of an instruction, the upper 32 bits of the 243 // source registers are ignored and the upper 32 bits of the destination 244 // register are set to zero. 245 // 246 // If AArch64's 32-bit form of instruction defines the source operand of 247 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is 248 // real AArch64 instruction and if it is not, do not process the opcode 249 // conservatively. 250 if (SrcMI->getOpcode() == TargetOpcode::COPY && 251 SrcMI->getOperand(1).getReg().isVirtual()) { 252 const TargetRegisterClass *RC = 253 MRI->getRegClass(SrcMI->getOperand(1).getReg()); 254 255 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know 256 // that the upper bits are zero. 257 if (RC != &AArch64::FPR32RegClass && 258 ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) || 259 SrcMI->getOperand(1).getSubReg() != AArch64::ssub)) 260 return false; 261 Register CpySrc = SrcMI->getOperand(1).getReg(); 262 if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) { 263 CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass); 264 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), 265 TII->get(TargetOpcode::COPY), CpySrc) 266 .add(SrcMI->getOperand(1)); 267 } 268 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), 269 TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg()) 270 .addReg(CpySrc); 271 SrcMI->eraseFromParent(); 272 } 273 else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) 274 return false; 275 276 Register DefReg = MI.getOperand(0).getReg(); 277 Register SrcReg = MI.getOperand(2).getReg(); 278 MRI->replaceRegWith(DefReg, SrcReg); 279 MRI->clearKillFlags(SrcReg); 280 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n"); 281 MI.eraseFromParent(); 282 283 return true; 284 } 285 286 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) { 287 // Check this INSERT_SUBREG comes from below zero-extend pattern. 288 // 289 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx 290 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx 291 // 292 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a 293 // COPY would destroy the upper part of the register anyway 294 if (!MI.isRegTiedToDefOperand(1)) 295 return false; 296 297 Register DstReg = MI.getOperand(0).getReg(); 298 const TargetRegisterClass *RC = MRI->getRegClass(DstReg); 299 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 300 if (!SrcMI) 301 return false; 302 303 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC 304 // 305 // When you use the 32-bit form of an instruction, the upper 32 bits of the 306 // source registers are ignored and the upper 32 bits of the destination 307 // register are set to zero. 308 // 309 // If AArch64's 32-bit form of instruction defines the source operand of 310 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is 311 // real AArch64 instruction and if it is not, do not process the opcode 312 // conservatively. 313 if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) || 314 !AArch64::GPR64allRegClass.hasSubClassEq(RC)) 315 return false; 316 317 // Build a SUBREG_TO_REG instruction 318 MachineInstr *SubregMI = 319 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 320 TII->get(TargetOpcode::SUBREG_TO_REG), DstReg) 321 .addImm(0) 322 .add(MI.getOperand(2)) 323 .add(MI.getOperand(3)); 324 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n"); 325 (void)SubregMI; 326 MI.eraseFromParent(); 327 328 return true; 329 } 330 331 template <typename T> 332 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { 333 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both 334 // imm0 and imm1 are non-zero 12-bit unsigned int. 335 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || 336 (Imm & ~static_cast<T>(0xffffff)) != 0) 337 return false; 338 339 // The immediate can not be composed via a single instruction. 340 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 341 AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); 342 if (Insn.size() == 1) 343 return false; 344 345 // Split Imm into (Imm0 << 12) + Imm1; 346 Imm0 = (Imm >> 12) & 0xfff; 347 Imm1 = Imm & 0xfff; 348 return true; 349 } 350 351 template <typename T> 352 bool AArch64MIPeepholeOpt::visitADDSUB( 353 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) { 354 // Try below transformation. 355 // 356 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri 357 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri 358 // 359 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri 360 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri 361 // 362 // The mov pseudo instruction could be expanded to multiple mov instructions 363 // later. Let's try to split the constant operand of mov instruction into two 364 // legal add/sub immediates. It makes only two ADD/SUB instructions intead of 365 // multiple `mov` + `and/sub` instructions. 366 367 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant 368 // folded. Make sure that we don't generate invalid instructions that use XZR 369 // in those cases. 370 if (MI.getOperand(1).getReg() == AArch64::XZR || 371 MI.getOperand(1).getReg() == AArch64::WZR) 372 return false; 373 374 return splitTwoPartImm<T>( 375 MI, 376 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0, 377 T &Imm1) -> std::optional<OpcodePair> { 378 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) 379 return std::make_pair(PosOpc, PosOpc); 380 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) 381 return std::make_pair(NegOpc, NegOpc); 382 return std::nullopt; 383 }, 384 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 385 unsigned Imm1, Register SrcReg, Register NewTmpReg, 386 Register NewDstReg) { 387 DebugLoc DL = MI.getDebugLoc(); 388 MachineBasicBlock *MBB = MI.getParent(); 389 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 390 .addReg(SrcReg) 391 .addImm(Imm0) 392 .addImm(12); 393 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 394 .addReg(NewTmpReg) 395 .addImm(Imm1) 396 .addImm(0); 397 }); 398 } 399 400 template <typename T> 401 bool AArch64MIPeepholeOpt::visitADDSSUBS( 402 OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) { 403 // Try the same transformation as ADDSUB but with additional requirement 404 // that the condition code usages are only for Equal and Not Equal 405 406 if (MI.getOperand(1).getReg() == AArch64::XZR || 407 MI.getOperand(1).getReg() == AArch64::WZR) 408 return false; 409 410 return splitTwoPartImm<T>( 411 MI, 412 [PosOpcs, NegOpcs, &MI, &TRI = TRI, 413 &MRI = MRI](T Imm, unsigned RegSize, T &Imm0, 414 T &Imm1) -> std::optional<OpcodePair> { 415 OpcodePair OP; 416 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) 417 OP = PosOpcs; 418 else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) 419 OP = NegOpcs; 420 else 421 return std::nullopt; 422 // Check conditional uses last since it is expensive for scanning 423 // proceeding instructions 424 MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 425 std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI); 426 if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V) 427 return std::nullopt; 428 return OP; 429 }, 430 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 431 unsigned Imm1, Register SrcReg, Register NewTmpReg, 432 Register NewDstReg) { 433 DebugLoc DL = MI.getDebugLoc(); 434 MachineBasicBlock *MBB = MI.getParent(); 435 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 436 .addReg(SrcReg) 437 .addImm(Imm0) 438 .addImm(12); 439 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 440 .addReg(NewTmpReg) 441 .addImm(Imm1) 442 .addImm(0); 443 }); 444 } 445 446 // Checks if the corresponding MOV immediate instruction is applicable for 447 // this peephole optimization. 448 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, 449 MachineInstr *&MovMI, 450 MachineInstr *&SubregToRegMI) { 451 // Check whether current MBB is in loop and the AND is loop invariant. 452 MachineBasicBlock *MBB = MI.getParent(); 453 MachineLoop *L = MLI->getLoopFor(MBB); 454 if (L && !L->isLoopInvariant(MI)) 455 return false; 456 457 // Check whether current MI's operand is MOV with immediate. 458 MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 459 if (!MovMI) 460 return false; 461 462 // If it is SUBREG_TO_REG, check its operand. 463 SubregToRegMI = nullptr; 464 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { 465 SubregToRegMI = MovMI; 466 MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); 467 if (!MovMI) 468 return false; 469 } 470 471 if (MovMI->getOpcode() != AArch64::MOVi32imm && 472 MovMI->getOpcode() != AArch64::MOVi64imm) 473 return false; 474 475 // If the MOV has multiple uses, do not split the immediate because it causes 476 // more instructions. 477 if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) 478 return false; 479 if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) 480 return false; 481 482 // It is OK to perform this peephole optimization. 483 return true; 484 } 485 486 template <typename T> 487 bool AArch64MIPeepholeOpt::splitTwoPartImm( 488 MachineInstr &MI, 489 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) { 490 unsigned RegSize = sizeof(T) * 8; 491 assert((RegSize == 32 || RegSize == 64) && 492 "Invalid RegSize for legal immediate peephole optimization"); 493 494 // Perform several essential checks against current MI. 495 MachineInstr *MovMI, *SubregToRegMI; 496 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) 497 return false; 498 499 // Split the immediate to Imm0 and Imm1, and calculate the Opcode. 500 T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1; 501 // For the 32 bit form of instruction, the upper 32 bits of the destination 502 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits 503 // of Imm to zero. This is essential if the Immediate value was a negative 504 // number since it was sign extended when we assign to the 64-bit Imm. 505 if (SubregToRegMI) 506 Imm &= 0xFFFFFFFF; 507 OpcodePair Opcode; 508 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1)) 509 Opcode = *R; 510 else 511 return false; 512 513 // Create new MIs using the first and second opcodes. Opcodes might differ for 514 // flag setting operations that should only set flags on second instruction. 515 // NewTmpReg = Opcode.first SrcReg Imm0 516 // NewDstReg = Opcode.second NewTmpReg Imm1 517 518 // Determine register classes for destinations and register operands 519 MachineFunction *MF = MI.getMF(); 520 const TargetRegisterClass *FirstInstrDstRC = 521 TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF); 522 const TargetRegisterClass *FirstInstrOperandRC = 523 TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF); 524 const TargetRegisterClass *SecondInstrDstRC = 525 (Opcode.first == Opcode.second) 526 ? FirstInstrDstRC 527 : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF); 528 const TargetRegisterClass *SecondInstrOperandRC = 529 (Opcode.first == Opcode.second) 530 ? FirstInstrOperandRC 531 : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF); 532 533 // Get old registers destinations and new register destinations 534 Register DstReg = MI.getOperand(0).getReg(); 535 Register SrcReg = MI.getOperand(1).getReg(); 536 Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC); 537 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to 538 // reuse that same destination register. 539 Register NewDstReg = DstReg.isVirtual() 540 ? MRI->createVirtualRegister(SecondInstrDstRC) 541 : DstReg; 542 543 // Constrain registers based on their new uses 544 MRI->constrainRegClass(SrcReg, FirstInstrOperandRC); 545 MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC); 546 if (DstReg != NewDstReg) 547 MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg)); 548 549 // Call the delegating operation to build the instruction 550 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg); 551 552 // replaceRegWith changes MI's definition register. Keep it for SSA form until 553 // deleting MI. Only if we made a new destination register. 554 if (DstReg != NewDstReg) { 555 MRI->replaceRegWith(DstReg, NewDstReg); 556 MI.getOperand(0).setReg(DstReg); 557 } 558 559 // Record the MIs need to be removed. 560 MI.eraseFromParent(); 561 if (SubregToRegMI) 562 SubregToRegMI->eraseFromParent(); 563 MovMI->eraseFromParent(); 564 565 return true; 566 } 567 568 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) { 569 // Check if this INSvi[X]gpr comes from COPY of a source FPR128 570 // 571 // From 572 // %intermediate1:gpr64 = COPY %src:fpr128 573 // %intermediate2:gpr32 = COPY %intermediate1:gpr64 574 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32 575 // To 576 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128, 577 // src_index 578 // where src_index = 0, X = [8|16|32|64] 579 580 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); 581 582 // For a chain of COPY instructions, find the initial source register 583 // and check if it's an FPR128 584 while (true) { 585 if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY) 586 return false; 587 588 if (!SrcMI->getOperand(1).getReg().isVirtual()) 589 return false; 590 591 if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) == 592 &AArch64::FPR128RegClass) { 593 break; 594 } 595 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg()); 596 } 597 598 Register DstReg = MI.getOperand(0).getReg(); 599 Register SrcReg = SrcMI->getOperand(1).getReg(); 600 MachineInstr *INSvilaneMI = 601 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg) 602 .add(MI.getOperand(1)) 603 .add(MI.getOperand(2)) 604 .addUse(SrcReg, getRegState(SrcMI->getOperand(1))) 605 .addImm(0); 606 607 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n"); 608 (void)INSvilaneMI; 609 MI.eraseFromParent(); 610 return true; 611 } 612 613 // All instructions that set a FPR64 will implicitly zero the top bits of the 614 // register. 615 static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI, 616 MachineRegisterInfo *MRI) { 617 if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef()) 618 return false; 619 const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg()); 620 if (RC != &AArch64::FPR64RegClass) 621 return false; 622 return MI->getOpcode() > TargetOpcode::GENERIC_OP_END; 623 } 624 625 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { 626 // Check the MI for low 64-bits sets zero for high 64-bits implicitly. 627 // We are expecting below case. 628 // 629 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 630 // %6:fpr128 = IMPLICIT_DEF 631 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub 632 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 633 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 634 if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG) 635 return false; 636 Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg()); 637 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI)) 638 return false; 639 640 // Check there is `mov 0` MI for high 64-bits. 641 // We are expecting below cases. 642 // 643 // %2:fpr64 = MOVID 0 644 // %4:fpr128 = IMPLICIT_DEF 645 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub 646 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 647 // or 648 // %5:fpr128 = MOVIv2d_ns 0 649 // %6:fpr64 = COPY %5.dsub:fpr128 650 // %8:fpr128 = IMPLICIT_DEF 651 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub 652 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0 653 MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); 654 if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG) 655 return false; 656 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg()); 657 if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY) 658 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg()); 659 if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID && 660 High64MI->getOpcode() != AArch64::MOVIv2d_ns)) 661 return false; 662 if (High64MI->getOperand(1).getImm() != 0) 663 return false; 664 665 // Let's remove MIs for high 64-bits. 666 Register OldDef = MI.getOperand(0).getReg(); 667 Register NewDef = MI.getOperand(1).getReg(); 668 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef)); 669 MRI->replaceRegWith(OldDef, NewDef); 670 MI.eraseFromParent(); 671 672 return true; 673 } 674 675 bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) { 676 // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR. 677 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 678 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI)) 679 return false; 680 681 // Let's remove MIs for high 64-bits. 682 Register OldDef = MI.getOperand(0).getReg(); 683 Register NewDef = MI.getOperand(1).getReg(); 684 LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n"); 685 MRI->clearKillFlags(OldDef); 686 MRI->clearKillFlags(NewDef); 687 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef)); 688 MRI->replaceRegWith(OldDef, NewDef); 689 MI.eraseFromParent(); 690 691 return true; 692 } 693 694 // Across a basic-block we might have in i32 extract from a value that only 695 // operates on upper bits (for example a sxtw). We can replace the COPY with a 696 // new version skipping the sxtw. 697 bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) { 698 Register InputReg = MI.getOperand(1).getReg(); 699 if (MI.getOperand(1).getSubReg() != AArch64::sub_32 || 700 !MRI->hasOneNonDBGUse(InputReg)) 701 return false; 702 703 MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg); 704 SmallPtrSet<MachineInstr *, 4> DeadInstrs; 705 DeadInstrs.insert(SrcMI); 706 while (SrcMI && SrcMI->isFullCopy() && 707 MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) { 708 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg()); 709 DeadInstrs.insert(SrcMI); 710 } 711 712 if (!SrcMI || SrcMI->getOpcode() != AArch64::SBFMXri || 713 SrcMI->getOperand(2).getImm() != 0 || SrcMI->getOperand(3).getImm() != 31) 714 return false; 715 716 Register SrcReg = SrcMI->getOperand(1).getReg(); 717 MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg)); 718 LLVM_DEBUG(dbgs() << "Optimizing: " << MI); 719 MI.getOperand(1).setReg(SrcReg); 720 LLVM_DEBUG(dbgs() << " to: " << MI); 721 for (auto *DeadMI : DeadInstrs) { 722 LLVM_DEBUG(dbgs() << " Removing: " << *DeadMI); 723 DeadMI->eraseFromParent(); 724 } 725 return true; 726 } 727 728 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { 729 if (skipFunction(MF.getFunction())) 730 return false; 731 732 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); 733 TRI = static_cast<const AArch64RegisterInfo *>( 734 MF.getSubtarget().getRegisterInfo()); 735 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); 736 MRI = &MF.getRegInfo(); 737 738 assert(MRI->isSSA() && "Expected to be run on SSA form!"); 739 740 bool Changed = false; 741 742 for (MachineBasicBlock &MBB : MF) { 743 for (MachineInstr &MI : make_early_inc_range(MBB)) { 744 switch (MI.getOpcode()) { 745 default: 746 break; 747 case AArch64::INSERT_SUBREG: 748 Changed |= visitINSERT(MI); 749 break; 750 case AArch64::ANDWrr: 751 Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI); 752 break; 753 case AArch64::ANDXrr: 754 Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); 755 break; 756 case AArch64::ORRWrs: 757 Changed |= visitORR(MI); 758 break; 759 case AArch64::ADDWrr: 760 Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI); 761 break; 762 case AArch64::SUBWrr: 763 Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI); 764 break; 765 case AArch64::ADDXrr: 766 Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI); 767 break; 768 case AArch64::SUBXrr: 769 Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI); 770 break; 771 case AArch64::ADDSWrr: 772 Changed |= 773 visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri}, 774 {AArch64::SUBWri, AArch64::SUBSWri}, MI); 775 break; 776 case AArch64::SUBSWrr: 777 Changed |= 778 visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri}, 779 {AArch64::ADDWri, AArch64::ADDSWri}, MI); 780 break; 781 case AArch64::ADDSXrr: 782 Changed |= 783 visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri}, 784 {AArch64::SUBXri, AArch64::SUBSXri}, MI); 785 break; 786 case AArch64::SUBSXrr: 787 Changed |= 788 visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri}, 789 {AArch64::ADDXri, AArch64::ADDSXri}, MI); 790 break; 791 case AArch64::INSvi64gpr: 792 Changed |= visitINSviGPR(MI, AArch64::INSvi64lane); 793 break; 794 case AArch64::INSvi32gpr: 795 Changed |= visitINSviGPR(MI, AArch64::INSvi32lane); 796 break; 797 case AArch64::INSvi16gpr: 798 Changed |= visitINSviGPR(MI, AArch64::INSvi16lane); 799 break; 800 case AArch64::INSvi8gpr: 801 Changed |= visitINSviGPR(MI, AArch64::INSvi8lane); 802 break; 803 case AArch64::INSvi64lane: 804 Changed |= visitINSvi64lane(MI); 805 break; 806 case AArch64::FMOVDr: 807 Changed |= visitFMOVDr(MI); 808 break; 809 case AArch64::COPY: 810 Changed |= visitCopy(MI); 811 break; 812 } 813 } 814 } 815 816 return Changed; 817 } 818 819 FunctionPass *llvm::createAArch64MIPeepholeOptPass() { 820 return new AArch64MIPeepholeOpt(); 821 } 822