1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs below peephole optimizations on MIR level. 10 // 11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri 12 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri 13 // 14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi 15 // MOVi64imm + ADDXrr ==> ANDXri + ANDXri 16 // 17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi 18 // MOVi64imm + SUBXrr ==> SUBXri + SUBXri 19 // 20 // The mov pseudo instruction could be expanded to multiple mov instructions 21 // later. In this case, we could try to split the constant operand of mov 22 // instruction into two immediates which can be directly encoded into 23 // *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of 24 // multiple `mov` + `and/add/sub` instructions. 25 // 26 // 4. Remove redundant ORRWrs which is generated by zero-extend. 27 // 28 // %3:gpr32 = ORRWrs $wzr, %2, 0 29 // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 30 // 31 // If AArch64's 32-bit form of instruction defines the source operand of 32 // ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source 33 // operand are set to zero. 34 // 35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx 36 // ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx 37 // 38 // 6. %intermediate:gpr32 = COPY %src:fpr128 39 // %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32 40 // ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0 41 // 42 // In cases where a source FPR is copied to a GPR in order to be copied 43 // to a destination FPR, we can directly copy the values between the FPRs, 44 // eliminating the use of the Integer unit. When we match a pattern of 45 // INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR 46 // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr 47 // instructions. 48 // 49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high 50 // 64-bits. For example, 51 // 52 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 53 // %2:fpr64 = MOVID 0 54 // %4:fpr128 = IMPLICIT_DEF 55 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub 56 // %6:fpr128 = IMPLICIT_DEF 57 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub 58 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 59 // ==> 60 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 61 // %6:fpr128 = IMPLICIT_DEF 62 // %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub 63 // 64 //===----------------------------------------------------------------------===// 65 66 #include "AArch64ExpandImm.h" 67 #include "AArch64InstrInfo.h" 68 #include "MCTargetDesc/AArch64AddressingModes.h" 69 #include "llvm/CodeGen/MachineDominators.h" 70 #include "llvm/CodeGen/MachineLoopInfo.h" 71 72 using namespace llvm; 73 74 #define DEBUG_TYPE "aarch64-mi-peephole-opt" 75 76 namespace { 77 78 struct AArch64MIPeepholeOpt : public MachineFunctionPass { 79 static char ID; 80 81 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) { 82 initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry()); 83 } 84 85 const AArch64InstrInfo *TII; 86 const AArch64RegisterInfo *TRI; 87 MachineLoopInfo *MLI; 88 MachineRegisterInfo *MRI; 89 90 using OpcodePair = std::pair<unsigned, unsigned>; 91 template <typename T> 92 using SplitAndOpcFunc = 93 std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>; 94 using BuildMIFunc = 95 std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned, 96 Register, Register, Register)>; 97 98 /// For instructions where an immediate operand could be split into two 99 /// separate immediate instructions, use the splitTwoPartImm two handle the 100 /// optimization. 101 /// 102 /// To implement, the following function types must be passed to 103 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if 104 /// splitting the immediate is valid and returns the associated new opcode. A 105 /// BuildMIFunc must be implemented to build the two immediate instructions. 106 /// 107 /// Example Pattern (where IMM would require 2+ MOV instructions): 108 /// %dst = <Instr>rr %src IMM [...] 109 /// becomes: 110 /// %tmp = <Instr>ri %src (encode half IMM) [...] 111 /// %dst = <Instr>ri %tmp (encode half IMM) [...] 112 template <typename T> 113 bool splitTwoPartImm(MachineInstr &MI, 114 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr); 115 116 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, 117 MachineInstr *&SubregToRegMI); 118 119 template <typename T> 120 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI); 121 template <typename T> 122 bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); 123 124 template <typename T> 125 bool visitAND(unsigned Opc, MachineInstr &MI); 126 bool visitORR(MachineInstr &MI); 127 bool visitINSERT(MachineInstr &MI); 128 bool visitINSviGPR(MachineInstr &MI, unsigned Opc); 129 bool visitINSvi64lane(MachineInstr &MI); 130 bool runOnMachineFunction(MachineFunction &MF) override; 131 132 StringRef getPassName() const override { 133 return "AArch64 MI Peephole Optimization pass"; 134 } 135 136 void getAnalysisUsage(AnalysisUsage &AU) const override { 137 AU.setPreservesCFG(); 138 AU.addRequired<MachineLoopInfo>(); 139 MachineFunctionPass::getAnalysisUsage(AU); 140 } 141 }; 142 143 char AArch64MIPeepholeOpt::ID = 0; 144 145 } // end anonymous namespace 146 147 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", 148 "AArch64 MI Peephole Optimization", false, false) 149 150 template <typename T> 151 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { 152 T UImm = static_cast<T>(Imm); 153 if (AArch64_AM::isLogicalImmediate(UImm, RegSize)) 154 return false; 155 156 // If this immediate can be handled by one instruction, do not split it. 157 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 158 AArch64_IMM::expandMOVImm(UImm, RegSize, Insn); 159 if (Insn.size() == 1) 160 return false; 161 162 // The bitmask immediate consists of consecutive ones. Let's say there is 163 // constant 0b00000000001000000000010000000000 which does not consist of 164 // consecutive ones. We can split it in to two bitmask immediate like 165 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111. 166 // If we do AND with these two bitmask immediate, we can see original one. 167 unsigned LowestBitSet = llvm::countr_zero(UImm); 168 unsigned HighestBitSet = Log2_64(UImm); 169 170 // Create a mask which is filled with one from the position of lowest bit set 171 // to the position of highest bit set. 172 T NewImm1 = (static_cast<T>(2) << HighestBitSet) - 173 (static_cast<T>(1) << LowestBitSet); 174 // Create a mask which is filled with one outside the position of lowest bit 175 // set and the position of highest bit set. 176 T NewImm2 = UImm | ~NewImm1; 177 178 // If the split value is not valid bitmask immediate, do not split this 179 // constant. 180 if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) 181 return false; 182 183 Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize); 184 Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize); 185 return true; 186 } 187 188 template <typename T> 189 bool AArch64MIPeepholeOpt::visitAND( 190 unsigned Opc, MachineInstr &MI) { 191 // Try below transformation. 192 // 193 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri 194 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri 195 // 196 // The mov pseudo instruction could be expanded to multiple mov instructions 197 // later. Let's try to split the constant operand of mov instruction into two 198 // bitmask immediates. It makes only two AND instructions intead of multiple 199 // mov + and instructions. 200 201 return splitTwoPartImm<T>( 202 MI, 203 [Opc](T Imm, unsigned RegSize, T &Imm0, 204 T &Imm1) -> std::optional<OpcodePair> { 205 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) 206 return std::make_pair(Opc, Opc); 207 return std::nullopt; 208 }, 209 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 210 unsigned Imm1, Register SrcReg, Register NewTmpReg, 211 Register NewDstReg) { 212 DebugLoc DL = MI.getDebugLoc(); 213 MachineBasicBlock *MBB = MI.getParent(); 214 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 215 .addReg(SrcReg) 216 .addImm(Imm0); 217 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 218 .addReg(NewTmpReg) 219 .addImm(Imm1); 220 }); 221 } 222 223 bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) { 224 // Check this ORR comes from below zero-extend pattern. 225 // 226 // def : Pat<(i64 (zext GPR32:$src)), 227 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; 228 if (MI.getOperand(3).getImm() != 0) 229 return false; 230 231 if (MI.getOperand(1).getReg() != AArch64::WZR) 232 return false; 233 234 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 235 if (!SrcMI) 236 return false; 237 238 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC 239 // 240 // When you use the 32-bit form of an instruction, the upper 32 bits of the 241 // source registers are ignored and the upper 32 bits of the destination 242 // register are set to zero. 243 // 244 // If AArch64's 32-bit form of instruction defines the source operand of 245 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is 246 // real AArch64 instruction and if it is not, do not process the opcode 247 // conservatively. 248 if (SrcMI->getOpcode() == TargetOpcode::COPY && 249 SrcMI->getOperand(1).getReg().isVirtual()) { 250 const TargetRegisterClass *RC = 251 MRI->getRegClass(SrcMI->getOperand(1).getReg()); 252 253 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know 254 // that the upper bits are zero. 255 if (RC != &AArch64::FPR32RegClass && 256 ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) || 257 SrcMI->getOperand(1).getSubReg() != AArch64::ssub)) 258 return false; 259 Register CpySrc = SrcMI->getOperand(1).getReg(); 260 if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) { 261 CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass); 262 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), 263 TII->get(TargetOpcode::COPY), CpySrc) 264 .add(SrcMI->getOperand(1)); 265 } 266 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), 267 TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg()) 268 .addReg(CpySrc); 269 SrcMI->eraseFromParent(); 270 } 271 else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) 272 return false; 273 274 Register DefReg = MI.getOperand(0).getReg(); 275 Register SrcReg = MI.getOperand(2).getReg(); 276 MRI->replaceRegWith(DefReg, SrcReg); 277 MRI->clearKillFlags(SrcReg); 278 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n"); 279 MI.eraseFromParent(); 280 281 return true; 282 } 283 284 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) { 285 // Check this INSERT_SUBREG comes from below zero-extend pattern. 286 // 287 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx 288 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx 289 // 290 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a 291 // COPY would destroy the upper part of the register anyway 292 if (!MI.isRegTiedToDefOperand(1)) 293 return false; 294 295 Register DstReg = MI.getOperand(0).getReg(); 296 const TargetRegisterClass *RC = MRI->getRegClass(DstReg); 297 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 298 if (!SrcMI) 299 return false; 300 301 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC 302 // 303 // When you use the 32-bit form of an instruction, the upper 32 bits of the 304 // source registers are ignored and the upper 32 bits of the destination 305 // register are set to zero. 306 // 307 // If AArch64's 32-bit form of instruction defines the source operand of 308 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is 309 // real AArch64 instruction and if it is not, do not process the opcode 310 // conservatively. 311 if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) || 312 !AArch64::GPR64allRegClass.hasSubClassEq(RC)) 313 return false; 314 315 // Build a SUBREG_TO_REG instruction 316 MachineInstr *SubregMI = 317 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 318 TII->get(TargetOpcode::SUBREG_TO_REG), DstReg) 319 .addImm(0) 320 .add(MI.getOperand(2)) 321 .add(MI.getOperand(3)); 322 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n"); 323 (void)SubregMI; 324 MI.eraseFromParent(); 325 326 return true; 327 } 328 329 template <typename T> 330 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { 331 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both 332 // imm0 and imm1 are non-zero 12-bit unsigned int. 333 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || 334 (Imm & ~static_cast<T>(0xffffff)) != 0) 335 return false; 336 337 // The immediate can not be composed via a single instruction. 338 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 339 AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); 340 if (Insn.size() == 1) 341 return false; 342 343 // Split Imm into (Imm0 << 12) + Imm1; 344 Imm0 = (Imm >> 12) & 0xfff; 345 Imm1 = Imm & 0xfff; 346 return true; 347 } 348 349 template <typename T> 350 bool AArch64MIPeepholeOpt::visitADDSUB( 351 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) { 352 // Try below transformation. 353 // 354 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri 355 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri 356 // 357 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri 358 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri 359 // 360 // The mov pseudo instruction could be expanded to multiple mov instructions 361 // later. Let's try to split the constant operand of mov instruction into two 362 // legal add/sub immediates. It makes only two ADD/SUB instructions intead of 363 // multiple `mov` + `and/sub` instructions. 364 365 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant 366 // folded. Make sure that we don't generate invalid instructions that use XZR 367 // in those cases. 368 if (MI.getOperand(1).getReg() == AArch64::XZR || 369 MI.getOperand(1).getReg() == AArch64::WZR) 370 return false; 371 372 return splitTwoPartImm<T>( 373 MI, 374 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0, 375 T &Imm1) -> std::optional<OpcodePair> { 376 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) 377 return std::make_pair(PosOpc, PosOpc); 378 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) 379 return std::make_pair(NegOpc, NegOpc); 380 return std::nullopt; 381 }, 382 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 383 unsigned Imm1, Register SrcReg, Register NewTmpReg, 384 Register NewDstReg) { 385 DebugLoc DL = MI.getDebugLoc(); 386 MachineBasicBlock *MBB = MI.getParent(); 387 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 388 .addReg(SrcReg) 389 .addImm(Imm0) 390 .addImm(12); 391 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 392 .addReg(NewTmpReg) 393 .addImm(Imm1) 394 .addImm(0); 395 }); 396 } 397 398 template <typename T> 399 bool AArch64MIPeepholeOpt::visitADDSSUBS( 400 OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) { 401 // Try the same transformation as ADDSUB but with additional requirement 402 // that the condition code usages are only for Equal and Not Equal 403 404 if (MI.getOperand(1).getReg() == AArch64::XZR || 405 MI.getOperand(1).getReg() == AArch64::WZR) 406 return false; 407 408 return splitTwoPartImm<T>( 409 MI, 410 [PosOpcs, NegOpcs, &MI, &TRI = TRI, 411 &MRI = MRI](T Imm, unsigned RegSize, T &Imm0, 412 T &Imm1) -> std::optional<OpcodePair> { 413 OpcodePair OP; 414 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) 415 OP = PosOpcs; 416 else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) 417 OP = NegOpcs; 418 else 419 return std::nullopt; 420 // Check conditional uses last since it is expensive for scanning 421 // proceeding instructions 422 MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 423 std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI); 424 if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V) 425 return std::nullopt; 426 return OP; 427 }, 428 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, 429 unsigned Imm1, Register SrcReg, Register NewTmpReg, 430 Register NewDstReg) { 431 DebugLoc DL = MI.getDebugLoc(); 432 MachineBasicBlock *MBB = MI.getParent(); 433 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) 434 .addReg(SrcReg) 435 .addImm(Imm0) 436 .addImm(12); 437 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) 438 .addReg(NewTmpReg) 439 .addImm(Imm1) 440 .addImm(0); 441 }); 442 } 443 444 // Checks if the corresponding MOV immediate instruction is applicable for 445 // this peephole optimization. 446 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, 447 MachineInstr *&MovMI, 448 MachineInstr *&SubregToRegMI) { 449 // Check whether current MBB is in loop and the AND is loop invariant. 450 MachineBasicBlock *MBB = MI.getParent(); 451 MachineLoop *L = MLI->getLoopFor(MBB); 452 if (L && !L->isLoopInvariant(MI)) 453 return false; 454 455 // Check whether current MI's operand is MOV with immediate. 456 MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 457 if (!MovMI) 458 return false; 459 460 // If it is SUBREG_TO_REG, check its operand. 461 SubregToRegMI = nullptr; 462 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { 463 SubregToRegMI = MovMI; 464 MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); 465 if (!MovMI) 466 return false; 467 } 468 469 if (MovMI->getOpcode() != AArch64::MOVi32imm && 470 MovMI->getOpcode() != AArch64::MOVi64imm) 471 return false; 472 473 // If the MOV has multiple uses, do not split the immediate because it causes 474 // more instructions. 475 if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) 476 return false; 477 if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) 478 return false; 479 480 // It is OK to perform this peephole optimization. 481 return true; 482 } 483 484 template <typename T> 485 bool AArch64MIPeepholeOpt::splitTwoPartImm( 486 MachineInstr &MI, 487 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) { 488 unsigned RegSize = sizeof(T) * 8; 489 assert((RegSize == 32 || RegSize == 64) && 490 "Invalid RegSize for legal immediate peephole optimization"); 491 492 // Perform several essential checks against current MI. 493 MachineInstr *MovMI, *SubregToRegMI; 494 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) 495 return false; 496 497 // Split the immediate to Imm0 and Imm1, and calculate the Opcode. 498 T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1; 499 // For the 32 bit form of instruction, the upper 32 bits of the destination 500 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits 501 // of Imm to zero. This is essential if the Immediate value was a negative 502 // number since it was sign extended when we assign to the 64-bit Imm. 503 if (SubregToRegMI) 504 Imm &= 0xFFFFFFFF; 505 OpcodePair Opcode; 506 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1)) 507 Opcode = *R; 508 else 509 return false; 510 511 // Create new MIs using the first and second opcodes. Opcodes might differ for 512 // flag setting operations that should only set flags on second instruction. 513 // NewTmpReg = Opcode.first SrcReg Imm0 514 // NewDstReg = Opcode.second NewTmpReg Imm1 515 516 // Determine register classes for destinations and register operands 517 MachineFunction *MF = MI.getMF(); 518 const TargetRegisterClass *FirstInstrDstRC = 519 TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF); 520 const TargetRegisterClass *FirstInstrOperandRC = 521 TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF); 522 const TargetRegisterClass *SecondInstrDstRC = 523 (Opcode.first == Opcode.second) 524 ? FirstInstrDstRC 525 : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF); 526 const TargetRegisterClass *SecondInstrOperandRC = 527 (Opcode.first == Opcode.second) 528 ? FirstInstrOperandRC 529 : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF); 530 531 // Get old registers destinations and new register destinations 532 Register DstReg = MI.getOperand(0).getReg(); 533 Register SrcReg = MI.getOperand(1).getReg(); 534 Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC); 535 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to 536 // reuse that same destination register. 537 Register NewDstReg = DstReg.isVirtual() 538 ? MRI->createVirtualRegister(SecondInstrDstRC) 539 : DstReg; 540 541 // Constrain registers based on their new uses 542 MRI->constrainRegClass(SrcReg, FirstInstrOperandRC); 543 MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC); 544 if (DstReg != NewDstReg) 545 MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg)); 546 547 // Call the delegating operation to build the instruction 548 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg); 549 550 // replaceRegWith changes MI's definition register. Keep it for SSA form until 551 // deleting MI. Only if we made a new destination register. 552 if (DstReg != NewDstReg) { 553 MRI->replaceRegWith(DstReg, NewDstReg); 554 MI.getOperand(0).setReg(DstReg); 555 } 556 557 // Record the MIs need to be removed. 558 MI.eraseFromParent(); 559 if (SubregToRegMI) 560 SubregToRegMI->eraseFromParent(); 561 MovMI->eraseFromParent(); 562 563 return true; 564 } 565 566 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) { 567 // Check if this INSvi[X]gpr comes from COPY of a source FPR128 568 // 569 // From 570 // %intermediate1:gpr64 = COPY %src:fpr128 571 // %intermediate2:gpr32 = COPY %intermediate1:gpr64 572 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32 573 // To 574 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128, 575 // src_index 576 // where src_index = 0, X = [8|16|32|64] 577 578 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); 579 580 // For a chain of COPY instructions, find the initial source register 581 // and check if it's an FPR128 582 while (true) { 583 if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY) 584 return false; 585 586 if (!SrcMI->getOperand(1).getReg().isVirtual()) 587 return false; 588 589 if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) == 590 &AArch64::FPR128RegClass) { 591 break; 592 } 593 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg()); 594 } 595 596 Register DstReg = MI.getOperand(0).getReg(); 597 Register SrcReg = SrcMI->getOperand(1).getReg(); 598 MachineInstr *INSvilaneMI = 599 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg) 600 .add(MI.getOperand(1)) 601 .add(MI.getOperand(2)) 602 .addUse(SrcReg, getRegState(SrcMI->getOperand(1))) 603 .addImm(0); 604 605 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n"); 606 (void)INSvilaneMI; 607 MI.eraseFromParent(); 608 return true; 609 } 610 611 // All instructions that set a FPR64 will implicitly zero the top bits of the 612 // register. 613 static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI, 614 MachineRegisterInfo *MRI) { 615 if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef()) 616 return false; 617 const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg()); 618 if (RC != &AArch64::FPR64RegClass) 619 return false; 620 return MI->getOpcode() > TargetOpcode::GENERIC_OP_END; 621 } 622 623 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { 624 // Check the MI for low 64-bits sets zero for high 64-bits implicitly. 625 // We are expecting below case. 626 // 627 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr 628 // %6:fpr128 = IMPLICIT_DEF 629 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub 630 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 631 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); 632 if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG) 633 return false; 634 Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg()); 635 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI)) 636 return false; 637 638 // Check there is `mov 0` MI for high 64-bits. 639 // We are expecting below cases. 640 // 641 // %2:fpr64 = MOVID 0 642 // %4:fpr128 = IMPLICIT_DEF 643 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub 644 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 645 // or 646 // %5:fpr128 = MOVIv2d_ns 0 647 // %6:fpr64 = COPY %5.dsub:fpr128 648 // %8:fpr128 = IMPLICIT_DEF 649 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub 650 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0 651 MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); 652 if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG) 653 return false; 654 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg()); 655 if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY) 656 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg()); 657 if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID && 658 High64MI->getOpcode() != AArch64::MOVIv2d_ns)) 659 return false; 660 if (High64MI->getOperand(1).getImm() != 0) 661 return false; 662 663 // Let's remove MIs for high 64-bits. 664 Register OldDef = MI.getOperand(0).getReg(); 665 Register NewDef = MI.getOperand(1).getReg(); 666 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef)); 667 MRI->replaceRegWith(OldDef, NewDef); 668 MI.eraseFromParent(); 669 670 return true; 671 } 672 673 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { 674 if (skipFunction(MF.getFunction())) 675 return false; 676 677 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); 678 TRI = static_cast<const AArch64RegisterInfo *>( 679 MF.getSubtarget().getRegisterInfo()); 680 MLI = &getAnalysis<MachineLoopInfo>(); 681 MRI = &MF.getRegInfo(); 682 683 assert(MRI->isSSA() && "Expected to be run on SSA form!"); 684 685 bool Changed = false; 686 687 for (MachineBasicBlock &MBB : MF) { 688 for (MachineInstr &MI : make_early_inc_range(MBB)) { 689 switch (MI.getOpcode()) { 690 default: 691 break; 692 case AArch64::INSERT_SUBREG: 693 Changed |= visitINSERT(MI); 694 break; 695 case AArch64::ANDWrr: 696 Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI); 697 break; 698 case AArch64::ANDXrr: 699 Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); 700 break; 701 case AArch64::ORRWrs: 702 Changed |= visitORR(MI); 703 break; 704 case AArch64::ADDWrr: 705 Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI); 706 break; 707 case AArch64::SUBWrr: 708 Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI); 709 break; 710 case AArch64::ADDXrr: 711 Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI); 712 break; 713 case AArch64::SUBXrr: 714 Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI); 715 break; 716 case AArch64::ADDSWrr: 717 Changed |= 718 visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri}, 719 {AArch64::SUBWri, AArch64::SUBSWri}, MI); 720 break; 721 case AArch64::SUBSWrr: 722 Changed |= 723 visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri}, 724 {AArch64::ADDWri, AArch64::ADDSWri}, MI); 725 break; 726 case AArch64::ADDSXrr: 727 Changed |= 728 visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri}, 729 {AArch64::SUBXri, AArch64::SUBSXri}, MI); 730 break; 731 case AArch64::SUBSXrr: 732 Changed |= 733 visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri}, 734 {AArch64::ADDXri, AArch64::ADDSXri}, MI); 735 break; 736 case AArch64::INSvi64gpr: 737 Changed |= visitINSviGPR(MI, AArch64::INSvi64lane); 738 break; 739 case AArch64::INSvi32gpr: 740 Changed |= visitINSviGPR(MI, AArch64::INSvi32lane); 741 break; 742 case AArch64::INSvi16gpr: 743 Changed |= visitINSviGPR(MI, AArch64::INSvi16lane); 744 break; 745 case AArch64::INSvi8gpr: 746 Changed |= visitINSviGPR(MI, AArch64::INSvi8lane); 747 break; 748 case AArch64::INSvi64lane: 749 Changed |= visitINSvi64lane(MI); 750 break; 751 } 752 } 753 } 754 755 return Changed; 756 } 757 758 FunctionPass *llvm::createAArch64MIPeepholeOptPass() { 759 return new AArch64MIPeepholeOpt(); 760 } 761