1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs below peephole optimizations on MIR level. 10 // 11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri 12 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri 13 // 14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi 15 // MOVi64imm + ADDXrr ==> ANDXri + ANDXri 16 // 17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi 18 // MOVi64imm + SUBXrr ==> SUBXri + SUBXri 19 // 20 // The mov pseudo instruction could be expanded to multiple mov instructions 21 // later. In this case, we could try to split the constant operand of mov 22 // instruction into two immediates which can be directly encoded into 23 // *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of 24 // multiple `mov` + `and/add/sub` instructions. 25 // 26 // 4. Remove redundant ORRWrs which is generated by zero-extend. 27 // 28 // %3:gpr32 = ORRWrs $wzr, %2, 0 29 // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 30 // 31 // If AArch64's 32-bit form of instruction defines the source operand of 32 // ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source 33 // operand are set to zero. 34 // 35 //===----------------------------------------------------------------------===// 36 37 #include "AArch64ExpandImm.h" 38 #include "AArch64InstrInfo.h" 39 #include "MCTargetDesc/AArch64AddressingModes.h" 40 #include "llvm/ADT/Optional.h" 41 #include "llvm/ADT/SetVector.h" 42 #include "llvm/CodeGen/MachineDominators.h" 43 #include "llvm/CodeGen/MachineLoopInfo.h" 44 45 using namespace llvm; 46 47 #define DEBUG_TYPE "aarch64-mi-peephole-opt" 48 49 namespace { 50 51 struct AArch64MIPeepholeOpt : public MachineFunctionPass { 52 static char ID; 53 54 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) { 55 initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry()); 56 } 57 58 const AArch64InstrInfo *TII; 59 const AArch64RegisterInfo *TRI; 60 MachineLoopInfo *MLI; 61 MachineRegisterInfo *MRI; 62 63 template <typename T> 64 using SplitAndOpcFunc = 65 std::function<Optional<unsigned>(T, unsigned, T &, T &)>; 66 using BuildMIFunc = 67 std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register, 68 Register, Register)>; 69 70 /// For instructions where an immediate operand could be split into two 71 /// separate immediate instructions, use the splitTwoPartImm two handle the 72 /// optimization. 73 /// 74 /// To implement, the following function types must be passed to 75 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if 76 /// splitting the immediate is valid and returns the associated new opcode. A 77 /// BuildMIFunc must be implemented to build the two immediate instructions. 78 /// 79 /// Example Pattern (where IMM would require 2+ MOV instructions): 80 /// %dst = <Instr>rr %src IMM [...] 81 /// becomes: 82 /// %tmp = <Instr>ri %src (encode half IMM) [...] 83 /// %dst = <Instr>ri %tmp (encode half IMM) [...] 84 template <typename T> 85 bool splitTwoPartImm(MachineInstr &MI, 86 SmallSetVector<MachineInstr *, 8> &ToBeRemoved, 87 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr); 88 89 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, 90 MachineInstr *&SubregToRegMI); 91 92 template <typename T> 93 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI, 94 SmallSetVector<MachineInstr *, 8> &ToBeRemoved); 95 template <typename T> 96 bool visitAND(unsigned Opc, MachineInstr &MI, 97 SmallSetVector<MachineInstr *, 8> &ToBeRemoved); 98 bool visitORR(MachineInstr &MI, 99 SmallSetVector<MachineInstr *, 8> &ToBeRemoved); 100 bool runOnMachineFunction(MachineFunction &MF) override; 101 102 StringRef getPassName() const override { 103 return "AArch64 MI Peephole Optimization pass"; 104 } 105 106 void getAnalysisUsage(AnalysisUsage &AU) const override { 107 AU.setPreservesCFG(); 108 AU.addRequired<MachineLoopInfo>(); 109 MachineFunctionPass::getAnalysisUsage(AU); 110 } 111 }; 112 113 char AArch64MIPeepholeOpt::ID = 0; 114 115 } // end anonymous namespace 116 117 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", 118 "AArch64 MI Peephole Optimization", false, false) 119 120 template <typename T> 121 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { 122 T UImm = static_cast<T>(Imm); 123 if (AArch64_AM::isLogicalImmediate(UImm, RegSize)) 124 return false; 125 126 // If this immediate can be handled by one instruction, do not split it. 127 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 128 AArch64_IMM::expandMOVImm(UImm, RegSize, Insn); 129 if (Insn.size() == 1) 130 return false; 131 132 // The bitmask immediate consists of consecutive ones. Let's say there is 133 // constant 0b00000000001000000000010000000000 which does not consist of 134 // consecutive ones. We can split it in to two bitmask immediate like 135 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111. 136 // If we do AND with these two bitmask immediate, we can see original one. 137 unsigned LowestBitSet = countTrailingZeros(UImm); 138 unsigned HighestBitSet = Log2_64(UImm); 139 140 // Create a mask which is filled with one from the position of lowest bit set 141 // to the position of highest bit set. 142 T NewImm1 = (static_cast<T>(2) << HighestBitSet) - 143 (static_cast<T>(1) << LowestBitSet); 144 // Create a mask which is filled with one outside the position of lowest bit 145 // set and the position of highest bit set. 146 T NewImm2 = UImm | ~NewImm1; 147 148 // If the split value is not valid bitmask immediate, do not split this 149 // constant. 150 if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) 151 return false; 152 153 Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize); 154 Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize); 155 return true; 156 } 157 158 template <typename T> 159 bool AArch64MIPeepholeOpt::visitAND( 160 unsigned Opc, MachineInstr &MI, 161 SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { 162 // Try below transformation. 163 // 164 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri 165 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri 166 // 167 // The mov pseudo instruction could be expanded to multiple mov instructions 168 // later. Let's try to split the constant operand of mov instruction into two 169 // bitmask immediates. It makes only two AND instructions intead of multiple 170 // mov + and instructions. 171 172 return splitTwoPartImm<T>( 173 MI, ToBeRemoved, 174 [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> { 175 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) 176 return Opc; 177 return None; 178 }, 179 [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0, 180 unsigned Imm1, Register SrcReg, Register NewTmpReg, 181 Register NewDstReg) { 182 DebugLoc DL = MI.getDebugLoc(); 183 MachineBasicBlock *MBB = MI.getParent(); 184 BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) 185 .addReg(SrcReg) 186 .addImm(Imm0); 187 BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg) 188 .addReg(NewTmpReg) 189 .addImm(Imm1); 190 }); 191 } 192 193 bool AArch64MIPeepholeOpt::visitORR( 194 MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { 195 // Check this ORR comes from below zero-extend pattern. 196 // 197 // def : Pat<(i64 (zext GPR32:$src)), 198 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; 199 if (MI.getOperand(3).getImm() != 0) 200 return false; 201 202 if (MI.getOperand(1).getReg() != AArch64::WZR) 203 return false; 204 205 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 206 if (!SrcMI) 207 return false; 208 209 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC 210 // 211 // When you use the 32-bit form of an instruction, the upper 32 bits of the 212 // source registers are ignored and the upper 32 bits of the destination 213 // register are set to zero. 214 // 215 // If AArch64's 32-bit form of instruction defines the source operand of 216 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is 217 // real AArch64 instruction and if it is not, do not process the opcode 218 // conservatively. 219 if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) 220 return false; 221 222 Register DefReg = MI.getOperand(0).getReg(); 223 Register SrcReg = MI.getOperand(2).getReg(); 224 MRI->replaceRegWith(DefReg, SrcReg); 225 MRI->clearKillFlags(SrcReg); 226 // replaceRegWith changes MI's definition register. Keep it for SSA form until 227 // deleting MI. 228 MI.getOperand(0).setReg(DefReg); 229 ToBeRemoved.insert(&MI); 230 231 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n"); 232 233 return true; 234 } 235 236 template <typename T> 237 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { 238 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both 239 // imm0 and imm1 are non-zero 12-bit unsigned int. 240 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || 241 (Imm & ~static_cast<T>(0xffffff)) != 0) 242 return false; 243 244 // The immediate can not be composed via a single instruction. 245 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 246 AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); 247 if (Insn.size() == 1) 248 return false; 249 250 // Split Imm into (Imm0 << 12) + Imm1; 251 Imm0 = (Imm >> 12) & 0xfff; 252 Imm1 = Imm & 0xfff; 253 return true; 254 } 255 256 template <typename T> 257 bool AArch64MIPeepholeOpt::visitADDSUB( 258 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI, 259 SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { 260 // Try below transformation. 261 // 262 // MOVi32imm + ADDWrr ==> ADDWri + ADDWri 263 // MOVi64imm + ADDXrr ==> ADDXri + ADDXri 264 // 265 // MOVi32imm + SUBWrr ==> SUBWri + SUBWri 266 // MOVi64imm + SUBXrr ==> SUBXri + SUBXri 267 // 268 // The mov pseudo instruction could be expanded to multiple mov instructions 269 // later. Let's try to split the constant operand of mov instruction into two 270 // legal add/sub immediates. It makes only two ADD/SUB instructions intead of 271 // multiple `mov` + `and/sub` instructions. 272 273 return splitTwoPartImm<T>( 274 MI, ToBeRemoved, 275 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0, 276 T &Imm1) -> Optional<unsigned> { 277 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) 278 return PosOpc; 279 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) 280 return NegOpc; 281 return None; 282 }, 283 [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0, 284 unsigned Imm1, Register SrcReg, Register NewTmpReg, 285 Register NewDstReg) { 286 DebugLoc DL = MI.getDebugLoc(); 287 MachineBasicBlock *MBB = MI.getParent(); 288 BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) 289 .addReg(SrcReg) 290 .addImm(Imm0) 291 .addImm(12); 292 BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg) 293 .addReg(NewTmpReg) 294 .addImm(Imm1) 295 .addImm(0); 296 }); 297 } 298 299 // Checks if the corresponding MOV immediate instruction is applicable for 300 // this peephole optimization. 301 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, 302 MachineInstr *&MovMI, 303 MachineInstr *&SubregToRegMI) { 304 // Check whether current MBB is in loop and the AND is loop invariant. 305 MachineBasicBlock *MBB = MI.getParent(); 306 MachineLoop *L = MLI->getLoopFor(MBB); 307 if (L && !L->isLoopInvariant(MI)) 308 return false; 309 310 // Check whether current MI's operand is MOV with immediate. 311 MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); 312 if (!MovMI) 313 return false; 314 315 // If it is SUBREG_TO_REG, check its operand. 316 SubregToRegMI = nullptr; 317 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { 318 SubregToRegMI = MovMI; 319 MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); 320 if (!MovMI) 321 return false; 322 } 323 324 if (MovMI->getOpcode() != AArch64::MOVi32imm && 325 MovMI->getOpcode() != AArch64::MOVi64imm) 326 return false; 327 328 // If the MOV has multiple uses, do not split the immediate because it causes 329 // more instructions. 330 if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) 331 return false; 332 if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) 333 return false; 334 335 // It is OK to perform this peephole optimization. 336 return true; 337 } 338 339 template <typename T> 340 bool AArch64MIPeepholeOpt::splitTwoPartImm( 341 MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved, 342 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) { 343 unsigned RegSize = sizeof(T) * 8; 344 assert((RegSize == 32 || RegSize == 64) && 345 "Invalid RegSize for legal immediate peephole optimization"); 346 347 // Perform several essential checks against current MI. 348 MachineInstr *MovMI, *SubregToRegMI; 349 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) 350 return false; 351 352 // Split the immediate to Imm0 and Imm1, and calculate the Opcode. 353 T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1; 354 // For the 32 bit form of instruction, the upper 32 bits of the destination 355 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits 356 // of Imm to zero. This is essential if the Immediate value was a negative 357 // number since it was sign extended when we assign to the 64-bit Imm. 358 if (SubregToRegMI) 359 Imm &= 0xFFFFFFFF; 360 unsigned Opcode; 361 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1)) 362 Opcode = R.getValue(); 363 else 364 return false; 365 366 // Create new ADD/SUB MIs. 367 MachineFunction *MF = MI.getMF(); 368 const TargetRegisterClass *RC = 369 TII->getRegClass(TII->get(Opcode), 0, TRI, *MF); 370 const TargetRegisterClass *ORC = 371 TII->getRegClass(TII->get(Opcode), 1, TRI, *MF); 372 Register DstReg = MI.getOperand(0).getReg(); 373 Register SrcReg = MI.getOperand(1).getReg(); 374 Register NewTmpReg = MRI->createVirtualRegister(RC); 375 Register NewDstReg = MRI->createVirtualRegister(RC); 376 377 MRI->constrainRegClass(SrcReg, RC); 378 MRI->constrainRegClass(NewTmpReg, ORC); 379 MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg)); 380 381 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg); 382 383 MRI->replaceRegWith(DstReg, NewDstReg); 384 // replaceRegWith changes MI's definition register. Keep it for SSA form until 385 // deleting MI. 386 MI.getOperand(0).setReg(DstReg); 387 388 // Record the MIs need to be removed. 389 ToBeRemoved.insert(&MI); 390 if (SubregToRegMI) 391 ToBeRemoved.insert(SubregToRegMI); 392 ToBeRemoved.insert(MovMI); 393 394 return true; 395 } 396 397 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { 398 if (skipFunction(MF.getFunction())) 399 return false; 400 401 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); 402 TRI = static_cast<const AArch64RegisterInfo *>( 403 MF.getSubtarget().getRegisterInfo()); 404 MLI = &getAnalysis<MachineLoopInfo>(); 405 MRI = &MF.getRegInfo(); 406 407 assert(MRI->isSSA() && "Expected to be run on SSA form!"); 408 409 bool Changed = false; 410 SmallSetVector<MachineInstr *, 8> ToBeRemoved; 411 412 for (MachineBasicBlock &MBB : MF) { 413 for (MachineInstr &MI : MBB) { 414 switch (MI.getOpcode()) { 415 default: 416 break; 417 case AArch64::ANDWrr: 418 Changed = visitAND<uint32_t>(AArch64::ANDWri, MI, ToBeRemoved); 419 break; 420 case AArch64::ANDXrr: 421 Changed = visitAND<uint64_t>(AArch64::ANDXri, MI, ToBeRemoved); 422 break; 423 case AArch64::ORRWrs: 424 Changed = visitORR(MI, ToBeRemoved); 425 break; 426 case AArch64::ADDWrr: 427 Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI, 428 ToBeRemoved); 429 break; 430 case AArch64::SUBWrr: 431 Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI, 432 ToBeRemoved); 433 break; 434 case AArch64::ADDXrr: 435 Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI, 436 ToBeRemoved); 437 break; 438 case AArch64::SUBXrr: 439 Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI, 440 ToBeRemoved); 441 break; 442 } 443 } 444 } 445 446 for (MachineInstr *MI : ToBeRemoved) 447 MI->eraseFromParent(); 448 449 return Changed; 450 } 451 452 FunctionPass *llvm::createAArch64MIPeepholeOptPass() { 453 return new AArch64MIPeepholeOpt(); 454 } 455