1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs below peephole optimizations on MIR level.
10 //
11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13 //
14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15 // MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16 //
17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18 // MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19 //
20 // The mov pseudo instruction could be expanded to multiple mov instructions
21 // later. In this case, we could try to split the constant operand of mov
22 // instruction into two immediates which can be directly encoded into
23 // *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24 // multiple `mov` + `and/add/sub` instructions.
25 //
26 // 4. Remove redundant ORRWrs which is generated by zero-extend.
27 //
28 // %3:gpr32 = ORRWrs $wzr, %2, 0
29 // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30 //
31 // If AArch64's 32-bit form of instruction defines the source operand of
32 // ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33 // operand are set to zero.
34 //
35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36 // ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
37 //
38 // 6. %intermediate:gpr32 = COPY %src:fpr128
39 // %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40 // ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41 //
42 // In cases where a source FPR is copied to a GPR in order to be copied
43 // to a destination FPR, we can directly copy the values between the FPRs,
44 // eliminating the use of the Integer unit. When we match a pattern of
45 // INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46 // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47 // instructions.
48 //
49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50 // 64-bits. For example,
51 //
52 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53 // %2:fpr64 = MOVID 0
54 // %4:fpr128 = IMPLICIT_DEF
55 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
56 // %6:fpr128 = IMPLICIT_DEF
57 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
58 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
59 // ==>
60 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61 // %6:fpr128 = IMPLICIT_DEF
62 // %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
63 //
64 //===----------------------------------------------------------------------===//
65
66 #include "AArch64ExpandImm.h"
67 #include "AArch64InstrInfo.h"
68 #include "MCTargetDesc/AArch64AddressingModes.h"
69 #include "llvm/CodeGen/MachineDominators.h"
70 #include "llvm/CodeGen/MachineLoopInfo.h"
71
72 using namespace llvm;
73
74 #define DEBUG_TYPE "aarch64-mi-peephole-opt"
75
76 namespace {
77
78 struct AArch64MIPeepholeOpt : public MachineFunctionPass {
79 static char ID;
80
AArch64MIPeepholeOpt__anona145c6620111::AArch64MIPeepholeOpt81 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
82 initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
83 }
84
85 const AArch64InstrInfo *TII;
86 const AArch64RegisterInfo *TRI;
87 MachineLoopInfo *MLI;
88 MachineRegisterInfo *MRI;
89
90 using OpcodePair = std::pair<unsigned, unsigned>;
91 template <typename T>
92 using SplitAndOpcFunc =
93 std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
94 using BuildMIFunc =
95 std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
96 Register, Register, Register)>;
97
98 /// For instructions where an immediate operand could be split into two
99 /// separate immediate instructions, use the splitTwoPartImm two handle the
100 /// optimization.
101 ///
102 /// To implement, the following function types must be passed to
103 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
104 /// splitting the immediate is valid and returns the associated new opcode. A
105 /// BuildMIFunc must be implemented to build the two immediate instructions.
106 ///
107 /// Example Pattern (where IMM would require 2+ MOV instructions):
108 /// %dst = <Instr>rr %src IMM [...]
109 /// becomes:
110 /// %tmp = <Instr>ri %src (encode half IMM) [...]
111 /// %dst = <Instr>ri %tmp (encode half IMM) [...]
112 template <typename T>
113 bool splitTwoPartImm(MachineInstr &MI,
114 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
115
116 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
117 MachineInstr *&SubregToRegMI);
118
119 template <typename T>
120 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
121 template <typename T>
122 bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
123
124 template <typename T>
125 bool visitAND(unsigned Opc, MachineInstr &MI);
126 bool visitORR(MachineInstr &MI);
127 bool visitINSERT(MachineInstr &MI);
128 bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
129 bool visitINSvi64lane(MachineInstr &MI);
130 bool visitFMOVDr(MachineInstr &MI);
131 bool visitCopy(MachineInstr &MI);
132 bool runOnMachineFunction(MachineFunction &MF) override;
133
getPassName__anona145c6620111::AArch64MIPeepholeOpt134 StringRef getPassName() const override {
135 return "AArch64 MI Peephole Optimization pass";
136 }
137
getAnalysisUsage__anona145c6620111::AArch64MIPeepholeOpt138 void getAnalysisUsage(AnalysisUsage &AU) const override {
139 AU.setPreservesCFG();
140 AU.addRequired<MachineLoopInfoWrapperPass>();
141 MachineFunctionPass::getAnalysisUsage(AU);
142 }
143 };
144
145 char AArch64MIPeepholeOpt::ID = 0;
146
147 } // end anonymous namespace
148
149 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
150 "AArch64 MI Peephole Optimization", false, false)
151
152 template <typename T>
splitBitmaskImm(T Imm,unsigned RegSize,T & Imm1Enc,T & Imm2Enc)153 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
154 T UImm = static_cast<T>(Imm);
155 if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
156 return false;
157
158 // If this immediate can be handled by one instruction, do not split it.
159 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
160 AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
161 if (Insn.size() == 1)
162 return false;
163
164 // The bitmask immediate consists of consecutive ones. Let's say there is
165 // constant 0b00000000001000000000010000000000 which does not consist of
166 // consecutive ones. We can split it in to two bitmask immediate like
167 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
168 // If we do AND with these two bitmask immediate, we can see original one.
169 unsigned LowestBitSet = llvm::countr_zero(UImm);
170 unsigned HighestBitSet = Log2_64(UImm);
171
172 // Create a mask which is filled with one from the position of lowest bit set
173 // to the position of highest bit set.
174 T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
175 (static_cast<T>(1) << LowestBitSet);
176 // Create a mask which is filled with one outside the position of lowest bit
177 // set and the position of highest bit set.
178 T NewImm2 = UImm | ~NewImm1;
179
180 // If the split value is not valid bitmask immediate, do not split this
181 // constant.
182 if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
183 return false;
184
185 Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
186 Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
187 return true;
188 }
189
190 template <typename T>
visitAND(unsigned Opc,MachineInstr & MI)191 bool AArch64MIPeepholeOpt::visitAND(
192 unsigned Opc, MachineInstr &MI) {
193 // Try below transformation.
194 //
195 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
196 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
197 //
198 // The mov pseudo instruction could be expanded to multiple mov instructions
199 // later. Let's try to split the constant operand of mov instruction into two
200 // bitmask immediates. It makes only two AND instructions intead of multiple
201 // mov + and instructions.
202
203 return splitTwoPartImm<T>(
204 MI,
205 [Opc](T Imm, unsigned RegSize, T &Imm0,
206 T &Imm1) -> std::optional<OpcodePair> {
207 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
208 return std::make_pair(Opc, Opc);
209 return std::nullopt;
210 },
211 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
212 unsigned Imm1, Register SrcReg, Register NewTmpReg,
213 Register NewDstReg) {
214 DebugLoc DL = MI.getDebugLoc();
215 MachineBasicBlock *MBB = MI.getParent();
216 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
217 .addReg(SrcReg)
218 .addImm(Imm0);
219 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
220 .addReg(NewTmpReg)
221 .addImm(Imm1);
222 });
223 }
224
visitORR(MachineInstr & MI)225 bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
226 // Check this ORR comes from below zero-extend pattern.
227 //
228 // def : Pat<(i64 (zext GPR32:$src)),
229 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
230 if (MI.getOperand(3).getImm() != 0)
231 return false;
232
233 if (MI.getOperand(1).getReg() != AArch64::WZR)
234 return false;
235
236 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
237 if (!SrcMI)
238 return false;
239
240 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
241 //
242 // When you use the 32-bit form of an instruction, the upper 32 bits of the
243 // source registers are ignored and the upper 32 bits of the destination
244 // register are set to zero.
245 //
246 // If AArch64's 32-bit form of instruction defines the source operand of
247 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
248 // real AArch64 instruction and if it is not, do not process the opcode
249 // conservatively.
250 if (SrcMI->getOpcode() == TargetOpcode::COPY &&
251 SrcMI->getOperand(1).getReg().isVirtual()) {
252 const TargetRegisterClass *RC =
253 MRI->getRegClass(SrcMI->getOperand(1).getReg());
254
255 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
256 // that the upper bits are zero.
257 if (RC != &AArch64::FPR32RegClass &&
258 ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
259 SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
260 return false;
261 Register CpySrc = SrcMI->getOperand(1).getReg();
262 if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
263 CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
264 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
265 TII->get(TargetOpcode::COPY), CpySrc)
266 .add(SrcMI->getOperand(1));
267 }
268 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
269 TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
270 .addReg(CpySrc);
271 SrcMI->eraseFromParent();
272 }
273 else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
274 return false;
275
276 Register DefReg = MI.getOperand(0).getReg();
277 Register SrcReg = MI.getOperand(2).getReg();
278 MRI->replaceRegWith(DefReg, SrcReg);
279 MRI->clearKillFlags(SrcReg);
280 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
281 MI.eraseFromParent();
282
283 return true;
284 }
285
visitINSERT(MachineInstr & MI)286 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
287 // Check this INSERT_SUBREG comes from below zero-extend pattern.
288 //
289 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
290 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
291 //
292 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
293 // COPY would destroy the upper part of the register anyway
294 if (!MI.isRegTiedToDefOperand(1))
295 return false;
296
297 Register DstReg = MI.getOperand(0).getReg();
298 const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
299 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
300 if (!SrcMI)
301 return false;
302
303 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
304 //
305 // When you use the 32-bit form of an instruction, the upper 32 bits of the
306 // source registers are ignored and the upper 32 bits of the destination
307 // register are set to zero.
308 //
309 // If AArch64's 32-bit form of instruction defines the source operand of
310 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
311 // real AArch64 instruction and if it is not, do not process the opcode
312 // conservatively.
313 if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
314 !AArch64::GPR64allRegClass.hasSubClassEq(RC))
315 return false;
316
317 // Build a SUBREG_TO_REG instruction
318 MachineInstr *SubregMI =
319 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
320 TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
321 .addImm(0)
322 .add(MI.getOperand(2))
323 .add(MI.getOperand(3));
324 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n");
325 (void)SubregMI;
326 MI.eraseFromParent();
327
328 return true;
329 }
330
331 template <typename T>
splitAddSubImm(T Imm,unsigned RegSize,T & Imm0,T & Imm1)332 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
333 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
334 // imm0 and imm1 are non-zero 12-bit unsigned int.
335 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
336 (Imm & ~static_cast<T>(0xffffff)) != 0)
337 return false;
338
339 // The immediate can not be composed via a single instruction.
340 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
341 AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
342 if (Insn.size() == 1)
343 return false;
344
345 // Split Imm into (Imm0 << 12) + Imm1;
346 Imm0 = (Imm >> 12) & 0xfff;
347 Imm1 = Imm & 0xfff;
348 return true;
349 }
350
351 template <typename T>
visitADDSUB(unsigned PosOpc,unsigned NegOpc,MachineInstr & MI)352 bool AArch64MIPeepholeOpt::visitADDSUB(
353 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
354 // Try below transformation.
355 //
356 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
357 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
358 //
359 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
360 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
361 //
362 // The mov pseudo instruction could be expanded to multiple mov instructions
363 // later. Let's try to split the constant operand of mov instruction into two
364 // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
365 // multiple `mov` + `and/sub` instructions.
366
367 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
368 // folded. Make sure that we don't generate invalid instructions that use XZR
369 // in those cases.
370 if (MI.getOperand(1).getReg() == AArch64::XZR ||
371 MI.getOperand(1).getReg() == AArch64::WZR)
372 return false;
373
374 return splitTwoPartImm<T>(
375 MI,
376 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
377 T &Imm1) -> std::optional<OpcodePair> {
378 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
379 return std::make_pair(PosOpc, PosOpc);
380 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
381 return std::make_pair(NegOpc, NegOpc);
382 return std::nullopt;
383 },
384 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
385 unsigned Imm1, Register SrcReg, Register NewTmpReg,
386 Register NewDstReg) {
387 DebugLoc DL = MI.getDebugLoc();
388 MachineBasicBlock *MBB = MI.getParent();
389 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
390 .addReg(SrcReg)
391 .addImm(Imm0)
392 .addImm(12);
393 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
394 .addReg(NewTmpReg)
395 .addImm(Imm1)
396 .addImm(0);
397 });
398 }
399
400 template <typename T>
visitADDSSUBS(OpcodePair PosOpcs,OpcodePair NegOpcs,MachineInstr & MI)401 bool AArch64MIPeepholeOpt::visitADDSSUBS(
402 OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
403 // Try the same transformation as ADDSUB but with additional requirement
404 // that the condition code usages are only for Equal and Not Equal
405
406 if (MI.getOperand(1).getReg() == AArch64::XZR ||
407 MI.getOperand(1).getReg() == AArch64::WZR)
408 return false;
409
410 return splitTwoPartImm<T>(
411 MI,
412 [PosOpcs, NegOpcs, &MI, &TRI = TRI,
413 &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
414 T &Imm1) -> std::optional<OpcodePair> {
415 OpcodePair OP;
416 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
417 OP = PosOpcs;
418 else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
419 OP = NegOpcs;
420 else
421 return std::nullopt;
422 // Check conditional uses last since it is expensive for scanning
423 // proceeding instructions
424 MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
425 std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
426 if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
427 return std::nullopt;
428 return OP;
429 },
430 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
431 unsigned Imm1, Register SrcReg, Register NewTmpReg,
432 Register NewDstReg) {
433 DebugLoc DL = MI.getDebugLoc();
434 MachineBasicBlock *MBB = MI.getParent();
435 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
436 .addReg(SrcReg)
437 .addImm(Imm0)
438 .addImm(12);
439 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
440 .addReg(NewTmpReg)
441 .addImm(Imm1)
442 .addImm(0);
443 });
444 }
445
446 // Checks if the corresponding MOV immediate instruction is applicable for
447 // this peephole optimization.
checkMovImmInstr(MachineInstr & MI,MachineInstr * & MovMI,MachineInstr * & SubregToRegMI)448 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
449 MachineInstr *&MovMI,
450 MachineInstr *&SubregToRegMI) {
451 // Check whether current MBB is in loop and the AND is loop invariant.
452 MachineBasicBlock *MBB = MI.getParent();
453 MachineLoop *L = MLI->getLoopFor(MBB);
454 if (L && !L->isLoopInvariant(MI))
455 return false;
456
457 // Check whether current MI's operand is MOV with immediate.
458 MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
459 if (!MovMI)
460 return false;
461
462 // If it is SUBREG_TO_REG, check its operand.
463 SubregToRegMI = nullptr;
464 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
465 SubregToRegMI = MovMI;
466 MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
467 if (!MovMI)
468 return false;
469 }
470
471 if (MovMI->getOpcode() != AArch64::MOVi32imm &&
472 MovMI->getOpcode() != AArch64::MOVi64imm)
473 return false;
474
475 // If the MOV has multiple uses, do not split the immediate because it causes
476 // more instructions.
477 if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
478 return false;
479 if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
480 return false;
481
482 // It is OK to perform this peephole optimization.
483 return true;
484 }
485
486 template <typename T>
splitTwoPartImm(MachineInstr & MI,SplitAndOpcFunc<T> SplitAndOpc,BuildMIFunc BuildInstr)487 bool AArch64MIPeepholeOpt::splitTwoPartImm(
488 MachineInstr &MI,
489 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
490 unsigned RegSize = sizeof(T) * 8;
491 assert((RegSize == 32 || RegSize == 64) &&
492 "Invalid RegSize for legal immediate peephole optimization");
493
494 // Perform several essential checks against current MI.
495 MachineInstr *MovMI, *SubregToRegMI;
496 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
497 return false;
498
499 // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
500 T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
501 // For the 32 bit form of instruction, the upper 32 bits of the destination
502 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
503 // of Imm to zero. This is essential if the Immediate value was a negative
504 // number since it was sign extended when we assign to the 64-bit Imm.
505 if (SubregToRegMI)
506 Imm &= 0xFFFFFFFF;
507 OpcodePair Opcode;
508 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
509 Opcode = *R;
510 else
511 return false;
512
513 // Create new MIs using the first and second opcodes. Opcodes might differ for
514 // flag setting operations that should only set flags on second instruction.
515 // NewTmpReg = Opcode.first SrcReg Imm0
516 // NewDstReg = Opcode.second NewTmpReg Imm1
517
518 // Determine register classes for destinations and register operands
519 MachineFunction *MF = MI.getMF();
520 const TargetRegisterClass *FirstInstrDstRC =
521 TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
522 const TargetRegisterClass *FirstInstrOperandRC =
523 TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
524 const TargetRegisterClass *SecondInstrDstRC =
525 (Opcode.first == Opcode.second)
526 ? FirstInstrDstRC
527 : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
528 const TargetRegisterClass *SecondInstrOperandRC =
529 (Opcode.first == Opcode.second)
530 ? FirstInstrOperandRC
531 : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
532
533 // Get old registers destinations and new register destinations
534 Register DstReg = MI.getOperand(0).getReg();
535 Register SrcReg = MI.getOperand(1).getReg();
536 Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
537 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
538 // reuse that same destination register.
539 Register NewDstReg = DstReg.isVirtual()
540 ? MRI->createVirtualRegister(SecondInstrDstRC)
541 : DstReg;
542
543 // Constrain registers based on their new uses
544 MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
545 MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
546 if (DstReg != NewDstReg)
547 MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
548
549 // Call the delegating operation to build the instruction
550 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
551
552 // replaceRegWith changes MI's definition register. Keep it for SSA form until
553 // deleting MI. Only if we made a new destination register.
554 if (DstReg != NewDstReg) {
555 MRI->replaceRegWith(DstReg, NewDstReg);
556 MI.getOperand(0).setReg(DstReg);
557 }
558
559 // Record the MIs need to be removed.
560 MI.eraseFromParent();
561 if (SubregToRegMI)
562 SubregToRegMI->eraseFromParent();
563 MovMI->eraseFromParent();
564
565 return true;
566 }
567
visitINSviGPR(MachineInstr & MI,unsigned Opc)568 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
569 // Check if this INSvi[X]gpr comes from COPY of a source FPR128
570 //
571 // From
572 // %intermediate1:gpr64 = COPY %src:fpr128
573 // %intermediate2:gpr32 = COPY %intermediate1:gpr64
574 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
575 // To
576 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
577 // src_index
578 // where src_index = 0, X = [8|16|32|64]
579
580 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
581
582 // For a chain of COPY instructions, find the initial source register
583 // and check if it's an FPR128
584 while (true) {
585 if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
586 return false;
587
588 if (!SrcMI->getOperand(1).getReg().isVirtual())
589 return false;
590
591 if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
592 &AArch64::FPR128RegClass) {
593 break;
594 }
595 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
596 }
597
598 Register DstReg = MI.getOperand(0).getReg();
599 Register SrcReg = SrcMI->getOperand(1).getReg();
600 MachineInstr *INSvilaneMI =
601 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
602 .add(MI.getOperand(1))
603 .add(MI.getOperand(2))
604 .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
605 .addImm(0);
606
607 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n");
608 (void)INSvilaneMI;
609 MI.eraseFromParent();
610 return true;
611 }
612
613 // All instructions that set a FPR64 will implicitly zero the top bits of the
614 // register.
is64bitDefwithZeroHigh64bit(MachineInstr * MI,MachineRegisterInfo * MRI)615 static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
616 MachineRegisterInfo *MRI) {
617 if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
618 return false;
619 const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
620 if (RC != &AArch64::FPR64RegClass)
621 return false;
622 return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
623 }
624
visitINSvi64lane(MachineInstr & MI)625 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
626 // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
627 // We are expecting below case.
628 //
629 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
630 // %6:fpr128 = IMPLICIT_DEF
631 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
632 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
633 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
634 if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
635 return false;
636 Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
637 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
638 return false;
639
640 // Check there is `mov 0` MI for high 64-bits.
641 // We are expecting below cases.
642 //
643 // %2:fpr64 = MOVID 0
644 // %4:fpr128 = IMPLICIT_DEF
645 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
646 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
647 // or
648 // %5:fpr128 = MOVIv2d_ns 0
649 // %6:fpr64 = COPY %5.dsub:fpr128
650 // %8:fpr128 = IMPLICIT_DEF
651 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
652 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
653 MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
654 if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
655 return false;
656 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
657 if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
658 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
659 if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
660 High64MI->getOpcode() != AArch64::MOVIv2d_ns))
661 return false;
662 if (High64MI->getOperand(1).getImm() != 0)
663 return false;
664
665 // Let's remove MIs for high 64-bits.
666 Register OldDef = MI.getOperand(0).getReg();
667 Register NewDef = MI.getOperand(1).getReg();
668 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
669 MRI->replaceRegWith(OldDef, NewDef);
670 MI.eraseFromParent();
671
672 return true;
673 }
674
visitFMOVDr(MachineInstr & MI)675 bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
676 // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
677 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
678 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
679 return false;
680
681 // Let's remove MIs for high 64-bits.
682 Register OldDef = MI.getOperand(0).getReg();
683 Register NewDef = MI.getOperand(1).getReg();
684 LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
685 MRI->clearKillFlags(OldDef);
686 MRI->clearKillFlags(NewDef);
687 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
688 MRI->replaceRegWith(OldDef, NewDef);
689 MI.eraseFromParent();
690
691 return true;
692 }
693
694 // Across a basic-block we might have in i32 extract from a value that only
695 // operates on upper bits (for example a sxtw). We can replace the COPY with a
696 // new version skipping the sxtw.
visitCopy(MachineInstr & MI)697 bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) {
698 Register InputReg = MI.getOperand(1).getReg();
699 if (MI.getOperand(1).getSubReg() != AArch64::sub_32 ||
700 !MRI->hasOneNonDBGUse(InputReg))
701 return false;
702
703 MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg);
704 SmallPtrSet<MachineInstr *, 4> DeadInstrs;
705 DeadInstrs.insert(SrcMI);
706 while (SrcMI && SrcMI->isFullCopy() &&
707 MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) {
708 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
709 DeadInstrs.insert(SrcMI);
710 }
711
712 if (!SrcMI || SrcMI->getOpcode() != AArch64::SBFMXri ||
713 SrcMI->getOperand(2).getImm() != 0 || SrcMI->getOperand(3).getImm() != 31)
714 return false;
715
716 Register SrcReg = SrcMI->getOperand(1).getReg();
717 MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg));
718 LLVM_DEBUG(dbgs() << "Optimizing: " << MI);
719 MI.getOperand(1).setReg(SrcReg);
720 LLVM_DEBUG(dbgs() << " to: " << MI);
721 for (auto *DeadMI : DeadInstrs) {
722 LLVM_DEBUG(dbgs() << " Removing: " << *DeadMI);
723 DeadMI->eraseFromParent();
724 }
725 return true;
726 }
727
runOnMachineFunction(MachineFunction & MF)728 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
729 if (skipFunction(MF.getFunction()))
730 return false;
731
732 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
733 TRI = static_cast<const AArch64RegisterInfo *>(
734 MF.getSubtarget().getRegisterInfo());
735 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
736 MRI = &MF.getRegInfo();
737
738 assert(MRI->isSSA() && "Expected to be run on SSA form!");
739
740 bool Changed = false;
741
742 for (MachineBasicBlock &MBB : MF) {
743 for (MachineInstr &MI : make_early_inc_range(MBB)) {
744 switch (MI.getOpcode()) {
745 default:
746 break;
747 case AArch64::INSERT_SUBREG:
748 Changed |= visitINSERT(MI);
749 break;
750 case AArch64::ANDWrr:
751 Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
752 break;
753 case AArch64::ANDXrr:
754 Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
755 break;
756 case AArch64::ORRWrs:
757 Changed |= visitORR(MI);
758 break;
759 case AArch64::ADDWrr:
760 Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
761 break;
762 case AArch64::SUBWrr:
763 Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
764 break;
765 case AArch64::ADDXrr:
766 Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
767 break;
768 case AArch64::SUBXrr:
769 Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
770 break;
771 case AArch64::ADDSWrr:
772 Changed |=
773 visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
774 {AArch64::SUBWri, AArch64::SUBSWri}, MI);
775 break;
776 case AArch64::SUBSWrr:
777 Changed |=
778 visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
779 {AArch64::ADDWri, AArch64::ADDSWri}, MI);
780 break;
781 case AArch64::ADDSXrr:
782 Changed |=
783 visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
784 {AArch64::SUBXri, AArch64::SUBSXri}, MI);
785 break;
786 case AArch64::SUBSXrr:
787 Changed |=
788 visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
789 {AArch64::ADDXri, AArch64::ADDSXri}, MI);
790 break;
791 case AArch64::INSvi64gpr:
792 Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
793 break;
794 case AArch64::INSvi32gpr:
795 Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
796 break;
797 case AArch64::INSvi16gpr:
798 Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
799 break;
800 case AArch64::INSvi8gpr:
801 Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
802 break;
803 case AArch64::INSvi64lane:
804 Changed |= visitINSvi64lane(MI);
805 break;
806 case AArch64::FMOVDr:
807 Changed |= visitFMOVDr(MI);
808 break;
809 case AArch64::COPY:
810 Changed |= visitCopy(MI);
811 break;
812 }
813 }
814 }
815
816 return Changed;
817 }
818
createAArch64MIPeepholeOptPass()819 FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
820 return new AArch64MIPeepholeOpt();
821 }
822