xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp (revision 9c77fb6aaa366cbabc80ee1b834bcfe4df135491)
1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs below peephole optimizations on MIR level.
10 //
11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12 //    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13 //
14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15 //    MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16 //
17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18 //    MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19 //
20 //    The mov pseudo instruction could be expanded to multiple mov instructions
21 //    later. In this case, we could try to split the constant  operand of mov
22 //    instruction into two immediates which can be directly encoded into
23 //    *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24 //    multiple `mov` + `and/add/sub` instructions.
25 //
26 // 4. Remove redundant ORRWrs which is generated by zero-extend.
27 //
28 //    %3:gpr32 = ORRWrs $wzr, %2, 0
29 //    %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30 //
31 //    If AArch64's 32-bit form of instruction defines the source operand of
32 //    ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33 //    operand are set to zero.
34 //
35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36 //     ==> %reg:subidx =  SUBREG_TO_REG 0, %subreg, subidx
37 //
38 // 6. %intermediate:gpr32 = COPY %src:fpr128
39 //    %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40 //     ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41 //
42 //    In cases where a source FPR is copied to a GPR in order to be copied
43 //    to a destination FPR, we can directly copy the values between the FPRs,
44 //    eliminating the use of the Integer unit. When we match a pattern of
45 //    INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46 //    source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47 //    instructions.
48 //
49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50 //    64-bits. For example,
51 //
52 //   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53 //   %2:fpr64 = MOVID 0
54 //   %4:fpr128 = IMPLICIT_DEF
55 //   %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), %2:fpr64, %subreg.dsub
56 //   %6:fpr128 = IMPLICIT_DEF
57 //   %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub
58 //   %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, %3:fpr128, 0
59 //   ==>
60 //   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61 //   %6:fpr128 = IMPLICIT_DEF
62 //   %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), %1:fpr64, %subreg.dsub
63 //
64 // 8. Remove redundant CSELs that select between identical registers, by
65 //    replacing them with unconditional moves.
66 //
67 // 9. Replace UBFMXri with UBFMWri if the instruction is equivalent to a 32 bit
68 //    LSR or LSL alias of UBFM.
69 //
70 //===----------------------------------------------------------------------===//
71 
72 #include "AArch64ExpandImm.h"
73 #include "AArch64InstrInfo.h"
74 #include "MCTargetDesc/AArch64AddressingModes.h"
75 #include "llvm/CodeGen/MachineDominators.h"
76 #include "llvm/CodeGen/MachineLoopInfo.h"
77 
78 using namespace llvm;
79 
80 #define DEBUG_TYPE "aarch64-mi-peephole-opt"
81 
82 namespace {
83 
84 struct AArch64MIPeepholeOpt : public MachineFunctionPass {
85   static char ID;
86 
87   AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {}
88 
89   const AArch64InstrInfo *TII;
90   const AArch64RegisterInfo *TRI;
91   MachineLoopInfo *MLI;
92   MachineRegisterInfo *MRI;
93 
94   using OpcodePair = std::pair<unsigned, unsigned>;
95   template <typename T>
96   using SplitAndOpcFunc =
97       std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
98   using BuildMIFunc =
99       std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
100                          Register, Register, Register)>;
101 
102   /// For instructions where an immediate operand could be split into two
103   /// separate immediate instructions, use the splitTwoPartImm two handle the
104   /// optimization.
105   ///
106   /// To implement, the following function types must be passed to
107   /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
108   /// splitting the immediate is valid and returns the associated new opcode. A
109   /// BuildMIFunc must be implemented to build the two immediate instructions.
110   ///
111   /// Example Pattern (where IMM would require 2+ MOV instructions):
112   ///     %dst = <Instr>rr %src IMM [...]
113   /// becomes:
114   ///     %tmp = <Instr>ri %src (encode half IMM) [...]
115   ///     %dst = <Instr>ri %tmp (encode half IMM) [...]
116   template <typename T>
117   bool splitTwoPartImm(MachineInstr &MI,
118                        SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
119 
120   bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
121                         MachineInstr *&SubregToRegMI);
122 
123   template <typename T>
124   bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
125   template <typename T>
126   bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
127 
128   template <typename T>
129   bool visitAND(unsigned Opc, MachineInstr &MI);
130   bool visitORR(MachineInstr &MI);
131   bool visitCSEL(MachineInstr &MI);
132   bool visitINSERT(MachineInstr &MI);
133   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
134   bool visitINSvi64lane(MachineInstr &MI);
135   bool visitFMOVDr(MachineInstr &MI);
136   bool visitUBFMXri(MachineInstr &MI);
137   bool visitCopy(MachineInstr &MI);
138   bool runOnMachineFunction(MachineFunction &MF) override;
139 
140   StringRef getPassName() const override {
141     return "AArch64 MI Peephole Optimization pass";
142   }
143 
144   void getAnalysisUsage(AnalysisUsage &AU) const override {
145     AU.setPreservesCFG();
146     AU.addRequired<MachineLoopInfoWrapperPass>();
147     MachineFunctionPass::getAnalysisUsage(AU);
148   }
149 };
150 
151 char AArch64MIPeepholeOpt::ID = 0;
152 
153 } // end anonymous namespace
154 
155 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
156                 "AArch64 MI Peephole Optimization", false, false)
157 
158 template <typename T>
159 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
160   T UImm = static_cast<T>(Imm);
161   if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
162     return false;
163 
164   // If this immediate can be handled by one instruction, do not split it.
165   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
166   AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
167   if (Insn.size() == 1)
168     return false;
169 
170   // The bitmask immediate consists of consecutive ones.  Let's say there is
171   // constant 0b00000000001000000000010000000000 which does not consist of
172   // consecutive ones. We can split it in to two bitmask immediate like
173   // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
174   // If we do AND with these two bitmask immediate, we can see original one.
175   unsigned LowestBitSet = llvm::countr_zero(UImm);
176   unsigned HighestBitSet = Log2_64(UImm);
177 
178   // Create a mask which is filled with one from the position of lowest bit set
179   // to the position of highest bit set.
180   T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
181               (static_cast<T>(1) << LowestBitSet);
182   // Create a mask which is filled with one outside the position of lowest bit
183   // set and the position of highest bit set.
184   T NewImm2 = UImm | ~NewImm1;
185 
186   // If the split value is not valid bitmask immediate, do not split this
187   // constant.
188   if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
189     return false;
190 
191   Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
192   Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
193   return true;
194 }
195 
196 template <typename T>
197 bool AArch64MIPeepholeOpt::visitAND(
198     unsigned Opc, MachineInstr &MI) {
199   // Try below transformation.
200   //
201   // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
202   // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
203   //
204   // The mov pseudo instruction could be expanded to multiple mov instructions
205   // later. Let's try to split the constant operand of mov instruction into two
206   // bitmask immediates. It makes only two AND instructions instead of multiple
207   // mov + and instructions.
208 
209   return splitTwoPartImm<T>(
210       MI,
211       [Opc](T Imm, unsigned RegSize, T &Imm0,
212             T &Imm1) -> std::optional<OpcodePair> {
213         if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
214           return std::make_pair(Opc, Opc);
215         return std::nullopt;
216       },
217       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
218                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
219                    Register NewDstReg) {
220         DebugLoc DL = MI.getDebugLoc();
221         MachineBasicBlock *MBB = MI.getParent();
222         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
223             .addReg(SrcReg)
224             .addImm(Imm0);
225         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
226             .addReg(NewTmpReg)
227             .addImm(Imm1);
228       });
229 }
230 
231 bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
232   // Check this ORR comes from below zero-extend pattern.
233   //
234   // def : Pat<(i64 (zext GPR32:$src)),
235   //           (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
236   if (MI.getOperand(3).getImm() != 0)
237     return false;
238 
239   if (MI.getOperand(1).getReg() != AArch64::WZR)
240     return false;
241 
242   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
243   if (!SrcMI)
244     return false;
245 
246   // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
247   //
248   // When you use the 32-bit form of an instruction, the upper 32 bits of the
249   // source registers are ignored and the upper 32 bits of the destination
250   // register are set to zero.
251   //
252   // If AArch64's 32-bit form of instruction defines the source operand of
253   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
254   // real AArch64 instruction and if it is not, do not process the opcode
255   // conservatively.
256   if (SrcMI->getOpcode() == TargetOpcode::COPY &&
257       SrcMI->getOperand(1).getReg().isVirtual()) {
258     const TargetRegisterClass *RC =
259         MRI->getRegClass(SrcMI->getOperand(1).getReg());
260 
261     // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
262     // that the upper bits are zero.
263     if (RC != &AArch64::FPR32RegClass &&
264         ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass &&
265           RC != &AArch64::ZPRRegClass) ||
266          SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
267       return false;
268     Register CpySrc;
269     if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
270       CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
271       BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
272               TII->get(TargetOpcode::COPY), CpySrc)
273           .add(SrcMI->getOperand(1));
274     } else {
275       CpySrc = SrcMI->getOperand(1).getReg();
276     }
277     BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
278             TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
279         .addReg(CpySrc);
280     SrcMI->eraseFromParent();
281   }
282   else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
283     return false;
284 
285   Register DefReg = MI.getOperand(0).getReg();
286   Register SrcReg = MI.getOperand(2).getReg();
287   MRI->replaceRegWith(DefReg, SrcReg);
288   MRI->clearKillFlags(SrcReg);
289   LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
290   MI.eraseFromParent();
291 
292   return true;
293 }
294 
295 bool AArch64MIPeepholeOpt::visitCSEL(MachineInstr &MI) {
296   // Replace CSEL with MOV when both inputs are the same register.
297   if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg())
298     return false;
299 
300   auto ZeroReg =
301       MI.getOpcode() == AArch64::CSELXr ? AArch64::XZR : AArch64::WZR;
302   auto OrOpcode =
303       MI.getOpcode() == AArch64::CSELXr ? AArch64::ORRXrs : AArch64::ORRWrs;
304 
305   BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(OrOpcode))
306       .addReg(MI.getOperand(0).getReg(), RegState::Define)
307       .addReg(ZeroReg)
308       .addReg(MI.getOperand(1).getReg())
309       .addImm(0);
310 
311   MI.eraseFromParent();
312   return true;
313 }
314 
315 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
316   // Check this INSERT_SUBREG comes from below zero-extend pattern.
317   //
318   // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
319   // To   %reg:subidx =  SUBREG_TO_REG 0, %subreg, subidx
320   //
321   // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
322   // COPY would destroy the upper part of the register anyway
323   if (!MI.isRegTiedToDefOperand(1))
324     return false;
325 
326   Register DstReg = MI.getOperand(0).getReg();
327   const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
328   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
329   if (!SrcMI)
330     return false;
331 
332   // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
333   //
334   // When you use the 32-bit form of an instruction, the upper 32 bits of the
335   // source registers are ignored and the upper 32 bits of the destination
336   // register are set to zero.
337   //
338   // If AArch64's 32-bit form of instruction defines the source operand of
339   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
340   // real AArch64 instruction and if it is not, do not process the opcode
341   // conservatively.
342   if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
343       !AArch64::GPR64allRegClass.hasSubClassEq(RC))
344     return false;
345 
346   // Build a SUBREG_TO_REG instruction
347   MachineInstr *SubregMI =
348       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
349               TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
350           .addImm(0)
351           .add(MI.getOperand(2))
352           .add(MI.getOperand(3));
353   LLVM_DEBUG(dbgs() << MI << "  replace by:\n: " << *SubregMI << "\n");
354   (void)SubregMI;
355   MI.eraseFromParent();
356 
357   return true;
358 }
359 
360 template <typename T>
361 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
362   // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
363   // imm0 and imm1 are non-zero 12-bit unsigned int.
364   if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
365       (Imm & ~static_cast<T>(0xffffff)) != 0)
366     return false;
367 
368   // The immediate can not be composed via a single instruction.
369   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
370   AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
371   if (Insn.size() == 1)
372     return false;
373 
374   // Split Imm into (Imm0 << 12) + Imm1;
375   Imm0 = (Imm >> 12) & 0xfff;
376   Imm1 = Imm & 0xfff;
377   return true;
378 }
379 
380 template <typename T>
381 bool AArch64MIPeepholeOpt::visitADDSUB(
382     unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
383   // Try below transformation.
384   //
385   // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
386   // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
387   //
388   // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
389   // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
390   //
391   // The mov pseudo instruction could be expanded to multiple mov instructions
392   // later. Let's try to split the constant operand of mov instruction into two
393   // legal add/sub immediates. It makes only two ADD/SUB instructions instead of
394   // multiple `mov` + `and/sub` instructions.
395 
396   // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
397   // folded. Make sure that we don't generate invalid instructions that use XZR
398   // in those cases.
399   if (MI.getOperand(1).getReg() == AArch64::XZR ||
400       MI.getOperand(1).getReg() == AArch64::WZR)
401     return false;
402 
403   return splitTwoPartImm<T>(
404       MI,
405       [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
406                        T &Imm1) -> std::optional<OpcodePair> {
407         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
408           return std::make_pair(PosOpc, PosOpc);
409         if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
410           return std::make_pair(NegOpc, NegOpc);
411         return std::nullopt;
412       },
413       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
414                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
415                    Register NewDstReg) {
416         DebugLoc DL = MI.getDebugLoc();
417         MachineBasicBlock *MBB = MI.getParent();
418         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
419             .addReg(SrcReg)
420             .addImm(Imm0)
421             .addImm(12);
422         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
423             .addReg(NewTmpReg)
424             .addImm(Imm1)
425             .addImm(0);
426       });
427 }
428 
429 template <typename T>
430 bool AArch64MIPeepholeOpt::visitADDSSUBS(
431     OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
432   // Try the same transformation as ADDSUB but with additional requirement
433   // that the condition code usages are only for Equal and Not Equal
434 
435   if (MI.getOperand(1).getReg() == AArch64::XZR ||
436       MI.getOperand(1).getReg() == AArch64::WZR)
437     return false;
438 
439   return splitTwoPartImm<T>(
440       MI,
441       [PosOpcs, NegOpcs, &MI, &TRI = TRI,
442        &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
443                    T &Imm1) -> std::optional<OpcodePair> {
444         OpcodePair OP;
445         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
446           OP = PosOpcs;
447         else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
448           OP = NegOpcs;
449         else
450           return std::nullopt;
451         // Check conditional uses last since it is expensive for scanning
452         // proceeding instructions
453         MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
454         std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
455         if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
456           return std::nullopt;
457         return OP;
458       },
459       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
460                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
461                    Register NewDstReg) {
462         DebugLoc DL = MI.getDebugLoc();
463         MachineBasicBlock *MBB = MI.getParent();
464         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
465             .addReg(SrcReg)
466             .addImm(Imm0)
467             .addImm(12);
468         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
469             .addReg(NewTmpReg)
470             .addImm(Imm1)
471             .addImm(0);
472       });
473 }
474 
475 // Checks if the corresponding MOV immediate instruction is applicable for
476 // this peephole optimization.
477 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
478                                             MachineInstr *&MovMI,
479                                             MachineInstr *&SubregToRegMI) {
480   // Check whether current MBB is in loop and the AND is loop invariant.
481   MachineBasicBlock *MBB = MI.getParent();
482   MachineLoop *L = MLI->getLoopFor(MBB);
483   if (L && !L->isLoopInvariant(MI))
484     return false;
485 
486   // Check whether current MI's operand is MOV with immediate.
487   MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
488   if (!MovMI)
489     return false;
490 
491   // If it is SUBREG_TO_REG, check its operand.
492   SubregToRegMI = nullptr;
493   if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
494     SubregToRegMI = MovMI;
495     MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
496     if (!MovMI)
497       return false;
498   }
499 
500   if (MovMI->getOpcode() != AArch64::MOVi32imm &&
501       MovMI->getOpcode() != AArch64::MOVi64imm)
502     return false;
503 
504   // If the MOV has multiple uses, do not split the immediate because it causes
505   // more instructions.
506   if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
507     return false;
508   if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
509     return false;
510 
511   // It is OK to perform this peephole optimization.
512   return true;
513 }
514 
515 template <typename T>
516 bool AArch64MIPeepholeOpt::splitTwoPartImm(
517     MachineInstr &MI,
518     SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
519   unsigned RegSize = sizeof(T) * 8;
520   assert((RegSize == 32 || RegSize == 64) &&
521          "Invalid RegSize for legal immediate peephole optimization");
522 
523   // Perform several essential checks against current MI.
524   MachineInstr *MovMI, *SubregToRegMI;
525   if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
526     return false;
527 
528   // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
529   T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
530   // For the 32 bit form of instruction, the upper 32 bits of the destination
531   // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
532   // of Imm to zero. This is essential if the Immediate value was a negative
533   // number since it was sign extended when we assign to the 64-bit Imm.
534   if (SubregToRegMI)
535     Imm &= 0xFFFFFFFF;
536   OpcodePair Opcode;
537   if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
538     Opcode = *R;
539   else
540     return false;
541 
542   // Create new MIs using the first and second opcodes. Opcodes might differ for
543   // flag setting operations that should only set flags on second instruction.
544   // NewTmpReg = Opcode.first SrcReg Imm0
545   // NewDstReg = Opcode.second NewTmpReg Imm1
546 
547   // Determine register classes for destinations and register operands
548   MachineFunction *MF = MI.getMF();
549   const TargetRegisterClass *FirstInstrDstRC =
550       TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
551   const TargetRegisterClass *FirstInstrOperandRC =
552       TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
553   const TargetRegisterClass *SecondInstrDstRC =
554       (Opcode.first == Opcode.second)
555           ? FirstInstrDstRC
556           : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
557   const TargetRegisterClass *SecondInstrOperandRC =
558       (Opcode.first == Opcode.second)
559           ? FirstInstrOperandRC
560           : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
561 
562   // Get old registers destinations and new register destinations
563   Register DstReg = MI.getOperand(0).getReg();
564   Register SrcReg = MI.getOperand(1).getReg();
565   Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
566   // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
567   // reuse that same destination register.
568   Register NewDstReg = DstReg.isVirtual()
569                            ? MRI->createVirtualRegister(SecondInstrDstRC)
570                            : DstReg;
571 
572   // Constrain registers based on their new uses
573   MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
574   MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
575   if (DstReg != NewDstReg)
576     MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
577 
578   // Call the delegating operation to build the instruction
579   BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
580 
581   // replaceRegWith changes MI's definition register. Keep it for SSA form until
582   // deleting MI. Only if we made a new destination register.
583   if (DstReg != NewDstReg) {
584     MRI->replaceRegWith(DstReg, NewDstReg);
585     MI.getOperand(0).setReg(DstReg);
586   }
587 
588   // Record the MIs need to be removed.
589   MI.eraseFromParent();
590   if (SubregToRegMI)
591     SubregToRegMI->eraseFromParent();
592   MovMI->eraseFromParent();
593 
594   return true;
595 }
596 
597 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
598   // Check if this INSvi[X]gpr comes from COPY of a source FPR128
599   //
600   // From
601   //  %intermediate1:gpr64 = COPY %src:fpr128
602   //  %intermediate2:gpr32 = COPY %intermediate1:gpr64
603   //  %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
604   // To
605   //  %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
606   //  src_index
607   // where src_index = 0, X = [8|16|32|64]
608 
609   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
610 
611   // For a chain of COPY instructions, find the initial source register
612   // and check if it's an FPR128
613   while (true) {
614     if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
615       return false;
616 
617     if (!SrcMI->getOperand(1).getReg().isVirtual())
618       return false;
619 
620     if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
621         &AArch64::FPR128RegClass) {
622       break;
623     }
624     SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
625   }
626 
627   Register DstReg = MI.getOperand(0).getReg();
628   Register SrcReg = SrcMI->getOperand(1).getReg();
629   MachineInstr *INSvilaneMI =
630       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
631           .add(MI.getOperand(1))
632           .add(MI.getOperand(2))
633           .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
634           .addImm(0);
635 
636   LLVM_DEBUG(dbgs() << MI << "  replace by:\n: " << *INSvilaneMI << "\n");
637   (void)INSvilaneMI;
638   MI.eraseFromParent();
639   return true;
640 }
641 
642 // All instructions that set a FPR64 will implicitly zero the top bits of the
643 // register.
644 static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
645                                         MachineRegisterInfo *MRI) {
646   if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
647     return false;
648   const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
649   if (RC != &AArch64::FPR64RegClass)
650     return false;
651   return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
652 }
653 
654 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
655   // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
656   // We are expecting below case.
657   //
658   //  %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
659   //  %6:fpr128 = IMPLICIT_DEF
660   //  %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
661   //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
662   MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
663   if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
664     return false;
665   Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
666   if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
667     return false;
668 
669   // Check there is `mov 0` MI for high 64-bits.
670   // We are expecting below cases.
671   //
672   //  %2:fpr64 = MOVID 0
673   //  %4:fpr128 = IMPLICIT_DEF
674   //  %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
675   //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
676   // or
677   //  %5:fpr128 = MOVIv2d_ns 0
678   //  %6:fpr64 = COPY %5.dsub:fpr128
679   //  %8:fpr128 = IMPLICIT_DEF
680   //  %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
681   //  %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
682   MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
683   if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
684     return false;
685   High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
686   if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
687     High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
688   if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
689                     High64MI->getOpcode() != AArch64::MOVIv2d_ns))
690     return false;
691   if (High64MI->getOperand(1).getImm() != 0)
692     return false;
693 
694   // Let's remove MIs for high 64-bits.
695   Register OldDef = MI.getOperand(0).getReg();
696   Register NewDef = MI.getOperand(1).getReg();
697   MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
698   MRI->replaceRegWith(OldDef, NewDef);
699   MI.eraseFromParent();
700 
701   return true;
702 }
703 
704 bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
705   // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
706   MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
707   if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
708     return false;
709 
710   // Let's remove MIs for high 64-bits.
711   Register OldDef = MI.getOperand(0).getReg();
712   Register NewDef = MI.getOperand(1).getReg();
713   LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
714   MRI->clearKillFlags(OldDef);
715   MRI->clearKillFlags(NewDef);
716   MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
717   MRI->replaceRegWith(OldDef, NewDef);
718   MI.eraseFromParent();
719 
720   return true;
721 }
722 
723 bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) {
724   // Check if the instruction is equivalent to a 32 bit LSR or LSL alias of
725   // UBFM, and replace the UBFMXri instruction with its 32 bit variant, UBFMWri.
726   int64_t Immr = MI.getOperand(2).getImm();
727   int64_t Imms = MI.getOperand(3).getImm();
728 
729   bool IsLSR = Imms == 31 && Immr <= Imms;
730   bool IsLSL = Immr == Imms + 33;
731   if (!IsLSR && !IsLSL)
732     return false;
733 
734   if (IsLSL) {
735     Immr -= 32;
736   }
737 
738   const TargetRegisterClass *DstRC64 =
739       TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI, *MI.getMF());
740   const TargetRegisterClass *DstRC32 =
741       TRI->getSubRegisterClass(DstRC64, AArch64::sub_32);
742   assert(DstRC32 && "Destination register class of UBFMXri doesn't have a "
743                     "sub_32 subregister class");
744 
745   const TargetRegisterClass *SrcRC64 =
746       TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI, *MI.getMF());
747   const TargetRegisterClass *SrcRC32 =
748       TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32);
749   assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 "
750                     "subregister class");
751 
752   Register DstReg64 = MI.getOperand(0).getReg();
753   Register DstReg32 = MRI->createVirtualRegister(DstRC32);
754   Register SrcReg64 = MI.getOperand(1).getReg();
755   Register SrcReg32 = MRI->createVirtualRegister(SrcRC32);
756 
757   BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::COPY),
758           SrcReg32)
759       .addReg(SrcReg64, 0, AArch64::sub_32);
760   BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::UBFMWri),
761           DstReg32)
762       .addReg(SrcReg32)
763       .addImm(Immr)
764       .addImm(Imms);
765   BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
766           TII->get(AArch64::SUBREG_TO_REG), DstReg64)
767       .addImm(0)
768       .addReg(DstReg32)
769       .addImm(AArch64::sub_32);
770   MI.eraseFromParent();
771   return true;
772 }
773 
774 // Across a basic-block we might have in i32 extract from a value that only
775 // operates on upper bits (for example a sxtw). We can replace the COPY with a
776 // new version skipping the sxtw.
777 bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) {
778   Register InputReg = MI.getOperand(1).getReg();
779   if (MI.getOperand(1).getSubReg() != AArch64::sub_32 ||
780       !MRI->hasOneNonDBGUse(InputReg))
781     return false;
782 
783   MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg);
784   SmallPtrSet<MachineInstr *, 4> DeadInstrs;
785   DeadInstrs.insert(SrcMI);
786   while (SrcMI && SrcMI->isFullCopy() &&
787          MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) {
788     SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
789     DeadInstrs.insert(SrcMI);
790   }
791 
792   if (!SrcMI)
793     return false;
794 
795   // Look for SXTW(X) and return Reg.
796   auto getSXTWSrcReg = [](MachineInstr *SrcMI) -> Register {
797     if (SrcMI->getOpcode() != AArch64::SBFMXri ||
798         SrcMI->getOperand(2).getImm() != 0 ||
799         SrcMI->getOperand(3).getImm() != 31)
800       return AArch64::NoRegister;
801     return SrcMI->getOperand(1).getReg();
802   };
803   // Look for SUBREG_TO_REG(ORRWrr(WZR, COPY(X.sub_32)))
804   auto getUXTWSrcReg = [&](MachineInstr *SrcMI) -> Register {
805     if (SrcMI->getOpcode() != AArch64::SUBREG_TO_REG ||
806         SrcMI->getOperand(3).getImm() != AArch64::sub_32 ||
807         !MRI->hasOneNonDBGUse(SrcMI->getOperand(2).getReg()))
808       return AArch64::NoRegister;
809     MachineInstr *Orr = MRI->getUniqueVRegDef(SrcMI->getOperand(2).getReg());
810     if (!Orr || Orr->getOpcode() != AArch64::ORRWrr ||
811         Orr->getOperand(1).getReg() != AArch64::WZR ||
812         !MRI->hasOneNonDBGUse(Orr->getOperand(2).getReg()))
813       return AArch64::NoRegister;
814     MachineInstr *Cpy = MRI->getUniqueVRegDef(Orr->getOperand(2).getReg());
815     if (!Cpy || Cpy->getOpcode() != AArch64::COPY ||
816         Cpy->getOperand(1).getSubReg() != AArch64::sub_32)
817       return AArch64::NoRegister;
818     DeadInstrs.insert(Orr);
819     return Cpy->getOperand(1).getReg();
820   };
821 
822   Register SrcReg = getSXTWSrcReg(SrcMI);
823   if (!SrcReg)
824     SrcReg = getUXTWSrcReg(SrcMI);
825   if (!SrcReg)
826     return false;
827 
828   MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg));
829   LLVM_DEBUG(dbgs() << "Optimizing: " << MI);
830   MI.getOperand(1).setReg(SrcReg);
831   LLVM_DEBUG(dbgs() << "        to: " << MI);
832   for (auto *DeadMI : DeadInstrs) {
833     LLVM_DEBUG(dbgs() << "  Removing: " << *DeadMI);
834     DeadMI->eraseFromParent();
835   }
836   return true;
837 }
838 
839 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
840   if (skipFunction(MF.getFunction()))
841     return false;
842 
843   TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
844   TRI = static_cast<const AArch64RegisterInfo *>(
845       MF.getSubtarget().getRegisterInfo());
846   MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
847   MRI = &MF.getRegInfo();
848 
849   assert(MRI->isSSA() && "Expected to be run on SSA form!");
850 
851   bool Changed = false;
852 
853   for (MachineBasicBlock &MBB : MF) {
854     for (MachineInstr &MI : make_early_inc_range(MBB)) {
855       switch (MI.getOpcode()) {
856       default:
857         break;
858       case AArch64::INSERT_SUBREG:
859         Changed |= visitINSERT(MI);
860         break;
861       case AArch64::ANDWrr:
862         Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
863         break;
864       case AArch64::ANDXrr:
865         Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
866         break;
867       case AArch64::ORRWrs:
868         Changed |= visitORR(MI);
869         break;
870       case AArch64::ADDWrr:
871         Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
872         break;
873       case AArch64::SUBWrr:
874         Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
875         break;
876       case AArch64::ADDXrr:
877         Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
878         break;
879       case AArch64::SUBXrr:
880         Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
881         break;
882       case AArch64::ADDSWrr:
883         Changed |=
884             visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
885                                     {AArch64::SUBWri, AArch64::SUBSWri}, MI);
886         break;
887       case AArch64::SUBSWrr:
888         Changed |=
889             visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
890                                     {AArch64::ADDWri, AArch64::ADDSWri}, MI);
891         break;
892       case AArch64::ADDSXrr:
893         Changed |=
894             visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
895                                     {AArch64::SUBXri, AArch64::SUBSXri}, MI);
896         break;
897       case AArch64::SUBSXrr:
898         Changed |=
899             visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
900                                     {AArch64::ADDXri, AArch64::ADDSXri}, MI);
901         break;
902       case AArch64::CSELWr:
903       case AArch64::CSELXr:
904         Changed |= visitCSEL(MI);
905         break;
906       case AArch64::INSvi64gpr:
907         Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
908         break;
909       case AArch64::INSvi32gpr:
910         Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
911         break;
912       case AArch64::INSvi16gpr:
913         Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
914         break;
915       case AArch64::INSvi8gpr:
916         Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
917         break;
918       case AArch64::INSvi64lane:
919         Changed |= visitINSvi64lane(MI);
920         break;
921       case AArch64::FMOVDr:
922         Changed |= visitFMOVDr(MI);
923         break;
924       case AArch64::UBFMXri:
925         Changed |= visitUBFMXri(MI);
926         break;
927       case AArch64::COPY:
928         Changed |= visitCopy(MI);
929         break;
930       }
931     }
932   }
933 
934   return Changed;
935 }
936 
937 FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
938   return new AArch64MIPeepholeOpt();
939 }
940