xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp (revision 3ceba58a7509418b47b8fca2d2b6bbf088714e26)
1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs below peephole optimizations on MIR level.
10 //
11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12 //    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13 //
14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15 //    MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16 //
17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18 //    MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19 //
20 //    The mov pseudo instruction could be expanded to multiple mov instructions
21 //    later. In this case, we could try to split the constant  operand of mov
22 //    instruction into two immediates which can be directly encoded into
23 //    *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24 //    multiple `mov` + `and/add/sub` instructions.
25 //
26 // 4. Remove redundant ORRWrs which is generated by zero-extend.
27 //
28 //    %3:gpr32 = ORRWrs $wzr, %2, 0
29 //    %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30 //
31 //    If AArch64's 32-bit form of instruction defines the source operand of
32 //    ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33 //    operand are set to zero.
34 //
35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36 //     ==> %reg:subidx =  SUBREG_TO_REG 0, %subreg, subidx
37 //
38 // 6. %intermediate:gpr32 = COPY %src:fpr128
39 //    %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40 //     ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41 //
42 //    In cases where a source FPR is copied to a GPR in order to be copied
43 //    to a destination FPR, we can directly copy the values between the FPRs,
44 //    eliminating the use of the Integer unit. When we match a pattern of
45 //    INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46 //    source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47 //    instructions.
48 //
49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50 //    64-bits. For example,
51 //
52 //   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53 //   %2:fpr64 = MOVID 0
54 //   %4:fpr128 = IMPLICIT_DEF
55 //   %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
56 //   %6:fpr128 = IMPLICIT_DEF
57 //   %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
58 //   %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
59 //   ==>
60 //   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61 //   %6:fpr128 = IMPLICIT_DEF
62 //   %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
63 //
64 //===----------------------------------------------------------------------===//
65 
66 #include "AArch64ExpandImm.h"
67 #include "AArch64InstrInfo.h"
68 #include "MCTargetDesc/AArch64AddressingModes.h"
69 #include "llvm/CodeGen/MachineDominators.h"
70 #include "llvm/CodeGen/MachineLoopInfo.h"
71 
72 using namespace llvm;
73 
74 #define DEBUG_TYPE "aarch64-mi-peephole-opt"
75 
76 namespace {
77 
78 struct AArch64MIPeepholeOpt : public MachineFunctionPass {
79   static char ID;
80 
81   AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
82     initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
83   }
84 
85   const AArch64InstrInfo *TII;
86   const AArch64RegisterInfo *TRI;
87   MachineLoopInfo *MLI;
88   MachineRegisterInfo *MRI;
89 
90   using OpcodePair = std::pair<unsigned, unsigned>;
91   template <typename T>
92   using SplitAndOpcFunc =
93       std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
94   using BuildMIFunc =
95       std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
96                          Register, Register, Register)>;
97 
98   /// For instructions where an immediate operand could be split into two
99   /// separate immediate instructions, use the splitTwoPartImm two handle the
100   /// optimization.
101   ///
102   /// To implement, the following function types must be passed to
103   /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
104   /// splitting the immediate is valid and returns the associated new opcode. A
105   /// BuildMIFunc must be implemented to build the two immediate instructions.
106   ///
107   /// Example Pattern (where IMM would require 2+ MOV instructions):
108   ///     %dst = <Instr>rr %src IMM [...]
109   /// becomes:
110   ///     %tmp = <Instr>ri %src (encode half IMM) [...]
111   ///     %dst = <Instr>ri %tmp (encode half IMM) [...]
112   template <typename T>
113   bool splitTwoPartImm(MachineInstr &MI,
114                        SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
115 
116   bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
117                         MachineInstr *&SubregToRegMI);
118 
119   template <typename T>
120   bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
121   template <typename T>
122   bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
123 
124   template <typename T>
125   bool visitAND(unsigned Opc, MachineInstr &MI);
126   bool visitORR(MachineInstr &MI);
127   bool visitINSERT(MachineInstr &MI);
128   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
129   bool visitINSvi64lane(MachineInstr &MI);
130   bool visitFMOVDr(MachineInstr &MI);
131   bool visitCopy(MachineInstr &MI);
132   bool runOnMachineFunction(MachineFunction &MF) override;
133 
134   StringRef getPassName() const override {
135     return "AArch64 MI Peephole Optimization pass";
136   }
137 
138   void getAnalysisUsage(AnalysisUsage &AU) const override {
139     AU.setPreservesCFG();
140     AU.addRequired<MachineLoopInfoWrapperPass>();
141     MachineFunctionPass::getAnalysisUsage(AU);
142   }
143 };
144 
145 char AArch64MIPeepholeOpt::ID = 0;
146 
147 } // end anonymous namespace
148 
149 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
150                 "AArch64 MI Peephole Optimization", false, false)
151 
152 template <typename T>
153 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
154   T UImm = static_cast<T>(Imm);
155   if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
156     return false;
157 
158   // If this immediate can be handled by one instruction, do not split it.
159   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
160   AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
161   if (Insn.size() == 1)
162     return false;
163 
164   // The bitmask immediate consists of consecutive ones.  Let's say there is
165   // constant 0b00000000001000000000010000000000 which does not consist of
166   // consecutive ones. We can split it in to two bitmask immediate like
167   // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
168   // If we do AND with these two bitmask immediate, we can see original one.
169   unsigned LowestBitSet = llvm::countr_zero(UImm);
170   unsigned HighestBitSet = Log2_64(UImm);
171 
172   // Create a mask which is filled with one from the position of lowest bit set
173   // to the position of highest bit set.
174   T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
175               (static_cast<T>(1) << LowestBitSet);
176   // Create a mask which is filled with one outside the position of lowest bit
177   // set and the position of highest bit set.
178   T NewImm2 = UImm | ~NewImm1;
179 
180   // If the split value is not valid bitmask immediate, do not split this
181   // constant.
182   if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
183     return false;
184 
185   Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
186   Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
187   return true;
188 }
189 
190 template <typename T>
191 bool AArch64MIPeepholeOpt::visitAND(
192     unsigned Opc, MachineInstr &MI) {
193   // Try below transformation.
194   //
195   // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
196   // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
197   //
198   // The mov pseudo instruction could be expanded to multiple mov instructions
199   // later. Let's try to split the constant operand of mov instruction into two
200   // bitmask immediates. It makes only two AND instructions intead of multiple
201   // mov + and instructions.
202 
203   return splitTwoPartImm<T>(
204       MI,
205       [Opc](T Imm, unsigned RegSize, T &Imm0,
206             T &Imm1) -> std::optional<OpcodePair> {
207         if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
208           return std::make_pair(Opc, Opc);
209         return std::nullopt;
210       },
211       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
212                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
213                    Register NewDstReg) {
214         DebugLoc DL = MI.getDebugLoc();
215         MachineBasicBlock *MBB = MI.getParent();
216         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
217             .addReg(SrcReg)
218             .addImm(Imm0);
219         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
220             .addReg(NewTmpReg)
221             .addImm(Imm1);
222       });
223 }
224 
225 bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
226   // Check this ORR comes from below zero-extend pattern.
227   //
228   // def : Pat<(i64 (zext GPR32:$src)),
229   //           (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
230   if (MI.getOperand(3).getImm() != 0)
231     return false;
232 
233   if (MI.getOperand(1).getReg() != AArch64::WZR)
234     return false;
235 
236   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
237   if (!SrcMI)
238     return false;
239 
240   // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
241   //
242   // When you use the 32-bit form of an instruction, the upper 32 bits of the
243   // source registers are ignored and the upper 32 bits of the destination
244   // register are set to zero.
245   //
246   // If AArch64's 32-bit form of instruction defines the source operand of
247   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
248   // real AArch64 instruction and if it is not, do not process the opcode
249   // conservatively.
250   if (SrcMI->getOpcode() == TargetOpcode::COPY &&
251       SrcMI->getOperand(1).getReg().isVirtual()) {
252     const TargetRegisterClass *RC =
253         MRI->getRegClass(SrcMI->getOperand(1).getReg());
254 
255     // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
256     // that the upper bits are zero.
257     if (RC != &AArch64::FPR32RegClass &&
258         ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
259          SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
260       return false;
261     Register CpySrc = SrcMI->getOperand(1).getReg();
262     if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
263       CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
264       BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
265               TII->get(TargetOpcode::COPY), CpySrc)
266           .add(SrcMI->getOperand(1));
267     }
268     BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
269             TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
270         .addReg(CpySrc);
271     SrcMI->eraseFromParent();
272   }
273   else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
274     return false;
275 
276   Register DefReg = MI.getOperand(0).getReg();
277   Register SrcReg = MI.getOperand(2).getReg();
278   MRI->replaceRegWith(DefReg, SrcReg);
279   MRI->clearKillFlags(SrcReg);
280   LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
281   MI.eraseFromParent();
282 
283   return true;
284 }
285 
286 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
287   // Check this INSERT_SUBREG comes from below zero-extend pattern.
288   //
289   // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
290   // To   %reg:subidx =  SUBREG_TO_REG 0, %subreg, subidx
291   //
292   // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
293   // COPY would destroy the upper part of the register anyway
294   if (!MI.isRegTiedToDefOperand(1))
295     return false;
296 
297   Register DstReg = MI.getOperand(0).getReg();
298   const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
299   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
300   if (!SrcMI)
301     return false;
302 
303   // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
304   //
305   // When you use the 32-bit form of an instruction, the upper 32 bits of the
306   // source registers are ignored and the upper 32 bits of the destination
307   // register are set to zero.
308   //
309   // If AArch64's 32-bit form of instruction defines the source operand of
310   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
311   // real AArch64 instruction and if it is not, do not process the opcode
312   // conservatively.
313   if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
314       !AArch64::GPR64allRegClass.hasSubClassEq(RC))
315     return false;
316 
317   // Build a SUBREG_TO_REG instruction
318   MachineInstr *SubregMI =
319       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
320               TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
321           .addImm(0)
322           .add(MI.getOperand(2))
323           .add(MI.getOperand(3));
324   LLVM_DEBUG(dbgs() << MI << "  replace by:\n: " << *SubregMI << "\n");
325   (void)SubregMI;
326   MI.eraseFromParent();
327 
328   return true;
329 }
330 
331 template <typename T>
332 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
333   // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
334   // imm0 and imm1 are non-zero 12-bit unsigned int.
335   if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
336       (Imm & ~static_cast<T>(0xffffff)) != 0)
337     return false;
338 
339   // The immediate can not be composed via a single instruction.
340   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
341   AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
342   if (Insn.size() == 1)
343     return false;
344 
345   // Split Imm into (Imm0 << 12) + Imm1;
346   Imm0 = (Imm >> 12) & 0xfff;
347   Imm1 = Imm & 0xfff;
348   return true;
349 }
350 
351 template <typename T>
352 bool AArch64MIPeepholeOpt::visitADDSUB(
353     unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
354   // Try below transformation.
355   //
356   // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
357   // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
358   //
359   // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
360   // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
361   //
362   // The mov pseudo instruction could be expanded to multiple mov instructions
363   // later. Let's try to split the constant operand of mov instruction into two
364   // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
365   // multiple `mov` + `and/sub` instructions.
366 
367   // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
368   // folded. Make sure that we don't generate invalid instructions that use XZR
369   // in those cases.
370   if (MI.getOperand(1).getReg() == AArch64::XZR ||
371       MI.getOperand(1).getReg() == AArch64::WZR)
372     return false;
373 
374   return splitTwoPartImm<T>(
375       MI,
376       [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
377                        T &Imm1) -> std::optional<OpcodePair> {
378         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
379           return std::make_pair(PosOpc, PosOpc);
380         if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
381           return std::make_pair(NegOpc, NegOpc);
382         return std::nullopt;
383       },
384       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
385                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
386                    Register NewDstReg) {
387         DebugLoc DL = MI.getDebugLoc();
388         MachineBasicBlock *MBB = MI.getParent();
389         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
390             .addReg(SrcReg)
391             .addImm(Imm0)
392             .addImm(12);
393         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
394             .addReg(NewTmpReg)
395             .addImm(Imm1)
396             .addImm(0);
397       });
398 }
399 
400 template <typename T>
401 bool AArch64MIPeepholeOpt::visitADDSSUBS(
402     OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
403   // Try the same transformation as ADDSUB but with additional requirement
404   // that the condition code usages are only for Equal and Not Equal
405 
406   if (MI.getOperand(1).getReg() == AArch64::XZR ||
407       MI.getOperand(1).getReg() == AArch64::WZR)
408     return false;
409 
410   return splitTwoPartImm<T>(
411       MI,
412       [PosOpcs, NegOpcs, &MI, &TRI = TRI,
413        &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
414                    T &Imm1) -> std::optional<OpcodePair> {
415         OpcodePair OP;
416         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
417           OP = PosOpcs;
418         else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
419           OP = NegOpcs;
420         else
421           return std::nullopt;
422         // Check conditional uses last since it is expensive for scanning
423         // proceeding instructions
424         MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
425         std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
426         if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
427           return std::nullopt;
428         return OP;
429       },
430       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
431                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
432                    Register NewDstReg) {
433         DebugLoc DL = MI.getDebugLoc();
434         MachineBasicBlock *MBB = MI.getParent();
435         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
436             .addReg(SrcReg)
437             .addImm(Imm0)
438             .addImm(12);
439         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
440             .addReg(NewTmpReg)
441             .addImm(Imm1)
442             .addImm(0);
443       });
444 }
445 
446 // Checks if the corresponding MOV immediate instruction is applicable for
447 // this peephole optimization.
448 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
449                                             MachineInstr *&MovMI,
450                                             MachineInstr *&SubregToRegMI) {
451   // Check whether current MBB is in loop and the AND is loop invariant.
452   MachineBasicBlock *MBB = MI.getParent();
453   MachineLoop *L = MLI->getLoopFor(MBB);
454   if (L && !L->isLoopInvariant(MI))
455     return false;
456 
457   // Check whether current MI's operand is MOV with immediate.
458   MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
459   if (!MovMI)
460     return false;
461 
462   // If it is SUBREG_TO_REG, check its operand.
463   SubregToRegMI = nullptr;
464   if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
465     SubregToRegMI = MovMI;
466     MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
467     if (!MovMI)
468       return false;
469   }
470 
471   if (MovMI->getOpcode() != AArch64::MOVi32imm &&
472       MovMI->getOpcode() != AArch64::MOVi64imm)
473     return false;
474 
475   // If the MOV has multiple uses, do not split the immediate because it causes
476   // more instructions.
477   if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
478     return false;
479   if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
480     return false;
481 
482   // It is OK to perform this peephole optimization.
483   return true;
484 }
485 
486 template <typename T>
487 bool AArch64MIPeepholeOpt::splitTwoPartImm(
488     MachineInstr &MI,
489     SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
490   unsigned RegSize = sizeof(T) * 8;
491   assert((RegSize == 32 || RegSize == 64) &&
492          "Invalid RegSize for legal immediate peephole optimization");
493 
494   // Perform several essential checks against current MI.
495   MachineInstr *MovMI, *SubregToRegMI;
496   if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
497     return false;
498 
499   // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
500   T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
501   // For the 32 bit form of instruction, the upper 32 bits of the destination
502   // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
503   // of Imm to zero. This is essential if the Immediate value was a negative
504   // number since it was sign extended when we assign to the 64-bit Imm.
505   if (SubregToRegMI)
506     Imm &= 0xFFFFFFFF;
507   OpcodePair Opcode;
508   if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
509     Opcode = *R;
510   else
511     return false;
512 
513   // Create new MIs using the first and second opcodes. Opcodes might differ for
514   // flag setting operations that should only set flags on second instruction.
515   // NewTmpReg = Opcode.first SrcReg Imm0
516   // NewDstReg = Opcode.second NewTmpReg Imm1
517 
518   // Determine register classes for destinations and register operands
519   MachineFunction *MF = MI.getMF();
520   const TargetRegisterClass *FirstInstrDstRC =
521       TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
522   const TargetRegisterClass *FirstInstrOperandRC =
523       TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
524   const TargetRegisterClass *SecondInstrDstRC =
525       (Opcode.first == Opcode.second)
526           ? FirstInstrDstRC
527           : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
528   const TargetRegisterClass *SecondInstrOperandRC =
529       (Opcode.first == Opcode.second)
530           ? FirstInstrOperandRC
531           : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
532 
533   // Get old registers destinations and new register destinations
534   Register DstReg = MI.getOperand(0).getReg();
535   Register SrcReg = MI.getOperand(1).getReg();
536   Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
537   // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
538   // reuse that same destination register.
539   Register NewDstReg = DstReg.isVirtual()
540                            ? MRI->createVirtualRegister(SecondInstrDstRC)
541                            : DstReg;
542 
543   // Constrain registers based on their new uses
544   MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
545   MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
546   if (DstReg != NewDstReg)
547     MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
548 
549   // Call the delegating operation to build the instruction
550   BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
551 
552   // replaceRegWith changes MI's definition register. Keep it for SSA form until
553   // deleting MI. Only if we made a new destination register.
554   if (DstReg != NewDstReg) {
555     MRI->replaceRegWith(DstReg, NewDstReg);
556     MI.getOperand(0).setReg(DstReg);
557   }
558 
559   // Record the MIs need to be removed.
560   MI.eraseFromParent();
561   if (SubregToRegMI)
562     SubregToRegMI->eraseFromParent();
563   MovMI->eraseFromParent();
564 
565   return true;
566 }
567 
568 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
569   // Check if this INSvi[X]gpr comes from COPY of a source FPR128
570   //
571   // From
572   //  %intermediate1:gpr64 = COPY %src:fpr128
573   //  %intermediate2:gpr32 = COPY %intermediate1:gpr64
574   //  %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
575   // To
576   //  %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
577   //  src_index
578   // where src_index = 0, X = [8|16|32|64]
579 
580   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
581 
582   // For a chain of COPY instructions, find the initial source register
583   // and check if it's an FPR128
584   while (true) {
585     if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
586       return false;
587 
588     if (!SrcMI->getOperand(1).getReg().isVirtual())
589       return false;
590 
591     if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
592         &AArch64::FPR128RegClass) {
593       break;
594     }
595     SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
596   }
597 
598   Register DstReg = MI.getOperand(0).getReg();
599   Register SrcReg = SrcMI->getOperand(1).getReg();
600   MachineInstr *INSvilaneMI =
601       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
602           .add(MI.getOperand(1))
603           .add(MI.getOperand(2))
604           .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
605           .addImm(0);
606 
607   LLVM_DEBUG(dbgs() << MI << "  replace by:\n: " << *INSvilaneMI << "\n");
608   (void)INSvilaneMI;
609   MI.eraseFromParent();
610   return true;
611 }
612 
613 // All instructions that set a FPR64 will implicitly zero the top bits of the
614 // register.
615 static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
616                                         MachineRegisterInfo *MRI) {
617   if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
618     return false;
619   const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
620   if (RC != &AArch64::FPR64RegClass)
621     return false;
622   return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
623 }
624 
625 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
626   // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
627   // We are expecting below case.
628   //
629   //  %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
630   //  %6:fpr128 = IMPLICIT_DEF
631   //  %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
632   //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
633   MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
634   if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
635     return false;
636   Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
637   if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
638     return false;
639 
640   // Check there is `mov 0` MI for high 64-bits.
641   // We are expecting below cases.
642   //
643   //  %2:fpr64 = MOVID 0
644   //  %4:fpr128 = IMPLICIT_DEF
645   //  %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
646   //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
647   // or
648   //  %5:fpr128 = MOVIv2d_ns 0
649   //  %6:fpr64 = COPY %5.dsub:fpr128
650   //  %8:fpr128 = IMPLICIT_DEF
651   //  %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
652   //  %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
653   MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
654   if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
655     return false;
656   High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
657   if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
658     High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
659   if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
660                     High64MI->getOpcode() != AArch64::MOVIv2d_ns))
661     return false;
662   if (High64MI->getOperand(1).getImm() != 0)
663     return false;
664 
665   // Let's remove MIs for high 64-bits.
666   Register OldDef = MI.getOperand(0).getReg();
667   Register NewDef = MI.getOperand(1).getReg();
668   MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
669   MRI->replaceRegWith(OldDef, NewDef);
670   MI.eraseFromParent();
671 
672   return true;
673 }
674 
675 bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
676   // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
677   MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
678   if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
679     return false;
680 
681   // Let's remove MIs for high 64-bits.
682   Register OldDef = MI.getOperand(0).getReg();
683   Register NewDef = MI.getOperand(1).getReg();
684   LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
685   MRI->clearKillFlags(OldDef);
686   MRI->clearKillFlags(NewDef);
687   MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
688   MRI->replaceRegWith(OldDef, NewDef);
689   MI.eraseFromParent();
690 
691   return true;
692 }
693 
694 // Across a basic-block we might have in i32 extract from a value that only
695 // operates on upper bits (for example a sxtw). We can replace the COPY with a
696 // new version skipping the sxtw.
697 bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) {
698   Register InputReg = MI.getOperand(1).getReg();
699   if (MI.getOperand(1).getSubReg() != AArch64::sub_32 ||
700       !MRI->hasOneNonDBGUse(InputReg))
701     return false;
702 
703   MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg);
704   SmallPtrSet<MachineInstr *, 4> DeadInstrs;
705   DeadInstrs.insert(SrcMI);
706   while (SrcMI && SrcMI->isFullCopy() &&
707          MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) {
708     SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
709     DeadInstrs.insert(SrcMI);
710   }
711 
712   if (!SrcMI || SrcMI->getOpcode() != AArch64::SBFMXri ||
713       SrcMI->getOperand(2).getImm() != 0 || SrcMI->getOperand(3).getImm() != 31)
714     return false;
715 
716   Register SrcReg = SrcMI->getOperand(1).getReg();
717   MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg));
718   LLVM_DEBUG(dbgs() << "Optimizing: " << MI);
719   MI.getOperand(1).setReg(SrcReg);
720   LLVM_DEBUG(dbgs() << "        to: " << MI);
721   for (auto *DeadMI : DeadInstrs) {
722     LLVM_DEBUG(dbgs() << "  Removing: " << *DeadMI);
723     DeadMI->eraseFromParent();
724   }
725   return true;
726 }
727 
728 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
729   if (skipFunction(MF.getFunction()))
730     return false;
731 
732   TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
733   TRI = static_cast<const AArch64RegisterInfo *>(
734       MF.getSubtarget().getRegisterInfo());
735   MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
736   MRI = &MF.getRegInfo();
737 
738   assert(MRI->isSSA() && "Expected to be run on SSA form!");
739 
740   bool Changed = false;
741 
742   for (MachineBasicBlock &MBB : MF) {
743     for (MachineInstr &MI : make_early_inc_range(MBB)) {
744       switch (MI.getOpcode()) {
745       default:
746         break;
747       case AArch64::INSERT_SUBREG:
748         Changed |= visitINSERT(MI);
749         break;
750       case AArch64::ANDWrr:
751         Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
752         break;
753       case AArch64::ANDXrr:
754         Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
755         break;
756       case AArch64::ORRWrs:
757         Changed |= visitORR(MI);
758         break;
759       case AArch64::ADDWrr:
760         Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
761         break;
762       case AArch64::SUBWrr:
763         Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
764         break;
765       case AArch64::ADDXrr:
766         Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
767         break;
768       case AArch64::SUBXrr:
769         Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
770         break;
771       case AArch64::ADDSWrr:
772         Changed |=
773             visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
774                                     {AArch64::SUBWri, AArch64::SUBSWri}, MI);
775         break;
776       case AArch64::SUBSWrr:
777         Changed |=
778             visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
779                                     {AArch64::ADDWri, AArch64::ADDSWri}, MI);
780         break;
781       case AArch64::ADDSXrr:
782         Changed |=
783             visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
784                                     {AArch64::SUBXri, AArch64::SUBSXri}, MI);
785         break;
786       case AArch64::SUBSXrr:
787         Changed |=
788             visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
789                                     {AArch64::ADDXri, AArch64::ADDSXri}, MI);
790         break;
791       case AArch64::INSvi64gpr:
792         Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
793         break;
794       case AArch64::INSvi32gpr:
795         Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
796         break;
797       case AArch64::INSvi16gpr:
798         Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
799         break;
800       case AArch64::INSvi8gpr:
801         Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
802         break;
803       case AArch64::INSvi64lane:
804         Changed |= visitINSvi64lane(MI);
805         break;
806       case AArch64::FMOVDr:
807         Changed |= visitFMOVDr(MI);
808         break;
809       case AArch64::COPY:
810         Changed |= visitCopy(MI);
811         break;
812       }
813     }
814   }
815 
816   return Changed;
817 }
818 
819 FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
820   return new AArch64MIPeepholeOpt();
821 }
822