xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp (revision f126d349810fdb512c0b01e101342d430b947488)
1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs below peephole optimizations on MIR level.
10 //
11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12 //    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13 //
14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15 //    MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16 //
17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18 //    MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19 //
20 //    The mov pseudo instruction could be expanded to multiple mov instructions
21 //    later. In this case, we could try to split the constant  operand of mov
22 //    instruction into two immediates which can be directly encoded into
23 //    *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24 //    multiple `mov` + `and/add/sub` instructions.
25 //
26 // 4. Remove redundant ORRWrs which is generated by zero-extend.
27 //
28 //    %3:gpr32 = ORRWrs $wzr, %2, 0
29 //    %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30 //
31 //    If AArch64's 32-bit form of instruction defines the source operand of
32 //    ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33 //    operand are set to zero.
34 //
35 //===----------------------------------------------------------------------===//
36 
37 #include "AArch64ExpandImm.h"
38 #include "AArch64InstrInfo.h"
39 #include "MCTargetDesc/AArch64AddressingModes.h"
40 #include "llvm/ADT/Optional.h"
41 #include "llvm/ADT/SetVector.h"
42 #include "llvm/CodeGen/MachineDominators.h"
43 #include "llvm/CodeGen/MachineLoopInfo.h"
44 
45 using namespace llvm;
46 
47 #define DEBUG_TYPE "aarch64-mi-peephole-opt"
48 
49 namespace {
50 
51 struct AArch64MIPeepholeOpt : public MachineFunctionPass {
52   static char ID;
53 
54   AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
55     initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
56   }
57 
58   const AArch64InstrInfo *TII;
59   const AArch64RegisterInfo *TRI;
60   MachineLoopInfo *MLI;
61   MachineRegisterInfo *MRI;
62 
63   template <typename T>
64   using SplitAndOpcFunc =
65       std::function<Optional<unsigned>(T, unsigned, T &, T &)>;
66   using BuildMIFunc =
67       std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register,
68                          Register, Register)>;
69 
70   /// For instructions where an immediate operand could be split into two
71   /// separate immediate instructions, use the splitTwoPartImm two handle the
72   /// optimization.
73   ///
74   /// To implement, the following function types must be passed to
75   /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
76   /// splitting the immediate is valid and returns the associated new opcode. A
77   /// BuildMIFunc must be implemented to build the two immediate instructions.
78   ///
79   /// Example Pattern (where IMM would require 2+ MOV instructions):
80   ///     %dst = <Instr>rr %src IMM [...]
81   /// becomes:
82   ///     %tmp = <Instr>ri %src (encode half IMM) [...]
83   ///     %dst = <Instr>ri %tmp (encode half IMM) [...]
84   template <typename T>
85   bool splitTwoPartImm(MachineInstr &MI,
86                        SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
87                        SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
88 
89   bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
90                         MachineInstr *&SubregToRegMI);
91 
92   template <typename T>
93   bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
94                    SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
95   template <typename T>
96   bool visitAND(unsigned Opc, MachineInstr &MI,
97                 SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
98   bool visitORR(MachineInstr &MI,
99                 SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
100   bool runOnMachineFunction(MachineFunction &MF) override;
101 
102   StringRef getPassName() const override {
103     return "AArch64 MI Peephole Optimization pass";
104   }
105 
106   void getAnalysisUsage(AnalysisUsage &AU) const override {
107     AU.setPreservesCFG();
108     AU.addRequired<MachineLoopInfo>();
109     MachineFunctionPass::getAnalysisUsage(AU);
110   }
111 };
112 
113 char AArch64MIPeepholeOpt::ID = 0;
114 
115 } // end anonymous namespace
116 
117 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
118                 "AArch64 MI Peephole Optimization", false, false)
119 
120 template <typename T>
121 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
122   T UImm = static_cast<T>(Imm);
123   if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
124     return false;
125 
126   // If this immediate can be handled by one instruction, do not split it.
127   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
128   AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
129   if (Insn.size() == 1)
130     return false;
131 
132   // The bitmask immediate consists of consecutive ones.  Let's say there is
133   // constant 0b00000000001000000000010000000000 which does not consist of
134   // consecutive ones. We can split it in to two bitmask immediate like
135   // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
136   // If we do AND with these two bitmask immediate, we can see original one.
137   unsigned LowestBitSet = countTrailingZeros(UImm);
138   unsigned HighestBitSet = Log2_64(UImm);
139 
140   // Create a mask which is filled with one from the position of lowest bit set
141   // to the position of highest bit set.
142   T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
143               (static_cast<T>(1) << LowestBitSet);
144   // Create a mask which is filled with one outside the position of lowest bit
145   // set and the position of highest bit set.
146   T NewImm2 = UImm | ~NewImm1;
147 
148   // If the split value is not valid bitmask immediate, do not split this
149   // constant.
150   if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
151     return false;
152 
153   Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
154   Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
155   return true;
156 }
157 
158 template <typename T>
159 bool AArch64MIPeepholeOpt::visitAND(
160     unsigned Opc, MachineInstr &MI,
161     SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
162   // Try below transformation.
163   //
164   // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
165   // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
166   //
167   // The mov pseudo instruction could be expanded to multiple mov instructions
168   // later. Let's try to split the constant operand of mov instruction into two
169   // bitmask immediates. It makes only two AND instructions intead of multiple
170   // mov + and instructions.
171 
172   return splitTwoPartImm<T>(
173       MI, ToBeRemoved,
174       [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> {
175         if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
176           return Opc;
177         return None;
178       },
179       [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
180                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
181                    Register NewDstReg) {
182         DebugLoc DL = MI.getDebugLoc();
183         MachineBasicBlock *MBB = MI.getParent();
184         BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
185             .addReg(SrcReg)
186             .addImm(Imm0);
187         BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
188             .addReg(NewTmpReg)
189             .addImm(Imm1);
190       });
191 }
192 
193 bool AArch64MIPeepholeOpt::visitORR(
194     MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
195   // Check this ORR comes from below zero-extend pattern.
196   //
197   // def : Pat<(i64 (zext GPR32:$src)),
198   //           (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
199   if (MI.getOperand(3).getImm() != 0)
200     return false;
201 
202   if (MI.getOperand(1).getReg() != AArch64::WZR)
203     return false;
204 
205   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
206   if (!SrcMI)
207     return false;
208 
209   // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
210   //
211   // When you use the 32-bit form of an instruction, the upper 32 bits of the
212   // source registers are ignored and the upper 32 bits of the destination
213   // register are set to zero.
214   //
215   // If AArch64's 32-bit form of instruction defines the source operand of
216   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
217   // real AArch64 instruction and if it is not, do not process the opcode
218   // conservatively.
219   if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
220     return false;
221 
222   Register DefReg = MI.getOperand(0).getReg();
223   Register SrcReg = MI.getOperand(2).getReg();
224   MRI->replaceRegWith(DefReg, SrcReg);
225   MRI->clearKillFlags(SrcReg);
226   // replaceRegWith changes MI's definition register. Keep it for SSA form until
227   // deleting MI.
228   MI.getOperand(0).setReg(DefReg);
229   ToBeRemoved.insert(&MI);
230 
231   LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
232 
233   return true;
234 }
235 
236 template <typename T>
237 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
238   // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
239   // imm0 and imm1 are non-zero 12-bit unsigned int.
240   if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
241       (Imm & ~static_cast<T>(0xffffff)) != 0)
242     return false;
243 
244   // The immediate can not be composed via a single instruction.
245   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
246   AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
247   if (Insn.size() == 1)
248     return false;
249 
250   // Split Imm into (Imm0 << 12) + Imm1;
251   Imm0 = (Imm >> 12) & 0xfff;
252   Imm1 = Imm & 0xfff;
253   return true;
254 }
255 
256 template <typename T>
257 bool AArch64MIPeepholeOpt::visitADDSUB(
258     unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
259     SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
260   // Try below transformation.
261   //
262   // MOVi32imm + ADDWrr ==> ADDWri + ADDWri
263   // MOVi64imm + ADDXrr ==> ADDXri + ADDXri
264   //
265   // MOVi32imm + SUBWrr ==> SUBWri + SUBWri
266   // MOVi64imm + SUBXrr ==> SUBXri + SUBXri
267   //
268   // The mov pseudo instruction could be expanded to multiple mov instructions
269   // later. Let's try to split the constant operand of mov instruction into two
270   // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
271   // multiple `mov` + `and/sub` instructions.
272 
273   return splitTwoPartImm<T>(
274       MI, ToBeRemoved,
275       [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
276                        T &Imm1) -> Optional<unsigned> {
277         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
278           return PosOpc;
279         if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
280           return NegOpc;
281         return None;
282       },
283       [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
284                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
285                    Register NewDstReg) {
286         DebugLoc DL = MI.getDebugLoc();
287         MachineBasicBlock *MBB = MI.getParent();
288         BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
289             .addReg(SrcReg)
290             .addImm(Imm0)
291             .addImm(12);
292         BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
293             .addReg(NewTmpReg)
294             .addImm(Imm1)
295             .addImm(0);
296       });
297 }
298 
299 // Checks if the corresponding MOV immediate instruction is applicable for
300 // this peephole optimization.
301 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
302                                             MachineInstr *&MovMI,
303                                             MachineInstr *&SubregToRegMI) {
304   // Check whether current MBB is in loop and the AND is loop invariant.
305   MachineBasicBlock *MBB = MI.getParent();
306   MachineLoop *L = MLI->getLoopFor(MBB);
307   if (L && !L->isLoopInvariant(MI))
308     return false;
309 
310   // Check whether current MI's operand is MOV with immediate.
311   MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
312   if (!MovMI)
313     return false;
314 
315   // If it is SUBREG_TO_REG, check its operand.
316   SubregToRegMI = nullptr;
317   if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
318     SubregToRegMI = MovMI;
319     MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
320     if (!MovMI)
321       return false;
322   }
323 
324   if (MovMI->getOpcode() != AArch64::MOVi32imm &&
325       MovMI->getOpcode() != AArch64::MOVi64imm)
326     return false;
327 
328   // If the MOV has multiple uses, do not split the immediate because it causes
329   // more instructions.
330   if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
331     return false;
332   if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
333     return false;
334 
335   // It is OK to perform this peephole optimization.
336   return true;
337 }
338 
339 template <typename T>
340 bool AArch64MIPeepholeOpt::splitTwoPartImm(
341     MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
342     SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
343   unsigned RegSize = sizeof(T) * 8;
344   assert((RegSize == 32 || RegSize == 64) &&
345          "Invalid RegSize for legal immediate peephole optimization");
346 
347   // Perform several essential checks against current MI.
348   MachineInstr *MovMI, *SubregToRegMI;
349   if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
350     return false;
351 
352   // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
353   T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
354   // For the 32 bit form of instruction, the upper 32 bits of the destination
355   // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
356   // of Imm to zero. This is essential if the Immediate value was a negative
357   // number since it was sign extended when we assign to the 64-bit Imm.
358   if (SubregToRegMI)
359     Imm &= 0xFFFFFFFF;
360   unsigned Opcode;
361   if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
362     Opcode = R.getValue();
363   else
364     return false;
365 
366   // Create new ADD/SUB MIs.
367   MachineFunction *MF = MI.getMF();
368   const TargetRegisterClass *RC =
369       TII->getRegClass(TII->get(Opcode), 0, TRI, *MF);
370   const TargetRegisterClass *ORC =
371       TII->getRegClass(TII->get(Opcode), 1, TRI, *MF);
372   Register DstReg = MI.getOperand(0).getReg();
373   Register SrcReg = MI.getOperand(1).getReg();
374   Register NewTmpReg = MRI->createVirtualRegister(RC);
375   Register NewDstReg = MRI->createVirtualRegister(RC);
376 
377   MRI->constrainRegClass(SrcReg, RC);
378   MRI->constrainRegClass(NewTmpReg, ORC);
379   MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
380 
381   BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
382 
383   MRI->replaceRegWith(DstReg, NewDstReg);
384   // replaceRegWith changes MI's definition register. Keep it for SSA form until
385   // deleting MI.
386   MI.getOperand(0).setReg(DstReg);
387 
388   // Record the MIs need to be removed.
389   ToBeRemoved.insert(&MI);
390   if (SubregToRegMI)
391     ToBeRemoved.insert(SubregToRegMI);
392   ToBeRemoved.insert(MovMI);
393 
394   return true;
395 }
396 
397 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
398   if (skipFunction(MF.getFunction()))
399     return false;
400 
401   TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
402   TRI = static_cast<const AArch64RegisterInfo *>(
403       MF.getSubtarget().getRegisterInfo());
404   MLI = &getAnalysis<MachineLoopInfo>();
405   MRI = &MF.getRegInfo();
406 
407   assert(MRI->isSSA() && "Expected to be run on SSA form!");
408 
409   bool Changed = false;
410   SmallSetVector<MachineInstr *, 8> ToBeRemoved;
411 
412   for (MachineBasicBlock &MBB : MF) {
413     for (MachineInstr &MI : MBB) {
414       switch (MI.getOpcode()) {
415       default:
416         break;
417       case AArch64::ANDWrr:
418         Changed = visitAND<uint32_t>(AArch64::ANDWri, MI, ToBeRemoved);
419         break;
420       case AArch64::ANDXrr:
421         Changed = visitAND<uint64_t>(AArch64::ANDXri, MI, ToBeRemoved);
422         break;
423       case AArch64::ORRWrs:
424         Changed = visitORR(MI, ToBeRemoved);
425         break;
426       case AArch64::ADDWrr:
427         Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI,
428                                         ToBeRemoved);
429         break;
430       case AArch64::SUBWrr:
431         Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI,
432                                         ToBeRemoved);
433         break;
434       case AArch64::ADDXrr:
435         Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI,
436                                         ToBeRemoved);
437         break;
438       case AArch64::SUBXrr:
439         Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI,
440                                         ToBeRemoved);
441         break;
442       }
443     }
444   }
445 
446   for (MachineInstr *MI : ToBeRemoved)
447     MI->eraseFromParent();
448 
449   return Changed;
450 }
451 
452 FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
453   return new AArch64MIPeepholeOpt();
454 }
455