xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does post-instruction-selection optimizations in the GlobalISel
10 // pipeline, before the rest of codegen runs.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64.h"
15 #include "AArch64TargetMachine.h"
16 #include "MCTargetDesc/AArch64MCTargetDesc.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/CodeGen/GlobalISel/Utils.h"
19 #include "llvm/CodeGen/MachineBasicBlock.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/CodeGen/MachineInstr.h"
22 #include "llvm/CodeGen/MachineOperand.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Support/Debug.h"
25 #include "llvm/Support/ErrorHandling.h"
26 
27 #define DEBUG_TYPE "aarch64-post-select-optimize"
28 
29 using namespace llvm;
30 
31 namespace {
32 class AArch64PostSelectOptimize : public MachineFunctionPass {
33 public:
34   static char ID;
35 
AArch64PostSelectOptimize()36   AArch64PostSelectOptimize() : MachineFunctionPass(ID) {}
37 
getPassName() const38   StringRef getPassName() const override {
39     return "AArch64 Post Select Optimizer";
40   }
41 
42   bool runOnMachineFunction(MachineFunction &MF) override;
43 
44   void getAnalysisUsage(AnalysisUsage &AU) const override;
45 
46 private:
47   bool optimizeNZCVDefs(MachineBasicBlock &MBB);
48   bool doPeepholeOpts(MachineBasicBlock &MBB);
49   /// Look for cross regclass copies that can be trivially eliminated.
50   bool foldSimpleCrossClassCopies(MachineInstr &MI);
51   bool foldCopyDup(MachineInstr &MI);
52 };
53 } // end anonymous namespace
54 
getAnalysisUsage(AnalysisUsage & AU) const55 void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const {
56   AU.addRequired<TargetPassConfig>();
57   AU.setPreservesCFG();
58   getSelectionDAGFallbackAnalysisUsage(AU);
59   MachineFunctionPass::getAnalysisUsage(AU);
60 }
61 
getNonFlagSettingVariant(unsigned Opc)62 unsigned getNonFlagSettingVariant(unsigned Opc) {
63   switch (Opc) {
64   default:
65     return 0;
66   case AArch64::SUBSXrr:
67     return AArch64::SUBXrr;
68   case AArch64::SUBSWrr:
69     return AArch64::SUBWrr;
70   case AArch64::SUBSXrs:
71     return AArch64::SUBXrs;
72   case AArch64::SUBSWrs:
73     return AArch64::SUBWrs;
74   case AArch64::SUBSXri:
75     return AArch64::SUBXri;
76   case AArch64::SUBSWri:
77     return AArch64::SUBWri;
78   case AArch64::ADDSXrr:
79     return AArch64::ADDXrr;
80   case AArch64::ADDSWrr:
81     return AArch64::ADDWrr;
82   case AArch64::ADDSXrs:
83     return AArch64::ADDXrs;
84   case AArch64::ADDSWrs:
85     return AArch64::ADDWrs;
86   case AArch64::ADDSXri:
87     return AArch64::ADDXri;
88   case AArch64::ADDSWri:
89     return AArch64::ADDWri;
90   case AArch64::SBCSXr:
91     return AArch64::SBCXr;
92   case AArch64::SBCSWr:
93     return AArch64::SBCWr;
94   case AArch64::ADCSXr:
95     return AArch64::ADCXr;
96   case AArch64::ADCSWr:
97     return AArch64::ADCWr;
98   }
99 }
100 
doPeepholeOpts(MachineBasicBlock & MBB)101 bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) {
102   bool Changed = false;
103   for (auto &MI : make_early_inc_range(MBB)) {
104     bool CurrentIterChanged = foldSimpleCrossClassCopies(MI);
105     if (!CurrentIterChanged)
106       CurrentIterChanged |= foldCopyDup(MI);
107     Changed |= CurrentIterChanged;
108   }
109   return Changed;
110 }
111 
foldSimpleCrossClassCopies(MachineInstr & MI)112 bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) {
113   auto *MF = MI.getMF();
114   auto &MRI = MF->getRegInfo();
115 
116   if (!MI.isCopy())
117     return false;
118 
119   if (MI.getOperand(1).getSubReg())
120     return false; // Don't deal with subreg copies
121 
122   Register Src = MI.getOperand(1).getReg();
123   Register Dst = MI.getOperand(0).getReg();
124 
125   if (Src.isPhysical() || Dst.isPhysical())
126     return false;
127 
128   const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
129   const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
130 
131   if (SrcRC == DstRC)
132     return false;
133 
134 
135   if (SrcRC->hasSubClass(DstRC)) {
136     // This is the case where the source class is a superclass of the dest, so
137     // if the copy is the only user of the source, we can just constrain the
138     // source reg to the dest class.
139 
140     if (!MRI.hasOneNonDBGUse(Src))
141       return false; // Only constrain single uses of the source.
142 
143     // Constrain to dst reg class as long as it's not a weird class that only
144     // has a few registers.
145     if (!MRI.constrainRegClass(Src, DstRC, /* MinNumRegs */ 25))
146       return false;
147   } else if (DstRC->hasSubClass(SrcRC)) {
148     // This is the inverse case, where the destination class is a superclass of
149     // the source. Here, if the copy is the only user, we can just constrain
150     // the user of the copy to use the smaller class of the source.
151   } else {
152     return false;
153   }
154 
155   MRI.replaceRegWith(Dst, Src);
156   MI.eraseFromParent();
157   return true;
158 }
159 
foldCopyDup(MachineInstr & MI)160 bool AArch64PostSelectOptimize::foldCopyDup(MachineInstr &MI) {
161   if (!MI.isCopy())
162     return false;
163 
164   auto *MF = MI.getMF();
165   auto &MRI = MF->getRegInfo();
166   auto *TII = MF->getSubtarget().getInstrInfo();
167 
168   // Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i).
169   // Here Dst is y and Src is the result of DUP.
170   Register Dst = MI.getOperand(0).getReg();
171   Register Src = MI.getOperand(1).getReg();
172 
173   if (!Dst.isVirtual() || !Src.isVirtual())
174     return false;
175 
176   auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass,
177                          const TargetRegisterClass *FPRRegClass, unsigned DUP,
178                          unsigned UMOV) {
179     if (MRI.getRegClassOrNull(Dst) != GPRRegClass ||
180         MRI.getRegClassOrNull(Src) != FPRRegClass)
181       return false;
182 
183     // There is a special case when one of the uses is COPY(z:FPR, y:GPR).
184     // In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can
185     // be folded by peephole-opt into just DUP(z:FPR, i), so this transform is
186     // not worthwhile in that case.
187     for (auto &Use : MRI.use_nodbg_instructions(Dst)) {
188       if (!Use.isCopy())
189         continue;
190 
191       Register UseOp0 = Use.getOperand(0).getReg();
192       Register UseOp1 = Use.getOperand(1).getReg();
193       if (UseOp0.isPhysical() || UseOp1.isPhysical())
194         return false;
195 
196       if (MRI.getRegClassOrNull(UseOp0) == FPRRegClass &&
197           MRI.getRegClassOrNull(UseOp1) == GPRRegClass)
198         return false;
199     }
200 
201     MachineInstr *SrcMI = MRI.getUniqueVRegDef(Src);
202     if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI.hasOneNonDBGUse(Src))
203       return false;
204 
205     Register DupSrc = SrcMI->getOperand(1).getReg();
206     int64_t DupImm = SrcMI->getOperand(2).getImm();
207 
208     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst)
209         .addReg(DupSrc)
210         .addImm(DupImm);
211     SrcMI->eraseFromParent();
212     MI.eraseFromParent();
213     return true;
214   };
215 
216   return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass,
217                      AArch64::DUPi32, AArch64::UMOVvi32) ||
218          TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass,
219                      AArch64::DUPi64, AArch64::UMOVvi64);
220 }
221 
optimizeNZCVDefs(MachineBasicBlock & MBB)222 bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
223   // If we find a dead NZCV implicit-def, we
224   // - try to convert the operation to a non-flag-setting equivalent
225   // - or mark the def as dead to aid later peephole optimizations.
226 
227   // Use cases:
228   // 1)
229   // Consider the following code:
230   //  FCMPSrr %0, %1, implicit-def $nzcv
231   //  %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
232   //  %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
233   //  FCMPSrr %0, %1, implicit-def $nzcv
234   //  %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
235   // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
236   // when we have a single IR fcmp being used by two selects. During selection,
237   // to ensure that there can be no clobbering of nzcv between the fcmp and the
238   // csel, we have to generate an fcmp immediately before each csel is
239   // selected.
240   // However, often we can essentially CSE these together later in MachineCSE.
241   // This doesn't work though if there are unrelated flag-setting instructions
242   // in between the two FCMPs. In this case, the SUBS defines NZCV
243   // but it doesn't have any users, being overwritten by the second FCMP.
244   //
245   // 2)
246   // The instruction selector always emits the flag-setting variant of ADC/SBC
247   // while selecting G_UADDE/G_SADDE/G_USUBE/G_SSUBE. If the carry-out of these
248   // instructions is never used, we can switch to the non-flag-setting variant.
249 
250   bool Changed = false;
251   auto &MF = *MBB.getParent();
252   auto &Subtarget = MF.getSubtarget();
253   const auto &TII = Subtarget.getInstrInfo();
254   auto TRI = Subtarget.getRegisterInfo();
255   auto RBI = Subtarget.getRegBankInfo();
256   auto &MRI = MF.getRegInfo();
257 
258   LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo());
259   LRU.addLiveOuts(MBB);
260 
261   for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
262     bool NZCVDead = LRU.available(AArch64::NZCV);
263     if (NZCVDead && II.definesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
264       // The instruction defines NZCV, but NZCV is dead.
265       unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode());
266       int DeadNZCVIdx =
267           II.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
268       if (DeadNZCVIdx != -1) {
269         if (NewOpc) {
270           // If there is an equivalent non-flag-setting op, we convert.
271           LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
272                                "op: "
273                             << II);
274           II.setDesc(TII->get(NewOpc));
275           II.removeOperand(DeadNZCVIdx);
276           // Changing the opcode can result in differing regclass requirements,
277           // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
278           // Constrain the regclasses, possibly introducing a copy.
279           constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(),
280                                    II.getOperand(0), 0);
281           Changed |= true;
282         } else {
283           // Otherwise, we just set the nzcv imp-def operand to be dead, so the
284           // peephole optimizations can optimize them further.
285           II.getOperand(DeadNZCVIdx).setIsDead();
286         }
287       }
288     }
289     LRU.stepBackward(II);
290   }
291   return Changed;
292 }
293 
runOnMachineFunction(MachineFunction & MF)294 bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
295   if (MF.getProperties().hasFailedISel())
296     return false;
297   assert(MF.getProperties().hasSelected() && "Expected a selected MF");
298 
299   bool Changed = false;
300   for (auto &BB : MF) {
301     Changed |= optimizeNZCVDefs(BB);
302     Changed |= doPeepholeOpts(BB);
303   }
304   return Changed;
305 }
306 
307 char AArch64PostSelectOptimize::ID = 0;
308 INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE,
309                       "Optimize AArch64 selected instructions",
310                       false, false)
311 INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE,
312                     "Optimize AArch64 selected instructions", false,
313                     false)
314 
315 namespace llvm {
createAArch64PostSelectOptimize()316 FunctionPass *createAArch64PostSelectOptimize() {
317   return new AArch64PostSelectOptimize();
318 }
319 } // end namespace llvm
320