xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp (revision 258a0d760aa8b42899a000e30f610f900a402556)
1 //=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does post-instruction-selection optimizations in the GlobalISel
10 // pipeline, before the rest of codegen runs.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64.h"
15 #include "AArch64TargetMachine.h"
16 #include "MCTargetDesc/AArch64MCTargetDesc.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/CodeGen/GlobalISel/Utils.h"
19 #include "llvm/CodeGen/MachineBasicBlock.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/CodeGen/MachineInstr.h"
22 #include "llvm/CodeGen/MachineOperand.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Support/Debug.h"
25 #include "llvm/Support/ErrorHandling.h"
26 
27 #define DEBUG_TYPE "aarch64-post-select-optimize"
28 
29 using namespace llvm;
30 
31 namespace {
32 class AArch64PostSelectOptimize : public MachineFunctionPass {
33 public:
34   static char ID;
35 
36   AArch64PostSelectOptimize();
37 
38   StringRef getPassName() const override {
39     return "AArch64 Post Select Optimizer";
40   }
41 
42   bool runOnMachineFunction(MachineFunction &MF) override;
43 
44   void getAnalysisUsage(AnalysisUsage &AU) const override;
45 
46 private:
47   bool optimizeNZCVDefs(MachineBasicBlock &MBB);
48   bool doPeepholeOpts(MachineBasicBlock &MBB);
49   /// Look for cross regclass copies that can be trivially eliminated.
50   bool foldSimpleCrossClassCopies(MachineInstr &MI);
51 };
52 } // end anonymous namespace
53 
54 void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const {
55   AU.addRequired<TargetPassConfig>();
56   AU.setPreservesCFG();
57   getSelectionDAGFallbackAnalysisUsage(AU);
58   MachineFunctionPass::getAnalysisUsage(AU);
59 }
60 
61 AArch64PostSelectOptimize::AArch64PostSelectOptimize()
62     : MachineFunctionPass(ID) {
63   initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry());
64 }
65 
66 unsigned getNonFlagSettingVariant(unsigned Opc) {
67   switch (Opc) {
68   default:
69     return 0;
70   case AArch64::SUBSXrr:
71     return AArch64::SUBXrr;
72   case AArch64::SUBSWrr:
73     return AArch64::SUBWrr;
74   case AArch64::SUBSXrs:
75     return AArch64::SUBXrs;
76   case AArch64::SUBSXri:
77     return AArch64::SUBXri;
78   case AArch64::SUBSWri:
79     return AArch64::SUBWri;
80   }
81 }
82 
83 bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) {
84   bool Changed = false;
85   for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) {
86     Changed |= foldSimpleCrossClassCopies(MI);
87   }
88   return Changed;
89 }
90 
91 bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) {
92   auto *MF = MI.getMF();
93   auto &MRI = MF->getRegInfo();
94 
95   if (!MI.isCopy())
96     return false;
97 
98   if (MI.getOperand(1).getSubReg())
99     return false; // Don't deal with subreg copies
100 
101   Register Src = MI.getOperand(1).getReg();
102   Register Dst = MI.getOperand(0).getReg();
103 
104   if (Src.isPhysical() || Dst.isPhysical())
105     return false;
106 
107   const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
108   const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
109 
110   if (SrcRC == DstRC)
111     return false;
112 
113 
114   if (SrcRC->hasSubClass(DstRC)) {
115     // This is the case where the source class is a superclass of the dest, so
116     // if the copy is the only user of the source, we can just constrain the
117     // source reg to the dest class.
118 
119     if (!MRI.hasOneNonDBGUse(Src))
120       return false; // Only constrain single uses of the source.
121 
122     // Constrain to dst reg class as long as it's not a weird class that only
123     // has a few registers.
124     if (!MRI.constrainRegClass(Src, DstRC, /* MinNumRegs */ 25))
125       return false;
126   } else if (DstRC->hasSubClass(SrcRC)) {
127     // This is the inverse case, where the destination class is a superclass of
128     // the source. Here, if the copy is the only user, we can just constrain
129     // the user of the copy to use the smaller class of the source.
130   } else {
131     return false;
132   }
133 
134   MRI.replaceRegWith(Dst, Src);
135   MI.eraseFromParent();
136   return true;
137 }
138 
139 bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
140   // Consider the following code:
141   //  FCMPSrr %0, %1, implicit-def $nzcv
142   //  %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
143   //  %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
144   //  FCMPSrr %0, %1, implicit-def $nzcv
145   //  %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
146   // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
147   // when we have a single IR fcmp being used by two selects. During selection,
148   // to ensure that there can be no clobbering of nzcv between the fcmp and the
149   // csel, we have to generate an fcmp immediately before each csel is
150   // selected.
151   // However, often we can essentially CSE these together later in MachineCSE.
152   // This doesn't work though if there are unrelated flag-setting instructions
153   // in between the two FCMPs. In this case, the SUBS defines NZCV
154   // but it doesn't have any users, being overwritten by the second FCMP.
155   //
156   // Our solution here is to try to convert flag setting operations between
157   // a interval of identical FCMPs, so that CSE will be able to eliminate one.
158   bool Changed = false;
159   auto &MF = *MBB.getParent();
160   auto &Subtarget = MF.getSubtarget();
161   const auto &TII = Subtarget.getInstrInfo();
162   auto TRI = Subtarget.getRegisterInfo();
163   auto RBI = Subtarget.getRegBankInfo();
164   auto &MRI = MF.getRegInfo();
165 
166   // The first step is to find the first and last FCMPs. If we have found
167   // at least two, then set the limit of the bottom-up walk to the first FCMP
168   // found since we're only interested in dealing with instructions between
169   // them.
170   MachineInstr *FirstCmp = nullptr, *LastCmp = nullptr;
171   for (auto &MI : instructionsWithoutDebug(MBB.begin(), MBB.end())) {
172     if (MI.getOpcode() == AArch64::FCMPSrr ||
173         MI.getOpcode() == AArch64::FCMPDrr) {
174       if (!FirstCmp)
175         FirstCmp = &MI;
176       else
177         LastCmp = &MI;
178     }
179   }
180 
181   // In addition to converting flag-setting ops in fcmp ranges into non-flag
182   // setting ops, across the whole basic block we also detect when nzcv
183   // implicit-defs are dead, and mark them as dead. Peephole optimizations need
184   // this information later.
185 
186   LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo());
187   LRU.addLiveOuts(MBB);
188   bool NZCVDead = LRU.available(AArch64::NZCV);
189   bool InsideCmpRange = false;
190   for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
191     LRU.stepBackward(II);
192 
193     if (LastCmp) { // There's a range present in this block.
194       // If we're inside an fcmp range, look for begin instruction.
195       if (InsideCmpRange && &II == FirstCmp)
196         InsideCmpRange = false;
197       else if (&II == LastCmp)
198         InsideCmpRange = true;
199     }
200 
201     // Did this instruction define NZCV?
202     bool NZCVDeadAtCurrInstr = LRU.available(AArch64::NZCV);
203     if (NZCVDead && NZCVDeadAtCurrInstr && II.definesRegister(AArch64::NZCV)) {
204       // If we have a def and NZCV is dead, then we may convert this op.
205       unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode());
206       int DeadNZCVIdx = II.findRegisterDefOperandIdx(AArch64::NZCV);
207       if (DeadNZCVIdx != -1) {
208         // If we're inside an fcmp range, then convert flag setting ops.
209         if (InsideCmpRange && NewOpc) {
210           LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
211                                "op in fcmp range: "
212                             << II);
213           II.setDesc(TII->get(NewOpc));
214           II.removeOperand(DeadNZCVIdx);
215           // Changing the opcode can result in differing regclass requirements,
216           // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
217           // Constrain the regclasses, possibly introducing a copy.
218           constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(),
219                                    II.getOperand(0), 0);
220           Changed |= true;
221         } else {
222           // Otherwise, we just set the nzcv imp-def operand to be dead, so the
223           // peephole optimizations can optimize them further.
224           II.getOperand(DeadNZCVIdx).setIsDead();
225         }
226       }
227     }
228 
229     NZCVDead = NZCVDeadAtCurrInstr;
230   }
231   return Changed;
232 }
233 
234 bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
235   if (MF.getProperties().hasProperty(
236           MachineFunctionProperties::Property::FailedISel))
237     return false;
238   assert(MF.getProperties().hasProperty(
239              MachineFunctionProperties::Property::Selected) &&
240          "Expected a selected MF");
241 
242   bool Changed = false;
243   for (auto &BB : MF) {
244     Changed |= optimizeNZCVDefs(BB);
245     Changed |= doPeepholeOpts(BB);
246   }
247   return Changed;
248 }
249 
250 char AArch64PostSelectOptimize::ID = 0;
251 INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE,
252                       "Optimize AArch64 selected instructions",
253                       false, false)
254 INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE,
255                     "Optimize AArch64 selected instructions", false,
256                     false)
257 
258 namespace llvm {
259 FunctionPass *createAArch64PostSelectOptimize() {
260   return new AArch64PostSelectOptimize();
261 }
262 } // end namespace llvm
263