1 //=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does post-instruction-selection optimizations in the GlobalISel
10 // pipeline, before the rest of codegen runs.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AArch64.h"
15 #include "AArch64TargetMachine.h"
16 #include "MCTargetDesc/AArch64MCTargetDesc.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/CodeGen/GlobalISel/Utils.h"
19 #include "llvm/CodeGen/MachineBasicBlock.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/CodeGen/MachineInstr.h"
22 #include "llvm/CodeGen/MachineOperand.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Support/Debug.h"
25 #include "llvm/Support/ErrorHandling.h"
26
27 #define DEBUG_TYPE "aarch64-post-select-optimize"
28
29 using namespace llvm;
30
31 namespace {
32 class AArch64PostSelectOptimize : public MachineFunctionPass {
33 public:
34 static char ID;
35
36 AArch64PostSelectOptimize();
37
getPassName() const38 StringRef getPassName() const override {
39 return "AArch64 Post Select Optimizer";
40 }
41
42 bool runOnMachineFunction(MachineFunction &MF) override;
43
44 void getAnalysisUsage(AnalysisUsage &AU) const override;
45
46 private:
47 bool optimizeNZCVDefs(MachineBasicBlock &MBB);
48 bool doPeepholeOpts(MachineBasicBlock &MBB);
49 /// Look for cross regclass copies that can be trivially eliminated.
50 bool foldSimpleCrossClassCopies(MachineInstr &MI);
51 bool foldCopyDup(MachineInstr &MI);
52 };
53 } // end anonymous namespace
54
getAnalysisUsage(AnalysisUsage & AU) const55 void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const {
56 AU.addRequired<TargetPassConfig>();
57 AU.setPreservesCFG();
58 getSelectionDAGFallbackAnalysisUsage(AU);
59 MachineFunctionPass::getAnalysisUsage(AU);
60 }
61
AArch64PostSelectOptimize()62 AArch64PostSelectOptimize::AArch64PostSelectOptimize()
63 : MachineFunctionPass(ID) {
64 initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry());
65 }
66
getNonFlagSettingVariant(unsigned Opc)67 unsigned getNonFlagSettingVariant(unsigned Opc) {
68 switch (Opc) {
69 default:
70 return 0;
71 case AArch64::SUBSXrr:
72 return AArch64::SUBXrr;
73 case AArch64::SUBSWrr:
74 return AArch64::SUBWrr;
75 case AArch64::SUBSXrs:
76 return AArch64::SUBXrs;
77 case AArch64::SUBSWrs:
78 return AArch64::SUBWrs;
79 case AArch64::SUBSXri:
80 return AArch64::SUBXri;
81 case AArch64::SUBSWri:
82 return AArch64::SUBWri;
83 case AArch64::ADDSXrr:
84 return AArch64::ADDXrr;
85 case AArch64::ADDSWrr:
86 return AArch64::ADDWrr;
87 case AArch64::ADDSXrs:
88 return AArch64::ADDXrs;
89 case AArch64::ADDSWrs:
90 return AArch64::ADDWrs;
91 case AArch64::ADDSXri:
92 return AArch64::ADDXri;
93 case AArch64::ADDSWri:
94 return AArch64::ADDWri;
95 case AArch64::SBCSXr:
96 return AArch64::SBCXr;
97 case AArch64::SBCSWr:
98 return AArch64::SBCWr;
99 case AArch64::ADCSXr:
100 return AArch64::ADCXr;
101 case AArch64::ADCSWr:
102 return AArch64::ADCWr;
103 }
104 }
105
doPeepholeOpts(MachineBasicBlock & MBB)106 bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) {
107 bool Changed = false;
108 for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) {
109 bool CurrentIterChanged = foldSimpleCrossClassCopies(MI);
110 if (!CurrentIterChanged)
111 CurrentIterChanged |= foldCopyDup(MI);
112 Changed |= CurrentIterChanged;
113 }
114 return Changed;
115 }
116
foldSimpleCrossClassCopies(MachineInstr & MI)117 bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) {
118 auto *MF = MI.getMF();
119 auto &MRI = MF->getRegInfo();
120
121 if (!MI.isCopy())
122 return false;
123
124 if (MI.getOperand(1).getSubReg())
125 return false; // Don't deal with subreg copies
126
127 Register Src = MI.getOperand(1).getReg();
128 Register Dst = MI.getOperand(0).getReg();
129
130 if (Src.isPhysical() || Dst.isPhysical())
131 return false;
132
133 const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
134 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
135
136 if (SrcRC == DstRC)
137 return false;
138
139
140 if (SrcRC->hasSubClass(DstRC)) {
141 // This is the case where the source class is a superclass of the dest, so
142 // if the copy is the only user of the source, we can just constrain the
143 // source reg to the dest class.
144
145 if (!MRI.hasOneNonDBGUse(Src))
146 return false; // Only constrain single uses of the source.
147
148 // Constrain to dst reg class as long as it's not a weird class that only
149 // has a few registers.
150 if (!MRI.constrainRegClass(Src, DstRC, /* MinNumRegs */ 25))
151 return false;
152 } else if (DstRC->hasSubClass(SrcRC)) {
153 // This is the inverse case, where the destination class is a superclass of
154 // the source. Here, if the copy is the only user, we can just constrain
155 // the user of the copy to use the smaller class of the source.
156 } else {
157 return false;
158 }
159
160 MRI.replaceRegWith(Dst, Src);
161 MI.eraseFromParent();
162 return true;
163 }
164
foldCopyDup(MachineInstr & MI)165 bool AArch64PostSelectOptimize::foldCopyDup(MachineInstr &MI) {
166 if (!MI.isCopy())
167 return false;
168
169 auto *MF = MI.getMF();
170 auto &MRI = MF->getRegInfo();
171 auto *TII = MF->getSubtarget().getInstrInfo();
172
173 // Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i).
174 // Here Dst is y and Src is the result of DUP.
175 Register Dst = MI.getOperand(0).getReg();
176 Register Src = MI.getOperand(1).getReg();
177
178 if (!Dst.isVirtual() || !Src.isVirtual())
179 return false;
180
181 auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass,
182 const TargetRegisterClass *FPRRegClass, unsigned DUP,
183 unsigned UMOV) {
184 if (MRI.getRegClassOrNull(Dst) != GPRRegClass ||
185 MRI.getRegClassOrNull(Src) != FPRRegClass)
186 return false;
187
188 // There is a special case when one of the uses is COPY(z:FPR, y:GPR).
189 // In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can
190 // be folded by peephole-opt into just DUP(z:FPR, i), so this transform is
191 // not worthwhile in that case.
192 for (auto &Use : MRI.use_nodbg_instructions(Dst)) {
193 if (!Use.isCopy())
194 continue;
195
196 Register UseOp0 = Use.getOperand(0).getReg();
197 Register UseOp1 = Use.getOperand(1).getReg();
198 if (UseOp0.isPhysical() || UseOp1.isPhysical())
199 return false;
200
201 if (MRI.getRegClassOrNull(UseOp0) == FPRRegClass &&
202 MRI.getRegClassOrNull(UseOp1) == GPRRegClass)
203 return false;
204 }
205
206 MachineInstr *SrcMI = MRI.getUniqueVRegDef(Src);
207 if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI.hasOneNonDBGUse(Src))
208 return false;
209
210 Register DupSrc = SrcMI->getOperand(1).getReg();
211 int64_t DupImm = SrcMI->getOperand(2).getImm();
212
213 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst)
214 .addReg(DupSrc)
215 .addImm(DupImm);
216 SrcMI->eraseFromParent();
217 MI.eraseFromParent();
218 return true;
219 };
220
221 return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass,
222 AArch64::DUPi32, AArch64::UMOVvi32) ||
223 TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass,
224 AArch64::DUPi64, AArch64::UMOVvi64);
225 }
226
optimizeNZCVDefs(MachineBasicBlock & MBB)227 bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
228 // If we find a dead NZCV implicit-def, we
229 // - try to convert the operation to a non-flag-setting equivalent
230 // - or mark the def as dead to aid later peephole optimizations.
231
232 // Use cases:
233 // 1)
234 // Consider the following code:
235 // FCMPSrr %0, %1, implicit-def $nzcv
236 // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
237 // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
238 // FCMPSrr %0, %1, implicit-def $nzcv
239 // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
240 // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
241 // when we have a single IR fcmp being used by two selects. During selection,
242 // to ensure that there can be no clobbering of nzcv between the fcmp and the
243 // csel, we have to generate an fcmp immediately before each csel is
244 // selected.
245 // However, often we can essentially CSE these together later in MachineCSE.
246 // This doesn't work though if there are unrelated flag-setting instructions
247 // in between the two FCMPs. In this case, the SUBS defines NZCV
248 // but it doesn't have any users, being overwritten by the second FCMP.
249 //
250 // 2)
251 // The instruction selector always emits the flag-setting variant of ADC/SBC
252 // while selecting G_UADDE/G_SADDE/G_USUBE/G_SSUBE. If the carry-out of these
253 // instructions is never used, we can switch to the non-flag-setting variant.
254
255 bool Changed = false;
256 auto &MF = *MBB.getParent();
257 auto &Subtarget = MF.getSubtarget();
258 const auto &TII = Subtarget.getInstrInfo();
259 auto TRI = Subtarget.getRegisterInfo();
260 auto RBI = Subtarget.getRegBankInfo();
261 auto &MRI = MF.getRegInfo();
262
263 LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo());
264 LRU.addLiveOuts(MBB);
265
266 for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
267 bool NZCVDead = LRU.available(AArch64::NZCV);
268 if (NZCVDead && II.definesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
269 // The instruction defines NZCV, but NZCV is dead.
270 unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode());
271 int DeadNZCVIdx =
272 II.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
273 if (DeadNZCVIdx != -1) {
274 if (NewOpc) {
275 // If there is an equivalent non-flag-setting op, we convert.
276 LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
277 "op: "
278 << II);
279 II.setDesc(TII->get(NewOpc));
280 II.removeOperand(DeadNZCVIdx);
281 // Changing the opcode can result in differing regclass requirements,
282 // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
283 // Constrain the regclasses, possibly introducing a copy.
284 constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(),
285 II.getOperand(0), 0);
286 Changed |= true;
287 } else {
288 // Otherwise, we just set the nzcv imp-def operand to be dead, so the
289 // peephole optimizations can optimize them further.
290 II.getOperand(DeadNZCVIdx).setIsDead();
291 }
292 }
293 }
294 LRU.stepBackward(II);
295 }
296 return Changed;
297 }
298
runOnMachineFunction(MachineFunction & MF)299 bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
300 if (MF.getProperties().hasProperty(
301 MachineFunctionProperties::Property::FailedISel))
302 return false;
303 assert(MF.getProperties().hasProperty(
304 MachineFunctionProperties::Property::Selected) &&
305 "Expected a selected MF");
306
307 bool Changed = false;
308 for (auto &BB : MF) {
309 Changed |= optimizeNZCVDefs(BB);
310 Changed |= doPeepholeOpts(BB);
311 }
312 return Changed;
313 }
314
315 char AArch64PostSelectOptimize::ID = 0;
316 INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE,
317 "Optimize AArch64 selected instructions",
318 false, false)
319 INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE,
320 "Optimize AArch64 selected instructions", false,
321 false)
322
323 namespace llvm {
createAArch64PostSelectOptimize()324 FunctionPass *createAArch64PostSelectOptimize() {
325 return new AArch64PostSelectOptimize();
326 }
327 } // end namespace llvm
328