1 //===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass does a few optimisations related to Tail predicated loops
10 /// and MVE VPT blocks before register allocation is performed. For VPT blocks
11 /// the goal is to maximize the sizes of the blocks that will be created by the
12 /// MVE VPT Block Insertion pass (which runs after register allocation). For
13 /// tail predicated loops we transform the loop into something that will
14 /// hopefully make the backend ARMLowOverheadLoops pass's job easier.
15 ///
16 //===----------------------------------------------------------------------===//
17
18 #include "ARM.h"
19 #include "ARMSubtarget.h"
20 #include "MCTargetDesc/ARMBaseInfo.h"
21 #include "MVETailPredUtils.h"
22 #include "Thumb2InstrInfo.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
28 #include "llvm/CodeGen/MachineInstr.h"
29 #include "llvm/CodeGen/MachineLoopInfo.h"
30 #include "llvm/InitializePasses.h"
31 #include "llvm/Support/Debug.h"
32 #include <cassert>
33
34 using namespace llvm;
35
36 #define DEBUG_TYPE "arm-mve-vpt-opts"
37
38 static cl::opt<bool>
39 MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden,
40 cl::desc("Enable merging Loop End and Dec instructions."),
41 cl::init(true));
42
43 static cl::opt<bool>
44 SetLRPredicate("arm-set-lr-predicate", cl::Hidden,
45 cl::desc("Enable setting lr as a predicate in tail predication regions."),
46 cl::init(true));
47
48 namespace {
49 class MVETPAndVPTOptimisations : public MachineFunctionPass {
50 public:
51 static char ID;
52 const Thumb2InstrInfo *TII;
53 MachineRegisterInfo *MRI;
54
MVETPAndVPTOptimisations()55 MVETPAndVPTOptimisations() : MachineFunctionPass(ID) {}
56
57 bool runOnMachineFunction(MachineFunction &Fn) override;
58
getAnalysisUsage(AnalysisUsage & AU) const59 void getAnalysisUsage(AnalysisUsage &AU) const override {
60 AU.addRequired<MachineLoopInfoWrapperPass>();
61 AU.addPreserved<MachineLoopInfoWrapperPass>();
62 AU.addRequired<MachineDominatorTreeWrapperPass>();
63 AU.addPreserved<MachineDominatorTreeWrapperPass>();
64 MachineFunctionPass::getAnalysisUsage(AU);
65 }
66
getPassName() const67 StringRef getPassName() const override {
68 return "ARM MVE TailPred and VPT Optimisation Pass";
69 }
70
71 private:
72 bool LowerWhileLoopStart(MachineLoop *ML);
73 bool MergeLoopEnd(MachineLoop *ML);
74 bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT);
75 MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
76 MachineInstr &Instr,
77 MachineOperand &User,
78 Register Target);
79 bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
80 bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
81 bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
82 bool ConvertVPSEL(MachineBasicBlock &MBB);
83 bool HintDoLoopStartReg(MachineBasicBlock &MBB);
84 MachineInstr *CheckForLRUseInPredecessors(MachineBasicBlock *PreHeader,
85 MachineInstr *LoopStart);
86 };
87
88 char MVETPAndVPTOptimisations::ID = 0;
89
90 } // end anonymous namespace
91
92 INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE,
93 "ARM MVE TailPred and VPT Optimisations pass", false,
94 false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)95 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
96 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
97 INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE,
98 "ARM MVE TailPred and VPT Optimisations pass", false, false)
99
100 static MachineInstr *LookThroughCOPY(MachineInstr *MI,
101 MachineRegisterInfo *MRI) {
102 while (MI && MI->getOpcode() == TargetOpcode::COPY &&
103 MI->getOperand(1).getReg().isVirtual())
104 MI = MRI->getVRegDef(MI->getOperand(1).getReg());
105 return MI;
106 }
107
108 // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and
109 // corresponding PHI that make up a low overhead loop. Only handles 'do' loops
110 // at the moment, returning a t2DoLoopStart in LoopStart.
findLoopComponents(MachineLoop * ML,MachineRegisterInfo * MRI,MachineInstr * & LoopStart,MachineInstr * & LoopPhi,MachineInstr * & LoopDec,MachineInstr * & LoopEnd)111 static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI,
112 MachineInstr *&LoopStart, MachineInstr *&LoopPhi,
113 MachineInstr *&LoopDec, MachineInstr *&LoopEnd) {
114 MachineBasicBlock *Header = ML->getHeader();
115 MachineBasicBlock *Latch = ML->getLoopLatch();
116 if (!Header || !Latch) {
117 LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n");
118 return false;
119 }
120
121 // Find the loop end from the terminators.
122 LoopEnd = nullptr;
123 for (auto &T : Latch->terminators()) {
124 if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) {
125 LoopEnd = &T;
126 break;
127 }
128 if (T.getOpcode() == ARM::t2LoopEndDec &&
129 T.getOperand(2).getMBB() == Header) {
130 LoopEnd = &T;
131 break;
132 }
133 }
134 if (!LoopEnd) {
135 LLVM_DEBUG(dbgs() << " no LoopEnd\n");
136 return false;
137 }
138 LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd);
139
140 // Find the dec from the use of the end. There may be copies between
141 // instructions. We expect the loop to loop like:
142 // $vs = t2DoLoopStart ...
143 // loop:
144 // $vp = phi [ $vs ], [ $vd ]
145 // ...
146 // $vd = t2LoopDec $vp
147 // ...
148 // t2LoopEnd $vd, loop
149 if (LoopEnd->getOpcode() == ARM::t2LoopEndDec)
150 LoopDec = LoopEnd;
151 else {
152 LoopDec =
153 LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI);
154 if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) {
155 LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n");
156 return false;
157 }
158 }
159 LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec);
160
161 LoopPhi =
162 LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI);
163 if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI ||
164 LoopPhi->getNumOperands() != 5 ||
165 (LoopPhi->getOperand(2).getMBB() != Latch &&
166 LoopPhi->getOperand(4).getMBB() != Latch)) {
167 LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n");
168 return false;
169 }
170 LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi);
171
172 Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch
173 ? LoopPhi->getOperand(3).getReg()
174 : LoopPhi->getOperand(1).getReg();
175 LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI);
176 if (!LoopStart || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
177 LoopStart->getOpcode() != ARM::t2WhileLoopSetup &&
178 LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) {
179 LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n");
180 return false;
181 }
182 LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart);
183
184 return true;
185 }
186
RevertWhileLoopSetup(MachineInstr * MI,const TargetInstrInfo * TII)187 static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) {
188 MachineBasicBlock *MBB = MI->getParent();
189 assert(MI->getOpcode() == ARM::t2WhileLoopSetup &&
190 "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!");
191
192 // Subs
193 MachineInstrBuilder MIB =
194 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
195 MIB.add(MI->getOperand(0));
196 MIB.add(MI->getOperand(1));
197 MIB.addImm(0);
198 MIB.addImm(ARMCC::AL);
199 MIB.addReg(ARM::NoRegister);
200 MIB.addReg(ARM::CPSR, RegState::Define);
201
202 // Attempt to find a t2WhileLoopStart and revert to a t2Bcc.
203 for (MachineInstr &I : MBB->terminators()) {
204 if (I.getOpcode() == ARM::t2WhileLoopStart) {
205 MachineInstrBuilder MIB =
206 BuildMI(*MBB, &I, I.getDebugLoc(), TII->get(ARM::t2Bcc));
207 MIB.add(MI->getOperand(1)); // branch target
208 MIB.addImm(ARMCC::EQ);
209 MIB.addReg(ARM::CPSR);
210 I.eraseFromParent();
211 break;
212 }
213 }
214
215 MI->eraseFromParent();
216 }
217
218 // The Hardware Loop insertion and ISel Lowering produce the pseudos for the
219 // start of a while loop:
220 // %a:gprlr = t2WhileLoopSetup %Cnt
221 // t2WhileLoopStart %a, %BB
222 // We want to convert those to a single instruction which, like t2LoopEndDec and
223 // t2DoLoopStartTP is both a terminator and produces a value:
224 // %a:grplr: t2WhileLoopStartLR %Cnt, %BB
225 //
226 // Otherwise if we can't, we revert the loop. t2WhileLoopSetup and
227 // t2WhileLoopStart are not valid past regalloc.
LowerWhileLoopStart(MachineLoop * ML)228 bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) {
229 LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop "
230 << ML->getHeader()->getName() << "\n");
231
232 MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
233 if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
234 return false;
235
236 if (LoopStart->getOpcode() != ARM::t2WhileLoopSetup)
237 return false;
238
239 Register LR = LoopStart->getOperand(0).getReg();
240 auto WLSIt = find_if(MRI->use_nodbg_instructions(LR), [](auto &MI) {
241 return MI.getOpcode() == ARM::t2WhileLoopStart;
242 });
243 if (!MergeEndDec || WLSIt == MRI->use_instr_nodbg_end()) {
244 RevertWhileLoopSetup(LoopStart, TII);
245 RevertLoopDec(LoopStart, TII);
246 RevertLoopEnd(LoopStart, TII);
247 return true;
248 }
249
250 MachineInstrBuilder MI =
251 BuildMI(*WLSIt->getParent(), *WLSIt, WLSIt->getDebugLoc(),
252 TII->get(ARM::t2WhileLoopStartLR), LR)
253 .add(LoopStart->getOperand(1))
254 .add(WLSIt->getOperand(1));
255 (void)MI;
256 LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI.getInstr());
257
258 WLSIt->eraseFromParent();
259 LoopStart->eraseFromParent();
260 return true;
261 }
262
263 // Return true if this instruction is invalid in a low overhead loop, usually
264 // because it clobbers LR.
IsInvalidTPInstruction(MachineInstr & MI)265 static bool IsInvalidTPInstruction(MachineInstr &MI) {
266 return MI.isCall() || isLoopStart(MI);
267 }
268
269 // Starting from PreHeader, search for invalid instructions back until the
270 // LoopStart block is reached. If invalid instructions are found, the loop start
271 // is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will
272 // return the new DLS LoopStart if updated.
CheckForLRUseInPredecessors(MachineBasicBlock * PreHeader,MachineInstr * LoopStart)273 MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors(
274 MachineBasicBlock *PreHeader, MachineInstr *LoopStart) {
275 SmallVector<MachineBasicBlock *> Worklist;
276 SmallPtrSet<MachineBasicBlock *, 4> Visited;
277 Worklist.push_back(PreHeader);
278 Visited.insert(LoopStart->getParent());
279
280 while (!Worklist.empty()) {
281 MachineBasicBlock *MBB = Worklist.pop_back_val();
282 if (Visited.count(MBB))
283 continue;
284
285 for (MachineInstr &MI : *MBB) {
286 if (!IsInvalidTPInstruction(MI))
287 continue;
288
289 LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI);
290
291 // Create a t2DoLoopStart at the end of the preheader.
292 MachineInstrBuilder MIB =
293 BuildMI(*PreHeader, PreHeader->getFirstTerminator(),
294 LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart));
295 MIB.add(LoopStart->getOperand(0));
296 MIB.add(LoopStart->getOperand(1));
297
298 // Make sure to remove the kill flags, to prevent them from being invalid.
299 LoopStart->getOperand(1).setIsKill(false);
300
301 // Revert the t2WhileLoopStartLR to a CMP and Br.
302 RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true);
303 return MIB;
304 }
305
306 Visited.insert(MBB);
307 for (auto *Pred : MBB->predecessors())
308 Worklist.push_back(Pred);
309 }
310 return LoopStart;
311 }
312
313 // This function converts loops with t2LoopEnd and t2LoopEnd instructions into
314 // a single t2LoopEndDec instruction. To do that it needs to make sure that LR
315 // will be valid to be used for the low overhead loop, which means nothing else
316 // is using LR (especially calls) and there are no superfluous copies in the
317 // loop. The t2LoopEndDec is a branching terminator that produces a value (the
318 // decrement) around the loop edge, which means we need to be careful that they
319 // will be valid to allocate without any spilling.
MergeLoopEnd(MachineLoop * ML)320 bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
321 if (!MergeEndDec)
322 return false;
323
324 LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName()
325 << "\n");
326
327 MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
328 if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
329 return false;
330
331 // Check if there is an illegal instruction (a call) in the low overhead loop
332 // and if so revert it now before we get any further. While loops also need to
333 // check the preheaders, but can be reverted to a DLS loop if needed.
334 auto *PreHeader = ML->getLoopPreheader();
335 if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader)
336 LoopStart = CheckForLRUseInPredecessors(PreHeader, LoopStart);
337
338 for (MachineBasicBlock *MBB : ML->blocks()) {
339 for (MachineInstr &MI : *MBB) {
340 if (IsInvalidTPInstruction(MI)) {
341 LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI);
342 if (LoopStart->getOpcode() == ARM::t2DoLoopStart)
343 RevertDoLoopStart(LoopStart, TII);
344 else
345 RevertWhileLoopStartLR(LoopStart, TII);
346 RevertLoopDec(LoopDec, TII);
347 RevertLoopEnd(LoopEnd, TII);
348 return true;
349 }
350 }
351 }
352
353 // Remove any copies from the loop, to ensure the phi that remains is both
354 // simpler and contains no extra uses. Because t2LoopEndDec is a terminator
355 // that cannot spill, we need to be careful what remains in the loop.
356 Register PhiReg = LoopPhi->getOperand(0).getReg();
357 Register DecReg = LoopDec->getOperand(0).getReg();
358 Register StartReg = LoopStart->getOperand(0).getReg();
359 // Ensure the uses are expected, and collect any copies we want to remove.
360 SmallVector<MachineInstr *, 4> Copies;
361 auto CheckUsers = [&Copies](Register BaseReg,
362 ArrayRef<MachineInstr *> ExpectedUsers,
363 MachineRegisterInfo *MRI) {
364 SmallVector<Register, 4> Worklist;
365 Worklist.push_back(BaseReg);
366 while (!Worklist.empty()) {
367 Register Reg = Worklist.pop_back_val();
368 for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
369 if (llvm::is_contained(ExpectedUsers, &MI))
370 continue;
371 if (MI.getOpcode() != TargetOpcode::COPY ||
372 !MI.getOperand(0).getReg().isVirtual()) {
373 LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI);
374 return false;
375 }
376 Worklist.push_back(MI.getOperand(0).getReg());
377 Copies.push_back(&MI);
378 }
379 }
380 return true;
381 };
382 if (!CheckUsers(PhiReg, {LoopDec}, MRI) ||
383 !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) ||
384 !CheckUsers(StartReg, {LoopPhi}, MRI)) {
385 // Don't leave a t2WhileLoopStartLR without the LoopDecEnd.
386 if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR) {
387 RevertWhileLoopStartLR(LoopStart, TII);
388 RevertLoopDec(LoopDec, TII);
389 RevertLoopEnd(LoopEnd, TII);
390 return true;
391 }
392 return false;
393 }
394
395 MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass);
396 MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass);
397 MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass);
398
399 if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) {
400 LoopPhi->getOperand(3).setReg(StartReg);
401 LoopPhi->getOperand(1).setReg(DecReg);
402 } else {
403 LoopPhi->getOperand(1).setReg(StartReg);
404 LoopPhi->getOperand(3).setReg(DecReg);
405 }
406
407 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
408 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
409 if (!TII->analyzeBranch(*LoopEnd->getParent(), TBB, FBB, Cond) && !FBB) {
410 // If the LoopEnd falls through, need to insert a t2B to the fall-through
411 // block so that the non-analyzable t2LoopEndDec doesn't fall through.
412 MachineFunction::iterator MBBI = ++LoopEnd->getParent()->getIterator();
413 BuildMI(LoopEnd->getParent(), DebugLoc(), TII->get(ARM::t2B))
414 .addMBB(&*MBBI)
415 .add(predOps(ARMCC::AL));
416 }
417
418 // Replace the loop dec and loop end as a single instruction.
419 MachineInstrBuilder MI =
420 BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(),
421 TII->get(ARM::t2LoopEndDec), DecReg)
422 .addReg(PhiReg)
423 .add(LoopEnd->getOperand(1));
424 (void)MI;
425 LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr());
426
427 LoopDec->eraseFromParent();
428 LoopEnd->eraseFromParent();
429 for (auto *MI : Copies)
430 MI->eraseFromParent();
431 return true;
432 }
433
434 // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP
435 // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP
436 // instruction, making the backend ARMLowOverheadLoops passes job of finding the
437 // VCTP operand much simpler.
ConvertTailPredLoop(MachineLoop * ML,MachineDominatorTree * DT)438 bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
439 MachineDominatorTree *DT) {
440 LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop "
441 << ML->getHeader()->getName() << "\n");
442
443 // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's
444 // in the loop.
445 MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
446 if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
447 return false;
448 if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
449 LoopStart->getOpcode() != ARM::t2WhileLoopStartLR))
450 return false;
451
452 SmallVector<MachineInstr *, 4> VCTPs;
453 SmallVector<MachineInstr *, 4> MVEInstrs;
454 for (MachineBasicBlock *BB : ML->blocks()) {
455 for (MachineInstr &MI : *BB)
456 if (isVCTP(&MI))
457 VCTPs.push_back(&MI);
458 else if (findFirstVPTPredOperandIdx(MI) != -1)
459 MVEInstrs.push_back(&MI);
460 }
461
462 if (VCTPs.empty()) {
463 LLVM_DEBUG(dbgs() << " no VCTPs\n");
464 return false;
465 }
466
467 // Check all VCTPs are the same.
468 MachineInstr *FirstVCTP = *VCTPs.begin();
469 for (MachineInstr *VCTP : VCTPs) {
470 LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP);
471 if (VCTP->getOpcode() != FirstVCTP->getOpcode() ||
472 VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) {
473 LLVM_DEBUG(dbgs() << " VCTP's are not identical\n");
474 return false;
475 }
476 }
477
478 // Check for the register being used can be setup before the loop. We expect
479 // this to be:
480 // $vx = ...
481 // loop:
482 // $vp = PHI [ $vx ], [ $vd ]
483 // ..
484 // $vpr = VCTP $vp
485 // ..
486 // $vd = t2SUBri $vp, #n
487 // ..
488 Register CountReg = FirstVCTP->getOperand(1).getReg();
489 if (!CountReg.isVirtual()) {
490 LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n");
491 return false;
492 }
493 MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI);
494 if (!Phi || Phi->getOpcode() != TargetOpcode::PHI ||
495 Phi->getNumOperands() != 5 ||
496 (Phi->getOperand(2).getMBB() != ML->getLoopLatch() &&
497 Phi->getOperand(4).getMBB() != ML->getLoopLatch())) {
498 LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n");
499 return false;
500 }
501 CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch()
502 ? Phi->getOperand(3).getReg()
503 : Phi->getOperand(1).getReg();
504
505 // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of
506 // the preheader and add the new CountReg to it. We attempt to place it late
507 // in the preheader, but may need to move that earlier based on uses.
508 MachineBasicBlock *MBB = LoopStart->getParent();
509 MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
510 for (MachineInstr &Use :
511 MRI->use_instructions(LoopStart->getOperand(0).getReg()))
512 if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) ||
513 !DT->dominates(ML->getHeader(), Use.getParent())) {
514 LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n");
515 return false;
516 }
517
518 unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart
519 ? ARM::t2DoLoopStartTP
520 : ARM::t2WhileLoopStartTP;
521 MachineInstrBuilder MI =
522 BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc))
523 .add(LoopStart->getOperand(0))
524 .add(LoopStart->getOperand(1))
525 .addReg(CountReg);
526 if (NewOpc == ARM::t2WhileLoopStartTP)
527 MI.add(LoopStart->getOperand(2));
528 LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with "
529 << *MI.getInstr());
530 MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);
531 LoopStart->eraseFromParent();
532
533 if (SetLRPredicate) {
534 // Each instruction in the loop needs to be using LR as the predicate from
535 // the Phi as the predicate.
536 Register LR = LoopPhi->getOperand(0).getReg();
537 for (MachineInstr *MI : MVEInstrs) {
538 int Idx = findFirstVPTPredOperandIdx(*MI);
539 MI->getOperand(Idx + 2).setReg(LR);
540 }
541 }
542
543 return true;
544 }
545
546 // Returns true if Opcode is any VCMP Opcode.
IsVCMP(unsigned Opcode)547 static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; }
548
549 // Returns true if a VCMP with this Opcode can have its operands swapped.
550 // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs,
551 // and VCMPr instructions (since the r is always on the right).
CanHaveSwappedOperands(unsigned Opcode)552 static bool CanHaveSwappedOperands(unsigned Opcode) {
553 switch (Opcode) {
554 default:
555 return true;
556 case ARM::MVE_VCMPf32:
557 case ARM::MVE_VCMPf16:
558 case ARM::MVE_VCMPf32r:
559 case ARM::MVE_VCMPf16r:
560 case ARM::MVE_VCMPi8r:
561 case ARM::MVE_VCMPi16r:
562 case ARM::MVE_VCMPi32r:
563 case ARM::MVE_VCMPu8r:
564 case ARM::MVE_VCMPu16r:
565 case ARM::MVE_VCMPu32r:
566 case ARM::MVE_VCMPs8r:
567 case ARM::MVE_VCMPs16r:
568 case ARM::MVE_VCMPs32r:
569 return false;
570 }
571 }
572
573 // Returns the CondCode of a VCMP Instruction.
GetCondCode(MachineInstr & Instr)574 static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) {
575 assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP");
576 return ARMCC::CondCodes(Instr.getOperand(3).getImm());
577 }
578
579 // Returns true if Cond is equivalent to a VPNOT instruction on the result of
580 // Prev. Cond and Prev must be VCMPs.
IsVPNOTEquivalent(MachineInstr & Cond,MachineInstr & Prev)581 static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) {
582 assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode()));
583
584 // Opcodes must match.
585 if (Cond.getOpcode() != Prev.getOpcode())
586 return false;
587
588 MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2);
589 MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2);
590
591 // If the VCMP has the opposite condition with the same operands, we can
592 // replace it with a VPNOT
593 ARMCC::CondCodes ExpectedCode = GetCondCode(Cond);
594 ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode);
595 if (ExpectedCode == GetCondCode(Prev))
596 if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2))
597 return true;
598 // Check again with operands swapped if possible
599 if (!CanHaveSwappedOperands(Cond.getOpcode()))
600 return false;
601 ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode);
602 return ExpectedCode == GetCondCode(Prev) && CondOP1.isIdenticalTo(PrevOP2) &&
603 CondOP2.isIdenticalTo(PrevOP1);
604 }
605
606 // Returns true if Instr writes to VCCR.
IsWritingToVCCR(MachineInstr & Instr)607 static bool IsWritingToVCCR(MachineInstr &Instr) {
608 if (Instr.getNumOperands() == 0)
609 return false;
610 MachineOperand &Dst = Instr.getOperand(0);
611 if (!Dst.isReg())
612 return false;
613 Register DstReg = Dst.getReg();
614 if (!DstReg.isVirtual())
615 return false;
616 MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo();
617 const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg);
618 return RegClass && (RegClass->getID() == ARM::VCCRRegClassID);
619 }
620
621 // Transforms
622 // <Instr that uses %A ('User' Operand)>
623 // Into
624 // %K = VPNOT %Target
625 // <Instr that uses %K ('User' Operand)>
626 // And returns the newly inserted VPNOT.
627 // This optimization is done in the hopes of preventing spills/reloads of VPR by
628 // reducing the number of VCCR values with overlapping lifetimes.
ReplaceRegisterUseWithVPNOT(MachineBasicBlock & MBB,MachineInstr & Instr,MachineOperand & User,Register Target)629 MachineInstr &MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT(
630 MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User,
631 Register Target) {
632 Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target));
633
634 MachineInstrBuilder MIBuilder =
635 BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
636 .addDef(NewResult)
637 .addReg(Target);
638 addUnpredicatedMveVpredNOp(MIBuilder);
639
640 // Make the user use NewResult instead, and clear its kill flag.
641 User.setReg(NewResult);
642 User.setIsKill(false);
643
644 LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): ";
645 MIBuilder.getInstr()->dump());
646
647 return *MIBuilder.getInstr();
648 }
649
650 // Moves a VPNOT before its first user if an instruction that uses Reg is found
651 // in-between the VPNOT and its user.
652 // Returns true if there is at least one user of the VPNOT in the block.
MoveVPNOTBeforeFirstUser(MachineBasicBlock & MBB,MachineBasicBlock::iterator Iter,Register Reg)653 static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB,
654 MachineBasicBlock::iterator Iter,
655 Register Reg) {
656 assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!");
657 assert(getVPTInstrPredicate(*Iter) == ARMVCC::None &&
658 "The VPNOT cannot be predicated");
659
660 MachineInstr &VPNOT = *Iter;
661 Register VPNOTResult = VPNOT.getOperand(0).getReg();
662 Register VPNOTOperand = VPNOT.getOperand(1).getReg();
663
664 // Whether the VPNOT will need to be moved, and whether we found a user of the
665 // VPNOT.
666 bool MustMove = false, HasUser = false;
667 MachineOperand *VPNOTOperandKiller = nullptr;
668 for (; Iter != MBB.end(); ++Iter) {
669 if (MachineOperand *MO =
670 Iter->findRegisterUseOperand(VPNOTOperand, /*TRI=*/nullptr,
671 /*isKill*/ true)) {
672 // If we find the operand that kills the VPNOTOperand's result, save it.
673 VPNOTOperandKiller = MO;
674 }
675
676 if (Iter->findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != -1) {
677 MustMove = true;
678 continue;
679 }
680
681 if (Iter->findRegisterUseOperandIdx(VPNOTResult, /*TRI=*/nullptr) == -1)
682 continue;
683
684 HasUser = true;
685 if (!MustMove)
686 break;
687
688 // Move the VPNOT right before Iter
689 LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << " Before: ";
690 Iter->dump());
691 MBB.splice(Iter, &MBB, VPNOT.getIterator());
692 // If we move the instr, and its operand was killed earlier, remove the kill
693 // flag.
694 if (VPNOTOperandKiller)
695 VPNOTOperandKiller->setIsKill(false);
696
697 break;
698 }
699 return HasUser;
700 }
701
702 // This optimisation attempts to reduce the number of overlapping lifetimes of
703 // VCCR values by replacing uses of old VCCR values with VPNOTs. For example,
704 // this replaces
705 // %A:vccr = (something)
706 // %B:vccr = VPNOT %A
707 // %Foo = (some op that uses %B)
708 // %Bar = (some op that uses %A)
709 // With
710 // %A:vccr = (something)
711 // %B:vccr = VPNOT %A
712 // %Foo = (some op that uses %B)
713 // %TMP2:vccr = VPNOT %B
714 // %Bar = (some op that uses %A)
ReduceOldVCCRValueUses(MachineBasicBlock & MBB)715 bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
716 MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end();
717 SmallVector<MachineInstr *, 4> DeadInstructions;
718 bool Modified = false;
719
720 while (Iter != End) {
721 Register VCCRValue, OppositeVCCRValue;
722 // The first loop looks for 2 unpredicated instructions:
723 // %A:vccr = (instr) ; A is stored in VCCRValue
724 // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue
725 for (; Iter != End; ++Iter) {
726 // We're only interested in unpredicated instructions that write to VCCR.
727 if (!IsWritingToVCCR(*Iter) ||
728 getVPTInstrPredicate(*Iter) != ARMVCC::None)
729 continue;
730 Register Dst = Iter->getOperand(0).getReg();
731
732 // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've
733 // found what we were looking for.
734 if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT &&
735 Iter->findRegisterUseOperandIdx(VCCRValue, /*TRI=*/nullptr) != -1) {
736 // Move the VPNOT closer to its first user if needed, and ignore if it
737 // has no users.
738 if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue))
739 continue;
740
741 OppositeVCCRValue = Dst;
742 ++Iter;
743 break;
744 }
745
746 // Else, just set VCCRValue.
747 VCCRValue = Dst;
748 }
749
750 // If the first inner loop didn't find anything, stop here.
751 if (Iter == End)
752 break;
753
754 assert(VCCRValue && OppositeVCCRValue &&
755 "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop "
756 "stopped before the end of the block!");
757 assert(VCCRValue != OppositeVCCRValue &&
758 "VCCRValue should not be equal to OppositeVCCRValue!");
759
760 // LastVPNOTResult always contains the same value as OppositeVCCRValue.
761 Register LastVPNOTResult = OppositeVCCRValue;
762
763 // This second loop tries to optimize the remaining instructions.
764 for (; Iter != End; ++Iter) {
765 bool IsInteresting = false;
766
767 if (MachineOperand *MO =
768 Iter->findRegisterUseOperand(VCCRValue, /*TRI=*/nullptr)) {
769 IsInteresting = true;
770
771 // - If the instruction is a VPNOT, it can be removed, and we can just
772 // replace its uses with LastVPNOTResult.
773 // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue.
774 if (Iter->getOpcode() == ARM::MVE_VPNOT) {
775 Register Result = Iter->getOperand(0).getReg();
776
777 MRI->replaceRegWith(Result, LastVPNOTResult);
778 DeadInstructions.push_back(&*Iter);
779 Modified = true;
780
781 LLVM_DEBUG(dbgs()
782 << "Replacing all uses of '" << printReg(Result)
783 << "' with '" << printReg(LastVPNOTResult) << "'\n");
784 } else {
785 MachineInstr &VPNOT =
786 ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult);
787 Modified = true;
788
789 LastVPNOTResult = VPNOT.getOperand(0).getReg();
790 std::swap(VCCRValue, OppositeVCCRValue);
791
792 LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue)
793 << "' with '" << printReg(LastVPNOTResult)
794 << "' in instr: " << *Iter);
795 }
796 } else {
797 // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult
798 // instead as they contain the same value.
799 if (MachineOperand *MO = Iter->findRegisterUseOperand(
800 OppositeVCCRValue, /*TRI=*/nullptr)) {
801 IsInteresting = true;
802
803 // This is pointless if LastVPNOTResult == OppositeVCCRValue.
804 if (LastVPNOTResult != OppositeVCCRValue) {
805 LLVM_DEBUG(dbgs() << "Replacing usage of '"
806 << printReg(OppositeVCCRValue) << "' with '"
807 << printReg(LastVPNOTResult) << " for instr: ";
808 Iter->dump());
809 MO->setReg(LastVPNOTResult);
810 Modified = true;
811 }
812
813 MO->setIsKill(false);
814 }
815
816 // If this is an unpredicated VPNOT on
817 // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it.
818 if (Iter->getOpcode() == ARM::MVE_VPNOT &&
819 getVPTInstrPredicate(*Iter) == ARMVCC::None) {
820 Register VPNOTOperand = Iter->getOperand(1).getReg();
821 if (VPNOTOperand == LastVPNOTResult ||
822 VPNOTOperand == OppositeVCCRValue) {
823 IsInteresting = true;
824
825 std::swap(VCCRValue, OppositeVCCRValue);
826 LastVPNOTResult = Iter->getOperand(0).getReg();
827 }
828 }
829 }
830
831 // If this instruction was not interesting, and it writes to VCCR, stop.
832 if (!IsInteresting && IsWritingToVCCR(*Iter))
833 break;
834 }
835 }
836
837 for (MachineInstr *DeadInstruction : DeadInstructions)
838 DeadInstruction->eraseFromParent();
839
840 return Modified;
841 }
842
843 // This optimisation replaces VCMPs with VPNOTs when they are equivalent.
ReplaceVCMPsByVPNOTs(MachineBasicBlock & MBB)844 bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
845 SmallVector<MachineInstr *, 4> DeadInstructions;
846
847 // The last VCMP that we have seen and that couldn't be replaced.
848 // This is reset when an instruction that writes to VCCR/VPR is found, or when
849 // a VCMP is replaced with a VPNOT.
850 // We'll only replace VCMPs with VPNOTs when this is not null, and when the
851 // current VCMP is the opposite of PrevVCMP.
852 MachineInstr *PrevVCMP = nullptr;
853 // If we find an instruction that kills the result of PrevVCMP, we save the
854 // operand here to remove the kill flag in case we need to use PrevVCMP's
855 // result.
856 MachineOperand *PrevVCMPResultKiller = nullptr;
857
858 for (MachineInstr &Instr : MBB.instrs()) {
859 if (PrevVCMP) {
860 if (MachineOperand *MO =
861 Instr.findRegisterUseOperand(PrevVCMP->getOperand(0).getReg(),
862 /*TRI=*/nullptr, /*isKill*/ true)) {
863 // If we come accross the instr that kills PrevVCMP's result, record it
864 // so we can remove the kill flag later if we need to.
865 PrevVCMPResultKiller = MO;
866 }
867 }
868
869 // Ignore predicated instructions.
870 if (getVPTInstrPredicate(Instr) != ARMVCC::None)
871 continue;
872
873 // Only look at VCMPs
874 if (!IsVCMP(Instr.getOpcode())) {
875 // If the instruction writes to VCCR, forget the previous VCMP.
876 if (IsWritingToVCCR(Instr))
877 PrevVCMP = nullptr;
878 continue;
879 }
880
881 if (!PrevVCMP || !IsVPNOTEquivalent(Instr, *PrevVCMP)) {
882 PrevVCMP = &Instr;
883 continue;
884 }
885
886 // The register containing the result of the VCMP that we're going to
887 // replace.
888 Register PrevVCMPResultReg = PrevVCMP->getOperand(0).getReg();
889
890 // Build a VPNOT to replace the VCMP, reusing its operands.
891 MachineInstrBuilder MIBuilder =
892 BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
893 .add(Instr.getOperand(0))
894 .addReg(PrevVCMPResultReg);
895 addUnpredicatedMveVpredNOp(MIBuilder);
896 LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): ";
897 MIBuilder.getInstr()->dump(); dbgs() << " Removed VCMP: ";
898 Instr.dump());
899
900 // If we found an instruction that uses, and kills PrevVCMP's result,
901 // remove the kill flag.
902 if (PrevVCMPResultKiller)
903 PrevVCMPResultKiller->setIsKill(false);
904
905 // Finally, mark the old VCMP for removal and reset
906 // PrevVCMP/PrevVCMPResultKiller.
907 DeadInstructions.push_back(&Instr);
908 PrevVCMP = nullptr;
909 PrevVCMPResultKiller = nullptr;
910 }
911
912 for (MachineInstr *DeadInstruction : DeadInstructions)
913 DeadInstruction->eraseFromParent();
914
915 return !DeadInstructions.empty();
916 }
917
ReplaceConstByVPNOTs(MachineBasicBlock & MBB,MachineDominatorTree * DT)918 bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
919 MachineDominatorTree *DT) {
920 // Scan through the block, looking for instructions that use constants moves
921 // into VPR that are the negative of one another. These are expected to be
922 // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant
923 // mask is kept it or and VPNOT's of it are added or reused as we scan through
924 // the function.
925 unsigned LastVPTImm = 0;
926 Register LastVPTReg = 0;
927 SmallSet<MachineInstr *, 4> DeadInstructions;
928
929 for (MachineInstr &Instr : MBB.instrs()) {
930 // Look for predicated MVE instructions.
931 int PIdx = llvm::findFirstVPTPredOperandIdx(Instr);
932 if (PIdx == -1)
933 continue;
934 Register VPR = Instr.getOperand(PIdx + 1).getReg();
935 if (!VPR.isVirtual())
936 continue;
937
938 // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr.
939 MachineInstr *Copy = MRI->getVRegDef(VPR);
940 if (!Copy || Copy->getOpcode() != TargetOpcode::COPY ||
941 !Copy->getOperand(1).getReg().isVirtual() ||
942 MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) {
943 LastVPTReg = 0;
944 continue;
945 }
946 Register GPR = Copy->getOperand(1).getReg();
947
948 // Find the Immediate used by the copy.
949 auto getImm = [&](Register GPR) -> unsigned {
950 MachineInstr *Def = MRI->getVRegDef(GPR);
951 if (Def && (Def->getOpcode() == ARM::t2MOVi ||
952 Def->getOpcode() == ARM::t2MOVi16))
953 return Def->getOperand(1).getImm();
954 return -1U;
955 };
956 unsigned Imm = getImm(GPR);
957 if (Imm == -1U) {
958 LastVPTReg = 0;
959 continue;
960 }
961
962 unsigned NotImm = ~Imm & 0xffff;
963 if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) {
964 MRI->clearKillFlags(LastVPTReg);
965 Instr.getOperand(PIdx + 1).setReg(LastVPTReg);
966 if (MRI->use_empty(VPR)) {
967 DeadInstructions.insert(Copy);
968 if (MRI->hasOneUse(GPR))
969 DeadInstructions.insert(MRI->getVRegDef(GPR));
970 }
971 LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr);
972 VPR = LastVPTReg;
973 } else if (LastVPTReg != 0 && LastVPTImm == NotImm) {
974 // We have found the not of a previous constant. Create a VPNot of the
975 // earlier predicate reg and use it instead of the copy.
976 Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass);
977 auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(),
978 TII->get(ARM::MVE_VPNOT), NewVPR)
979 .addReg(LastVPTReg);
980 addUnpredicatedMveVpredNOp(VPNot);
981
982 // Use the new register and check if the def is now dead.
983 Instr.getOperand(PIdx + 1).setReg(NewVPR);
984 if (MRI->use_empty(VPR)) {
985 DeadInstructions.insert(Copy);
986 if (MRI->hasOneUse(GPR))
987 DeadInstructions.insert(MRI->getVRegDef(GPR));
988 }
989 LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at "
990 << Instr);
991 VPR = NewVPR;
992 }
993
994 LastVPTImm = Imm;
995 LastVPTReg = VPR;
996 }
997
998 for (MachineInstr *DI : DeadInstructions)
999 DI->eraseFromParent();
1000
1001 return !DeadInstructions.empty();
1002 }
1003
1004 // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
1005 // somewhat blunt approximation to allow tail predicated with vpsel
1006 // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
1007 // different semantics under tail predication. Until that is modelled we just
1008 // convert to a VMOVT (via a predicated VORR) instead.
ConvertVPSEL(MachineBasicBlock & MBB)1009 bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
1010 bool HasVCTP = false;
1011 SmallVector<MachineInstr *, 4> DeadInstructions;
1012
1013 for (MachineInstr &MI : MBB.instrs()) {
1014 if (isVCTP(&MI)) {
1015 HasVCTP = true;
1016 continue;
1017 }
1018
1019 if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL)
1020 continue;
1021
1022 MachineInstrBuilder MIBuilder =
1023 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR))
1024 .add(MI.getOperand(0))
1025 .add(MI.getOperand(1))
1026 .add(MI.getOperand(1))
1027 .addImm(ARMVCC::Then)
1028 .add(MI.getOperand(4))
1029 .add(MI.getOperand(5))
1030 .add(MI.getOperand(2));
1031 // Silence unused variable warning in release builds.
1032 (void)MIBuilder;
1033 LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump();
1034 dbgs() << " with VMOVT: "; MIBuilder.getInstr()->dump());
1035 DeadInstructions.push_back(&MI);
1036 }
1037
1038 for (MachineInstr *DeadInstruction : DeadInstructions)
1039 DeadInstruction->eraseFromParent();
1040
1041 return !DeadInstructions.empty();
1042 }
1043
1044 // Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as
1045 // the instruction may be removable as a noop.
HintDoLoopStartReg(MachineBasicBlock & MBB)1046 bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) {
1047 bool Changed = false;
1048 for (MachineInstr &MI : MBB.instrs()) {
1049 if (MI.getOpcode() != ARM::t2DoLoopStart)
1050 continue;
1051 Register R = MI.getOperand(1).getReg();
1052 MachineFunction *MF = MI.getParent()->getParent();
1053 MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
1054 Changed = true;
1055 }
1056 return Changed;
1057 }
1058
runOnMachineFunction(MachineFunction & Fn)1059 bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
1060 const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
1061
1062 if (!STI.isThumb2() || !STI.hasLOB())
1063 return false;
1064
1065 TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
1066 MRI = &Fn.getRegInfo();
1067 MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
1068 MachineDominatorTree *DT =
1069 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
1070
1071 LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
1072 << "********** Function: " << Fn.getName() << '\n');
1073
1074 bool Modified = false;
1075 for (MachineLoop *ML : MLI->getLoopsInPreorder()) {
1076 Modified |= LowerWhileLoopStart(ML);
1077 Modified |= MergeLoopEnd(ML);
1078 Modified |= ConvertTailPredLoop(ML, DT);
1079 }
1080
1081 for (MachineBasicBlock &MBB : Fn) {
1082 Modified |= HintDoLoopStartReg(MBB);
1083 Modified |= ReplaceConstByVPNOTs(MBB, DT);
1084 Modified |= ReplaceVCMPsByVPNOTs(MBB);
1085 Modified |= ReduceOldVCCRValueUses(MBB);
1086 Modified |= ConvertVPSEL(MBB);
1087 }
1088
1089 LLVM_DEBUG(dbgs() << "**************************************\n");
1090 return Modified;
1091 }
1092
1093 /// createMVETPAndVPTOptimisationsPass
createMVETPAndVPTOptimisationsPass()1094 FunctionPass *llvm::createMVETPAndVPTOptimisationsPass() {
1095 return new MVETPAndVPTOptimisations();
1096 }
1097