//=== AArch64PostSelectOptimize.cpp ---------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass does post-instruction-selection optimizations in the GlobalISel // pipeline, before the rest of codegen runs. // //===----------------------------------------------------------------------===// #include "AArch64.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #define DEBUG_TYPE "aarch64-post-select-optimize" using namespace llvm; namespace { class AArch64PostSelectOptimize : public MachineFunctionPass { public: static char ID; AArch64PostSelectOptimize(); StringRef getPassName() const override { return "AArch64 Post Select Optimizer"; } bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; private: bool optimizeNZCVDefs(MachineBasicBlock &MBB); bool doPeepholeOpts(MachineBasicBlock &MBB); /// Look for cross regclass copies that can be trivially eliminated. bool foldSimpleCrossClassCopies(MachineInstr &MI); }; } // end anonymous namespace void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); } AArch64PostSelectOptimize::AArch64PostSelectOptimize() : MachineFunctionPass(ID) { initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry()); } unsigned getNonFlagSettingVariant(unsigned Opc) { switch (Opc) { default: return 0; case AArch64::SUBSXrr: return AArch64::SUBXrr; case AArch64::SUBSWrr: return AArch64::SUBWrr; case AArch64::SUBSXrs: return AArch64::SUBXrs; case AArch64::SUBSXri: return AArch64::SUBXri; case AArch64::SUBSWri: return AArch64::SUBWri; } } bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) { bool Changed = false; for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) { Changed |= foldSimpleCrossClassCopies(MI); } return Changed; } bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) { auto *MF = MI.getMF(); auto &MRI = MF->getRegInfo(); if (!MI.isCopy()) return false; if (MI.getOperand(1).getSubReg()) return false; // Don't deal with subreg copies Register Src = MI.getOperand(1).getReg(); Register Dst = MI.getOperand(0).getReg(); if (Src.isPhysical() || Dst.isPhysical()) return false; const TargetRegisterClass *SrcRC = MRI.getRegClass(Src); const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); if (SrcRC == DstRC) return false; if (SrcRC->hasSubClass(DstRC)) { // This is the case where the source class is a superclass of the dest, so // if the copy is the only user of the source, we can just constrain the // source reg to the dest class. if (!MRI.hasOneNonDBGUse(Src)) return false; // Only constrain single uses of the source. // Constrain to dst reg class as long as it's not a weird class that only // has a few registers. if (!MRI.constrainRegClass(Src, DstRC, /* MinNumRegs */ 25)) return false; } else if (DstRC->hasSubClass(SrcRC)) { // This is the inverse case, where the destination class is a superclass of // the source. Here, if the copy is the only user, we can just constrain // the user of the copy to use the smaller class of the source. } else { return false; } MRI.replaceRegWith(Dst, Src); MI.eraseFromParent(); return true; } bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) { // Consider the following code: // FCMPSrr %0, %1, implicit-def $nzcv // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv // FCMPSrr %0, %1, implicit-def $nzcv // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv // This kind of code where we have 2 FCMPs each feeding a CSEL can happen // when we have a single IR fcmp being used by two selects. During selection, // to ensure that there can be no clobbering of nzcv between the fcmp and the // csel, we have to generate an fcmp immediately before each csel is // selected. // However, often we can essentially CSE these together later in MachineCSE. // This doesn't work though if there are unrelated flag-setting instructions // in between the two FCMPs. In this case, the SUBS defines NZCV // but it doesn't have any users, being overwritten by the second FCMP. // // Our solution here is to try to convert flag setting operations between // a interval of identical FCMPs, so that CSE will be able to eliminate one. bool Changed = false; auto &MF = *MBB.getParent(); auto &Subtarget = MF.getSubtarget(); const auto &TII = Subtarget.getInstrInfo(); auto TRI = Subtarget.getRegisterInfo(); auto RBI = Subtarget.getRegBankInfo(); auto &MRI = MF.getRegInfo(); // The first step is to find the first and last FCMPs. If we have found // at least two, then set the limit of the bottom-up walk to the first FCMP // found since we're only interested in dealing with instructions between // them. MachineInstr *FirstCmp = nullptr, *LastCmp = nullptr; for (auto &MI : instructionsWithoutDebug(MBB.begin(), MBB.end())) { if (MI.getOpcode() == AArch64::FCMPSrr || MI.getOpcode() == AArch64::FCMPDrr) { if (!FirstCmp) FirstCmp = &MI; else LastCmp = &MI; } } // In addition to converting flag-setting ops in fcmp ranges into non-flag // setting ops, across the whole basic block we also detect when nzcv // implicit-defs are dead, and mark them as dead. Peephole optimizations need // this information later. LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo()); LRU.addLiveOuts(MBB); bool NZCVDead = LRU.available(AArch64::NZCV); bool InsideCmpRange = false; for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) { LRU.stepBackward(II); if (LastCmp) { // There's a range present in this block. // If we're inside an fcmp range, look for begin instruction. if (InsideCmpRange && &II == FirstCmp) InsideCmpRange = false; else if (&II == LastCmp) InsideCmpRange = true; } // Did this instruction define NZCV? bool NZCVDeadAtCurrInstr = LRU.available(AArch64::NZCV); if (NZCVDead && NZCVDeadAtCurrInstr && II.definesRegister(AArch64::NZCV)) { // If we have a def and NZCV is dead, then we may convert this op. unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode()); int DeadNZCVIdx = II.findRegisterDefOperandIdx(AArch64::NZCV); if (DeadNZCVIdx != -1) { // If we're inside an fcmp range, then convert flag setting ops. if (InsideCmpRange && NewOpc) { LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting " "op in fcmp range: " << II); II.setDesc(TII->get(NewOpc)); II.removeOperand(DeadNZCVIdx); // Changing the opcode can result in differing regclass requirements, // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp. // Constrain the regclasses, possibly introducing a copy. constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(), II.getOperand(0), 0); Changed |= true; } else { // Otherwise, we just set the nzcv imp-def operand to be dead, so the // peephole optimizations can optimize them further. II.getOperand(DeadNZCVIdx).setIsDead(); } } } NZCVDead = NZCVDeadAtCurrInstr; } return Changed; } bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) { if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) return false; assert(MF.getProperties().hasProperty( MachineFunctionProperties::Property::Selected) && "Expected a selected MF"); bool Changed = false; for (auto &BB : MF) { Changed |= optimizeNZCVDefs(BB); Changed |= doPeepholeOpts(BB); } return Changed; } char AArch64PostSelectOptimize::ID = 0; INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE, "Optimize AArch64 selected instructions", false, false) INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE, "Optimize AArch64 selected instructions", false, false) namespace llvm { FunctionPass *createAArch64PostSelectOptimize() { return new AArch64PostSelectOptimize(); } } // end namespace llvm