1 //=== AArch64PostSelectOptimize.cpp ---------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does post-instruction-selection optimizations in the GlobalISel 10 // pipeline, before the rest of codegen runs. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64.h" 15 #include "AArch64TargetMachine.h" 16 #include "MCTargetDesc/AArch64MCTargetDesc.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/CodeGen/GlobalISel/Utils.h" 19 #include "llvm/CodeGen/MachineBasicBlock.h" 20 #include "llvm/CodeGen/MachineFunctionPass.h" 21 #include "llvm/CodeGen/MachineInstr.h" 22 #include "llvm/CodeGen/MachineOperand.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/Support/Debug.h" 25 #include "llvm/Support/ErrorHandling.h" 26 27 #define DEBUG_TYPE "aarch64-post-select-optimize" 28 29 using namespace llvm; 30 31 namespace { 32 class AArch64PostSelectOptimize : public MachineFunctionPass { 33 public: 34 static char ID; 35 36 AArch64PostSelectOptimize() : MachineFunctionPass(ID) {} 37 38 StringRef getPassName() const override { 39 return "AArch64 Post Select Optimizer"; 40 } 41 42 bool runOnMachineFunction(MachineFunction &MF) override; 43 44 void getAnalysisUsage(AnalysisUsage &AU) const override; 45 46 private: 47 bool optimizeNZCVDefs(MachineBasicBlock &MBB); 48 bool doPeepholeOpts(MachineBasicBlock &MBB); 49 /// Look for cross regclass copies that can be trivially eliminated. 50 bool foldSimpleCrossClassCopies(MachineInstr &MI); 51 bool foldCopyDup(MachineInstr &MI); 52 }; 53 } // end anonymous namespace 54 55 void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const { 56 AU.addRequired<TargetPassConfig>(); 57 AU.setPreservesCFG(); 58 getSelectionDAGFallbackAnalysisUsage(AU); 59 MachineFunctionPass::getAnalysisUsage(AU); 60 } 61 62 unsigned getNonFlagSettingVariant(unsigned Opc) { 63 switch (Opc) { 64 default: 65 return 0; 66 case AArch64::SUBSXrr: 67 return AArch64::SUBXrr; 68 case AArch64::SUBSWrr: 69 return AArch64::SUBWrr; 70 case AArch64::SUBSXrs: 71 return AArch64::SUBXrs; 72 case AArch64::SUBSWrs: 73 return AArch64::SUBWrs; 74 case AArch64::SUBSXri: 75 return AArch64::SUBXri; 76 case AArch64::SUBSWri: 77 return AArch64::SUBWri; 78 case AArch64::ADDSXrr: 79 return AArch64::ADDXrr; 80 case AArch64::ADDSWrr: 81 return AArch64::ADDWrr; 82 case AArch64::ADDSXrs: 83 return AArch64::ADDXrs; 84 case AArch64::ADDSWrs: 85 return AArch64::ADDWrs; 86 case AArch64::ADDSXri: 87 return AArch64::ADDXri; 88 case AArch64::ADDSWri: 89 return AArch64::ADDWri; 90 case AArch64::SBCSXr: 91 return AArch64::SBCXr; 92 case AArch64::SBCSWr: 93 return AArch64::SBCWr; 94 case AArch64::ADCSXr: 95 return AArch64::ADCXr; 96 case AArch64::ADCSWr: 97 return AArch64::ADCWr; 98 } 99 } 100 101 bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) { 102 bool Changed = false; 103 for (auto &MI : make_early_inc_range(MBB)) { 104 bool CurrentIterChanged = foldSimpleCrossClassCopies(MI); 105 if (!CurrentIterChanged) 106 CurrentIterChanged |= foldCopyDup(MI); 107 Changed |= CurrentIterChanged; 108 } 109 return Changed; 110 } 111 112 bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) { 113 auto *MF = MI.getMF(); 114 auto &MRI = MF->getRegInfo(); 115 116 if (!MI.isCopy()) 117 return false; 118 119 if (MI.getOperand(1).getSubReg()) 120 return false; // Don't deal with subreg copies 121 122 Register Src = MI.getOperand(1).getReg(); 123 Register Dst = MI.getOperand(0).getReg(); 124 125 if (Src.isPhysical() || Dst.isPhysical()) 126 return false; 127 128 const TargetRegisterClass *SrcRC = MRI.getRegClass(Src); 129 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 130 131 if (SrcRC == DstRC) 132 return false; 133 134 135 if (SrcRC->hasSubClass(DstRC)) { 136 // This is the case where the source class is a superclass of the dest, so 137 // if the copy is the only user of the source, we can just constrain the 138 // source reg to the dest class. 139 140 if (!MRI.hasOneNonDBGUse(Src)) 141 return false; // Only constrain single uses of the source. 142 143 // Constrain to dst reg class as long as it's not a weird class that only 144 // has a few registers. 145 if (!MRI.constrainRegClass(Src, DstRC, /* MinNumRegs */ 25)) 146 return false; 147 } else if (DstRC->hasSubClass(SrcRC)) { 148 // This is the inverse case, where the destination class is a superclass of 149 // the source. Here, if the copy is the only user, we can just constrain 150 // the user of the copy to use the smaller class of the source. 151 } else { 152 return false; 153 } 154 155 MRI.replaceRegWith(Dst, Src); 156 MI.eraseFromParent(); 157 return true; 158 } 159 160 bool AArch64PostSelectOptimize::foldCopyDup(MachineInstr &MI) { 161 if (!MI.isCopy()) 162 return false; 163 164 auto *MF = MI.getMF(); 165 auto &MRI = MF->getRegInfo(); 166 auto *TII = MF->getSubtarget().getInstrInfo(); 167 168 // Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i). 169 // Here Dst is y and Src is the result of DUP. 170 Register Dst = MI.getOperand(0).getReg(); 171 Register Src = MI.getOperand(1).getReg(); 172 173 if (!Dst.isVirtual() || !Src.isVirtual()) 174 return false; 175 176 auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass, 177 const TargetRegisterClass *FPRRegClass, unsigned DUP, 178 unsigned UMOV) { 179 if (MRI.getRegClassOrNull(Dst) != GPRRegClass || 180 MRI.getRegClassOrNull(Src) != FPRRegClass) 181 return false; 182 183 // There is a special case when one of the uses is COPY(z:FPR, y:GPR). 184 // In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can 185 // be folded by peephole-opt into just DUP(z:FPR, i), so this transform is 186 // not worthwhile in that case. 187 for (auto &Use : MRI.use_nodbg_instructions(Dst)) { 188 if (!Use.isCopy()) 189 continue; 190 191 Register UseOp0 = Use.getOperand(0).getReg(); 192 Register UseOp1 = Use.getOperand(1).getReg(); 193 if (UseOp0.isPhysical() || UseOp1.isPhysical()) 194 return false; 195 196 if (MRI.getRegClassOrNull(UseOp0) == FPRRegClass && 197 MRI.getRegClassOrNull(UseOp1) == GPRRegClass) 198 return false; 199 } 200 201 MachineInstr *SrcMI = MRI.getUniqueVRegDef(Src); 202 if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI.hasOneNonDBGUse(Src)) 203 return false; 204 205 Register DupSrc = SrcMI->getOperand(1).getReg(); 206 int64_t DupImm = SrcMI->getOperand(2).getImm(); 207 208 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst) 209 .addReg(DupSrc) 210 .addImm(DupImm); 211 SrcMI->eraseFromParent(); 212 MI.eraseFromParent(); 213 return true; 214 }; 215 216 return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass, 217 AArch64::DUPi32, AArch64::UMOVvi32) || 218 TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass, 219 AArch64::DUPi64, AArch64::UMOVvi64); 220 } 221 222 bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) { 223 // If we find a dead NZCV implicit-def, we 224 // - try to convert the operation to a non-flag-setting equivalent 225 // - or mark the def as dead to aid later peephole optimizations. 226 227 // Use cases: 228 // 1) 229 // Consider the following code: 230 // FCMPSrr %0, %1, implicit-def $nzcv 231 // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv 232 // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv 233 // FCMPSrr %0, %1, implicit-def $nzcv 234 // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv 235 // This kind of code where we have 2 FCMPs each feeding a CSEL can happen 236 // when we have a single IR fcmp being used by two selects. During selection, 237 // to ensure that there can be no clobbering of nzcv between the fcmp and the 238 // csel, we have to generate an fcmp immediately before each csel is 239 // selected. 240 // However, often we can essentially CSE these together later in MachineCSE. 241 // This doesn't work though if there are unrelated flag-setting instructions 242 // in between the two FCMPs. In this case, the SUBS defines NZCV 243 // but it doesn't have any users, being overwritten by the second FCMP. 244 // 245 // 2) 246 // The instruction selector always emits the flag-setting variant of ADC/SBC 247 // while selecting G_UADDE/G_SADDE/G_USUBE/G_SSUBE. If the carry-out of these 248 // instructions is never used, we can switch to the non-flag-setting variant. 249 250 bool Changed = false; 251 auto &MF = *MBB.getParent(); 252 auto &Subtarget = MF.getSubtarget(); 253 const auto &TII = Subtarget.getInstrInfo(); 254 auto TRI = Subtarget.getRegisterInfo(); 255 auto RBI = Subtarget.getRegBankInfo(); 256 auto &MRI = MF.getRegInfo(); 257 258 LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo()); 259 LRU.addLiveOuts(MBB); 260 261 for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) { 262 bool NZCVDead = LRU.available(AArch64::NZCV); 263 if (NZCVDead && II.definesRegister(AArch64::NZCV, /*TRI=*/nullptr)) { 264 // The instruction defines NZCV, but NZCV is dead. 265 unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode()); 266 int DeadNZCVIdx = 267 II.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr); 268 if (DeadNZCVIdx != -1) { 269 if (NewOpc) { 270 // If there is an equivalent non-flag-setting op, we convert. 271 LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting " 272 "op: " 273 << II); 274 II.setDesc(TII->get(NewOpc)); 275 II.removeOperand(DeadNZCVIdx); 276 // Changing the opcode can result in differing regclass requirements, 277 // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp. 278 // Constrain the regclasses, possibly introducing a copy. 279 constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(), 280 II.getOperand(0), 0); 281 Changed |= true; 282 } else { 283 // Otherwise, we just set the nzcv imp-def operand to be dead, so the 284 // peephole optimizations can optimize them further. 285 II.getOperand(DeadNZCVIdx).setIsDead(); 286 } 287 } 288 } 289 LRU.stepBackward(II); 290 } 291 return Changed; 292 } 293 294 bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) { 295 if (MF.getProperties().hasFailedISel()) 296 return false; 297 assert(MF.getProperties().hasSelected() && "Expected a selected MF"); 298 299 bool Changed = false; 300 for (auto &BB : MF) { 301 Changed |= optimizeNZCVDefs(BB); 302 Changed |= doPeepholeOpts(BB); 303 } 304 return Changed; 305 } 306 307 char AArch64PostSelectOptimize::ID = 0; 308 INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE, 309 "Optimize AArch64 selected instructions", 310 false, false) 311 INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE, 312 "Optimize AArch64 selected instructions", false, 313 false) 314 315 namespace llvm { 316 FunctionPass *createAArch64PostSelectOptimize() { 317 return new AArch64PostSelectOptimize(); 318 } 319 } // end namespace llvm 320