1 //=== AArch64PostSelectOptimize.cpp ---------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does post-instruction-selection optimizations in the GlobalISel 10 // pipeline, before the rest of codegen runs. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64.h" 15 #include "AArch64TargetMachine.h" 16 #include "MCTargetDesc/AArch64MCTargetDesc.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/CodeGen/GlobalISel/Utils.h" 19 #include "llvm/CodeGen/MachineBasicBlock.h" 20 #include "llvm/CodeGen/MachineFunctionPass.h" 21 #include "llvm/CodeGen/MachineInstr.h" 22 #include "llvm/CodeGen/MachineOperand.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/Support/Debug.h" 25 #include "llvm/Support/ErrorHandling.h" 26 27 #define DEBUG_TYPE "aarch64-post-select-optimize" 28 29 using namespace llvm; 30 31 namespace { 32 class AArch64PostSelectOptimize : public MachineFunctionPass { 33 public: 34 static char ID; 35 36 AArch64PostSelectOptimize(); 37 38 StringRef getPassName() const override { 39 return "AArch64 Post Select Optimizer"; 40 } 41 42 bool runOnMachineFunction(MachineFunction &MF) override; 43 44 void getAnalysisUsage(AnalysisUsage &AU) const override; 45 46 private: 47 bool optimizeNZCVDefs(MachineBasicBlock &MBB); 48 bool doPeepholeOpts(MachineBasicBlock &MBB); 49 /// Look for cross regclass copies that can be trivially eliminated. 50 bool foldSimpleCrossClassCopies(MachineInstr &MI); 51 bool foldCopyDup(MachineInstr &MI); 52 }; 53 } // end anonymous namespace 54 55 void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const { 56 AU.addRequired<TargetPassConfig>(); 57 AU.setPreservesCFG(); 58 getSelectionDAGFallbackAnalysisUsage(AU); 59 MachineFunctionPass::getAnalysisUsage(AU); 60 } 61 62 AArch64PostSelectOptimize::AArch64PostSelectOptimize() 63 : MachineFunctionPass(ID) { 64 initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry()); 65 } 66 67 unsigned getNonFlagSettingVariant(unsigned Opc) { 68 switch (Opc) { 69 default: 70 return 0; 71 case AArch64::SUBSXrr: 72 return AArch64::SUBXrr; 73 case AArch64::SUBSWrr: 74 return AArch64::SUBWrr; 75 case AArch64::SUBSXrs: 76 return AArch64::SUBXrs; 77 case AArch64::SUBSWrs: 78 return AArch64::SUBWrs; 79 case AArch64::SUBSXri: 80 return AArch64::SUBXri; 81 case AArch64::SUBSWri: 82 return AArch64::SUBWri; 83 case AArch64::ADDSXrr: 84 return AArch64::ADDXrr; 85 case AArch64::ADDSWrr: 86 return AArch64::ADDWrr; 87 case AArch64::ADDSXrs: 88 return AArch64::ADDXrs; 89 case AArch64::ADDSWrs: 90 return AArch64::ADDWrs; 91 case AArch64::ADDSXri: 92 return AArch64::ADDXri; 93 case AArch64::ADDSWri: 94 return AArch64::ADDWri; 95 case AArch64::SBCSXr: 96 return AArch64::SBCXr; 97 case AArch64::SBCSWr: 98 return AArch64::SBCWr; 99 case AArch64::ADCSXr: 100 return AArch64::ADCXr; 101 case AArch64::ADCSWr: 102 return AArch64::ADCWr; 103 } 104 } 105 106 bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) { 107 bool Changed = false; 108 for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) { 109 bool CurrentIterChanged = foldSimpleCrossClassCopies(MI); 110 if (!CurrentIterChanged) 111 CurrentIterChanged |= foldCopyDup(MI); 112 Changed |= CurrentIterChanged; 113 } 114 return Changed; 115 } 116 117 bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) { 118 auto *MF = MI.getMF(); 119 auto &MRI = MF->getRegInfo(); 120 121 if (!MI.isCopy()) 122 return false; 123 124 if (MI.getOperand(1).getSubReg()) 125 return false; // Don't deal with subreg copies 126 127 Register Src = MI.getOperand(1).getReg(); 128 Register Dst = MI.getOperand(0).getReg(); 129 130 if (Src.isPhysical() || Dst.isPhysical()) 131 return false; 132 133 const TargetRegisterClass *SrcRC = MRI.getRegClass(Src); 134 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 135 136 if (SrcRC == DstRC) 137 return false; 138 139 140 if (SrcRC->hasSubClass(DstRC)) { 141 // This is the case where the source class is a superclass of the dest, so 142 // if the copy is the only user of the source, we can just constrain the 143 // source reg to the dest class. 144 145 if (!MRI.hasOneNonDBGUse(Src)) 146 return false; // Only constrain single uses of the source. 147 148 // Constrain to dst reg class as long as it's not a weird class that only 149 // has a few registers. 150 if (!MRI.constrainRegClass(Src, DstRC, /* MinNumRegs */ 25)) 151 return false; 152 } else if (DstRC->hasSubClass(SrcRC)) { 153 // This is the inverse case, where the destination class is a superclass of 154 // the source. Here, if the copy is the only user, we can just constrain 155 // the user of the copy to use the smaller class of the source. 156 } else { 157 return false; 158 } 159 160 MRI.replaceRegWith(Dst, Src); 161 MI.eraseFromParent(); 162 return true; 163 } 164 165 bool AArch64PostSelectOptimize::foldCopyDup(MachineInstr &MI) { 166 if (!MI.isCopy()) 167 return false; 168 169 auto *MF = MI.getMF(); 170 auto &MRI = MF->getRegInfo(); 171 auto *TII = MF->getSubtarget().getInstrInfo(); 172 173 // Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i). 174 // Here Dst is y and Src is the result of DUP. 175 Register Dst = MI.getOperand(0).getReg(); 176 Register Src = MI.getOperand(1).getReg(); 177 178 if (!Dst.isVirtual() || !Src.isVirtual()) 179 return false; 180 181 auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass, 182 const TargetRegisterClass *FPRRegClass, unsigned DUP, 183 unsigned UMOV) { 184 if (MRI.getRegClassOrNull(Dst) != GPRRegClass || 185 MRI.getRegClassOrNull(Src) != FPRRegClass) 186 return false; 187 188 // There is a special case when one of the uses is COPY(z:FPR, y:GPR). 189 // In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can 190 // be folded by peephole-opt into just DUP(z:FPR, i), so this transform is 191 // not worthwhile in that case. 192 for (auto &Use : MRI.use_nodbg_instructions(Dst)) { 193 if (!Use.isCopy()) 194 continue; 195 196 Register UseOp0 = Use.getOperand(0).getReg(); 197 Register UseOp1 = Use.getOperand(1).getReg(); 198 if (UseOp0.isPhysical() || UseOp1.isPhysical()) 199 return false; 200 201 if (MRI.getRegClassOrNull(UseOp0) == FPRRegClass && 202 MRI.getRegClassOrNull(UseOp1) == GPRRegClass) 203 return false; 204 } 205 206 MachineInstr *SrcMI = MRI.getUniqueVRegDef(Src); 207 if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI.hasOneNonDBGUse(Src)) 208 return false; 209 210 Register DupSrc = SrcMI->getOperand(1).getReg(); 211 int64_t DupImm = SrcMI->getOperand(2).getImm(); 212 213 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst) 214 .addReg(DupSrc) 215 .addImm(DupImm); 216 SrcMI->eraseFromParent(); 217 MI.eraseFromParent(); 218 return true; 219 }; 220 221 return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass, 222 AArch64::DUPi32, AArch64::UMOVvi32) || 223 TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass, 224 AArch64::DUPi64, AArch64::UMOVvi64); 225 } 226 227 bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) { 228 // If we find a dead NZCV implicit-def, we 229 // - try to convert the operation to a non-flag-setting equivalent 230 // - or mark the def as dead to aid later peephole optimizations. 231 232 // Use cases: 233 // 1) 234 // Consider the following code: 235 // FCMPSrr %0, %1, implicit-def $nzcv 236 // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv 237 // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv 238 // FCMPSrr %0, %1, implicit-def $nzcv 239 // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv 240 // This kind of code where we have 2 FCMPs each feeding a CSEL can happen 241 // when we have a single IR fcmp being used by two selects. During selection, 242 // to ensure that there can be no clobbering of nzcv between the fcmp and the 243 // csel, we have to generate an fcmp immediately before each csel is 244 // selected. 245 // However, often we can essentially CSE these together later in MachineCSE. 246 // This doesn't work though if there are unrelated flag-setting instructions 247 // in between the two FCMPs. In this case, the SUBS defines NZCV 248 // but it doesn't have any users, being overwritten by the second FCMP. 249 // 250 // 2) 251 // The instruction selector always emits the flag-setting variant of ADC/SBC 252 // while selecting G_UADDE/G_SADDE/G_USUBE/G_SSUBE. If the carry-out of these 253 // instructions is never used, we can switch to the non-flag-setting variant. 254 255 bool Changed = false; 256 auto &MF = *MBB.getParent(); 257 auto &Subtarget = MF.getSubtarget(); 258 const auto &TII = Subtarget.getInstrInfo(); 259 auto TRI = Subtarget.getRegisterInfo(); 260 auto RBI = Subtarget.getRegBankInfo(); 261 auto &MRI = MF.getRegInfo(); 262 263 LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo()); 264 LRU.addLiveOuts(MBB); 265 266 for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) { 267 bool NZCVDead = LRU.available(AArch64::NZCV); 268 if (NZCVDead && II.definesRegister(AArch64::NZCV, /*TRI=*/nullptr)) { 269 // The instruction defines NZCV, but NZCV is dead. 270 unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode()); 271 int DeadNZCVIdx = 272 II.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr); 273 if (DeadNZCVIdx != -1) { 274 if (NewOpc) { 275 // If there is an equivalent non-flag-setting op, we convert. 276 LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting " 277 "op: " 278 << II); 279 II.setDesc(TII->get(NewOpc)); 280 II.removeOperand(DeadNZCVIdx); 281 // Changing the opcode can result in differing regclass requirements, 282 // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp. 283 // Constrain the regclasses, possibly introducing a copy. 284 constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(), 285 II.getOperand(0), 0); 286 Changed |= true; 287 } else { 288 // Otherwise, we just set the nzcv imp-def operand to be dead, so the 289 // peephole optimizations can optimize them further. 290 II.getOperand(DeadNZCVIdx).setIsDead(); 291 } 292 } 293 } 294 LRU.stepBackward(II); 295 } 296 return Changed; 297 } 298 299 bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) { 300 if (MF.getProperties().hasProperty( 301 MachineFunctionProperties::Property::FailedISel)) 302 return false; 303 assert(MF.getProperties().hasProperty( 304 MachineFunctionProperties::Property::Selected) && 305 "Expected a selected MF"); 306 307 bool Changed = false; 308 for (auto &BB : MF) { 309 Changed |= optimizeNZCVDefs(BB); 310 Changed |= doPeepholeOpts(BB); 311 } 312 return Changed; 313 } 314 315 char AArch64PostSelectOptimize::ID = 0; 316 INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE, 317 "Optimize AArch64 selected instructions", 318 false, false) 319 INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE, 320 "Optimize AArch64 selected instructions", false, 321 false) 322 323 namespace llvm { 324 FunctionPass *createAArch64PostSelectOptimize() { 325 return new AArch64PostSelectOptimize(); 326 } 327 } // end namespace llvm 328