1 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual 11 /// flag bits. 12 /// 13 /// We have to do this by carefully analyzing and rewriting the usage of the 14 /// copied EFLAGS register because there is no general way to rematerialize the 15 /// entire EFLAGS register safely and efficiently. Using `popf` both forces 16 /// dynamic stack adjustment and can create correctness issues due to IF, TF, 17 /// and other non-status flags being overwritten. Using sequences involving 18 /// SAHF don't work on all x86 processors and are often quite slow compared to 19 /// directly testing a single status preserved in its own GPR. 20 /// 21 //===----------------------------------------------------------------------===// 22 23 #include "X86.h" 24 #include "X86InstrBuilder.h" 25 #include "X86InstrInfo.h" 26 #include "X86Subtarget.h" 27 #include "llvm/ADT/ArrayRef.h" 28 #include "llvm/ADT/DenseMap.h" 29 #include "llvm/ADT/PostOrderIterator.h" 30 #include "llvm/ADT/STLExtras.h" 31 #include "llvm/ADT/ScopeExit.h" 32 #include "llvm/ADT/SmallPtrSet.h" 33 #include "llvm/ADT/SmallSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/SparseBitVector.h" 36 #include "llvm/ADT/Statistic.h" 37 #include "llvm/CodeGen/MachineBasicBlock.h" 38 #include "llvm/CodeGen/MachineConstantPool.h" 39 #include "llvm/CodeGen/MachineDominators.h" 40 #include "llvm/CodeGen/MachineFunction.h" 41 #include "llvm/CodeGen/MachineFunctionPass.h" 42 #include "llvm/CodeGen/MachineInstr.h" 43 #include "llvm/CodeGen/MachineInstrBuilder.h" 44 #include "llvm/CodeGen/MachineModuleInfo.h" 45 #include "llvm/CodeGen/MachineOperand.h" 46 #include "llvm/CodeGen/MachineRegisterInfo.h" 47 #include "llvm/CodeGen/MachineSSAUpdater.h" 48 #include "llvm/CodeGen/TargetInstrInfo.h" 49 #include "llvm/CodeGen/TargetRegisterInfo.h" 50 #include "llvm/CodeGen/TargetSchedule.h" 51 #include "llvm/CodeGen/TargetSubtargetInfo.h" 52 #include "llvm/IR/DebugLoc.h" 53 #include "llvm/MC/MCSchedule.h" 54 #include "llvm/Pass.h" 55 #include "llvm/Support/CommandLine.h" 56 #include "llvm/Support/Debug.h" 57 #include "llvm/Support/raw_ostream.h" 58 #include <algorithm> 59 #include <cassert> 60 #include <iterator> 61 #include <utility> 62 63 using namespace llvm; 64 65 #define PASS_KEY "x86-flags-copy-lowering" 66 #define DEBUG_TYPE PASS_KEY 67 68 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated"); 69 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted"); 70 STATISTIC(NumTestsInserted, "Number of test instructions inserted"); 71 STATISTIC(NumAddsInserted, "Number of adds instructions inserted"); 72 73 namespace { 74 75 // Convenient array type for storing registers associated with each condition. 76 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>; 77 78 class X86FlagsCopyLoweringPass : public MachineFunctionPass { 79 public: 80 X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { } 81 82 StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; } 83 bool runOnMachineFunction(MachineFunction &MF) override; 84 void getAnalysisUsage(AnalysisUsage &AU) const override; 85 86 /// Pass identification, replacement for typeid. 87 static char ID; 88 89 private: 90 MachineRegisterInfo *MRI = nullptr; 91 const X86Subtarget *Subtarget = nullptr; 92 const X86InstrInfo *TII = nullptr; 93 const TargetRegisterInfo *TRI = nullptr; 94 const TargetRegisterClass *PromoteRC = nullptr; 95 MachineDominatorTree *MDT = nullptr; 96 97 CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, 98 MachineBasicBlock::iterator CopyDefI); 99 100 Register promoteCondToReg(MachineBasicBlock &MBB, 101 MachineBasicBlock::iterator TestPos, 102 const DebugLoc &TestLoc, X86::CondCode Cond); 103 std::pair<unsigned, bool> getCondOrInverseInReg( 104 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 105 const DebugLoc &TestLoc, X86::CondCode Cond, CondRegArray &CondRegs); 106 void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos, 107 const DebugLoc &Loc, unsigned Reg); 108 109 void rewriteArithmetic(MachineBasicBlock &TestMBB, 110 MachineBasicBlock::iterator TestPos, 111 const DebugLoc &TestLoc, MachineInstr &MI, 112 MachineOperand &FlagUse, CondRegArray &CondRegs); 113 void rewriteCMov(MachineBasicBlock &TestMBB, 114 MachineBasicBlock::iterator TestPos, const DebugLoc &TestLoc, 115 MachineInstr &CMovI, MachineOperand &FlagUse, 116 CondRegArray &CondRegs); 117 void rewriteFCMov(MachineBasicBlock &TestMBB, 118 MachineBasicBlock::iterator TestPos, 119 const DebugLoc &TestLoc, MachineInstr &CMovI, 120 MachineOperand &FlagUse, CondRegArray &CondRegs); 121 void rewriteCondJmp(MachineBasicBlock &TestMBB, 122 MachineBasicBlock::iterator TestPos, 123 const DebugLoc &TestLoc, MachineInstr &JmpI, 124 CondRegArray &CondRegs); 125 void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse, 126 MachineInstr &CopyDefI); 127 void rewriteSetCC(MachineBasicBlock &TestMBB, 128 MachineBasicBlock::iterator TestPos, 129 const DebugLoc &TestLoc, MachineInstr &SetCCI, 130 MachineOperand &FlagUse, CondRegArray &CondRegs); 131 }; 132 133 } // end anonymous namespace 134 135 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE, 136 "X86 EFLAGS copy lowering", false, false) 137 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE, 138 "X86 EFLAGS copy lowering", false, false) 139 140 FunctionPass *llvm::createX86FlagsCopyLoweringPass() { 141 return new X86FlagsCopyLoweringPass(); 142 } 143 144 char X86FlagsCopyLoweringPass::ID = 0; 145 146 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { 147 AU.addRequired<MachineDominatorTree>(); 148 MachineFunctionPass::getAnalysisUsage(AU); 149 } 150 151 namespace { 152 /// An enumeration of the arithmetic instruction mnemonics which have 153 /// interesting flag semantics. 154 /// 155 /// We can map instruction opcodes into these mnemonics to make it easy to 156 /// dispatch with specific functionality. 157 enum class FlagArithMnemonic { 158 ADC, 159 ADCX, 160 ADOX, 161 RCL, 162 RCR, 163 SBB, 164 SETB, 165 }; 166 } // namespace 167 168 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) { 169 switch (Opcode) { 170 default: 171 report_fatal_error("No support for lowering a copy into EFLAGS when used " 172 "by this instruction!"); 173 174 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX) \ 175 case X86::MNEMONIC##8##SUFFIX: \ 176 case X86::MNEMONIC##16##SUFFIX: \ 177 case X86::MNEMONIC##32##SUFFIX: \ 178 case X86::MNEMONIC##64##SUFFIX: 179 180 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \ 181 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \ 182 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \ 183 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \ 184 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \ 185 case X86::MNEMONIC##8ri: \ 186 case X86::MNEMONIC##16ri8: \ 187 case X86::MNEMONIC##32ri8: \ 188 case X86::MNEMONIC##64ri8: \ 189 case X86::MNEMONIC##16ri: \ 190 case X86::MNEMONIC##32ri: \ 191 case X86::MNEMONIC##64ri32: \ 192 case X86::MNEMONIC##8mi: \ 193 case X86::MNEMONIC##16mi8: \ 194 case X86::MNEMONIC##32mi8: \ 195 case X86::MNEMONIC##64mi8: \ 196 case X86::MNEMONIC##16mi: \ 197 case X86::MNEMONIC##32mi: \ 198 case X86::MNEMONIC##64mi32: \ 199 case X86::MNEMONIC##8i8: \ 200 case X86::MNEMONIC##16i16: \ 201 case X86::MNEMONIC##32i32: \ 202 case X86::MNEMONIC##64i32: 203 204 LLVM_EXPAND_ADC_SBB_INSTR(ADC) 205 return FlagArithMnemonic::ADC; 206 207 LLVM_EXPAND_ADC_SBB_INSTR(SBB) 208 return FlagArithMnemonic::SBB; 209 210 #undef LLVM_EXPAND_ADC_SBB_INSTR 211 212 LLVM_EXPAND_INSTR_SIZES(RCL, rCL) 213 LLVM_EXPAND_INSTR_SIZES(RCL, r1) 214 LLVM_EXPAND_INSTR_SIZES(RCL, ri) 215 return FlagArithMnemonic::RCL; 216 217 LLVM_EXPAND_INSTR_SIZES(RCR, rCL) 218 LLVM_EXPAND_INSTR_SIZES(RCR, r1) 219 LLVM_EXPAND_INSTR_SIZES(RCR, ri) 220 return FlagArithMnemonic::RCR; 221 222 #undef LLVM_EXPAND_INSTR_SIZES 223 224 case X86::ADCX32rr: 225 case X86::ADCX64rr: 226 case X86::ADCX32rm: 227 case X86::ADCX64rm: 228 return FlagArithMnemonic::ADCX; 229 230 case X86::ADOX32rr: 231 case X86::ADOX64rr: 232 case X86::ADOX32rm: 233 case X86::ADOX64rm: 234 return FlagArithMnemonic::ADOX; 235 236 case X86::SETB_C32r: 237 case X86::SETB_C64r: 238 return FlagArithMnemonic::SETB; 239 } 240 } 241 242 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB, 243 MachineInstr &SplitI, 244 const X86InstrInfo &TII) { 245 MachineFunction &MF = *MBB.getParent(); 246 247 assert(SplitI.getParent() == &MBB && 248 "Split instruction must be in the split block!"); 249 assert(SplitI.isBranch() && 250 "Only designed to split a tail of branch instructions!"); 251 assert(X86::getCondFromBranch(SplitI) != X86::COND_INVALID && 252 "Must split on an actual jCC instruction!"); 253 254 // Dig out the previous instruction to the split point. 255 MachineInstr &PrevI = *std::prev(SplitI.getIterator()); 256 assert(PrevI.isBranch() && "Must split after a branch!"); 257 assert(X86::getCondFromBranch(PrevI) != X86::COND_INVALID && 258 "Must split after an actual jCC instruction!"); 259 assert(!std::prev(PrevI.getIterator())->isTerminator() && 260 "Must only have this one terminator prior to the split!"); 261 262 // Grab the one successor edge that will stay in `MBB`. 263 MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB(); 264 265 // Analyze the original block to see if we are actually splitting an edge 266 // into two edges. This can happen when we have multiple conditional jumps to 267 // the same successor. 268 bool IsEdgeSplit = 269 std::any_of(SplitI.getIterator(), MBB.instr_end(), 270 [&](MachineInstr &MI) { 271 assert(MI.isTerminator() && 272 "Should only have spliced terminators!"); 273 return llvm::any_of( 274 MI.operands(), [&](MachineOperand &MOp) { 275 return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc; 276 }); 277 }) || 278 MBB.getFallThrough() == &UnsplitSucc; 279 280 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock(); 281 282 // Insert the new block immediately after the current one. Any existing 283 // fallthrough will be sunk into this new block anyways. 284 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB); 285 286 // Splice the tail of instructions into the new block. 287 NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end()); 288 289 // Copy the necessary succesors (and their probability info) into the new 290 // block. 291 for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) 292 if (IsEdgeSplit || *SI != &UnsplitSucc) 293 NewMBB.copySuccessor(&MBB, SI); 294 // Normalize the probabilities if we didn't end up splitting the edge. 295 if (!IsEdgeSplit) 296 NewMBB.normalizeSuccProbs(); 297 298 // Now replace all of the moved successors in the original block with the new 299 // block. This will merge their probabilities. 300 for (MachineBasicBlock *Succ : NewMBB.successors()) 301 if (Succ != &UnsplitSucc) 302 MBB.replaceSuccessor(Succ, &NewMBB); 303 304 // We should always end up replacing at least one successor. 305 assert(MBB.isSuccessor(&NewMBB) && 306 "Failed to make the new block a successor!"); 307 308 // Now update all the PHIs. 309 for (MachineBasicBlock *Succ : NewMBB.successors()) { 310 for (MachineInstr &MI : *Succ) { 311 if (!MI.isPHI()) 312 break; 313 314 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps; 315 OpIdx += 2) { 316 MachineOperand &OpV = MI.getOperand(OpIdx); 317 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1); 318 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!"); 319 if (OpMBB.getMBB() != &MBB) 320 continue; 321 322 // Replace the operand for unsplit successors 323 if (!IsEdgeSplit || Succ != &UnsplitSucc) { 324 OpMBB.setMBB(&NewMBB); 325 326 // We have to continue scanning as there may be multiple entries in 327 // the PHI. 328 continue; 329 } 330 331 // When we have split the edge append a new successor. 332 MI.addOperand(MF, OpV); 333 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB)); 334 break; 335 } 336 } 337 } 338 339 return NewMBB; 340 } 341 342 static X86::CondCode getCondFromFCMOV(unsigned Opcode) { 343 switch (Opcode) { 344 default: return X86::COND_INVALID; 345 case X86::CMOVBE_Fp32: case X86::CMOVBE_Fp64: case X86::CMOVBE_Fp80: 346 return X86::COND_BE; 347 case X86::CMOVB_Fp32: case X86::CMOVB_Fp64: case X86::CMOVB_Fp80: 348 return X86::COND_B; 349 case X86::CMOVE_Fp32: case X86::CMOVE_Fp64: case X86::CMOVE_Fp80: 350 return X86::COND_E; 351 case X86::CMOVNBE_Fp32: case X86::CMOVNBE_Fp64: case X86::CMOVNBE_Fp80: 352 return X86::COND_A; 353 case X86::CMOVNB_Fp32: case X86::CMOVNB_Fp64: case X86::CMOVNB_Fp80: 354 return X86::COND_AE; 355 case X86::CMOVNE_Fp32: case X86::CMOVNE_Fp64: case X86::CMOVNE_Fp80: 356 return X86::COND_NE; 357 case X86::CMOVNP_Fp32: case X86::CMOVNP_Fp64: case X86::CMOVNP_Fp80: 358 return X86::COND_NP; 359 case X86::CMOVP_Fp32: case X86::CMOVP_Fp64: case X86::CMOVP_Fp80: 360 return X86::COND_P; 361 } 362 } 363 364 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { 365 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() 366 << " **********\n"); 367 368 Subtarget = &MF.getSubtarget<X86Subtarget>(); 369 MRI = &MF.getRegInfo(); 370 TII = Subtarget->getInstrInfo(); 371 TRI = Subtarget->getRegisterInfo(); 372 MDT = &getAnalysis<MachineDominatorTree>(); 373 PromoteRC = &X86::GR8RegClass; 374 375 if (MF.begin() == MF.end()) 376 // Nothing to do for a degenerate empty function... 377 return false; 378 379 // Collect the copies in RPO so that when there are chains where a copy is in 380 // turn copied again we visit the first one first. This ensures we can find 381 // viable locations for testing the original EFLAGS that dominate all the 382 // uses across complex CFGs. 383 SmallVector<MachineInstr *, 4> Copies; 384 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); 385 for (MachineBasicBlock *MBB : RPOT) 386 for (MachineInstr &MI : *MBB) 387 if (MI.getOpcode() == TargetOpcode::COPY && 388 MI.getOperand(0).getReg() == X86::EFLAGS) 389 Copies.push_back(&MI); 390 391 for (MachineInstr *CopyI : Copies) { 392 MachineBasicBlock &MBB = *CopyI->getParent(); 393 394 MachineOperand &VOp = CopyI->getOperand(1); 395 assert(VOp.isReg() && 396 "The input to the copy for EFLAGS should always be a register!"); 397 MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg()); 398 if (CopyDefI.getOpcode() != TargetOpcode::COPY) { 399 // FIXME: The big likely candidate here are PHI nodes. We could in theory 400 // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard 401 // enough that it is probably better to change every other part of LLVM 402 // to avoid creating them. The issue is that once we have PHIs we won't 403 // know which original EFLAGS value we need to capture with our setCCs 404 // below. The end result will be computing a complete set of setCCs that 405 // we *might* want, computing them in every place where we copy *out* of 406 // EFLAGS and then doing SSA formation on all of them to insert necessary 407 // PHI nodes and consume those here. Then hoping that somehow we DCE the 408 // unnecessary ones. This DCE seems very unlikely to be successful and so 409 // we will almost certainly end up with a glut of dead setCC 410 // instructions. Until we have a motivating test case and fail to avoid 411 // it by changing other parts of LLVM's lowering, we refuse to handle 412 // this complex case here. 413 LLVM_DEBUG( 414 dbgs() << "ERROR: Encountered unexpected def of an eflags copy: "; 415 CopyDefI.dump()); 416 report_fatal_error( 417 "Cannot lower EFLAGS copy unless it is defined in turn by a copy!"); 418 } 419 420 auto Cleanup = make_scope_exit([&] { 421 // All uses of the EFLAGS copy are now rewritten, kill the copy into 422 // eflags and if dead the copy from. 423 CopyI->eraseFromParent(); 424 if (MRI->use_empty(CopyDefI.getOperand(0).getReg())) 425 CopyDefI.eraseFromParent(); 426 ++NumCopiesEliminated; 427 }); 428 429 MachineOperand &DOp = CopyI->getOperand(0); 430 assert(DOp.isDef() && "Expected register def!"); 431 assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!"); 432 if (DOp.isDead()) 433 continue; 434 435 MachineBasicBlock *TestMBB = CopyDefI.getParent(); 436 auto TestPos = CopyDefI.getIterator(); 437 DebugLoc TestLoc = CopyDefI.getDebugLoc(); 438 439 LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump()); 440 441 // Walk up across live-in EFLAGS to find where they were actually def'ed. 442 // 443 // This copy's def may just be part of a region of blocks covered by 444 // a single def of EFLAGS and we want to find the top of that region where 445 // possible. 446 // 447 // This is essentially a search for a *candidate* reaching definition 448 // location. We don't need to ever find the actual reaching definition here, 449 // but we want to walk up the dominator tree to find the highest point which 450 // would be viable for such a definition. 451 auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin, 452 MachineBasicBlock::iterator End) { 453 // Scan backwards as we expect these to be relatively short and often find 454 // a clobber near the end. 455 return llvm::any_of( 456 llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) { 457 // Flag any instruction (other than the copy we are 458 // currently rewriting) that defs EFLAGS. 459 return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS); 460 }); 461 }; 462 auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB, 463 MachineBasicBlock *EndMBB) { 464 assert(MDT->dominates(BeginMBB, EndMBB) && 465 "Only support paths down the dominator tree!"); 466 SmallPtrSet<MachineBasicBlock *, 4> Visited; 467 SmallVector<MachineBasicBlock *, 4> Worklist; 468 // We terminate at the beginning. No need to scan it. 469 Visited.insert(BeginMBB); 470 Worklist.push_back(EndMBB); 471 do { 472 auto *MBB = Worklist.pop_back_val(); 473 for (auto *PredMBB : MBB->predecessors()) { 474 if (!Visited.insert(PredMBB).second) 475 continue; 476 if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end())) 477 return true; 478 // Enqueue this block to walk its predecessors. 479 Worklist.push_back(PredMBB); 480 } 481 } while (!Worklist.empty()); 482 // No clobber found along a path from the begin to end. 483 return false; 484 }; 485 while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() && 486 !HasEFLAGSClobber(TestMBB->begin(), TestPos)) { 487 // Find the nearest common dominator of the predecessors, as 488 // that will be the best candidate to hoist into. 489 MachineBasicBlock *HoistMBB = 490 std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(), 491 *TestMBB->pred_begin(), 492 [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) { 493 return MDT->findNearestCommonDominator(LHS, RHS); 494 }); 495 496 // Now we need to scan all predecessors that may be reached along paths to 497 // the hoist block. A clobber anywhere in any of these blocks the hoist. 498 // Note that this even handles loops because we require *no* clobbers. 499 if (HasEFLAGSClobberPath(HoistMBB, TestMBB)) 500 break; 501 502 // We also need the terminators to not sneakily clobber flags. 503 if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(), 504 HoistMBB->instr_end())) 505 break; 506 507 // We found a viable location, hoist our test position to it. 508 TestMBB = HoistMBB; 509 TestPos = TestMBB->getFirstTerminator()->getIterator(); 510 // Clear the debug location as it would just be confusing after hoisting. 511 TestLoc = DebugLoc(); 512 } 513 LLVM_DEBUG({ 514 auto DefIt = llvm::find_if( 515 llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)), 516 [&](MachineInstr &MI) { 517 return MI.findRegisterDefOperand(X86::EFLAGS); 518 }); 519 if (DefIt.base() != TestMBB->instr_begin()) { 520 dbgs() << " Using EFLAGS defined by: "; 521 DefIt->dump(); 522 } else { 523 dbgs() << " Using live-in flags for BB:\n"; 524 TestMBB->dump(); 525 } 526 }); 527 528 // While rewriting uses, we buffer jumps and rewrite them in a second pass 529 // because doing so will perturb the CFG that we are walking to find the 530 // uses in the first place. 531 SmallVector<MachineInstr *, 4> JmpIs; 532 533 // Gather the condition flags that have already been preserved in 534 // registers. We do this from scratch each time as we expect there to be 535 // very few of them and we expect to not revisit the same copy definition 536 // many times. If either of those change sufficiently we could build a map 537 // of these up front instead. 538 CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos); 539 540 // Collect the basic blocks we need to scan. Typically this will just be 541 // a single basic block but we may have to scan multiple blocks if the 542 // EFLAGS copy lives into successors. 543 SmallVector<MachineBasicBlock *, 2> Blocks; 544 SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks; 545 Blocks.push_back(&MBB); 546 547 do { 548 MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); 549 550 // Track when if/when we find a kill of the flags in this block. 551 bool FlagsKilled = false; 552 553 // In most cases, we walk from the beginning to the end of the block. But 554 // when the block is the same block as the copy is from, we will visit it 555 // twice. The first time we start from the copy and go to the end. The 556 // second time we start from the beginning and go to the copy. This lets 557 // us handle copies inside of cycles. 558 // FIXME: This loop is *super* confusing. This is at least in part 559 // a symptom of all of this routine needing to be refactored into 560 // documentable components. Once done, there may be a better way to write 561 // this loop. 562 for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB)) 563 ? std::next(CopyI->getIterator()) 564 : UseMBB.instr_begin(), 565 MIE = UseMBB.instr_end(); 566 MII != MIE;) { 567 MachineInstr &MI = *MII++; 568 // If we are in the original copy block and encounter either the copy 569 // def or the copy itself, break so that we don't re-process any part of 570 // the block or process the instructions in the range that was copied 571 // over. 572 if (&MI == CopyI || &MI == &CopyDefI) { 573 assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) && 574 "Should only encounter these on the second pass over the " 575 "original block."); 576 break; 577 } 578 579 MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); 580 if (!FlagUse) { 581 if (MI.findRegisterDefOperand(X86::EFLAGS)) { 582 // If EFLAGS are defined, it's as-if they were killed. We can stop 583 // scanning here. 584 // 585 // NB!!! Many instructions only modify some flags. LLVM currently 586 // models this as clobbering all flags, but if that ever changes 587 // this will need to be carefully updated to handle that more 588 // complex logic. 589 FlagsKilled = true; 590 break; 591 } 592 continue; 593 } 594 595 LLVM_DEBUG(dbgs() << " Rewriting use: "; MI.dump()); 596 597 // Check the kill flag before we rewrite as that may change it. 598 if (FlagUse->isKill()) 599 FlagsKilled = true; 600 601 // Once we encounter a branch, the rest of the instructions must also be 602 // branches. We can't rewrite in place here, so we handle them below. 603 // 604 // Note that we don't have to handle tail calls here, even conditional 605 // tail calls, as those are not introduced into the X86 MI until post-RA 606 // branch folding or black placement. As a consequence, we get to deal 607 // with the simpler formulation of conditional branches followed by tail 608 // calls. 609 if (X86::getCondFromBranch(MI) != X86::COND_INVALID) { 610 auto JmpIt = MI.getIterator(); 611 do { 612 JmpIs.push_back(&*JmpIt); 613 ++JmpIt; 614 } while (JmpIt != UseMBB.instr_end() && 615 X86::getCondFromBranch(*JmpIt) != 616 X86::COND_INVALID); 617 break; 618 } 619 620 // Otherwise we can just rewrite in-place. 621 if (X86::getCondFromCMov(MI) != X86::COND_INVALID) { 622 rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 623 } else if (getCondFromFCMOV(MI.getOpcode()) != X86::COND_INVALID) { 624 rewriteFCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 625 } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) { 626 rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 627 } else if (MI.getOpcode() == TargetOpcode::COPY) { 628 rewriteCopy(MI, *FlagUse, CopyDefI); 629 } else { 630 // We assume all other instructions that use flags also def them. 631 assert(MI.findRegisterDefOperand(X86::EFLAGS) && 632 "Expected a def of EFLAGS for this instruction!"); 633 634 // NB!!! Several arithmetic instructions only *partially* update 635 // flags. Theoretically, we could generate MI code sequences that 636 // would rely on this fact and observe different flags independently. 637 // But currently LLVM models all of these instructions as clobbering 638 // all the flags in an undef way. We rely on that to simplify the 639 // logic. 640 FlagsKilled = true; 641 642 // Generically handle remaining uses as arithmetic instructions. 643 rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse, 644 CondRegs); 645 } 646 647 // If this was the last use of the flags, we're done. 648 if (FlagsKilled) 649 break; 650 } 651 652 // If the flags were killed, we're done with this block. 653 if (FlagsKilled) 654 continue; 655 656 // Otherwise we need to scan successors for ones where the flags live-in 657 // and queue those up for processing. 658 for (MachineBasicBlock *SuccMBB : UseMBB.successors()) 659 if (SuccMBB->isLiveIn(X86::EFLAGS) && 660 VisitedBlocks.insert(SuccMBB).second) { 661 // We currently don't do any PHI insertion and so we require that the 662 // test basic block dominates all of the use basic blocks. Further, we 663 // can't have a cycle from the test block back to itself as that would 664 // create a cycle requiring a PHI to break it. 665 // 666 // We could in theory do PHI insertion here if it becomes useful by 667 // just taking undef values in along every edge that we don't trace 668 // this EFLAGS copy along. This isn't as bad as fully general PHI 669 // insertion, but still seems like a great deal of complexity. 670 // 671 // Because it is theoretically possible that some earlier MI pass or 672 // other lowering transformation could induce this to happen, we do 673 // a hard check even in non-debug builds here. 674 if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) { 675 LLVM_DEBUG({ 676 dbgs() 677 << "ERROR: Encountered use that is not dominated by our test " 678 "basic block! Rewriting this would require inserting PHI " 679 "nodes to track the flag state across the CFG.\n\nTest " 680 "block:\n"; 681 TestMBB->dump(); 682 dbgs() << "Use block:\n"; 683 SuccMBB->dump(); 684 }); 685 report_fatal_error( 686 "Cannot lower EFLAGS copy when original copy def " 687 "does not dominate all uses."); 688 } 689 690 Blocks.push_back(SuccMBB); 691 692 // After this, EFLAGS will be recreated before each use. 693 SuccMBB->removeLiveIn(X86::EFLAGS); 694 } 695 } while (!Blocks.empty()); 696 697 // Now rewrite the jumps that use the flags. These we handle specially 698 // because if there are multiple jumps in a single basic block we'll have 699 // to do surgery on the CFG. 700 MachineBasicBlock *LastJmpMBB = nullptr; 701 for (MachineInstr *JmpI : JmpIs) { 702 // Past the first jump within a basic block we need to split the blocks 703 // apart. 704 if (JmpI->getParent() == LastJmpMBB) 705 splitBlock(*JmpI->getParent(), *JmpI, *TII); 706 else 707 LastJmpMBB = JmpI->getParent(); 708 709 rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs); 710 } 711 712 // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if 713 // the copy's def operand is itself a kill. 714 } 715 716 #ifndef NDEBUG 717 for (MachineBasicBlock &MBB : MF) 718 for (MachineInstr &MI : MBB) 719 if (MI.getOpcode() == TargetOpcode::COPY && 720 (MI.getOperand(0).getReg() == X86::EFLAGS || 721 MI.getOperand(1).getReg() == X86::EFLAGS)) { 722 LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; 723 MI.dump()); 724 llvm_unreachable("Unlowered EFLAGS copy!"); 725 } 726 #endif 727 728 return true; 729 } 730 731 /// Collect any conditions that have already been set in registers so that we 732 /// can re-use them rather than adding duplicates. 733 CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( 734 MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) { 735 CondRegArray CondRegs = {}; 736 737 // Scan backwards across the range of instructions with live EFLAGS. 738 for (MachineInstr &MI : 739 llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { 740 X86::CondCode Cond = X86::getCondFromSETCC(MI); 741 if (Cond != X86::COND_INVALID && !MI.mayStore() && 742 MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isVirtual()) { 743 assert(MI.getOperand(0).isDef() && 744 "A non-storing SETcc should always define a register!"); 745 CondRegs[Cond] = MI.getOperand(0).getReg(); 746 } 747 748 // Stop scanning when we see the first definition of the EFLAGS as prior to 749 // this we would potentially capture the wrong flag state. 750 if (MI.findRegisterDefOperand(X86::EFLAGS)) 751 break; 752 } 753 return CondRegs; 754 } 755 756 Register X86FlagsCopyLoweringPass::promoteCondToReg( 757 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 758 const DebugLoc &TestLoc, X86::CondCode Cond) { 759 Register Reg = MRI->createVirtualRegister(PromoteRC); 760 auto SetI = BuildMI(TestMBB, TestPos, TestLoc, 761 TII->get(X86::SETCCr), Reg).addImm(Cond); 762 (void)SetI; 763 LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump()); 764 ++NumSetCCsInserted; 765 return Reg; 766 } 767 768 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg( 769 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 770 const DebugLoc &TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) { 771 unsigned &CondReg = CondRegs[Cond]; 772 unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)]; 773 if (!CondReg && !InvCondReg) 774 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 775 776 if (CondReg) 777 return {CondReg, false}; 778 else 779 return {InvCondReg, true}; 780 } 781 782 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB, 783 MachineBasicBlock::iterator Pos, 784 const DebugLoc &Loc, unsigned Reg) { 785 auto TestI = 786 BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg); 787 (void)TestI; 788 LLVM_DEBUG(dbgs() << " test cond: "; TestI->dump()); 789 ++NumTestsInserted; 790 } 791 792 void X86FlagsCopyLoweringPass::rewriteArithmetic( 793 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 794 const DebugLoc &TestLoc, MachineInstr &MI, MachineOperand &FlagUse, 795 CondRegArray &CondRegs) { 796 // Arithmetic is either reading CF or OF. Figure out which condition we need 797 // to preserve in a register. 798 X86::CondCode Cond = X86::COND_INVALID; 799 800 // The addend to use to reset CF or OF when added to the flag value. 801 int Addend = 0; 802 803 switch (getMnemonicFromOpcode(MI.getOpcode())) { 804 case FlagArithMnemonic::ADC: 805 case FlagArithMnemonic::ADCX: 806 case FlagArithMnemonic::RCL: 807 case FlagArithMnemonic::RCR: 808 case FlagArithMnemonic::SBB: 809 case FlagArithMnemonic::SETB: 810 Cond = X86::COND_B; // CF == 1 811 // Set up an addend that when one is added will need a carry due to not 812 // having a higher bit available. 813 Addend = 255; 814 break; 815 816 case FlagArithMnemonic::ADOX: 817 Cond = X86::COND_O; // OF == 1 818 // Set up an addend that when one is added will turn from positive to 819 // negative and thus overflow in the signed domain. 820 Addend = 127; 821 break; 822 } 823 824 // Now get a register that contains the value of the flag input to the 825 // arithmetic. We require exactly this flag to simplify the arithmetic 826 // required to materialize it back into the flag. 827 unsigned &CondReg = CondRegs[Cond]; 828 if (!CondReg) 829 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 830 831 MachineBasicBlock &MBB = *MI.getParent(); 832 833 // Insert an instruction that will set the flag back to the desired value. 834 Register TmpReg = MRI->createVirtualRegister(PromoteRC); 835 auto AddI = 836 BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri)) 837 .addDef(TmpReg, RegState::Dead) 838 .addReg(CondReg) 839 .addImm(Addend); 840 (void)AddI; 841 LLVM_DEBUG(dbgs() << " add cond: "; AddI->dump()); 842 ++NumAddsInserted; 843 FlagUse.setIsKill(true); 844 } 845 846 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, 847 MachineBasicBlock::iterator TestPos, 848 const DebugLoc &TestLoc, 849 MachineInstr &CMovI, 850 MachineOperand &FlagUse, 851 CondRegArray &CondRegs) { 852 // First get the register containing this specific condition. 853 X86::CondCode Cond = X86::getCondFromCMov(CMovI); 854 unsigned CondReg; 855 bool Inverted; 856 std::tie(CondReg, Inverted) = 857 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 858 859 MachineBasicBlock &MBB = *CMovI.getParent(); 860 861 // Insert a direct test of the saved register. 862 insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); 863 864 // Rewrite the CMov to use the !ZF flag from the test, and then kill its use 865 // of the flags afterward. 866 CMovI.getOperand(CMovI.getDesc().getNumOperands() - 1) 867 .setImm(Inverted ? X86::COND_E : X86::COND_NE); 868 FlagUse.setIsKill(true); 869 LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump()); 870 } 871 872 void X86FlagsCopyLoweringPass::rewriteFCMov(MachineBasicBlock &TestMBB, 873 MachineBasicBlock::iterator TestPos, 874 const DebugLoc &TestLoc, 875 MachineInstr &CMovI, 876 MachineOperand &FlagUse, 877 CondRegArray &CondRegs) { 878 // First get the register containing this specific condition. 879 X86::CondCode Cond = getCondFromFCMOV(CMovI.getOpcode()); 880 unsigned CondReg; 881 bool Inverted; 882 std::tie(CondReg, Inverted) = 883 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 884 885 MachineBasicBlock &MBB = *CMovI.getParent(); 886 887 // Insert a direct test of the saved register. 888 insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); 889 890 auto getFCMOVOpcode = [](unsigned Opcode, bool Inverted) { 891 switch (Opcode) { 892 default: llvm_unreachable("Unexpected opcode!"); 893 case X86::CMOVBE_Fp32: case X86::CMOVNBE_Fp32: 894 case X86::CMOVB_Fp32: case X86::CMOVNB_Fp32: 895 case X86::CMOVE_Fp32: case X86::CMOVNE_Fp32: 896 case X86::CMOVP_Fp32: case X86::CMOVNP_Fp32: 897 return Inverted ? X86::CMOVE_Fp32 : X86::CMOVNE_Fp32; 898 case X86::CMOVBE_Fp64: case X86::CMOVNBE_Fp64: 899 case X86::CMOVB_Fp64: case X86::CMOVNB_Fp64: 900 case X86::CMOVE_Fp64: case X86::CMOVNE_Fp64: 901 case X86::CMOVP_Fp64: case X86::CMOVNP_Fp64: 902 return Inverted ? X86::CMOVE_Fp64 : X86::CMOVNE_Fp64; 903 case X86::CMOVBE_Fp80: case X86::CMOVNBE_Fp80: 904 case X86::CMOVB_Fp80: case X86::CMOVNB_Fp80: 905 case X86::CMOVE_Fp80: case X86::CMOVNE_Fp80: 906 case X86::CMOVP_Fp80: case X86::CMOVNP_Fp80: 907 return Inverted ? X86::CMOVE_Fp80 : X86::CMOVNE_Fp80; 908 } 909 }; 910 911 // Rewrite the CMov to use the !ZF flag from the test. 912 CMovI.setDesc(TII->get(getFCMOVOpcode(CMovI.getOpcode(), Inverted))); 913 FlagUse.setIsKill(true); 914 LLVM_DEBUG(dbgs() << " fixed fcmov: "; CMovI.dump()); 915 } 916 917 void X86FlagsCopyLoweringPass::rewriteCondJmp( 918 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 919 const DebugLoc &TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) { 920 // First get the register containing this specific condition. 921 X86::CondCode Cond = X86::getCondFromBranch(JmpI); 922 unsigned CondReg; 923 bool Inverted; 924 std::tie(CondReg, Inverted) = 925 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 926 927 MachineBasicBlock &JmpMBB = *JmpI.getParent(); 928 929 // Insert a direct test of the saved register. 930 insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg); 931 932 // Rewrite the jump to use the !ZF flag from the test, and kill its use of 933 // flags afterward. 934 JmpI.getOperand(1).setImm(Inverted ? X86::COND_E : X86::COND_NE); 935 JmpI.findRegisterUseOperand(X86::EFLAGS)->setIsKill(true); 936 LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump()); 937 } 938 939 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI, 940 MachineOperand &FlagUse, 941 MachineInstr &CopyDefI) { 942 // Just replace this copy with the original copy def. 943 MRI->replaceRegWith(MI.getOperand(0).getReg(), 944 CopyDefI.getOperand(0).getReg()); 945 MI.eraseFromParent(); 946 } 947 948 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB, 949 MachineBasicBlock::iterator TestPos, 950 const DebugLoc &TestLoc, 951 MachineInstr &SetCCI, 952 MachineOperand &FlagUse, 953 CondRegArray &CondRegs) { 954 X86::CondCode Cond = X86::getCondFromSETCC(SetCCI); 955 // Note that we can't usefully rewrite this to the inverse without complex 956 // analysis of the users of the setCC. Largely we rely on duplicates which 957 // could have been avoided already being avoided here. 958 unsigned &CondReg = CondRegs[Cond]; 959 if (!CondReg) 960 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 961 962 // Rewriting a register def is trivial: we just replace the register and 963 // remove the setcc. 964 if (!SetCCI.mayStore()) { 965 assert(SetCCI.getOperand(0).isReg() && 966 "Cannot have a non-register defined operand to SETcc!"); 967 Register OldReg = SetCCI.getOperand(0).getReg(); 968 // Drop Kill flags on the old register before replacing. CondReg may have 969 // a longer live range. 970 MRI->clearKillFlags(OldReg); 971 MRI->replaceRegWith(OldReg, CondReg); 972 SetCCI.eraseFromParent(); 973 return; 974 } 975 976 // Otherwise, we need to emit a store. 977 auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(), 978 SetCCI.getDebugLoc(), TII->get(X86::MOV8mr)); 979 // Copy the address operands. 980 for (int i = 0; i < X86::AddrNumOperands; ++i) 981 MIB.add(SetCCI.getOperand(i)); 982 983 MIB.addReg(CondReg); 984 985 MIB.setMemRefs(SetCCI.memoperands()); 986 987 SetCCI.eraseFromParent(); 988 } 989