1 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual 11 /// flag bits. 12 /// 13 /// We have to do this by carefully analyzing and rewriting the usage of the 14 /// copied EFLAGS register because there is no general way to rematerialize the 15 /// entire EFLAGS register safely and efficiently. Using `popf` both forces 16 /// dynamic stack adjustment and can create correctness issues due to IF, TF, 17 /// and other non-status flags being overwritten. Using sequences involving 18 /// SAHF don't work on all x86 processors and are often quite slow compared to 19 /// directly testing a single status preserved in its own GPR. 20 /// 21 //===----------------------------------------------------------------------===// 22 23 #include "X86.h" 24 #include "X86InstrBuilder.h" 25 #include "X86InstrInfo.h" 26 #include "X86Subtarget.h" 27 #include "llvm/ADT/PostOrderIterator.h" 28 #include "llvm/ADT/STLExtras.h" 29 #include "llvm/ADT/ScopeExit.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/ADT/SmallVector.h" 32 #include "llvm/ADT/Statistic.h" 33 #include "llvm/CodeGen/MachineBasicBlock.h" 34 #include "llvm/CodeGen/MachineConstantPool.h" 35 #include "llvm/CodeGen/MachineDominators.h" 36 #include "llvm/CodeGen/MachineFunction.h" 37 #include "llvm/CodeGen/MachineFunctionPass.h" 38 #include "llvm/CodeGen/MachineInstr.h" 39 #include "llvm/CodeGen/MachineInstrBuilder.h" 40 #include "llvm/CodeGen/MachineModuleInfo.h" 41 #include "llvm/CodeGen/MachineOperand.h" 42 #include "llvm/CodeGen/MachineRegisterInfo.h" 43 #include "llvm/CodeGen/MachineSSAUpdater.h" 44 #include "llvm/CodeGen/TargetInstrInfo.h" 45 #include "llvm/CodeGen/TargetRegisterInfo.h" 46 #include "llvm/CodeGen/TargetSchedule.h" 47 #include "llvm/CodeGen/TargetSubtargetInfo.h" 48 #include "llvm/IR/DebugLoc.h" 49 #include "llvm/MC/MCSchedule.h" 50 #include "llvm/Pass.h" 51 #include "llvm/Support/CommandLine.h" 52 #include "llvm/Support/Debug.h" 53 #include "llvm/Support/raw_ostream.h" 54 #include <algorithm> 55 #include <cassert> 56 #include <iterator> 57 #include <utility> 58 59 using namespace llvm; 60 61 #define PASS_KEY "x86-flags-copy-lowering" 62 #define DEBUG_TYPE PASS_KEY 63 64 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated"); 65 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted"); 66 STATISTIC(NumTestsInserted, "Number of test instructions inserted"); 67 STATISTIC(NumAddsInserted, "Number of adds instructions inserted"); 68 69 namespace { 70 71 // Convenient array type for storing registers associated with each condition. 72 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>; 73 74 class X86FlagsCopyLoweringPass : public MachineFunctionPass { 75 public: 76 X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { } 77 78 StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; } 79 bool runOnMachineFunction(MachineFunction &MF) override; 80 void getAnalysisUsage(AnalysisUsage &AU) const override; 81 82 /// Pass identification, replacement for typeid. 83 static char ID; 84 85 private: 86 MachineRegisterInfo *MRI = nullptr; 87 const X86Subtarget *Subtarget = nullptr; 88 const X86InstrInfo *TII = nullptr; 89 const TargetRegisterInfo *TRI = nullptr; 90 const TargetRegisterClass *PromoteRC = nullptr; 91 MachineDominatorTree *MDT = nullptr; 92 93 CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, 94 MachineBasicBlock::iterator CopyDefI); 95 96 Register promoteCondToReg(MachineBasicBlock &MBB, 97 MachineBasicBlock::iterator TestPos, 98 const DebugLoc &TestLoc, X86::CondCode Cond); 99 std::pair<unsigned, bool> getCondOrInverseInReg( 100 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 101 const DebugLoc &TestLoc, X86::CondCode Cond, CondRegArray &CondRegs); 102 void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos, 103 const DebugLoc &Loc, unsigned Reg); 104 105 void rewriteArithmetic(MachineBasicBlock &TestMBB, 106 MachineBasicBlock::iterator TestPos, 107 const DebugLoc &TestLoc, MachineInstr &MI, 108 MachineOperand &FlagUse, CondRegArray &CondRegs); 109 void rewriteCMov(MachineBasicBlock &TestMBB, 110 MachineBasicBlock::iterator TestPos, const DebugLoc &TestLoc, 111 MachineInstr &CMovI, MachineOperand &FlagUse, 112 CondRegArray &CondRegs); 113 void rewriteFCMov(MachineBasicBlock &TestMBB, 114 MachineBasicBlock::iterator TestPos, 115 const DebugLoc &TestLoc, MachineInstr &CMovI, 116 MachineOperand &FlagUse, CondRegArray &CondRegs); 117 void rewriteCondJmp(MachineBasicBlock &TestMBB, 118 MachineBasicBlock::iterator TestPos, 119 const DebugLoc &TestLoc, MachineInstr &JmpI, 120 CondRegArray &CondRegs); 121 void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse, 122 MachineInstr &CopyDefI); 123 void rewriteSetCC(MachineBasicBlock &TestMBB, 124 MachineBasicBlock::iterator TestPos, 125 const DebugLoc &TestLoc, MachineInstr &SetCCI, 126 MachineOperand &FlagUse, CondRegArray &CondRegs); 127 }; 128 129 } // end anonymous namespace 130 131 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE, 132 "X86 EFLAGS copy lowering", false, false) 133 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE, 134 "X86 EFLAGS copy lowering", false, false) 135 136 FunctionPass *llvm::createX86FlagsCopyLoweringPass() { 137 return new X86FlagsCopyLoweringPass(); 138 } 139 140 char X86FlagsCopyLoweringPass::ID = 0; 141 142 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { 143 AU.addRequired<MachineDominatorTree>(); 144 MachineFunctionPass::getAnalysisUsage(AU); 145 } 146 147 namespace { 148 /// An enumeration of the arithmetic instruction mnemonics which have 149 /// interesting flag semantics. 150 /// 151 /// We can map instruction opcodes into these mnemonics to make it easy to 152 /// dispatch with specific functionality. 153 enum class FlagArithMnemonic { 154 ADC, 155 RCL, 156 RCR, 157 SBB, 158 SETB, 159 }; 160 } // namespace 161 162 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) { 163 switch (Opcode) { 164 default: 165 report_fatal_error("No support for lowering a copy into EFLAGS when used " 166 "by this instruction!"); 167 168 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX) \ 169 case X86::MNEMONIC##8##SUFFIX: \ 170 case X86::MNEMONIC##16##SUFFIX: \ 171 case X86::MNEMONIC##32##SUFFIX: \ 172 case X86::MNEMONIC##64##SUFFIX: 173 174 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \ 175 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \ 176 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \ 177 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \ 178 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \ 179 case X86::MNEMONIC##8ri: \ 180 case X86::MNEMONIC##16ri8: \ 181 case X86::MNEMONIC##32ri8: \ 182 case X86::MNEMONIC##64ri8: \ 183 case X86::MNEMONIC##16ri: \ 184 case X86::MNEMONIC##32ri: \ 185 case X86::MNEMONIC##64ri32: \ 186 case X86::MNEMONIC##8mi: \ 187 case X86::MNEMONIC##16mi8: \ 188 case X86::MNEMONIC##32mi8: \ 189 case X86::MNEMONIC##64mi8: \ 190 case X86::MNEMONIC##16mi: \ 191 case X86::MNEMONIC##32mi: \ 192 case X86::MNEMONIC##64mi32: \ 193 case X86::MNEMONIC##8i8: \ 194 case X86::MNEMONIC##16i16: \ 195 case X86::MNEMONIC##32i32: \ 196 case X86::MNEMONIC##64i32: 197 198 LLVM_EXPAND_ADC_SBB_INSTR(ADC) 199 return FlagArithMnemonic::ADC; 200 201 LLVM_EXPAND_ADC_SBB_INSTR(SBB) 202 return FlagArithMnemonic::SBB; 203 204 #undef LLVM_EXPAND_ADC_SBB_INSTR 205 206 LLVM_EXPAND_INSTR_SIZES(RCL, rCL) 207 LLVM_EXPAND_INSTR_SIZES(RCL, r1) 208 LLVM_EXPAND_INSTR_SIZES(RCL, ri) 209 return FlagArithMnemonic::RCL; 210 211 LLVM_EXPAND_INSTR_SIZES(RCR, rCL) 212 LLVM_EXPAND_INSTR_SIZES(RCR, r1) 213 LLVM_EXPAND_INSTR_SIZES(RCR, ri) 214 return FlagArithMnemonic::RCR; 215 216 #undef LLVM_EXPAND_INSTR_SIZES 217 218 case X86::SETB_C32r: 219 case X86::SETB_C64r: 220 return FlagArithMnemonic::SETB; 221 } 222 } 223 224 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB, 225 MachineInstr &SplitI, 226 const X86InstrInfo &TII) { 227 MachineFunction &MF = *MBB.getParent(); 228 229 assert(SplitI.getParent() == &MBB && 230 "Split instruction must be in the split block!"); 231 assert(SplitI.isBranch() && 232 "Only designed to split a tail of branch instructions!"); 233 assert(X86::getCondFromBranch(SplitI) != X86::COND_INVALID && 234 "Must split on an actual jCC instruction!"); 235 236 // Dig out the previous instruction to the split point. 237 MachineInstr &PrevI = *std::prev(SplitI.getIterator()); 238 assert(PrevI.isBranch() && "Must split after a branch!"); 239 assert(X86::getCondFromBranch(PrevI) != X86::COND_INVALID && 240 "Must split after an actual jCC instruction!"); 241 assert(!std::prev(PrevI.getIterator())->isTerminator() && 242 "Must only have this one terminator prior to the split!"); 243 244 // Grab the one successor edge that will stay in `MBB`. 245 MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB(); 246 247 // Analyze the original block to see if we are actually splitting an edge 248 // into two edges. This can happen when we have multiple conditional jumps to 249 // the same successor. 250 bool IsEdgeSplit = 251 std::any_of(SplitI.getIterator(), MBB.instr_end(), 252 [&](MachineInstr &MI) { 253 assert(MI.isTerminator() && 254 "Should only have spliced terminators!"); 255 return llvm::any_of( 256 MI.operands(), [&](MachineOperand &MOp) { 257 return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc; 258 }); 259 }) || 260 MBB.getFallThrough() == &UnsplitSucc; 261 262 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock(); 263 264 // Insert the new block immediately after the current one. Any existing 265 // fallthrough will be sunk into this new block anyways. 266 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB); 267 268 // Splice the tail of instructions into the new block. 269 NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end()); 270 271 // Copy the necessary succesors (and their probability info) into the new 272 // block. 273 for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) 274 if (IsEdgeSplit || *SI != &UnsplitSucc) 275 NewMBB.copySuccessor(&MBB, SI); 276 // Normalize the probabilities if we didn't end up splitting the edge. 277 if (!IsEdgeSplit) 278 NewMBB.normalizeSuccProbs(); 279 280 // Now replace all of the moved successors in the original block with the new 281 // block. This will merge their probabilities. 282 for (MachineBasicBlock *Succ : NewMBB.successors()) 283 if (Succ != &UnsplitSucc) 284 MBB.replaceSuccessor(Succ, &NewMBB); 285 286 // We should always end up replacing at least one successor. 287 assert(MBB.isSuccessor(&NewMBB) && 288 "Failed to make the new block a successor!"); 289 290 // Now update all the PHIs. 291 for (MachineBasicBlock *Succ : NewMBB.successors()) { 292 for (MachineInstr &MI : *Succ) { 293 if (!MI.isPHI()) 294 break; 295 296 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps; 297 OpIdx += 2) { 298 MachineOperand &OpV = MI.getOperand(OpIdx); 299 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1); 300 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!"); 301 if (OpMBB.getMBB() != &MBB) 302 continue; 303 304 // Replace the operand for unsplit successors 305 if (!IsEdgeSplit || Succ != &UnsplitSucc) { 306 OpMBB.setMBB(&NewMBB); 307 308 // We have to continue scanning as there may be multiple entries in 309 // the PHI. 310 continue; 311 } 312 313 // When we have split the edge append a new successor. 314 MI.addOperand(MF, OpV); 315 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB)); 316 break; 317 } 318 } 319 } 320 321 return NewMBB; 322 } 323 324 static X86::CondCode getCondFromFCMOV(unsigned Opcode) { 325 switch (Opcode) { 326 default: return X86::COND_INVALID; 327 case X86::CMOVBE_Fp32: case X86::CMOVBE_Fp64: case X86::CMOVBE_Fp80: 328 return X86::COND_BE; 329 case X86::CMOVB_Fp32: case X86::CMOVB_Fp64: case X86::CMOVB_Fp80: 330 return X86::COND_B; 331 case X86::CMOVE_Fp32: case X86::CMOVE_Fp64: case X86::CMOVE_Fp80: 332 return X86::COND_E; 333 case X86::CMOVNBE_Fp32: case X86::CMOVNBE_Fp64: case X86::CMOVNBE_Fp80: 334 return X86::COND_A; 335 case X86::CMOVNB_Fp32: case X86::CMOVNB_Fp64: case X86::CMOVNB_Fp80: 336 return X86::COND_AE; 337 case X86::CMOVNE_Fp32: case X86::CMOVNE_Fp64: case X86::CMOVNE_Fp80: 338 return X86::COND_NE; 339 case X86::CMOVNP_Fp32: case X86::CMOVNP_Fp64: case X86::CMOVNP_Fp80: 340 return X86::COND_NP; 341 case X86::CMOVP_Fp32: case X86::CMOVP_Fp64: case X86::CMOVP_Fp80: 342 return X86::COND_P; 343 } 344 } 345 346 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { 347 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() 348 << " **********\n"); 349 350 Subtarget = &MF.getSubtarget<X86Subtarget>(); 351 MRI = &MF.getRegInfo(); 352 TII = Subtarget->getInstrInfo(); 353 TRI = Subtarget->getRegisterInfo(); 354 MDT = &getAnalysis<MachineDominatorTree>(); 355 PromoteRC = &X86::GR8RegClass; 356 357 if (MF.begin() == MF.end()) 358 // Nothing to do for a degenerate empty function... 359 return false; 360 361 // Collect the copies in RPO so that when there are chains where a copy is in 362 // turn copied again we visit the first one first. This ensures we can find 363 // viable locations for testing the original EFLAGS that dominate all the 364 // uses across complex CFGs. 365 SmallVector<MachineInstr *, 4> Copies; 366 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); 367 for (MachineBasicBlock *MBB : RPOT) 368 for (MachineInstr &MI : *MBB) 369 if (MI.getOpcode() == TargetOpcode::COPY && 370 MI.getOperand(0).getReg() == X86::EFLAGS) 371 Copies.push_back(&MI); 372 373 for (MachineInstr *CopyI : Copies) { 374 MachineBasicBlock &MBB = *CopyI->getParent(); 375 376 MachineOperand &VOp = CopyI->getOperand(1); 377 assert(VOp.isReg() && 378 "The input to the copy for EFLAGS should always be a register!"); 379 MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg()); 380 if (CopyDefI.getOpcode() != TargetOpcode::COPY) { 381 // FIXME: The big likely candidate here are PHI nodes. We could in theory 382 // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard 383 // enough that it is probably better to change every other part of LLVM 384 // to avoid creating them. The issue is that once we have PHIs we won't 385 // know which original EFLAGS value we need to capture with our setCCs 386 // below. The end result will be computing a complete set of setCCs that 387 // we *might* want, computing them in every place where we copy *out* of 388 // EFLAGS and then doing SSA formation on all of them to insert necessary 389 // PHI nodes and consume those here. Then hoping that somehow we DCE the 390 // unnecessary ones. This DCE seems very unlikely to be successful and so 391 // we will almost certainly end up with a glut of dead setCC 392 // instructions. Until we have a motivating test case and fail to avoid 393 // it by changing other parts of LLVM's lowering, we refuse to handle 394 // this complex case here. 395 LLVM_DEBUG( 396 dbgs() << "ERROR: Encountered unexpected def of an eflags copy: "; 397 CopyDefI.dump()); 398 report_fatal_error( 399 "Cannot lower EFLAGS copy unless it is defined in turn by a copy!"); 400 } 401 402 auto Cleanup = make_scope_exit([&] { 403 // All uses of the EFLAGS copy are now rewritten, kill the copy into 404 // eflags and if dead the copy from. 405 CopyI->eraseFromParent(); 406 if (MRI->use_empty(CopyDefI.getOperand(0).getReg())) 407 CopyDefI.eraseFromParent(); 408 ++NumCopiesEliminated; 409 }); 410 411 MachineOperand &DOp = CopyI->getOperand(0); 412 assert(DOp.isDef() && "Expected register def!"); 413 assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!"); 414 if (DOp.isDead()) 415 continue; 416 417 MachineBasicBlock *TestMBB = CopyDefI.getParent(); 418 auto TestPos = CopyDefI.getIterator(); 419 DebugLoc TestLoc = CopyDefI.getDebugLoc(); 420 421 LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump()); 422 423 // Walk up across live-in EFLAGS to find where they were actually def'ed. 424 // 425 // This copy's def may just be part of a region of blocks covered by 426 // a single def of EFLAGS and we want to find the top of that region where 427 // possible. 428 // 429 // This is essentially a search for a *candidate* reaching definition 430 // location. We don't need to ever find the actual reaching definition here, 431 // but we want to walk up the dominator tree to find the highest point which 432 // would be viable for such a definition. 433 auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin, 434 MachineBasicBlock::iterator End) { 435 // Scan backwards as we expect these to be relatively short and often find 436 // a clobber near the end. 437 return llvm::any_of( 438 llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) { 439 // Flag any instruction (other than the copy we are 440 // currently rewriting) that defs EFLAGS. 441 return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS); 442 }); 443 }; 444 auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB, 445 MachineBasicBlock *EndMBB) { 446 assert(MDT->dominates(BeginMBB, EndMBB) && 447 "Only support paths down the dominator tree!"); 448 SmallPtrSet<MachineBasicBlock *, 4> Visited; 449 SmallVector<MachineBasicBlock *, 4> Worklist; 450 // We terminate at the beginning. No need to scan it. 451 Visited.insert(BeginMBB); 452 Worklist.push_back(EndMBB); 453 do { 454 auto *MBB = Worklist.pop_back_val(); 455 for (auto *PredMBB : MBB->predecessors()) { 456 if (!Visited.insert(PredMBB).second) 457 continue; 458 if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end())) 459 return true; 460 // Enqueue this block to walk its predecessors. 461 Worklist.push_back(PredMBB); 462 } 463 } while (!Worklist.empty()); 464 // No clobber found along a path from the begin to end. 465 return false; 466 }; 467 while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() && 468 !HasEFLAGSClobber(TestMBB->begin(), TestPos)) { 469 // Find the nearest common dominator of the predecessors, as 470 // that will be the best candidate to hoist into. 471 MachineBasicBlock *HoistMBB = 472 std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(), 473 *TestMBB->pred_begin(), 474 [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) { 475 return MDT->findNearestCommonDominator(LHS, RHS); 476 }); 477 478 // Now we need to scan all predecessors that may be reached along paths to 479 // the hoist block. A clobber anywhere in any of these blocks the hoist. 480 // Note that this even handles loops because we require *no* clobbers. 481 if (HasEFLAGSClobberPath(HoistMBB, TestMBB)) 482 break; 483 484 // We also need the terminators to not sneakily clobber flags. 485 if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(), 486 HoistMBB->instr_end())) 487 break; 488 489 // We found a viable location, hoist our test position to it. 490 TestMBB = HoistMBB; 491 TestPos = TestMBB->getFirstTerminator()->getIterator(); 492 // Clear the debug location as it would just be confusing after hoisting. 493 TestLoc = DebugLoc(); 494 } 495 LLVM_DEBUG({ 496 auto DefIt = llvm::find_if( 497 llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)), 498 [&](MachineInstr &MI) { 499 return MI.findRegisterDefOperand(X86::EFLAGS); 500 }); 501 if (DefIt.base() != TestMBB->instr_begin()) { 502 dbgs() << " Using EFLAGS defined by: "; 503 DefIt->dump(); 504 } else { 505 dbgs() << " Using live-in flags for BB:\n"; 506 TestMBB->dump(); 507 } 508 }); 509 510 // While rewriting uses, we buffer jumps and rewrite them in a second pass 511 // because doing so will perturb the CFG that we are walking to find the 512 // uses in the first place. 513 SmallVector<MachineInstr *, 4> JmpIs; 514 515 // Gather the condition flags that have already been preserved in 516 // registers. We do this from scratch each time as we expect there to be 517 // very few of them and we expect to not revisit the same copy definition 518 // many times. If either of those change sufficiently we could build a map 519 // of these up front instead. 520 CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos); 521 522 // Collect the basic blocks we need to scan. Typically this will just be 523 // a single basic block but we may have to scan multiple blocks if the 524 // EFLAGS copy lives into successors. 525 SmallVector<MachineBasicBlock *, 2> Blocks; 526 SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks; 527 Blocks.push_back(&MBB); 528 529 do { 530 MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); 531 532 // Track when if/when we find a kill of the flags in this block. 533 bool FlagsKilled = false; 534 535 // In most cases, we walk from the beginning to the end of the block. But 536 // when the block is the same block as the copy is from, we will visit it 537 // twice. The first time we start from the copy and go to the end. The 538 // second time we start from the beginning and go to the copy. This lets 539 // us handle copies inside of cycles. 540 // FIXME: This loop is *super* confusing. This is at least in part 541 // a symptom of all of this routine needing to be refactored into 542 // documentable components. Once done, there may be a better way to write 543 // this loop. 544 for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB)) 545 ? std::next(CopyI->getIterator()) 546 : UseMBB.instr_begin(), 547 MIE = UseMBB.instr_end(); 548 MII != MIE;) { 549 MachineInstr &MI = *MII++; 550 // If we are in the original copy block and encounter either the copy 551 // def or the copy itself, break so that we don't re-process any part of 552 // the block or process the instructions in the range that was copied 553 // over. 554 if (&MI == CopyI || &MI == &CopyDefI) { 555 assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) && 556 "Should only encounter these on the second pass over the " 557 "original block."); 558 break; 559 } 560 561 MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); 562 if (!FlagUse) { 563 if (MI.findRegisterDefOperand(X86::EFLAGS)) { 564 // If EFLAGS are defined, it's as-if they were killed. We can stop 565 // scanning here. 566 // 567 // NB!!! Many instructions only modify some flags. LLVM currently 568 // models this as clobbering all flags, but if that ever changes 569 // this will need to be carefully updated to handle that more 570 // complex logic. 571 FlagsKilled = true; 572 break; 573 } 574 continue; 575 } 576 577 LLVM_DEBUG(dbgs() << " Rewriting use: "; MI.dump()); 578 579 // Check the kill flag before we rewrite as that may change it. 580 if (FlagUse->isKill()) 581 FlagsKilled = true; 582 583 // Once we encounter a branch, the rest of the instructions must also be 584 // branches. We can't rewrite in place here, so we handle them below. 585 // 586 // Note that we don't have to handle tail calls here, even conditional 587 // tail calls, as those are not introduced into the X86 MI until post-RA 588 // branch folding or black placement. As a consequence, we get to deal 589 // with the simpler formulation of conditional branches followed by tail 590 // calls. 591 if (X86::getCondFromBranch(MI) != X86::COND_INVALID) { 592 auto JmpIt = MI.getIterator(); 593 do { 594 JmpIs.push_back(&*JmpIt); 595 ++JmpIt; 596 } while (JmpIt != UseMBB.instr_end() && 597 X86::getCondFromBranch(*JmpIt) != 598 X86::COND_INVALID); 599 break; 600 } 601 602 // Otherwise we can just rewrite in-place. 603 if (X86::getCondFromCMov(MI) != X86::COND_INVALID) { 604 rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 605 } else if (getCondFromFCMOV(MI.getOpcode()) != X86::COND_INVALID) { 606 rewriteFCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 607 } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) { 608 rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 609 } else if (MI.getOpcode() == TargetOpcode::COPY) { 610 rewriteCopy(MI, *FlagUse, CopyDefI); 611 } else { 612 // We assume all other instructions that use flags also def them. 613 assert(MI.findRegisterDefOperand(X86::EFLAGS) && 614 "Expected a def of EFLAGS for this instruction!"); 615 616 // NB!!! Several arithmetic instructions only *partially* update 617 // flags. Theoretically, we could generate MI code sequences that 618 // would rely on this fact and observe different flags independently. 619 // But currently LLVM models all of these instructions as clobbering 620 // all the flags in an undef way. We rely on that to simplify the 621 // logic. 622 FlagsKilled = true; 623 624 // Generically handle remaining uses as arithmetic instructions. 625 rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse, 626 CondRegs); 627 } 628 629 // If this was the last use of the flags, we're done. 630 if (FlagsKilled) 631 break; 632 } 633 634 // If the flags were killed, we're done with this block. 635 if (FlagsKilled) 636 continue; 637 638 // Otherwise we need to scan successors for ones where the flags live-in 639 // and queue those up for processing. 640 for (MachineBasicBlock *SuccMBB : UseMBB.successors()) 641 if (SuccMBB->isLiveIn(X86::EFLAGS) && 642 VisitedBlocks.insert(SuccMBB).second) { 643 // We currently don't do any PHI insertion and so we require that the 644 // test basic block dominates all of the use basic blocks. Further, we 645 // can't have a cycle from the test block back to itself as that would 646 // create a cycle requiring a PHI to break it. 647 // 648 // We could in theory do PHI insertion here if it becomes useful by 649 // just taking undef values in along every edge that we don't trace 650 // this EFLAGS copy along. This isn't as bad as fully general PHI 651 // insertion, but still seems like a great deal of complexity. 652 // 653 // Because it is theoretically possible that some earlier MI pass or 654 // other lowering transformation could induce this to happen, we do 655 // a hard check even in non-debug builds here. 656 if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) { 657 LLVM_DEBUG({ 658 dbgs() 659 << "ERROR: Encountered use that is not dominated by our test " 660 "basic block! Rewriting this would require inserting PHI " 661 "nodes to track the flag state across the CFG.\n\nTest " 662 "block:\n"; 663 TestMBB->dump(); 664 dbgs() << "Use block:\n"; 665 SuccMBB->dump(); 666 }); 667 report_fatal_error( 668 "Cannot lower EFLAGS copy when original copy def " 669 "does not dominate all uses."); 670 } 671 672 Blocks.push_back(SuccMBB); 673 674 // After this, EFLAGS will be recreated before each use. 675 SuccMBB->removeLiveIn(X86::EFLAGS); 676 } 677 } while (!Blocks.empty()); 678 679 // Now rewrite the jumps that use the flags. These we handle specially 680 // because if there are multiple jumps in a single basic block we'll have 681 // to do surgery on the CFG. 682 MachineBasicBlock *LastJmpMBB = nullptr; 683 for (MachineInstr *JmpI : JmpIs) { 684 // Past the first jump within a basic block we need to split the blocks 685 // apart. 686 if (JmpI->getParent() == LastJmpMBB) 687 splitBlock(*JmpI->getParent(), *JmpI, *TII); 688 else 689 LastJmpMBB = JmpI->getParent(); 690 691 rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs); 692 } 693 694 // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if 695 // the copy's def operand is itself a kill. 696 } 697 698 #ifndef NDEBUG 699 for (MachineBasicBlock &MBB : MF) 700 for (MachineInstr &MI : MBB) 701 if (MI.getOpcode() == TargetOpcode::COPY && 702 (MI.getOperand(0).getReg() == X86::EFLAGS || 703 MI.getOperand(1).getReg() == X86::EFLAGS)) { 704 LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; 705 MI.dump()); 706 llvm_unreachable("Unlowered EFLAGS copy!"); 707 } 708 #endif 709 710 return true; 711 } 712 713 /// Collect any conditions that have already been set in registers so that we 714 /// can re-use them rather than adding duplicates. 715 CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( 716 MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) { 717 CondRegArray CondRegs = {}; 718 719 // Scan backwards across the range of instructions with live EFLAGS. 720 for (MachineInstr &MI : 721 llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { 722 X86::CondCode Cond = X86::getCondFromSETCC(MI); 723 if (Cond != X86::COND_INVALID && !MI.mayStore() && 724 MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isVirtual()) { 725 assert(MI.getOperand(0).isDef() && 726 "A non-storing SETcc should always define a register!"); 727 CondRegs[Cond] = MI.getOperand(0).getReg(); 728 } 729 730 // Stop scanning when we see the first definition of the EFLAGS as prior to 731 // this we would potentially capture the wrong flag state. 732 if (MI.findRegisterDefOperand(X86::EFLAGS)) 733 break; 734 } 735 return CondRegs; 736 } 737 738 Register X86FlagsCopyLoweringPass::promoteCondToReg( 739 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 740 const DebugLoc &TestLoc, X86::CondCode Cond) { 741 Register Reg = MRI->createVirtualRegister(PromoteRC); 742 auto SetI = BuildMI(TestMBB, TestPos, TestLoc, 743 TII->get(X86::SETCCr), Reg).addImm(Cond); 744 (void)SetI; 745 LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump()); 746 ++NumSetCCsInserted; 747 return Reg; 748 } 749 750 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg( 751 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 752 const DebugLoc &TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) { 753 unsigned &CondReg = CondRegs[Cond]; 754 unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)]; 755 if (!CondReg && !InvCondReg) 756 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 757 758 if (CondReg) 759 return {CondReg, false}; 760 else 761 return {InvCondReg, true}; 762 } 763 764 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB, 765 MachineBasicBlock::iterator Pos, 766 const DebugLoc &Loc, unsigned Reg) { 767 auto TestI = 768 BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg); 769 (void)TestI; 770 LLVM_DEBUG(dbgs() << " test cond: "; TestI->dump()); 771 ++NumTestsInserted; 772 } 773 774 void X86FlagsCopyLoweringPass::rewriteArithmetic( 775 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 776 const DebugLoc &TestLoc, MachineInstr &MI, MachineOperand &FlagUse, 777 CondRegArray &CondRegs) { 778 // Arithmetic is either reading CF or OF. Figure out which condition we need 779 // to preserve in a register. 780 X86::CondCode Cond = X86::COND_INVALID; 781 782 // The addend to use to reset CF or OF when added to the flag value. 783 int Addend = 0; 784 785 switch (getMnemonicFromOpcode(MI.getOpcode())) { 786 case FlagArithMnemonic::ADC: 787 case FlagArithMnemonic::RCL: 788 case FlagArithMnemonic::RCR: 789 case FlagArithMnemonic::SBB: 790 case FlagArithMnemonic::SETB: 791 Cond = X86::COND_B; // CF == 1 792 // Set up an addend that when one is added will need a carry due to not 793 // having a higher bit available. 794 Addend = 255; 795 break; 796 } 797 798 // Now get a register that contains the value of the flag input to the 799 // arithmetic. We require exactly this flag to simplify the arithmetic 800 // required to materialize it back into the flag. 801 unsigned &CondReg = CondRegs[Cond]; 802 if (!CondReg) 803 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 804 805 MachineBasicBlock &MBB = *MI.getParent(); 806 807 // Insert an instruction that will set the flag back to the desired value. 808 Register TmpReg = MRI->createVirtualRegister(PromoteRC); 809 auto AddI = 810 BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri)) 811 .addDef(TmpReg, RegState::Dead) 812 .addReg(CondReg) 813 .addImm(Addend); 814 (void)AddI; 815 LLVM_DEBUG(dbgs() << " add cond: "; AddI->dump()); 816 ++NumAddsInserted; 817 FlagUse.setIsKill(true); 818 } 819 820 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, 821 MachineBasicBlock::iterator TestPos, 822 const DebugLoc &TestLoc, 823 MachineInstr &CMovI, 824 MachineOperand &FlagUse, 825 CondRegArray &CondRegs) { 826 // First get the register containing this specific condition. 827 X86::CondCode Cond = X86::getCondFromCMov(CMovI); 828 unsigned CondReg; 829 bool Inverted; 830 std::tie(CondReg, Inverted) = 831 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 832 833 MachineBasicBlock &MBB = *CMovI.getParent(); 834 835 // Insert a direct test of the saved register. 836 insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); 837 838 // Rewrite the CMov to use the !ZF flag from the test, and then kill its use 839 // of the flags afterward. 840 CMovI.getOperand(CMovI.getDesc().getNumOperands() - 1) 841 .setImm(Inverted ? X86::COND_E : X86::COND_NE); 842 FlagUse.setIsKill(true); 843 LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump()); 844 } 845 846 void X86FlagsCopyLoweringPass::rewriteFCMov(MachineBasicBlock &TestMBB, 847 MachineBasicBlock::iterator TestPos, 848 const DebugLoc &TestLoc, 849 MachineInstr &CMovI, 850 MachineOperand &FlagUse, 851 CondRegArray &CondRegs) { 852 // First get the register containing this specific condition. 853 X86::CondCode Cond = getCondFromFCMOV(CMovI.getOpcode()); 854 unsigned CondReg; 855 bool Inverted; 856 std::tie(CondReg, Inverted) = 857 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 858 859 MachineBasicBlock &MBB = *CMovI.getParent(); 860 861 // Insert a direct test of the saved register. 862 insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); 863 864 auto getFCMOVOpcode = [](unsigned Opcode, bool Inverted) { 865 switch (Opcode) { 866 default: llvm_unreachable("Unexpected opcode!"); 867 case X86::CMOVBE_Fp32: case X86::CMOVNBE_Fp32: 868 case X86::CMOVB_Fp32: case X86::CMOVNB_Fp32: 869 case X86::CMOVE_Fp32: case X86::CMOVNE_Fp32: 870 case X86::CMOVP_Fp32: case X86::CMOVNP_Fp32: 871 return Inverted ? X86::CMOVE_Fp32 : X86::CMOVNE_Fp32; 872 case X86::CMOVBE_Fp64: case X86::CMOVNBE_Fp64: 873 case X86::CMOVB_Fp64: case X86::CMOVNB_Fp64: 874 case X86::CMOVE_Fp64: case X86::CMOVNE_Fp64: 875 case X86::CMOVP_Fp64: case X86::CMOVNP_Fp64: 876 return Inverted ? X86::CMOVE_Fp64 : X86::CMOVNE_Fp64; 877 case X86::CMOVBE_Fp80: case X86::CMOVNBE_Fp80: 878 case X86::CMOVB_Fp80: case X86::CMOVNB_Fp80: 879 case X86::CMOVE_Fp80: case X86::CMOVNE_Fp80: 880 case X86::CMOVP_Fp80: case X86::CMOVNP_Fp80: 881 return Inverted ? X86::CMOVE_Fp80 : X86::CMOVNE_Fp80; 882 } 883 }; 884 885 // Rewrite the CMov to use the !ZF flag from the test. 886 CMovI.setDesc(TII->get(getFCMOVOpcode(CMovI.getOpcode(), Inverted))); 887 FlagUse.setIsKill(true); 888 LLVM_DEBUG(dbgs() << " fixed fcmov: "; CMovI.dump()); 889 } 890 891 void X86FlagsCopyLoweringPass::rewriteCondJmp( 892 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 893 const DebugLoc &TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) { 894 // First get the register containing this specific condition. 895 X86::CondCode Cond = X86::getCondFromBranch(JmpI); 896 unsigned CondReg; 897 bool Inverted; 898 std::tie(CondReg, Inverted) = 899 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 900 901 MachineBasicBlock &JmpMBB = *JmpI.getParent(); 902 903 // Insert a direct test of the saved register. 904 insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg); 905 906 // Rewrite the jump to use the !ZF flag from the test, and kill its use of 907 // flags afterward. 908 JmpI.getOperand(1).setImm(Inverted ? X86::COND_E : X86::COND_NE); 909 JmpI.findRegisterUseOperand(X86::EFLAGS)->setIsKill(true); 910 LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump()); 911 } 912 913 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI, 914 MachineOperand &FlagUse, 915 MachineInstr &CopyDefI) { 916 // Just replace this copy with the original copy def. 917 MRI->replaceRegWith(MI.getOperand(0).getReg(), 918 CopyDefI.getOperand(0).getReg()); 919 MI.eraseFromParent(); 920 } 921 922 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB, 923 MachineBasicBlock::iterator TestPos, 924 const DebugLoc &TestLoc, 925 MachineInstr &SetCCI, 926 MachineOperand &FlagUse, 927 CondRegArray &CondRegs) { 928 X86::CondCode Cond = X86::getCondFromSETCC(SetCCI); 929 // Note that we can't usefully rewrite this to the inverse without complex 930 // analysis of the users of the setCC. Largely we rely on duplicates which 931 // could have been avoided already being avoided here. 932 unsigned &CondReg = CondRegs[Cond]; 933 if (!CondReg) 934 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 935 936 // Rewriting a register def is trivial: we just replace the register and 937 // remove the setcc. 938 if (!SetCCI.mayStore()) { 939 assert(SetCCI.getOperand(0).isReg() && 940 "Cannot have a non-register defined operand to SETcc!"); 941 Register OldReg = SetCCI.getOperand(0).getReg(); 942 // Drop Kill flags on the old register before replacing. CondReg may have 943 // a longer live range. 944 MRI->clearKillFlags(OldReg); 945 MRI->replaceRegWith(OldReg, CondReg); 946 SetCCI.eraseFromParent(); 947 return; 948 } 949 950 // Otherwise, we need to emit a store. 951 auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(), 952 SetCCI.getDebugLoc(), TII->get(X86::MOV8mr)); 953 // Copy the address operands. 954 for (int i = 0; i < X86::AddrNumOperands; ++i) 955 MIB.add(SetCCI.getOperand(i)); 956 957 MIB.addReg(CondReg); 958 959 MIB.setMemRefs(SetCCI.memoperands()); 960 961 SetCCI.eraseFromParent(); 962 } 963