1 //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the X86 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "X86InstrInfo.h" 14 #include "X86.h" 15 #include "X86InstrBuilder.h" 16 #include "X86InstrFoldTables.h" 17 #include "X86MachineFunctionInfo.h" 18 #include "X86Subtarget.h" 19 #include "X86TargetMachine.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/Sequence.h" 22 #include "llvm/CodeGen/LiveIntervals.h" 23 #include "llvm/CodeGen/LivePhysRegs.h" 24 #include "llvm/CodeGen/LiveVariables.h" 25 #include "llvm/CodeGen/MachineCombinerPattern.h" 26 #include "llvm/CodeGen/MachineConstantPool.h" 27 #include "llvm/CodeGen/MachineDominators.h" 28 #include "llvm/CodeGen/MachineFrameInfo.h" 29 #include "llvm/CodeGen/MachineInstr.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachineModuleInfo.h" 32 #include "llvm/CodeGen/MachineOperand.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/CodeGen/StackMaps.h" 35 #include "llvm/IR/DebugInfoMetadata.h" 36 #include "llvm/IR/DerivedTypes.h" 37 #include "llvm/IR/Function.h" 38 #include "llvm/IR/InstrTypes.h" 39 #include "llvm/IR/Module.h" 40 #include "llvm/MC/MCAsmInfo.h" 41 #include "llvm/MC/MCExpr.h" 42 #include "llvm/MC/MCInst.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Debug.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/raw_ostream.h" 47 #include "llvm/Target/TargetOptions.h" 48 #include <optional> 49 50 using namespace llvm; 51 52 #define DEBUG_TYPE "x86-instr-info" 53 54 #define GET_INSTRINFO_CTOR_DTOR 55 #include "X86GenInstrInfo.inc" 56 57 static cl::opt<bool> 58 NoFusing("disable-spill-fusing", 59 cl::desc("Disable fusing of spill code into instructions"), 60 cl::Hidden); 61 static cl::opt<bool> 62 PrintFailedFusing("print-failed-fuse-candidates", 63 cl::desc("Print instructions that the allocator wants to" 64 " fuse, but the X86 backend currently can't"), 65 cl::Hidden); 66 static cl::opt<bool> 67 ReMatPICStubLoad("remat-pic-stub-load", 68 cl::desc("Re-materialize load from stub in PIC mode"), 69 cl::init(false), cl::Hidden); 70 static cl::opt<unsigned> 71 PartialRegUpdateClearance("partial-reg-update-clearance", 72 cl::desc("Clearance between two register writes " 73 "for inserting XOR to avoid partial " 74 "register update"), 75 cl::init(64), cl::Hidden); 76 static cl::opt<unsigned> UndefRegClearance( 77 "undef-reg-clearance", 78 cl::desc("How many idle instructions we would like before " 79 "certain undef register reads"), 80 cl::init(128), cl::Hidden); 81 82 // Pin the vtable to this file. 83 void X86InstrInfo::anchor() {} 84 85 X86InstrInfo::X86InstrInfo(X86Subtarget &STI) 86 : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 87 : X86::ADJCALLSTACKDOWN32), 88 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 89 : X86::ADJCALLSTACKUP32), 90 X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)), 91 Subtarget(STI), RI(STI.getTargetTriple()) {} 92 93 const TargetRegisterClass * 94 X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, 95 const TargetRegisterInfo *TRI, 96 const MachineFunction &MF) const { 97 auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI, MF); 98 // If the target does not have egpr, then r16-r31 will be resereved for all 99 // instructions. 100 if (!RC || !Subtarget.hasEGPR()) 101 return RC; 102 103 if (X86II::canUseApxExtendedReg(MCID)) 104 return RC; 105 106 switch (RC->getID()) { 107 default: 108 return RC; 109 case X86::GR8RegClassID: 110 return &X86::GR8_NOREX2RegClass; 111 case X86::GR16RegClassID: 112 return &X86::GR16_NOREX2RegClass; 113 case X86::GR32RegClassID: 114 return &X86::GR32_NOREX2RegClass; 115 case X86::GR64RegClassID: 116 return &X86::GR64_NOREX2RegClass; 117 case X86::GR32_NOSPRegClassID: 118 return &X86::GR32_NOREX2_NOSPRegClass; 119 case X86::GR64_NOSPRegClassID: 120 return &X86::GR64_NOREX2_NOSPRegClass; 121 } 122 } 123 124 bool X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 125 Register &SrcReg, Register &DstReg, 126 unsigned &SubIdx) const { 127 switch (MI.getOpcode()) { 128 default: 129 break; 130 case X86::MOVSX16rr8: 131 case X86::MOVZX16rr8: 132 case X86::MOVSX32rr8: 133 case X86::MOVZX32rr8: 134 case X86::MOVSX64rr8: 135 if (!Subtarget.is64Bit()) 136 // It's not always legal to reference the low 8-bit of the larger 137 // register in 32-bit mode. 138 return false; 139 [[fallthrough]]; 140 case X86::MOVSX32rr16: 141 case X86::MOVZX32rr16: 142 case X86::MOVSX64rr16: 143 case X86::MOVSX64rr32: { 144 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) 145 // Be conservative. 146 return false; 147 SrcReg = MI.getOperand(1).getReg(); 148 DstReg = MI.getOperand(0).getReg(); 149 switch (MI.getOpcode()) { 150 default: 151 llvm_unreachable("Unreachable!"); 152 case X86::MOVSX16rr8: 153 case X86::MOVZX16rr8: 154 case X86::MOVSX32rr8: 155 case X86::MOVZX32rr8: 156 case X86::MOVSX64rr8: 157 SubIdx = X86::sub_8bit; 158 break; 159 case X86::MOVSX32rr16: 160 case X86::MOVZX32rr16: 161 case X86::MOVSX64rr16: 162 SubIdx = X86::sub_16bit; 163 break; 164 case X86::MOVSX64rr32: 165 SubIdx = X86::sub_32bit; 166 break; 167 } 168 return true; 169 } 170 } 171 return false; 172 } 173 174 bool X86InstrInfo::isDataInvariant(MachineInstr &MI) { 175 if (MI.mayLoad() || MI.mayStore()) 176 return false; 177 178 // Some target-independent operations that trivially lower to data-invariant 179 // instructions. 180 if (MI.isCopyLike() || MI.isInsertSubreg()) 181 return true; 182 183 unsigned Opcode = MI.getOpcode(); 184 using namespace X86; 185 // On x86 it is believed that imul is constant time w.r.t. the loaded data. 186 // However, they set flags and are perhaps the most surprisingly constant 187 // time operations so we call them out here separately. 188 if (isIMUL(Opcode)) 189 return true; 190 // Bit scanning and counting instructions that are somewhat surprisingly 191 // constant time as they scan across bits and do other fairly complex 192 // operations like popcnt, but are believed to be constant time on x86. 193 // However, these set flags. 194 if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) || 195 isTZCNT(Opcode)) 196 return true; 197 // Bit manipulation instructions are effectively combinations of basic 198 // arithmetic ops, and should still execute in constant time. These also 199 // set flags. 200 if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) || 201 isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) || 202 isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) || 203 isTZMSK(Opcode)) 204 return true; 205 // Bit extracting and clearing instructions should execute in constant time, 206 // and set flags. 207 if (isBEXTR(Opcode) || isBZHI(Opcode)) 208 return true; 209 // Shift and rotate. 210 if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) || 211 isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode)) 212 return true; 213 // Basic arithmetic is constant time on the input but does set flags. 214 if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) || 215 isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode)) 216 return true; 217 // Arithmetic with just 32-bit and 64-bit variants and no immediates. 218 if (isANDN(Opcode)) 219 return true; 220 // Unary arithmetic operations. 221 if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode)) 222 return true; 223 // Unlike other arithmetic, NOT doesn't set EFLAGS. 224 if (isNOT(Opcode)) 225 return true; 226 // Various move instructions used to zero or sign extend things. Note that we 227 // intentionally don't support the _NOREX variants as we can't handle that 228 // register constraint anyways. 229 if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode)) 230 return true; 231 // Arithmetic instructions that are both constant time and don't set flags. 232 if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode)) 233 return true; 234 // LEA doesn't actually access memory, and its arithmetic is constant time. 235 if (isLEA(Opcode)) 236 return true; 237 // By default, assume that the instruction is not data invariant. 238 return false; 239 } 240 241 bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) { 242 switch (MI.getOpcode()) { 243 default: 244 // By default, assume that the load will immediately leak. 245 return false; 246 247 // On x86 it is believed that imul is constant time w.r.t. the loaded data. 248 // However, they set flags and are perhaps the most surprisingly constant 249 // time operations so we call them out here separately. 250 case X86::IMUL16rm: 251 case X86::IMUL16rmi: 252 case X86::IMUL32rm: 253 case X86::IMUL32rmi: 254 case X86::IMUL64rm: 255 case X86::IMUL64rmi32: 256 257 // Bit scanning and counting instructions that are somewhat surprisingly 258 // constant time as they scan across bits and do other fairly complex 259 // operations like popcnt, but are believed to be constant time on x86. 260 // However, these set flags. 261 case X86::BSF16rm: 262 case X86::BSF32rm: 263 case X86::BSF64rm: 264 case X86::BSR16rm: 265 case X86::BSR32rm: 266 case X86::BSR64rm: 267 case X86::LZCNT16rm: 268 case X86::LZCNT32rm: 269 case X86::LZCNT64rm: 270 case X86::POPCNT16rm: 271 case X86::POPCNT32rm: 272 case X86::POPCNT64rm: 273 case X86::TZCNT16rm: 274 case X86::TZCNT32rm: 275 case X86::TZCNT64rm: 276 277 // Bit manipulation instructions are effectively combinations of basic 278 // arithmetic ops, and should still execute in constant time. These also 279 // set flags. 280 case X86::BLCFILL32rm: 281 case X86::BLCFILL64rm: 282 case X86::BLCI32rm: 283 case X86::BLCI64rm: 284 case X86::BLCIC32rm: 285 case X86::BLCIC64rm: 286 case X86::BLCMSK32rm: 287 case X86::BLCMSK64rm: 288 case X86::BLCS32rm: 289 case X86::BLCS64rm: 290 case X86::BLSFILL32rm: 291 case X86::BLSFILL64rm: 292 case X86::BLSI32rm: 293 case X86::BLSI64rm: 294 case X86::BLSIC32rm: 295 case X86::BLSIC64rm: 296 case X86::BLSMSK32rm: 297 case X86::BLSMSK64rm: 298 case X86::BLSR32rm: 299 case X86::BLSR64rm: 300 case X86::TZMSK32rm: 301 case X86::TZMSK64rm: 302 303 // Bit extracting and clearing instructions should execute in constant time, 304 // and set flags. 305 case X86::BEXTR32rm: 306 case X86::BEXTR64rm: 307 case X86::BEXTRI32mi: 308 case X86::BEXTRI64mi: 309 case X86::BZHI32rm: 310 case X86::BZHI64rm: 311 312 // Basic arithmetic is constant time on the input but does set flags. 313 case X86::ADC8rm: 314 case X86::ADC16rm: 315 case X86::ADC32rm: 316 case X86::ADC64rm: 317 case X86::ADD8rm: 318 case X86::ADD16rm: 319 case X86::ADD32rm: 320 case X86::ADD64rm: 321 case X86::AND8rm: 322 case X86::AND16rm: 323 case X86::AND32rm: 324 case X86::AND64rm: 325 case X86::ANDN32rm: 326 case X86::ANDN64rm: 327 case X86::OR8rm: 328 case X86::OR16rm: 329 case X86::OR32rm: 330 case X86::OR64rm: 331 case X86::SBB8rm: 332 case X86::SBB16rm: 333 case X86::SBB32rm: 334 case X86::SBB64rm: 335 case X86::SUB8rm: 336 case X86::SUB16rm: 337 case X86::SUB32rm: 338 case X86::SUB64rm: 339 case X86::XOR8rm: 340 case X86::XOR16rm: 341 case X86::XOR32rm: 342 case X86::XOR64rm: 343 344 // Integer multiply w/o affecting flags is still believed to be constant 345 // time on x86. Called out separately as this is among the most surprising 346 // instructions to exhibit that behavior. 347 case X86::MULX32rm: 348 case X86::MULX64rm: 349 350 // Arithmetic instructions that are both constant time and don't set flags. 351 case X86::RORX32mi: 352 case X86::RORX64mi: 353 case X86::SARX32rm: 354 case X86::SARX64rm: 355 case X86::SHLX32rm: 356 case X86::SHLX64rm: 357 case X86::SHRX32rm: 358 case X86::SHRX64rm: 359 360 // Conversions are believed to be constant time and don't set flags. 361 case X86::CVTTSD2SI64rm: 362 case X86::VCVTTSD2SI64rm: 363 case X86::VCVTTSD2SI64Zrm: 364 case X86::CVTTSD2SIrm: 365 case X86::VCVTTSD2SIrm: 366 case X86::VCVTTSD2SIZrm: 367 case X86::CVTTSS2SI64rm: 368 case X86::VCVTTSS2SI64rm: 369 case X86::VCVTTSS2SI64Zrm: 370 case X86::CVTTSS2SIrm: 371 case X86::VCVTTSS2SIrm: 372 case X86::VCVTTSS2SIZrm: 373 case X86::CVTSI2SDrm: 374 case X86::VCVTSI2SDrm: 375 case X86::VCVTSI2SDZrm: 376 case X86::CVTSI2SSrm: 377 case X86::VCVTSI2SSrm: 378 case X86::VCVTSI2SSZrm: 379 case X86::CVTSI642SDrm: 380 case X86::VCVTSI642SDrm: 381 case X86::VCVTSI642SDZrm: 382 case X86::CVTSI642SSrm: 383 case X86::VCVTSI642SSrm: 384 case X86::VCVTSI642SSZrm: 385 case X86::CVTSS2SDrm: 386 case X86::VCVTSS2SDrm: 387 case X86::VCVTSS2SDZrm: 388 case X86::CVTSD2SSrm: 389 case X86::VCVTSD2SSrm: 390 case X86::VCVTSD2SSZrm: 391 // AVX512 added unsigned integer conversions. 392 case X86::VCVTTSD2USI64Zrm: 393 case X86::VCVTTSD2USIZrm: 394 case X86::VCVTTSS2USI64Zrm: 395 case X86::VCVTTSS2USIZrm: 396 case X86::VCVTUSI2SDZrm: 397 case X86::VCVTUSI642SDZrm: 398 case X86::VCVTUSI2SSZrm: 399 case X86::VCVTUSI642SSZrm: 400 401 // Loads to register don't set flags. 402 case X86::MOV8rm: 403 case X86::MOV8rm_NOREX: 404 case X86::MOV16rm: 405 case X86::MOV32rm: 406 case X86::MOV64rm: 407 case X86::MOVSX16rm8: 408 case X86::MOVSX32rm16: 409 case X86::MOVSX32rm8: 410 case X86::MOVSX32rm8_NOREX: 411 case X86::MOVSX64rm16: 412 case X86::MOVSX64rm32: 413 case X86::MOVSX64rm8: 414 case X86::MOVZX16rm8: 415 case X86::MOVZX32rm16: 416 case X86::MOVZX32rm8: 417 case X86::MOVZX32rm8_NOREX: 418 case X86::MOVZX64rm16: 419 case X86::MOVZX64rm8: 420 return true; 421 } 422 } 423 424 int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const { 425 const MachineFunction *MF = MI.getParent()->getParent(); 426 const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); 427 428 if (isFrameInstr(MI)) { 429 int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign()); 430 SPAdj -= getFrameAdjustment(MI); 431 if (!isFrameSetup(MI)) 432 SPAdj = -SPAdj; 433 return SPAdj; 434 } 435 436 // To know whether a call adjusts the stack, we need information 437 // that is bound to the following ADJCALLSTACKUP pseudo. 438 // Look for the next ADJCALLSTACKUP that follows the call. 439 if (MI.isCall()) { 440 const MachineBasicBlock *MBB = MI.getParent(); 441 auto I = ++MachineBasicBlock::const_iterator(MI); 442 for (auto E = MBB->end(); I != E; ++I) { 443 if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall()) 444 break; 445 } 446 447 // If we could not find a frame destroy opcode, then it has already 448 // been simplified, so we don't care. 449 if (I->getOpcode() != getCallFrameDestroyOpcode()) 450 return 0; 451 452 return -(I->getOperand(1).getImm()); 453 } 454 455 // Currently handle only PUSHes we can reasonably expect to see 456 // in call sequences 457 switch (MI.getOpcode()) { 458 default: 459 return 0; 460 case X86::PUSH32r: 461 case X86::PUSH32rmm: 462 case X86::PUSH32rmr: 463 case X86::PUSH32i: 464 return 4; 465 case X86::PUSH64r: 466 case X86::PUSH64rmm: 467 case X86::PUSH64rmr: 468 case X86::PUSH64i32: 469 return 8; 470 } 471 } 472 473 /// Return true and the FrameIndex if the specified 474 /// operand and follow operands form a reference to the stack frame. 475 bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op, 476 int &FrameIndex) const { 477 if (MI.getOperand(Op + X86::AddrBaseReg).isFI() && 478 MI.getOperand(Op + X86::AddrScaleAmt).isImm() && 479 MI.getOperand(Op + X86::AddrIndexReg).isReg() && 480 MI.getOperand(Op + X86::AddrDisp).isImm() && 481 MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 && 482 MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 && 483 MI.getOperand(Op + X86::AddrDisp).getImm() == 0) { 484 FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex(); 485 return true; 486 } 487 return false; 488 } 489 490 static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) { 491 switch (Opcode) { 492 default: 493 return false; 494 case X86::MOV8rm: 495 case X86::KMOVBkm: 496 case X86::KMOVBkm_EVEX: 497 MemBytes = 1; 498 return true; 499 case X86::MOV16rm: 500 case X86::KMOVWkm: 501 case X86::KMOVWkm_EVEX: 502 case X86::VMOVSHZrm: 503 case X86::VMOVSHZrm_alt: 504 MemBytes = 2; 505 return true; 506 case X86::MOV32rm: 507 case X86::MOVSSrm: 508 case X86::MOVSSrm_alt: 509 case X86::VMOVSSrm: 510 case X86::VMOVSSrm_alt: 511 case X86::VMOVSSZrm: 512 case X86::VMOVSSZrm_alt: 513 case X86::KMOVDkm: 514 case X86::KMOVDkm_EVEX: 515 MemBytes = 4; 516 return true; 517 case X86::MOV64rm: 518 case X86::LD_Fp64m: 519 case X86::MOVSDrm: 520 case X86::MOVSDrm_alt: 521 case X86::VMOVSDrm: 522 case X86::VMOVSDrm_alt: 523 case X86::VMOVSDZrm: 524 case X86::VMOVSDZrm_alt: 525 case X86::MMX_MOVD64rm: 526 case X86::MMX_MOVQ64rm: 527 case X86::KMOVQkm: 528 case X86::KMOVQkm_EVEX: 529 MemBytes = 8; 530 return true; 531 case X86::MOVAPSrm: 532 case X86::MOVUPSrm: 533 case X86::MOVAPDrm: 534 case X86::MOVUPDrm: 535 case X86::MOVDQArm: 536 case X86::MOVDQUrm: 537 case X86::VMOVAPSrm: 538 case X86::VMOVUPSrm: 539 case X86::VMOVAPDrm: 540 case X86::VMOVUPDrm: 541 case X86::VMOVDQArm: 542 case X86::VMOVDQUrm: 543 case X86::VMOVAPSZ128rm: 544 case X86::VMOVUPSZ128rm: 545 case X86::VMOVAPSZ128rm_NOVLX: 546 case X86::VMOVUPSZ128rm_NOVLX: 547 case X86::VMOVAPDZ128rm: 548 case X86::VMOVUPDZ128rm: 549 case X86::VMOVDQU8Z128rm: 550 case X86::VMOVDQU16Z128rm: 551 case X86::VMOVDQA32Z128rm: 552 case X86::VMOVDQU32Z128rm: 553 case X86::VMOVDQA64Z128rm: 554 case X86::VMOVDQU64Z128rm: 555 MemBytes = 16; 556 return true; 557 case X86::VMOVAPSYrm: 558 case X86::VMOVUPSYrm: 559 case X86::VMOVAPDYrm: 560 case X86::VMOVUPDYrm: 561 case X86::VMOVDQAYrm: 562 case X86::VMOVDQUYrm: 563 case X86::VMOVAPSZ256rm: 564 case X86::VMOVUPSZ256rm: 565 case X86::VMOVAPSZ256rm_NOVLX: 566 case X86::VMOVUPSZ256rm_NOVLX: 567 case X86::VMOVAPDZ256rm: 568 case X86::VMOVUPDZ256rm: 569 case X86::VMOVDQU8Z256rm: 570 case X86::VMOVDQU16Z256rm: 571 case X86::VMOVDQA32Z256rm: 572 case X86::VMOVDQU32Z256rm: 573 case X86::VMOVDQA64Z256rm: 574 case X86::VMOVDQU64Z256rm: 575 MemBytes = 32; 576 return true; 577 case X86::VMOVAPSZrm: 578 case X86::VMOVUPSZrm: 579 case X86::VMOVAPDZrm: 580 case X86::VMOVUPDZrm: 581 case X86::VMOVDQU8Zrm: 582 case X86::VMOVDQU16Zrm: 583 case X86::VMOVDQA32Zrm: 584 case X86::VMOVDQU32Zrm: 585 case X86::VMOVDQA64Zrm: 586 case X86::VMOVDQU64Zrm: 587 MemBytes = 64; 588 return true; 589 } 590 } 591 592 static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) { 593 switch (Opcode) { 594 default: 595 return false; 596 case X86::MOV8mr: 597 case X86::KMOVBmk: 598 case X86::KMOVBmk_EVEX: 599 MemBytes = 1; 600 return true; 601 case X86::MOV16mr: 602 case X86::KMOVWmk: 603 case X86::KMOVWmk_EVEX: 604 case X86::VMOVSHZmr: 605 MemBytes = 2; 606 return true; 607 case X86::MOV32mr: 608 case X86::MOVSSmr: 609 case X86::VMOVSSmr: 610 case X86::VMOVSSZmr: 611 case X86::KMOVDmk: 612 case X86::KMOVDmk_EVEX: 613 MemBytes = 4; 614 return true; 615 case X86::MOV64mr: 616 case X86::ST_FpP64m: 617 case X86::MOVSDmr: 618 case X86::VMOVSDmr: 619 case X86::VMOVSDZmr: 620 case X86::MMX_MOVD64mr: 621 case X86::MMX_MOVQ64mr: 622 case X86::MMX_MOVNTQmr: 623 case X86::KMOVQmk: 624 case X86::KMOVQmk_EVEX: 625 MemBytes = 8; 626 return true; 627 case X86::MOVAPSmr: 628 case X86::MOVUPSmr: 629 case X86::MOVAPDmr: 630 case X86::MOVUPDmr: 631 case X86::MOVDQAmr: 632 case X86::MOVDQUmr: 633 case X86::VMOVAPSmr: 634 case X86::VMOVUPSmr: 635 case X86::VMOVAPDmr: 636 case X86::VMOVUPDmr: 637 case X86::VMOVDQAmr: 638 case X86::VMOVDQUmr: 639 case X86::VMOVUPSZ128mr: 640 case X86::VMOVAPSZ128mr: 641 case X86::VMOVUPSZ128mr_NOVLX: 642 case X86::VMOVAPSZ128mr_NOVLX: 643 case X86::VMOVUPDZ128mr: 644 case X86::VMOVAPDZ128mr: 645 case X86::VMOVDQA32Z128mr: 646 case X86::VMOVDQU32Z128mr: 647 case X86::VMOVDQA64Z128mr: 648 case X86::VMOVDQU64Z128mr: 649 case X86::VMOVDQU8Z128mr: 650 case X86::VMOVDQU16Z128mr: 651 MemBytes = 16; 652 return true; 653 case X86::VMOVUPSYmr: 654 case X86::VMOVAPSYmr: 655 case X86::VMOVUPDYmr: 656 case X86::VMOVAPDYmr: 657 case X86::VMOVDQUYmr: 658 case X86::VMOVDQAYmr: 659 case X86::VMOVUPSZ256mr: 660 case X86::VMOVAPSZ256mr: 661 case X86::VMOVUPSZ256mr_NOVLX: 662 case X86::VMOVAPSZ256mr_NOVLX: 663 case X86::VMOVUPDZ256mr: 664 case X86::VMOVAPDZ256mr: 665 case X86::VMOVDQU8Z256mr: 666 case X86::VMOVDQU16Z256mr: 667 case X86::VMOVDQA32Z256mr: 668 case X86::VMOVDQU32Z256mr: 669 case X86::VMOVDQA64Z256mr: 670 case X86::VMOVDQU64Z256mr: 671 MemBytes = 32; 672 return true; 673 case X86::VMOVUPSZmr: 674 case X86::VMOVAPSZmr: 675 case X86::VMOVUPDZmr: 676 case X86::VMOVAPDZmr: 677 case X86::VMOVDQU8Zmr: 678 case X86::VMOVDQU16Zmr: 679 case X86::VMOVDQA32Zmr: 680 case X86::VMOVDQU32Zmr: 681 case X86::VMOVDQA64Zmr: 682 case X86::VMOVDQU64Zmr: 683 MemBytes = 64; 684 return true; 685 } 686 return false; 687 } 688 689 Register X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 690 int &FrameIndex) const { 691 unsigned Dummy; 692 return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy); 693 } 694 695 Register X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 696 int &FrameIndex, 697 unsigned &MemBytes) const { 698 if (isFrameLoadOpcode(MI.getOpcode(), MemBytes)) 699 if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) 700 return MI.getOperand(0).getReg(); 701 return 0; 702 } 703 704 Register X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, 705 int &FrameIndex) const { 706 unsigned Dummy; 707 if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) { 708 unsigned Reg; 709 if ((Reg = isLoadFromStackSlot(MI, FrameIndex))) 710 return Reg; 711 // Check for post-frame index elimination operations 712 SmallVector<const MachineMemOperand *, 1> Accesses; 713 if (hasLoadFromStackSlot(MI, Accesses)) { 714 FrameIndex = 715 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue()) 716 ->getFrameIndex(); 717 return MI.getOperand(0).getReg(); 718 } 719 } 720 return 0; 721 } 722 723 Register X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 724 int &FrameIndex) const { 725 unsigned Dummy; 726 return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy); 727 } 728 729 Register X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 730 int &FrameIndex, 731 unsigned &MemBytes) const { 732 if (isFrameStoreOpcode(MI.getOpcode(), MemBytes)) 733 if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 && 734 isFrameOperand(MI, 0, FrameIndex)) 735 return MI.getOperand(X86::AddrNumOperands).getReg(); 736 return 0; 737 } 738 739 Register X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, 740 int &FrameIndex) const { 741 unsigned Dummy; 742 if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) { 743 unsigned Reg; 744 if ((Reg = isStoreToStackSlot(MI, FrameIndex))) 745 return Reg; 746 // Check for post-frame index elimination operations 747 SmallVector<const MachineMemOperand *, 1> Accesses; 748 if (hasStoreToStackSlot(MI, Accesses)) { 749 FrameIndex = 750 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue()) 751 ->getFrameIndex(); 752 return MI.getOperand(X86::AddrNumOperands).getReg(); 753 } 754 } 755 return 0; 756 } 757 758 /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r. 759 static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) { 760 // Don't waste compile time scanning use-def chains of physregs. 761 if (!BaseReg.isVirtual()) 762 return false; 763 bool isPICBase = false; 764 for (const MachineInstr &DefMI : MRI.def_instructions(BaseReg)) { 765 if (DefMI.getOpcode() != X86::MOVPC32r) 766 return false; 767 assert(!isPICBase && "More than one PIC base?"); 768 isPICBase = true; 769 } 770 return isPICBase; 771 } 772 773 bool X86InstrInfo::isReallyTriviallyReMaterializable( 774 const MachineInstr &MI) const { 775 switch (MI.getOpcode()) { 776 default: 777 // This function should only be called for opcodes with the ReMaterializable 778 // flag set. 779 llvm_unreachable("Unknown rematerializable operation!"); 780 break; 781 case X86::IMPLICIT_DEF: 782 // Defer to generic logic. 783 break; 784 case X86::LOAD_STACK_GUARD: 785 case X86::LD_Fp032: 786 case X86::LD_Fp064: 787 case X86::LD_Fp080: 788 case X86::LD_Fp132: 789 case X86::LD_Fp164: 790 case X86::LD_Fp180: 791 case X86::AVX1_SETALLONES: 792 case X86::AVX2_SETALLONES: 793 case X86::AVX512_128_SET0: 794 case X86::AVX512_256_SET0: 795 case X86::AVX512_512_SET0: 796 case X86::AVX512_512_SETALLONES: 797 case X86::AVX512_FsFLD0SD: 798 case X86::AVX512_FsFLD0SH: 799 case X86::AVX512_FsFLD0SS: 800 case X86::AVX512_FsFLD0F128: 801 case X86::AVX_SET0: 802 case X86::FsFLD0SD: 803 case X86::FsFLD0SS: 804 case X86::FsFLD0SH: 805 case X86::FsFLD0F128: 806 case X86::KSET0D: 807 case X86::KSET0Q: 808 case X86::KSET0W: 809 case X86::KSET1D: 810 case X86::KSET1Q: 811 case X86::KSET1W: 812 case X86::MMX_SET0: 813 case X86::MOV32ImmSExti8: 814 case X86::MOV32r0: 815 case X86::MOV32r1: 816 case X86::MOV32r_1: 817 case X86::MOV32ri64: 818 case X86::MOV64ImmSExti8: 819 case X86::V_SET0: 820 case X86::V_SETALLONES: 821 case X86::MOV16ri: 822 case X86::MOV32ri: 823 case X86::MOV64ri: 824 case X86::MOV64ri32: 825 case X86::MOV8ri: 826 case X86::PTILEZEROV: 827 return true; 828 829 case X86::MOV8rm: 830 case X86::MOV8rm_NOREX: 831 case X86::MOV16rm: 832 case X86::MOV32rm: 833 case X86::MOV64rm: 834 case X86::MOVSSrm: 835 case X86::MOVSSrm_alt: 836 case X86::MOVSDrm: 837 case X86::MOVSDrm_alt: 838 case X86::MOVAPSrm: 839 case X86::MOVUPSrm: 840 case X86::MOVAPDrm: 841 case X86::MOVUPDrm: 842 case X86::MOVDQArm: 843 case X86::MOVDQUrm: 844 case X86::VMOVSSrm: 845 case X86::VMOVSSrm_alt: 846 case X86::VMOVSDrm: 847 case X86::VMOVSDrm_alt: 848 case X86::VMOVAPSrm: 849 case X86::VMOVUPSrm: 850 case X86::VMOVAPDrm: 851 case X86::VMOVUPDrm: 852 case X86::VMOVDQArm: 853 case X86::VMOVDQUrm: 854 case X86::VMOVAPSYrm: 855 case X86::VMOVUPSYrm: 856 case X86::VMOVAPDYrm: 857 case X86::VMOVUPDYrm: 858 case X86::VMOVDQAYrm: 859 case X86::VMOVDQUYrm: 860 case X86::MMX_MOVD64rm: 861 case X86::MMX_MOVQ64rm: 862 case X86::VBROADCASTSSrm: 863 case X86::VBROADCASTSSYrm: 864 case X86::VBROADCASTSDYrm: 865 // AVX-512 866 case X86::VPBROADCASTBZ128rm: 867 case X86::VPBROADCASTBZ256rm: 868 case X86::VPBROADCASTBZrm: 869 case X86::VBROADCASTF32X2Z256rm: 870 case X86::VBROADCASTF32X2Zrm: 871 case X86::VBROADCASTI32X2Z128rm: 872 case X86::VBROADCASTI32X2Z256rm: 873 case X86::VBROADCASTI32X2Zrm: 874 case X86::VPBROADCASTWZ128rm: 875 case X86::VPBROADCASTWZ256rm: 876 case X86::VPBROADCASTWZrm: 877 case X86::VPBROADCASTDZ128rm: 878 case X86::VPBROADCASTDZ256rm: 879 case X86::VPBROADCASTDZrm: 880 case X86::VBROADCASTSSZ128rm: 881 case X86::VBROADCASTSSZ256rm: 882 case X86::VBROADCASTSSZrm: 883 case X86::VPBROADCASTQZ128rm: 884 case X86::VPBROADCASTQZ256rm: 885 case X86::VPBROADCASTQZrm: 886 case X86::VBROADCASTSDZ256rm: 887 case X86::VBROADCASTSDZrm: 888 case X86::VMOVSSZrm: 889 case X86::VMOVSSZrm_alt: 890 case X86::VMOVSDZrm: 891 case X86::VMOVSDZrm_alt: 892 case X86::VMOVSHZrm: 893 case X86::VMOVSHZrm_alt: 894 case X86::VMOVAPDZ128rm: 895 case X86::VMOVAPDZ256rm: 896 case X86::VMOVAPDZrm: 897 case X86::VMOVAPSZ128rm: 898 case X86::VMOVAPSZ256rm: 899 case X86::VMOVAPSZ128rm_NOVLX: 900 case X86::VMOVAPSZ256rm_NOVLX: 901 case X86::VMOVAPSZrm: 902 case X86::VMOVDQA32Z128rm: 903 case X86::VMOVDQA32Z256rm: 904 case X86::VMOVDQA32Zrm: 905 case X86::VMOVDQA64Z128rm: 906 case X86::VMOVDQA64Z256rm: 907 case X86::VMOVDQA64Zrm: 908 case X86::VMOVDQU16Z128rm: 909 case X86::VMOVDQU16Z256rm: 910 case X86::VMOVDQU16Zrm: 911 case X86::VMOVDQU32Z128rm: 912 case X86::VMOVDQU32Z256rm: 913 case X86::VMOVDQU32Zrm: 914 case X86::VMOVDQU64Z128rm: 915 case X86::VMOVDQU64Z256rm: 916 case X86::VMOVDQU64Zrm: 917 case X86::VMOVDQU8Z128rm: 918 case X86::VMOVDQU8Z256rm: 919 case X86::VMOVDQU8Zrm: 920 case X86::VMOVUPDZ128rm: 921 case X86::VMOVUPDZ256rm: 922 case X86::VMOVUPDZrm: 923 case X86::VMOVUPSZ128rm: 924 case X86::VMOVUPSZ256rm: 925 case X86::VMOVUPSZ128rm_NOVLX: 926 case X86::VMOVUPSZ256rm_NOVLX: 927 case X86::VMOVUPSZrm: { 928 // Loads from constant pools are trivially rematerializable. 929 if (MI.getOperand(1 + X86::AddrBaseReg).isReg() && 930 MI.getOperand(1 + X86::AddrScaleAmt).isImm() && 931 MI.getOperand(1 + X86::AddrIndexReg).isReg() && 932 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && 933 MI.isDereferenceableInvariantLoad()) { 934 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); 935 if (BaseReg == 0 || BaseReg == X86::RIP) 936 return true; 937 // Allow re-materialization of PIC load. 938 if (!(!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())) { 939 const MachineFunction &MF = *MI.getParent()->getParent(); 940 const MachineRegisterInfo &MRI = MF.getRegInfo(); 941 if (regIsPICBase(BaseReg, MRI)) 942 return true; 943 } 944 } 945 break; 946 } 947 948 case X86::LEA32r: 949 case X86::LEA64r: { 950 if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() && 951 MI.getOperand(1 + X86::AddrIndexReg).isReg() && 952 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && 953 !MI.getOperand(1 + X86::AddrDisp).isReg()) { 954 // lea fi#, lea GV, etc. are all rematerializable. 955 if (!MI.getOperand(1 + X86::AddrBaseReg).isReg()) 956 return true; 957 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); 958 if (BaseReg == 0) 959 return true; 960 // Allow re-materialization of lea PICBase + x. 961 const MachineFunction &MF = *MI.getParent()->getParent(); 962 const MachineRegisterInfo &MRI = MF.getRegInfo(); 963 if (regIsPICBase(BaseReg, MRI)) 964 return true; 965 } 966 break; 967 } 968 } 969 return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); 970 } 971 972 void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, 973 MachineBasicBlock::iterator I, 974 Register DestReg, unsigned SubIdx, 975 const MachineInstr &Orig, 976 const TargetRegisterInfo &TRI) const { 977 bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI); 978 if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) != 979 MachineBasicBlock::LQR_Dead) { 980 // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side 981 // effects. 982 int Value; 983 switch (Orig.getOpcode()) { 984 case X86::MOV32r0: 985 Value = 0; 986 break; 987 case X86::MOV32r1: 988 Value = 1; 989 break; 990 case X86::MOV32r_1: 991 Value = -1; 992 break; 993 default: 994 llvm_unreachable("Unexpected instruction!"); 995 } 996 997 const DebugLoc &DL = Orig.getDebugLoc(); 998 BuildMI(MBB, I, DL, get(X86::MOV32ri)) 999 .add(Orig.getOperand(0)) 1000 .addImm(Value); 1001 } else { 1002 MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig); 1003 MBB.insert(I, MI); 1004 } 1005 1006 MachineInstr &NewMI = *std::prev(I); 1007 NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI); 1008 } 1009 1010 /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. 1011 bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const { 1012 for (const MachineOperand &MO : MI.operands()) { 1013 if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS && 1014 !MO.isDead()) { 1015 return true; 1016 } 1017 } 1018 return false; 1019 } 1020 1021 /// Check whether the shift count for a machine operand is non-zero. 1022 inline static unsigned getTruncatedShiftCount(const MachineInstr &MI, 1023 unsigned ShiftAmtOperandIdx) { 1024 // The shift count is six bits with the REX.W prefix and five bits without. 1025 unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31; 1026 unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm(); 1027 return Imm & ShiftCountMask; 1028 } 1029 1030 /// Check whether the given shift count is appropriate 1031 /// can be represented by a LEA instruction. 1032 inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { 1033 // Left shift instructions can be transformed into load-effective-address 1034 // instructions if we can encode them appropriately. 1035 // A LEA instruction utilizes a SIB byte to encode its scale factor. 1036 // The SIB.scale field is two bits wide which means that we can encode any 1037 // shift amount less than 4. 1038 return ShAmt < 4 && ShAmt > 0; 1039 } 1040 1041 static bool findRedundantFlagInstr(MachineInstr &CmpInstr, 1042 MachineInstr &CmpValDefInstr, 1043 const MachineRegisterInfo *MRI, 1044 MachineInstr **AndInstr, 1045 const TargetRegisterInfo *TRI, 1046 bool &NoSignFlag, bool &ClearsOverflowFlag) { 1047 if (!(CmpValDefInstr.getOpcode() == X86::SUBREG_TO_REG && 1048 CmpInstr.getOpcode() == X86::TEST64rr) && 1049 !(CmpValDefInstr.getOpcode() == X86::COPY && 1050 CmpInstr.getOpcode() == X86::TEST16rr)) 1051 return false; 1052 1053 // CmpInstr is a TEST16rr/TEST64rr instruction, and 1054 // `X86InstrInfo::analyzeCompare` guarantees that it's analyzable only if two 1055 // registers are identical. 1056 assert((CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) && 1057 "CmpInstr is an analyzable TEST16rr/TEST64rr, and " 1058 "`X86InstrInfo::analyzeCompare` requires two reg operands are the" 1059 "same."); 1060 1061 // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that 1062 // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case 1063 // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is 1064 // redundant. 1065 assert( 1066 (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) && 1067 "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG or TEST16rr " 1068 "is a user of COPY sub16bit."); 1069 MachineInstr *VregDefInstr = nullptr; 1070 if (CmpInstr.getOpcode() == X86::TEST16rr) { 1071 if (!CmpValDefInstr.getOperand(1).getReg().isVirtual()) 1072 return false; 1073 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg()); 1074 if (!VregDefInstr) 1075 return false; 1076 // We can only remove test when AND32ri or AND64ri32 whose imm can fit 16bit 1077 // size, others 32/64 bit ops would test higher bits which test16rr don't 1078 // want to. 1079 if (!((VregDefInstr->getOpcode() == X86::AND32ri || 1080 VregDefInstr->getOpcode() == X86::AND64ri32) && 1081 isUInt<16>(VregDefInstr->getOperand(2).getImm()))) 1082 return false; 1083 } 1084 1085 if (CmpInstr.getOpcode() == X86::TEST64rr) { 1086 // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is 1087 // typically 0. 1088 if (CmpValDefInstr.getOperand(1).getImm() != 0) 1089 return false; 1090 1091 // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically 1092 // sub_32bit or sub_xmm. 1093 if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit) 1094 return false; 1095 1096 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg()); 1097 } 1098 1099 assert(VregDefInstr && "Must have a definition (SSA)"); 1100 1101 // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB 1102 // to simplify the subsequent analysis. 1103 // 1104 // FIXME: If `VregDefInstr->getParent()` is the only predecessor of 1105 // `CmpValDefInstr.getParent()`, this could be handled. 1106 if (VregDefInstr->getParent() != CmpValDefInstr.getParent()) 1107 return false; 1108 1109 if (X86::isAND(VregDefInstr->getOpcode())) { 1110 // Get a sequence of instructions like 1111 // %reg = and* ... // Set EFLAGS 1112 // ... // EFLAGS not changed 1113 // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit 1114 // test64rr %extended_reg, %extended_reg, implicit-def $eflags 1115 // or 1116 // %reg = and32* ... 1117 // ... // EFLAGS not changed. 1118 // %src_reg = copy %reg.sub_16bit:gr32 1119 // test16rr %src_reg, %src_reg, implicit-def $eflags 1120 // 1121 // If subsequent readers use a subset of bits that don't change 1122 // after `and*` instructions, it's likely that the test64rr could 1123 // be optimized away. 1124 for (const MachineInstr &Instr : 1125 make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)), 1126 MachineBasicBlock::iterator(CmpValDefInstr))) { 1127 // There are instructions between 'VregDefInstr' and 1128 // 'CmpValDefInstr' that modifies EFLAGS. 1129 if (Instr.modifiesRegister(X86::EFLAGS, TRI)) 1130 return false; 1131 } 1132 1133 *AndInstr = VregDefInstr; 1134 1135 // AND instruction will essentially update SF and clear OF, so 1136 // NoSignFlag should be false in the sense that SF is modified by `AND`. 1137 // 1138 // However, the implementation artifically sets `NoSignFlag` to true 1139 // to poison the SF bit; that is to say, if SF is looked at later, the 1140 // optimization (to erase TEST64rr) will be disabled. 1141 // 1142 // The reason to poison SF bit is that SF bit value could be different 1143 // in the `AND` and `TEST` operation; signed bit is not known for `AND`, 1144 // and is known to be 0 as a result of `TEST64rr`. 1145 // 1146 // FIXME: As opposed to poisoning the SF bit directly, consider peeking into 1147 // the AND instruction and using the static information to guide peephole 1148 // optimization if possible. For example, it's possible to fold a 1149 // conditional move into a copy if the relevant EFLAG bits could be deduced 1150 // from an immediate operand of and operation. 1151 // 1152 NoSignFlag = true; 1153 // ClearsOverflowFlag is true for AND operation (no surprise). 1154 ClearsOverflowFlag = true; 1155 return true; 1156 } 1157 return false; 1158 } 1159 1160 bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, 1161 unsigned Opc, bool AllowSP, Register &NewSrc, 1162 bool &isKill, MachineOperand &ImplicitOp, 1163 LiveVariables *LV, LiveIntervals *LIS) const { 1164 MachineFunction &MF = *MI.getParent()->getParent(); 1165 const TargetRegisterClass *RC; 1166 if (AllowSP) { 1167 RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass; 1168 } else { 1169 RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass; 1170 } 1171 Register SrcReg = Src.getReg(); 1172 isKill = MI.killsRegister(SrcReg, /*TRI=*/nullptr); 1173 1174 // For both LEA64 and LEA32 the register already has essentially the right 1175 // type (32-bit or 64-bit) we may just need to forbid SP. 1176 if (Opc != X86::LEA64_32r) { 1177 NewSrc = SrcReg; 1178 assert(!Src.isUndef() && "Undef op doesn't need optimization"); 1179 1180 if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC)) 1181 return false; 1182 1183 return true; 1184 } 1185 1186 // This is for an LEA64_32r and incoming registers are 32-bit. One way or 1187 // another we need to add 64-bit registers to the final MI. 1188 if (SrcReg.isPhysical()) { 1189 ImplicitOp = Src; 1190 ImplicitOp.setImplicit(); 1191 1192 NewSrc = getX86SubSuperRegister(SrcReg, 64); 1193 assert(NewSrc.isValid() && "Invalid Operand"); 1194 assert(!Src.isUndef() && "Undef op doesn't need optimization"); 1195 } else { 1196 // Virtual register of the wrong class, we have to create a temporary 64-bit 1197 // vreg to feed into the LEA. 1198 NewSrc = MF.getRegInfo().createVirtualRegister(RC); 1199 MachineInstr *Copy = 1200 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY)) 1201 .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) 1202 .addReg(SrcReg, getKillRegState(isKill)); 1203 1204 // Which is obviously going to be dead after we're done with it. 1205 isKill = true; 1206 1207 if (LV) 1208 LV->replaceKillInstruction(SrcReg, MI, *Copy); 1209 1210 if (LIS) { 1211 SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy); 1212 SlotIndex Idx = LIS->getInstructionIndex(MI); 1213 LiveInterval &LI = LIS->getInterval(SrcReg); 1214 LiveRange::Segment *S = LI.getSegmentContaining(Idx); 1215 if (S->end.getBaseIndex() == Idx) 1216 S->end = CopyIdx.getRegSlot(); 1217 } 1218 } 1219 1220 // We've set all the parameters without issue. 1221 return true; 1222 } 1223 1224 MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, 1225 MachineInstr &MI, 1226 LiveVariables *LV, 1227 LiveIntervals *LIS, 1228 bool Is8BitOp) const { 1229 // We handle 8-bit adds and various 16-bit opcodes in the switch below. 1230 MachineBasicBlock &MBB = *MI.getParent(); 1231 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); 1232 assert((Is8BitOp || 1233 RegInfo.getTargetRegisterInfo()->getRegSizeInBits( 1234 *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) && 1235 "Unexpected type for LEA transform"); 1236 1237 // TODO: For a 32-bit target, we need to adjust the LEA variables with 1238 // something like this: 1239 // Opcode = X86::LEA32r; 1240 // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); 1241 // OutRegLEA = 1242 // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass) 1243 // : RegInfo.createVirtualRegister(&X86::GR32RegClass); 1244 if (!Subtarget.is64Bit()) 1245 return nullptr; 1246 1247 unsigned Opcode = X86::LEA64_32r; 1248 Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); 1249 Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass); 1250 Register InRegLEA2; 1251 1252 // Build and insert into an implicit UNDEF value. This is OK because 1253 // we will be shifting and then extracting the lower 8/16-bits. 1254 // This has the potential to cause partial register stall. e.g. 1255 // movw (%rbp,%rcx,2), %dx 1256 // leal -65(%rdx), %esi 1257 // But testing has shown this *does* help performance in 64-bit mode (at 1258 // least on modern x86 machines). 1259 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1260 Register Dest = MI.getOperand(0).getReg(); 1261 Register Src = MI.getOperand(1).getReg(); 1262 Register Src2; 1263 bool IsDead = MI.getOperand(0).isDead(); 1264 bool IsKill = MI.getOperand(1).isKill(); 1265 unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit; 1266 assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization"); 1267 MachineInstr *ImpDef = 1268 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA); 1269 MachineInstr *InsMI = 1270 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY)) 1271 .addReg(InRegLEA, RegState::Define, SubReg) 1272 .addReg(Src, getKillRegState(IsKill)); 1273 MachineInstr *ImpDef2 = nullptr; 1274 MachineInstr *InsMI2 = nullptr; 1275 1276 MachineInstrBuilder MIB = 1277 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA); 1278 switch (MIOpc) { 1279 default: 1280 llvm_unreachable("Unreachable!"); 1281 case X86::SHL8ri: 1282 case X86::SHL16ri: { 1283 unsigned ShAmt = MI.getOperand(2).getImm(); 1284 MIB.addReg(0) 1285 .addImm(1LL << ShAmt) 1286 .addReg(InRegLEA, RegState::Kill) 1287 .addImm(0) 1288 .addReg(0); 1289 break; 1290 } 1291 case X86::INC8r: 1292 case X86::INC16r: 1293 addRegOffset(MIB, InRegLEA, true, 1); 1294 break; 1295 case X86::DEC8r: 1296 case X86::DEC16r: 1297 addRegOffset(MIB, InRegLEA, true, -1); 1298 break; 1299 case X86::ADD8ri: 1300 case X86::ADD8ri_DB: 1301 case X86::ADD16ri: 1302 case X86::ADD16ri_DB: 1303 addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm()); 1304 break; 1305 case X86::ADD8rr: 1306 case X86::ADD8rr_DB: 1307 case X86::ADD16rr: 1308 case X86::ADD16rr_DB: { 1309 Src2 = MI.getOperand(2).getReg(); 1310 bool IsKill2 = MI.getOperand(2).isKill(); 1311 assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization"); 1312 if (Src == Src2) { 1313 // ADD8rr/ADD16rr killed %reg1028, %reg1028 1314 // just a single insert_subreg. 1315 addRegReg(MIB, InRegLEA, true, InRegLEA, false); 1316 } else { 1317 if (Subtarget.is64Bit()) 1318 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); 1319 else 1320 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); 1321 // Build and insert into an implicit UNDEF value. This is OK because 1322 // we will be shifting and then extracting the lower 8/16-bits. 1323 ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), 1324 InRegLEA2); 1325 InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY)) 1326 .addReg(InRegLEA2, RegState::Define, SubReg) 1327 .addReg(Src2, getKillRegState(IsKill2)); 1328 addRegReg(MIB, InRegLEA, true, InRegLEA2, true); 1329 } 1330 if (LV && IsKill2 && InsMI2) 1331 LV->replaceKillInstruction(Src2, MI, *InsMI2); 1332 break; 1333 } 1334 } 1335 1336 MachineInstr *NewMI = MIB; 1337 MachineInstr *ExtMI = 1338 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY)) 1339 .addReg(Dest, RegState::Define | getDeadRegState(IsDead)) 1340 .addReg(OutRegLEA, RegState::Kill, SubReg); 1341 1342 if (LV) { 1343 // Update live variables. 1344 LV->getVarInfo(InRegLEA).Kills.push_back(NewMI); 1345 if (InRegLEA2) 1346 LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI); 1347 LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI); 1348 if (IsKill) 1349 LV->replaceKillInstruction(Src, MI, *InsMI); 1350 if (IsDead) 1351 LV->replaceKillInstruction(Dest, MI, *ExtMI); 1352 } 1353 1354 if (LIS) { 1355 LIS->InsertMachineInstrInMaps(*ImpDef); 1356 SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI); 1357 if (ImpDef2) 1358 LIS->InsertMachineInstrInMaps(*ImpDef2); 1359 SlotIndex Ins2Idx; 1360 if (InsMI2) 1361 Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2); 1362 SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI); 1363 SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI); 1364 LIS->getInterval(InRegLEA); 1365 LIS->getInterval(OutRegLEA); 1366 if (InRegLEA2) 1367 LIS->getInterval(InRegLEA2); 1368 1369 // Move the use of Src up to InsMI. 1370 LiveInterval &SrcLI = LIS->getInterval(Src); 1371 LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx); 1372 if (SrcSeg->end == NewIdx.getRegSlot()) 1373 SrcSeg->end = InsIdx.getRegSlot(); 1374 1375 if (InsMI2) { 1376 // Move the use of Src2 up to InsMI2. 1377 LiveInterval &Src2LI = LIS->getInterval(Src2); 1378 LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx); 1379 if (Src2Seg->end == NewIdx.getRegSlot()) 1380 Src2Seg->end = Ins2Idx.getRegSlot(); 1381 } 1382 1383 // Move the definition of Dest down to ExtMI. 1384 LiveInterval &DestLI = LIS->getInterval(Dest); 1385 LiveRange::Segment *DestSeg = 1386 DestLI.getSegmentContaining(NewIdx.getRegSlot()); 1387 assert(DestSeg->start == NewIdx.getRegSlot() && 1388 DestSeg->valno->def == NewIdx.getRegSlot()); 1389 DestSeg->start = ExtIdx.getRegSlot(); 1390 DestSeg->valno->def = ExtIdx.getRegSlot(); 1391 } 1392 1393 return ExtMI; 1394 } 1395 1396 /// This method must be implemented by targets that 1397 /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target 1398 /// may be able to convert a two-address instruction into a true 1399 /// three-address instruction on demand. This allows the X86 target (for 1400 /// example) to convert ADD and SHL instructions into LEA instructions if they 1401 /// would require register copies due to two-addressness. 1402 /// 1403 /// This method returns a null pointer if the transformation cannot be 1404 /// performed, otherwise it returns the new instruction. 1405 /// 1406 MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI, 1407 LiveVariables *LV, 1408 LiveIntervals *LIS) const { 1409 // The following opcodes also sets the condition code register(s). Only 1410 // convert them to equivalent lea if the condition code register def's 1411 // are dead! 1412 if (hasLiveCondCodeDef(MI)) 1413 return nullptr; 1414 1415 MachineFunction &MF = *MI.getParent()->getParent(); 1416 // All instructions input are two-addr instructions. Get the known operands. 1417 const MachineOperand &Dest = MI.getOperand(0); 1418 const MachineOperand &Src = MI.getOperand(1); 1419 1420 // Ideally, operations with undef should be folded before we get here, but we 1421 // can't guarantee it. Bail out because optimizing undefs is a waste of time. 1422 // Without this, we have to forward undef state to new register operands to 1423 // avoid machine verifier errors. 1424 if (Src.isUndef()) 1425 return nullptr; 1426 if (MI.getNumOperands() > 2) 1427 if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef()) 1428 return nullptr; 1429 1430 MachineInstr *NewMI = nullptr; 1431 Register SrcReg, SrcReg2; 1432 bool Is64Bit = Subtarget.is64Bit(); 1433 1434 bool Is8BitOp = false; 1435 unsigned NumRegOperands = 2; 1436 unsigned MIOpc = MI.getOpcode(); 1437 switch (MIOpc) { 1438 default: 1439 llvm_unreachable("Unreachable!"); 1440 case X86::SHL64ri: { 1441 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); 1442 unsigned ShAmt = getTruncatedShiftCount(MI, 2); 1443 if (!isTruncatedShiftCountForLEA(ShAmt)) 1444 return nullptr; 1445 1446 // LEA can't handle RSP. 1447 if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass( 1448 Src.getReg(), &X86::GR64_NOSPRegClass)) 1449 return nullptr; 1450 1451 NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) 1452 .add(Dest) 1453 .addReg(0) 1454 .addImm(1LL << ShAmt) 1455 .add(Src) 1456 .addImm(0) 1457 .addReg(0); 1458 break; 1459 } 1460 case X86::SHL32ri: { 1461 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); 1462 unsigned ShAmt = getTruncatedShiftCount(MI, 2); 1463 if (!isTruncatedShiftCountForLEA(ShAmt)) 1464 return nullptr; 1465 1466 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; 1467 1468 // LEA can't handle ESP. 1469 bool isKill; 1470 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); 1471 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill, 1472 ImplicitOp, LV, LIS)) 1473 return nullptr; 1474 1475 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) 1476 .add(Dest) 1477 .addReg(0) 1478 .addImm(1LL << ShAmt) 1479 .addReg(SrcReg, getKillRegState(isKill)) 1480 .addImm(0) 1481 .addReg(0); 1482 if (ImplicitOp.getReg() != 0) 1483 MIB.add(ImplicitOp); 1484 NewMI = MIB; 1485 1486 // Add kills if classifyLEAReg created a new register. 1487 if (LV && SrcReg != Src.getReg()) 1488 LV->getVarInfo(SrcReg).Kills.push_back(NewMI); 1489 break; 1490 } 1491 case X86::SHL8ri: 1492 Is8BitOp = true; 1493 [[fallthrough]]; 1494 case X86::SHL16ri: { 1495 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); 1496 unsigned ShAmt = getTruncatedShiftCount(MI, 2); 1497 if (!isTruncatedShiftCountForLEA(ShAmt)) 1498 return nullptr; 1499 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); 1500 } 1501 case X86::INC64r: 1502 case X86::INC32r: { 1503 assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!"); 1504 unsigned Opc = MIOpc == X86::INC64r 1505 ? X86::LEA64r 1506 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); 1507 bool isKill; 1508 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); 1509 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill, 1510 ImplicitOp, LV, LIS)) 1511 return nullptr; 1512 1513 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) 1514 .add(Dest) 1515 .addReg(SrcReg, getKillRegState(isKill)); 1516 if (ImplicitOp.getReg() != 0) 1517 MIB.add(ImplicitOp); 1518 1519 NewMI = addOffset(MIB, 1); 1520 1521 // Add kills if classifyLEAReg created a new register. 1522 if (LV && SrcReg != Src.getReg()) 1523 LV->getVarInfo(SrcReg).Kills.push_back(NewMI); 1524 break; 1525 } 1526 case X86::DEC64r: 1527 case X86::DEC32r: { 1528 assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); 1529 unsigned Opc = MIOpc == X86::DEC64r 1530 ? X86::LEA64r 1531 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); 1532 1533 bool isKill; 1534 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); 1535 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill, 1536 ImplicitOp, LV, LIS)) 1537 return nullptr; 1538 1539 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) 1540 .add(Dest) 1541 .addReg(SrcReg, getKillRegState(isKill)); 1542 if (ImplicitOp.getReg() != 0) 1543 MIB.add(ImplicitOp); 1544 1545 NewMI = addOffset(MIB, -1); 1546 1547 // Add kills if classifyLEAReg created a new register. 1548 if (LV && SrcReg != Src.getReg()) 1549 LV->getVarInfo(SrcReg).Kills.push_back(NewMI); 1550 break; 1551 } 1552 case X86::DEC8r: 1553 case X86::INC8r: 1554 Is8BitOp = true; 1555 [[fallthrough]]; 1556 case X86::DEC16r: 1557 case X86::INC16r: 1558 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); 1559 case X86::ADD64rr: 1560 case X86::ADD64rr_DB: 1561 case X86::ADD32rr: 1562 case X86::ADD32rr_DB: { 1563 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); 1564 unsigned Opc; 1565 if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) 1566 Opc = X86::LEA64r; 1567 else 1568 Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; 1569 1570 const MachineOperand &Src2 = MI.getOperand(2); 1571 bool isKill2; 1572 MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); 1573 if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2, 1574 ImplicitOp2, LV, LIS)) 1575 return nullptr; 1576 1577 bool isKill; 1578 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); 1579 if (Src.getReg() == Src2.getReg()) { 1580 // Don't call classify LEAReg a second time on the same register, in case 1581 // the first call inserted a COPY from Src2 and marked it as killed. 1582 isKill = isKill2; 1583 SrcReg = SrcReg2; 1584 } else { 1585 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill, 1586 ImplicitOp, LV, LIS)) 1587 return nullptr; 1588 } 1589 1590 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest); 1591 if (ImplicitOp.getReg() != 0) 1592 MIB.add(ImplicitOp); 1593 if (ImplicitOp2.getReg() != 0) 1594 MIB.add(ImplicitOp2); 1595 1596 NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); 1597 1598 // Add kills if classifyLEAReg created a new register. 1599 if (LV) { 1600 if (SrcReg2 != Src2.getReg()) 1601 LV->getVarInfo(SrcReg2).Kills.push_back(NewMI); 1602 if (SrcReg != SrcReg2 && SrcReg != Src.getReg()) 1603 LV->getVarInfo(SrcReg).Kills.push_back(NewMI); 1604 } 1605 NumRegOperands = 3; 1606 break; 1607 } 1608 case X86::ADD8rr: 1609 case X86::ADD8rr_DB: 1610 Is8BitOp = true; 1611 [[fallthrough]]; 1612 case X86::ADD16rr: 1613 case X86::ADD16rr_DB: 1614 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); 1615 case X86::ADD64ri32: 1616 case X86::ADD64ri32_DB: 1617 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); 1618 NewMI = addOffset( 1619 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src), 1620 MI.getOperand(2)); 1621 break; 1622 case X86::ADD32ri: 1623 case X86::ADD32ri_DB: { 1624 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); 1625 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; 1626 1627 bool isKill; 1628 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); 1629 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill, 1630 ImplicitOp, LV, LIS)) 1631 return nullptr; 1632 1633 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) 1634 .add(Dest) 1635 .addReg(SrcReg, getKillRegState(isKill)); 1636 if (ImplicitOp.getReg() != 0) 1637 MIB.add(ImplicitOp); 1638 1639 NewMI = addOffset(MIB, MI.getOperand(2)); 1640 1641 // Add kills if classifyLEAReg created a new register. 1642 if (LV && SrcReg != Src.getReg()) 1643 LV->getVarInfo(SrcReg).Kills.push_back(NewMI); 1644 break; 1645 } 1646 case X86::ADD8ri: 1647 case X86::ADD8ri_DB: 1648 Is8BitOp = true; 1649 [[fallthrough]]; 1650 case X86::ADD16ri: 1651 case X86::ADD16ri_DB: 1652 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); 1653 case X86::SUB8ri: 1654 case X86::SUB16ri: 1655 /// FIXME: Support these similar to ADD8ri/ADD16ri*. 1656 return nullptr; 1657 case X86::SUB32ri: { 1658 if (!MI.getOperand(2).isImm()) 1659 return nullptr; 1660 int64_t Imm = MI.getOperand(2).getImm(); 1661 if (!isInt<32>(-Imm)) 1662 return nullptr; 1663 1664 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); 1665 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; 1666 1667 bool isKill; 1668 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); 1669 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill, 1670 ImplicitOp, LV, LIS)) 1671 return nullptr; 1672 1673 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) 1674 .add(Dest) 1675 .addReg(SrcReg, getKillRegState(isKill)); 1676 if (ImplicitOp.getReg() != 0) 1677 MIB.add(ImplicitOp); 1678 1679 NewMI = addOffset(MIB, -Imm); 1680 1681 // Add kills if classifyLEAReg created a new register. 1682 if (LV && SrcReg != Src.getReg()) 1683 LV->getVarInfo(SrcReg).Kills.push_back(NewMI); 1684 break; 1685 } 1686 1687 case X86::SUB64ri32: { 1688 if (!MI.getOperand(2).isImm()) 1689 return nullptr; 1690 int64_t Imm = MI.getOperand(2).getImm(); 1691 if (!isInt<32>(-Imm)) 1692 return nullptr; 1693 1694 assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!"); 1695 1696 MachineInstrBuilder MIB = 1697 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src); 1698 NewMI = addOffset(MIB, -Imm); 1699 break; 1700 } 1701 1702 case X86::VMOVDQU8Z128rmk: 1703 case X86::VMOVDQU8Z256rmk: 1704 case X86::VMOVDQU8Zrmk: 1705 case X86::VMOVDQU16Z128rmk: 1706 case X86::VMOVDQU16Z256rmk: 1707 case X86::VMOVDQU16Zrmk: 1708 case X86::VMOVDQU32Z128rmk: 1709 case X86::VMOVDQA32Z128rmk: 1710 case X86::VMOVDQU32Z256rmk: 1711 case X86::VMOVDQA32Z256rmk: 1712 case X86::VMOVDQU32Zrmk: 1713 case X86::VMOVDQA32Zrmk: 1714 case X86::VMOVDQU64Z128rmk: 1715 case X86::VMOVDQA64Z128rmk: 1716 case X86::VMOVDQU64Z256rmk: 1717 case X86::VMOVDQA64Z256rmk: 1718 case X86::VMOVDQU64Zrmk: 1719 case X86::VMOVDQA64Zrmk: 1720 case X86::VMOVUPDZ128rmk: 1721 case X86::VMOVAPDZ128rmk: 1722 case X86::VMOVUPDZ256rmk: 1723 case X86::VMOVAPDZ256rmk: 1724 case X86::VMOVUPDZrmk: 1725 case X86::VMOVAPDZrmk: 1726 case X86::VMOVUPSZ128rmk: 1727 case X86::VMOVAPSZ128rmk: 1728 case X86::VMOVUPSZ256rmk: 1729 case X86::VMOVAPSZ256rmk: 1730 case X86::VMOVUPSZrmk: 1731 case X86::VMOVAPSZrmk: 1732 case X86::VBROADCASTSDZ256rmk: 1733 case X86::VBROADCASTSDZrmk: 1734 case X86::VBROADCASTSSZ128rmk: 1735 case X86::VBROADCASTSSZ256rmk: 1736 case X86::VBROADCASTSSZrmk: 1737 case X86::VPBROADCASTDZ128rmk: 1738 case X86::VPBROADCASTDZ256rmk: 1739 case X86::VPBROADCASTDZrmk: 1740 case X86::VPBROADCASTQZ128rmk: 1741 case X86::VPBROADCASTQZ256rmk: 1742 case X86::VPBROADCASTQZrmk: { 1743 unsigned Opc; 1744 switch (MIOpc) { 1745 default: 1746 llvm_unreachable("Unreachable!"); 1747 case X86::VMOVDQU8Z128rmk: 1748 Opc = X86::VPBLENDMBZ128rmk; 1749 break; 1750 case X86::VMOVDQU8Z256rmk: 1751 Opc = X86::VPBLENDMBZ256rmk; 1752 break; 1753 case X86::VMOVDQU8Zrmk: 1754 Opc = X86::VPBLENDMBZrmk; 1755 break; 1756 case X86::VMOVDQU16Z128rmk: 1757 Opc = X86::VPBLENDMWZ128rmk; 1758 break; 1759 case X86::VMOVDQU16Z256rmk: 1760 Opc = X86::VPBLENDMWZ256rmk; 1761 break; 1762 case X86::VMOVDQU16Zrmk: 1763 Opc = X86::VPBLENDMWZrmk; 1764 break; 1765 case X86::VMOVDQU32Z128rmk: 1766 Opc = X86::VPBLENDMDZ128rmk; 1767 break; 1768 case X86::VMOVDQU32Z256rmk: 1769 Opc = X86::VPBLENDMDZ256rmk; 1770 break; 1771 case X86::VMOVDQU32Zrmk: 1772 Opc = X86::VPBLENDMDZrmk; 1773 break; 1774 case X86::VMOVDQU64Z128rmk: 1775 Opc = X86::VPBLENDMQZ128rmk; 1776 break; 1777 case X86::VMOVDQU64Z256rmk: 1778 Opc = X86::VPBLENDMQZ256rmk; 1779 break; 1780 case X86::VMOVDQU64Zrmk: 1781 Opc = X86::VPBLENDMQZrmk; 1782 break; 1783 case X86::VMOVUPDZ128rmk: 1784 Opc = X86::VBLENDMPDZ128rmk; 1785 break; 1786 case X86::VMOVUPDZ256rmk: 1787 Opc = X86::VBLENDMPDZ256rmk; 1788 break; 1789 case X86::VMOVUPDZrmk: 1790 Opc = X86::VBLENDMPDZrmk; 1791 break; 1792 case X86::VMOVUPSZ128rmk: 1793 Opc = X86::VBLENDMPSZ128rmk; 1794 break; 1795 case X86::VMOVUPSZ256rmk: 1796 Opc = X86::VBLENDMPSZ256rmk; 1797 break; 1798 case X86::VMOVUPSZrmk: 1799 Opc = X86::VBLENDMPSZrmk; 1800 break; 1801 case X86::VMOVDQA32Z128rmk: 1802 Opc = X86::VPBLENDMDZ128rmk; 1803 break; 1804 case X86::VMOVDQA32Z256rmk: 1805 Opc = X86::VPBLENDMDZ256rmk; 1806 break; 1807 case X86::VMOVDQA32Zrmk: 1808 Opc = X86::VPBLENDMDZrmk; 1809 break; 1810 case X86::VMOVDQA64Z128rmk: 1811 Opc = X86::VPBLENDMQZ128rmk; 1812 break; 1813 case X86::VMOVDQA64Z256rmk: 1814 Opc = X86::VPBLENDMQZ256rmk; 1815 break; 1816 case X86::VMOVDQA64Zrmk: 1817 Opc = X86::VPBLENDMQZrmk; 1818 break; 1819 case X86::VMOVAPDZ128rmk: 1820 Opc = X86::VBLENDMPDZ128rmk; 1821 break; 1822 case X86::VMOVAPDZ256rmk: 1823 Opc = X86::VBLENDMPDZ256rmk; 1824 break; 1825 case X86::VMOVAPDZrmk: 1826 Opc = X86::VBLENDMPDZrmk; 1827 break; 1828 case X86::VMOVAPSZ128rmk: 1829 Opc = X86::VBLENDMPSZ128rmk; 1830 break; 1831 case X86::VMOVAPSZ256rmk: 1832 Opc = X86::VBLENDMPSZ256rmk; 1833 break; 1834 case X86::VMOVAPSZrmk: 1835 Opc = X86::VBLENDMPSZrmk; 1836 break; 1837 case X86::VBROADCASTSDZ256rmk: 1838 Opc = X86::VBLENDMPDZ256rmbk; 1839 break; 1840 case X86::VBROADCASTSDZrmk: 1841 Opc = X86::VBLENDMPDZrmbk; 1842 break; 1843 case X86::VBROADCASTSSZ128rmk: 1844 Opc = X86::VBLENDMPSZ128rmbk; 1845 break; 1846 case X86::VBROADCASTSSZ256rmk: 1847 Opc = X86::VBLENDMPSZ256rmbk; 1848 break; 1849 case X86::VBROADCASTSSZrmk: 1850 Opc = X86::VBLENDMPSZrmbk; 1851 break; 1852 case X86::VPBROADCASTDZ128rmk: 1853 Opc = X86::VPBLENDMDZ128rmbk; 1854 break; 1855 case X86::VPBROADCASTDZ256rmk: 1856 Opc = X86::VPBLENDMDZ256rmbk; 1857 break; 1858 case X86::VPBROADCASTDZrmk: 1859 Opc = X86::VPBLENDMDZrmbk; 1860 break; 1861 case X86::VPBROADCASTQZ128rmk: 1862 Opc = X86::VPBLENDMQZ128rmbk; 1863 break; 1864 case X86::VPBROADCASTQZ256rmk: 1865 Opc = X86::VPBLENDMQZ256rmbk; 1866 break; 1867 case X86::VPBROADCASTQZrmk: 1868 Opc = X86::VPBLENDMQZrmbk; 1869 break; 1870 } 1871 1872 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) 1873 .add(Dest) 1874 .add(MI.getOperand(2)) 1875 .add(Src) 1876 .add(MI.getOperand(3)) 1877 .add(MI.getOperand(4)) 1878 .add(MI.getOperand(5)) 1879 .add(MI.getOperand(6)) 1880 .add(MI.getOperand(7)); 1881 NumRegOperands = 4; 1882 break; 1883 } 1884 1885 case X86::VMOVDQU8Z128rrk: 1886 case X86::VMOVDQU8Z256rrk: 1887 case X86::VMOVDQU8Zrrk: 1888 case X86::VMOVDQU16Z128rrk: 1889 case X86::VMOVDQU16Z256rrk: 1890 case X86::VMOVDQU16Zrrk: 1891 case X86::VMOVDQU32Z128rrk: 1892 case X86::VMOVDQA32Z128rrk: 1893 case X86::VMOVDQU32Z256rrk: 1894 case X86::VMOVDQA32Z256rrk: 1895 case X86::VMOVDQU32Zrrk: 1896 case X86::VMOVDQA32Zrrk: 1897 case X86::VMOVDQU64Z128rrk: 1898 case X86::VMOVDQA64Z128rrk: 1899 case X86::VMOVDQU64Z256rrk: 1900 case X86::VMOVDQA64Z256rrk: 1901 case X86::VMOVDQU64Zrrk: 1902 case X86::VMOVDQA64Zrrk: 1903 case X86::VMOVUPDZ128rrk: 1904 case X86::VMOVAPDZ128rrk: 1905 case X86::VMOVUPDZ256rrk: 1906 case X86::VMOVAPDZ256rrk: 1907 case X86::VMOVUPDZrrk: 1908 case X86::VMOVAPDZrrk: 1909 case X86::VMOVUPSZ128rrk: 1910 case X86::VMOVAPSZ128rrk: 1911 case X86::VMOVUPSZ256rrk: 1912 case X86::VMOVAPSZ256rrk: 1913 case X86::VMOVUPSZrrk: 1914 case X86::VMOVAPSZrrk: { 1915 unsigned Opc; 1916 switch (MIOpc) { 1917 default: 1918 llvm_unreachable("Unreachable!"); 1919 case X86::VMOVDQU8Z128rrk: 1920 Opc = X86::VPBLENDMBZ128rrk; 1921 break; 1922 case X86::VMOVDQU8Z256rrk: 1923 Opc = X86::VPBLENDMBZ256rrk; 1924 break; 1925 case X86::VMOVDQU8Zrrk: 1926 Opc = X86::VPBLENDMBZrrk; 1927 break; 1928 case X86::VMOVDQU16Z128rrk: 1929 Opc = X86::VPBLENDMWZ128rrk; 1930 break; 1931 case X86::VMOVDQU16Z256rrk: 1932 Opc = X86::VPBLENDMWZ256rrk; 1933 break; 1934 case X86::VMOVDQU16Zrrk: 1935 Opc = X86::VPBLENDMWZrrk; 1936 break; 1937 case X86::VMOVDQU32Z128rrk: 1938 Opc = X86::VPBLENDMDZ128rrk; 1939 break; 1940 case X86::VMOVDQU32Z256rrk: 1941 Opc = X86::VPBLENDMDZ256rrk; 1942 break; 1943 case X86::VMOVDQU32Zrrk: 1944 Opc = X86::VPBLENDMDZrrk; 1945 break; 1946 case X86::VMOVDQU64Z128rrk: 1947 Opc = X86::VPBLENDMQZ128rrk; 1948 break; 1949 case X86::VMOVDQU64Z256rrk: 1950 Opc = X86::VPBLENDMQZ256rrk; 1951 break; 1952 case X86::VMOVDQU64Zrrk: 1953 Opc = X86::VPBLENDMQZrrk; 1954 break; 1955 case X86::VMOVUPDZ128rrk: 1956 Opc = X86::VBLENDMPDZ128rrk; 1957 break; 1958 case X86::VMOVUPDZ256rrk: 1959 Opc = X86::VBLENDMPDZ256rrk; 1960 break; 1961 case X86::VMOVUPDZrrk: 1962 Opc = X86::VBLENDMPDZrrk; 1963 break; 1964 case X86::VMOVUPSZ128rrk: 1965 Opc = X86::VBLENDMPSZ128rrk; 1966 break; 1967 case X86::VMOVUPSZ256rrk: 1968 Opc = X86::VBLENDMPSZ256rrk; 1969 break; 1970 case X86::VMOVUPSZrrk: 1971 Opc = X86::VBLENDMPSZrrk; 1972 break; 1973 case X86::VMOVDQA32Z128rrk: 1974 Opc = X86::VPBLENDMDZ128rrk; 1975 break; 1976 case X86::VMOVDQA32Z256rrk: 1977 Opc = X86::VPBLENDMDZ256rrk; 1978 break; 1979 case X86::VMOVDQA32Zrrk: 1980 Opc = X86::VPBLENDMDZrrk; 1981 break; 1982 case X86::VMOVDQA64Z128rrk: 1983 Opc = X86::VPBLENDMQZ128rrk; 1984 break; 1985 case X86::VMOVDQA64Z256rrk: 1986 Opc = X86::VPBLENDMQZ256rrk; 1987 break; 1988 case X86::VMOVDQA64Zrrk: 1989 Opc = X86::VPBLENDMQZrrk; 1990 break; 1991 case X86::VMOVAPDZ128rrk: 1992 Opc = X86::VBLENDMPDZ128rrk; 1993 break; 1994 case X86::VMOVAPDZ256rrk: 1995 Opc = X86::VBLENDMPDZ256rrk; 1996 break; 1997 case X86::VMOVAPDZrrk: 1998 Opc = X86::VBLENDMPDZrrk; 1999 break; 2000 case X86::VMOVAPSZ128rrk: 2001 Opc = X86::VBLENDMPSZ128rrk; 2002 break; 2003 case X86::VMOVAPSZ256rrk: 2004 Opc = X86::VBLENDMPSZ256rrk; 2005 break; 2006 case X86::VMOVAPSZrrk: 2007 Opc = X86::VBLENDMPSZrrk; 2008 break; 2009 } 2010 2011 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) 2012 .add(Dest) 2013 .add(MI.getOperand(2)) 2014 .add(Src) 2015 .add(MI.getOperand(3)); 2016 NumRegOperands = 4; 2017 break; 2018 } 2019 } 2020 2021 if (!NewMI) 2022 return nullptr; 2023 2024 if (LV) { // Update live variables 2025 for (unsigned I = 0; I < NumRegOperands; ++I) { 2026 MachineOperand &Op = MI.getOperand(I); 2027 if (Op.isReg() && (Op.isDead() || Op.isKill())) 2028 LV->replaceKillInstruction(Op.getReg(), MI, *NewMI); 2029 } 2030 } 2031 2032 MachineBasicBlock &MBB = *MI.getParent(); 2033 MBB.insert(MI.getIterator(), NewMI); // Insert the new inst 2034 2035 if (LIS) { 2036 LIS->ReplaceMachineInstrInMaps(MI, *NewMI); 2037 if (SrcReg) 2038 LIS->getInterval(SrcReg); 2039 if (SrcReg2) 2040 LIS->getInterval(SrcReg2); 2041 } 2042 2043 return NewMI; 2044 } 2045 2046 /// This determines which of three possible cases of a three source commute 2047 /// the source indexes correspond to taking into account any mask operands. 2048 /// All prevents commuting a passthru operand. Returns -1 if the commute isn't 2049 /// possible. 2050 /// Case 0 - Possible to commute the first and second operands. 2051 /// Case 1 - Possible to commute the first and third operands. 2052 /// Case 2 - Possible to commute the second and third operands. 2053 static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, 2054 unsigned SrcOpIdx2) { 2055 // Put the lowest index to SrcOpIdx1 to simplify the checks below. 2056 if (SrcOpIdx1 > SrcOpIdx2) 2057 std::swap(SrcOpIdx1, SrcOpIdx2); 2058 2059 unsigned Op1 = 1, Op2 = 2, Op3 = 3; 2060 if (X86II::isKMasked(TSFlags)) { 2061 Op2++; 2062 Op3++; 2063 } 2064 2065 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2) 2066 return 0; 2067 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3) 2068 return 1; 2069 if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3) 2070 return 2; 2071 llvm_unreachable("Unknown three src commute case."); 2072 } 2073 2074 unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( 2075 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, 2076 const X86InstrFMA3Group &FMA3Group) const { 2077 2078 unsigned Opc = MI.getOpcode(); 2079 2080 // TODO: Commuting the 1st operand of FMA*_Int requires some additional 2081 // analysis. The commute optimization is legal only if all users of FMA*_Int 2082 // use only the lowest element of the FMA*_Int instruction. Such analysis are 2083 // not implemented yet. So, just return 0 in that case. 2084 // When such analysis are available this place will be the right place for 2085 // calling it. 2086 assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) && 2087 "Intrinsic instructions can't commute operand 1"); 2088 2089 // Determine which case this commute is or if it can't be done. 2090 unsigned Case = 2091 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2); 2092 assert(Case < 3 && "Unexpected case number!"); 2093 2094 // Define the FMA forms mapping array that helps to map input FMA form 2095 // to output FMA form to preserve the operation semantics after 2096 // commuting the operands. 2097 const unsigned Form132Index = 0; 2098 const unsigned Form213Index = 1; 2099 const unsigned Form231Index = 2; 2100 static const unsigned FormMapping[][3] = { 2101 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; 2102 // FMA132 A, C, b; ==> FMA231 C, A, b; 2103 // FMA213 B, A, c; ==> FMA213 A, B, c; 2104 // FMA231 C, A, b; ==> FMA132 A, C, b; 2105 {Form231Index, Form213Index, Form132Index}, 2106 // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; 2107 // FMA132 A, c, B; ==> FMA132 B, c, A; 2108 // FMA213 B, a, C; ==> FMA231 C, a, B; 2109 // FMA231 C, a, B; ==> FMA213 B, a, C; 2110 {Form132Index, Form231Index, Form213Index}, 2111 // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; 2112 // FMA132 a, C, B; ==> FMA213 a, B, C; 2113 // FMA213 b, A, C; ==> FMA132 b, C, A; 2114 // FMA231 c, A, B; ==> FMA231 c, B, A; 2115 {Form213Index, Form132Index, Form231Index}}; 2116 2117 unsigned FMAForms[3]; 2118 FMAForms[0] = FMA3Group.get132Opcode(); 2119 FMAForms[1] = FMA3Group.get213Opcode(); 2120 FMAForms[2] = FMA3Group.get231Opcode(); 2121 2122 // Everything is ready, just adjust the FMA opcode and return it. 2123 for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++) 2124 if (Opc == FMAForms[FormIndex]) 2125 return FMAForms[FormMapping[Case][FormIndex]]; 2126 2127 llvm_unreachable("Illegal FMA3 format"); 2128 } 2129 2130 static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, 2131 unsigned SrcOpIdx2) { 2132 // Determine which case this commute is or if it can't be done. 2133 unsigned Case = 2134 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2); 2135 assert(Case < 3 && "Unexpected case value!"); 2136 2137 // For each case we need to swap two pairs of bits in the final immediate. 2138 static const uint8_t SwapMasks[3][4] = { 2139 {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5. 2140 {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6. 2141 {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6. 2142 }; 2143 2144 uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); 2145 // Clear out the bits we are swapping. 2146 uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] | 2147 SwapMasks[Case][2] | SwapMasks[Case][3]); 2148 // If the immediate had a bit of the pair set, then set the opposite bit. 2149 if (Imm & SwapMasks[Case][0]) 2150 NewImm |= SwapMasks[Case][1]; 2151 if (Imm & SwapMasks[Case][1]) 2152 NewImm |= SwapMasks[Case][0]; 2153 if (Imm & SwapMasks[Case][2]) 2154 NewImm |= SwapMasks[Case][3]; 2155 if (Imm & SwapMasks[Case][3]) 2156 NewImm |= SwapMasks[Case][2]; 2157 MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm); 2158 } 2159 2160 // Returns true if this is a VPERMI2 or VPERMT2 instruction that can be 2161 // commuted. 2162 static bool isCommutableVPERMV3Instruction(unsigned Opcode) { 2163 #define VPERM_CASES(Suffix) \ 2164 case X86::VPERMI2##Suffix##Z128rr: \ 2165 case X86::VPERMT2##Suffix##Z128rr: \ 2166 case X86::VPERMI2##Suffix##Z256rr: \ 2167 case X86::VPERMT2##Suffix##Z256rr: \ 2168 case X86::VPERMI2##Suffix##Zrr: \ 2169 case X86::VPERMT2##Suffix##Zrr: \ 2170 case X86::VPERMI2##Suffix##Z128rm: \ 2171 case X86::VPERMT2##Suffix##Z128rm: \ 2172 case X86::VPERMI2##Suffix##Z256rm: \ 2173 case X86::VPERMT2##Suffix##Z256rm: \ 2174 case X86::VPERMI2##Suffix##Zrm: \ 2175 case X86::VPERMT2##Suffix##Zrm: \ 2176 case X86::VPERMI2##Suffix##Z128rrkz: \ 2177 case X86::VPERMT2##Suffix##Z128rrkz: \ 2178 case X86::VPERMI2##Suffix##Z256rrkz: \ 2179 case X86::VPERMT2##Suffix##Z256rrkz: \ 2180 case X86::VPERMI2##Suffix##Zrrkz: \ 2181 case X86::VPERMT2##Suffix##Zrrkz: \ 2182 case X86::VPERMI2##Suffix##Z128rmkz: \ 2183 case X86::VPERMT2##Suffix##Z128rmkz: \ 2184 case X86::VPERMI2##Suffix##Z256rmkz: \ 2185 case X86::VPERMT2##Suffix##Z256rmkz: \ 2186 case X86::VPERMI2##Suffix##Zrmkz: \ 2187 case X86::VPERMT2##Suffix##Zrmkz: 2188 2189 #define VPERM_CASES_BROADCAST(Suffix) \ 2190 VPERM_CASES(Suffix) \ 2191 case X86::VPERMI2##Suffix##Z128rmb: \ 2192 case X86::VPERMT2##Suffix##Z128rmb: \ 2193 case X86::VPERMI2##Suffix##Z256rmb: \ 2194 case X86::VPERMT2##Suffix##Z256rmb: \ 2195 case X86::VPERMI2##Suffix##Zrmb: \ 2196 case X86::VPERMT2##Suffix##Zrmb: \ 2197 case X86::VPERMI2##Suffix##Z128rmbkz: \ 2198 case X86::VPERMT2##Suffix##Z128rmbkz: \ 2199 case X86::VPERMI2##Suffix##Z256rmbkz: \ 2200 case X86::VPERMT2##Suffix##Z256rmbkz: \ 2201 case X86::VPERMI2##Suffix##Zrmbkz: \ 2202 case X86::VPERMT2##Suffix##Zrmbkz: 2203 2204 switch (Opcode) { 2205 default: 2206 return false; 2207 VPERM_CASES(B) 2208 VPERM_CASES_BROADCAST(D) 2209 VPERM_CASES_BROADCAST(PD) 2210 VPERM_CASES_BROADCAST(PS) 2211 VPERM_CASES_BROADCAST(Q) 2212 VPERM_CASES(W) 2213 return true; 2214 } 2215 #undef VPERM_CASES_BROADCAST 2216 #undef VPERM_CASES 2217 } 2218 2219 // Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching 2220 // from the I opcode to the T opcode and vice versa. 2221 static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) { 2222 #define VPERM_CASES(Orig, New) \ 2223 case X86::Orig##Z128rr: \ 2224 return X86::New##Z128rr; \ 2225 case X86::Orig##Z128rrkz: \ 2226 return X86::New##Z128rrkz; \ 2227 case X86::Orig##Z128rm: \ 2228 return X86::New##Z128rm; \ 2229 case X86::Orig##Z128rmkz: \ 2230 return X86::New##Z128rmkz; \ 2231 case X86::Orig##Z256rr: \ 2232 return X86::New##Z256rr; \ 2233 case X86::Orig##Z256rrkz: \ 2234 return X86::New##Z256rrkz; \ 2235 case X86::Orig##Z256rm: \ 2236 return X86::New##Z256rm; \ 2237 case X86::Orig##Z256rmkz: \ 2238 return X86::New##Z256rmkz; \ 2239 case X86::Orig##Zrr: \ 2240 return X86::New##Zrr; \ 2241 case X86::Orig##Zrrkz: \ 2242 return X86::New##Zrrkz; \ 2243 case X86::Orig##Zrm: \ 2244 return X86::New##Zrm; \ 2245 case X86::Orig##Zrmkz: \ 2246 return X86::New##Zrmkz; 2247 2248 #define VPERM_CASES_BROADCAST(Orig, New) \ 2249 VPERM_CASES(Orig, New) \ 2250 case X86::Orig##Z128rmb: \ 2251 return X86::New##Z128rmb; \ 2252 case X86::Orig##Z128rmbkz: \ 2253 return X86::New##Z128rmbkz; \ 2254 case X86::Orig##Z256rmb: \ 2255 return X86::New##Z256rmb; \ 2256 case X86::Orig##Z256rmbkz: \ 2257 return X86::New##Z256rmbkz; \ 2258 case X86::Orig##Zrmb: \ 2259 return X86::New##Zrmb; \ 2260 case X86::Orig##Zrmbkz: \ 2261 return X86::New##Zrmbkz; 2262 2263 switch (Opcode) { 2264 VPERM_CASES(VPERMI2B, VPERMT2B) 2265 VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D) 2266 VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD) 2267 VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS) 2268 VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q) 2269 VPERM_CASES(VPERMI2W, VPERMT2W) 2270 VPERM_CASES(VPERMT2B, VPERMI2B) 2271 VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D) 2272 VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD) 2273 VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS) 2274 VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q) 2275 VPERM_CASES(VPERMT2W, VPERMI2W) 2276 } 2277 2278 llvm_unreachable("Unreachable!"); 2279 #undef VPERM_CASES_BROADCAST 2280 #undef VPERM_CASES 2281 } 2282 2283 MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 2284 unsigned OpIdx1, 2285 unsigned OpIdx2) const { 2286 auto CloneIfNew = [&](MachineInstr &MI) { 2287 return std::exchange(NewMI, false) 2288 ? MI.getParent()->getParent()->CloneMachineInstr(&MI) 2289 : &MI; 2290 }; 2291 MachineInstr *WorkingMI = nullptr; 2292 unsigned Opc = MI.getOpcode(); 2293 2294 #define CASE_ND(OP) \ 2295 case X86::OP: \ 2296 case X86::OP##_ND: 2297 2298 switch (Opc) { 2299 // SHLD B, C, I <-> SHRD C, B, (BitWidth - I) 2300 CASE_ND(SHRD16rri8) 2301 CASE_ND(SHLD16rri8) 2302 CASE_ND(SHRD32rri8) 2303 CASE_ND(SHLD32rri8) 2304 CASE_ND(SHRD64rri8) 2305 CASE_ND(SHLD64rri8) { 2306 unsigned Size; 2307 switch (Opc) { 2308 default: 2309 llvm_unreachable("Unreachable!"); 2310 #define FROM_TO_SIZE(A, B, S) \ 2311 case X86::A: \ 2312 Opc = X86::B; \ 2313 Size = S; \ 2314 break; \ 2315 case X86::A##_ND: \ 2316 Opc = X86::B##_ND; \ 2317 Size = S; \ 2318 break; \ 2319 case X86::B: \ 2320 Opc = X86::A; \ 2321 Size = S; \ 2322 break; \ 2323 case X86::B##_ND: \ 2324 Opc = X86::A##_ND; \ 2325 Size = S; \ 2326 break; 2327 2328 FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16) 2329 FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32) 2330 FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64) 2331 #undef FROM_TO_SIZE 2332 } 2333 WorkingMI = CloneIfNew(MI); 2334 WorkingMI->setDesc(get(Opc)); 2335 WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm()); 2336 break; 2337 } 2338 case X86::PFSUBrr: 2339 case X86::PFSUBRrr: 2340 // PFSUB x, y: x = x - y 2341 // PFSUBR x, y: x = y - x 2342 WorkingMI = CloneIfNew(MI); 2343 WorkingMI->setDesc( 2344 get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr)); 2345 break; 2346 case X86::BLENDPDrri: 2347 case X86::BLENDPSrri: 2348 case X86::VBLENDPDrri: 2349 case X86::VBLENDPSrri: 2350 // If we're optimizing for size, try to use MOVSD/MOVSS. 2351 if (MI.getParent()->getParent()->getFunction().hasOptSize()) { 2352 unsigned Mask = (Opc == X86::BLENDPDrri || Opc == X86::VBLENDPDrri) ? 0x03: 0x0F; 2353 if ((MI.getOperand(3).getImm() ^ Mask) == 1) { 2354 #define FROM_TO(FROM, TO) \ 2355 case X86::FROM: \ 2356 Opc = X86::TO; \ 2357 break; 2358 switch (Opc) { 2359 default: 2360 llvm_unreachable("Unreachable!"); 2361 FROM_TO(BLENDPDrri, MOVSDrr) 2362 FROM_TO(BLENDPSrri, MOVSSrr) 2363 FROM_TO(VBLENDPDrri, VMOVSDrr) 2364 FROM_TO(VBLENDPSrri, VMOVSSrr) 2365 } 2366 WorkingMI = CloneIfNew(MI); 2367 WorkingMI->setDesc(get(Opc)); 2368 WorkingMI->removeOperand(3); 2369 break; 2370 } 2371 #undef FROM_TO 2372 } 2373 [[fallthrough]]; 2374 case X86::PBLENDWrri: 2375 case X86::VBLENDPDYrri: 2376 case X86::VBLENDPSYrri: 2377 case X86::VPBLENDDrri: 2378 case X86::VPBLENDWrri: 2379 case X86::VPBLENDDYrri: 2380 case X86::VPBLENDWYrri: { 2381 int8_t Mask; 2382 switch (Opc) { 2383 default: 2384 llvm_unreachable("Unreachable!"); 2385 case X86::BLENDPDrri: 2386 Mask = (int8_t)0x03; 2387 break; 2388 case X86::BLENDPSrri: 2389 Mask = (int8_t)0x0F; 2390 break; 2391 case X86::PBLENDWrri: 2392 Mask = (int8_t)0xFF; 2393 break; 2394 case X86::VBLENDPDrri: 2395 Mask = (int8_t)0x03; 2396 break; 2397 case X86::VBLENDPSrri: 2398 Mask = (int8_t)0x0F; 2399 break; 2400 case X86::VBLENDPDYrri: 2401 Mask = (int8_t)0x0F; 2402 break; 2403 case X86::VBLENDPSYrri: 2404 Mask = (int8_t)0xFF; 2405 break; 2406 case X86::VPBLENDDrri: 2407 Mask = (int8_t)0x0F; 2408 break; 2409 case X86::VPBLENDWrri: 2410 Mask = (int8_t)0xFF; 2411 break; 2412 case X86::VPBLENDDYrri: 2413 Mask = (int8_t)0xFF; 2414 break; 2415 case X86::VPBLENDWYrri: 2416 Mask = (int8_t)0xFF; 2417 break; 2418 } 2419 // Only the least significant bits of Imm are used. 2420 // Using int8_t to ensure it will be sign extended to the int64_t that 2421 // setImm takes in order to match isel behavior. 2422 int8_t Imm = MI.getOperand(3).getImm() & Mask; 2423 WorkingMI = CloneIfNew(MI); 2424 WorkingMI->getOperand(3).setImm(Mask ^ Imm); 2425 break; 2426 } 2427 case X86::INSERTPSrr: 2428 case X86::VINSERTPSrr: 2429 case X86::VINSERTPSZrr: { 2430 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); 2431 unsigned ZMask = Imm & 15; 2432 unsigned DstIdx = (Imm >> 4) & 3; 2433 unsigned SrcIdx = (Imm >> 6) & 3; 2434 2435 // We can commute insertps if we zero 2 of the elements, the insertion is 2436 // "inline" and we don't override the insertion with a zero. 2437 if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 && 2438 llvm::popcount(ZMask) == 2) { 2439 unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15); 2440 assert(AltIdx < 4 && "Illegal insertion index"); 2441 unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask; 2442 WorkingMI = CloneIfNew(MI); 2443 WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm); 2444 break; 2445 } 2446 return nullptr; 2447 } 2448 case X86::MOVSDrr: 2449 case X86::MOVSSrr: 2450 case X86::VMOVSDrr: 2451 case X86::VMOVSSrr: { 2452 // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD. 2453 if (Subtarget.hasSSE41()) { 2454 unsigned Mask; 2455 switch (Opc) { 2456 default: 2457 llvm_unreachable("Unreachable!"); 2458 case X86::MOVSDrr: 2459 Opc = X86::BLENDPDrri; 2460 Mask = 0x02; 2461 break; 2462 case X86::MOVSSrr: 2463 Opc = X86::BLENDPSrri; 2464 Mask = 0x0E; 2465 break; 2466 case X86::VMOVSDrr: 2467 Opc = X86::VBLENDPDrri; 2468 Mask = 0x02; 2469 break; 2470 case X86::VMOVSSrr: 2471 Opc = X86::VBLENDPSrri; 2472 Mask = 0x0E; 2473 break; 2474 } 2475 2476 WorkingMI = CloneIfNew(MI); 2477 WorkingMI->setDesc(get(Opc)); 2478 WorkingMI->addOperand(MachineOperand::CreateImm(Mask)); 2479 break; 2480 } 2481 2482 WorkingMI = CloneIfNew(MI); 2483 WorkingMI->setDesc(get(X86::SHUFPDrri)); 2484 WorkingMI->addOperand(MachineOperand::CreateImm(0x02)); 2485 break; 2486 } 2487 case X86::SHUFPDrri: { 2488 // Commute to MOVSD. 2489 assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!"); 2490 WorkingMI = CloneIfNew(MI); 2491 WorkingMI->setDesc(get(X86::MOVSDrr)); 2492 WorkingMI->removeOperand(3); 2493 break; 2494 } 2495 case X86::PCLMULQDQrri: 2496 case X86::VPCLMULQDQrri: 2497 case X86::VPCLMULQDQYrri: 2498 case X86::VPCLMULQDQZrri: 2499 case X86::VPCLMULQDQZ128rri: 2500 case X86::VPCLMULQDQZ256rri: { 2501 // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] 2502 // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0] 2503 unsigned Imm = MI.getOperand(3).getImm(); 2504 unsigned Src1Hi = Imm & 0x01; 2505 unsigned Src2Hi = Imm & 0x10; 2506 WorkingMI = CloneIfNew(MI); 2507 WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); 2508 break; 2509 } 2510 case X86::VPCMPBZ128rri: 2511 case X86::VPCMPUBZ128rri: 2512 case X86::VPCMPBZ256rri: 2513 case X86::VPCMPUBZ256rri: 2514 case X86::VPCMPBZrri: 2515 case X86::VPCMPUBZrri: 2516 case X86::VPCMPDZ128rri: 2517 case X86::VPCMPUDZ128rri: 2518 case X86::VPCMPDZ256rri: 2519 case X86::VPCMPUDZ256rri: 2520 case X86::VPCMPDZrri: 2521 case X86::VPCMPUDZrri: 2522 case X86::VPCMPQZ128rri: 2523 case X86::VPCMPUQZ128rri: 2524 case X86::VPCMPQZ256rri: 2525 case X86::VPCMPUQZ256rri: 2526 case X86::VPCMPQZrri: 2527 case X86::VPCMPUQZrri: 2528 case X86::VPCMPWZ128rri: 2529 case X86::VPCMPUWZ128rri: 2530 case X86::VPCMPWZ256rri: 2531 case X86::VPCMPUWZ256rri: 2532 case X86::VPCMPWZrri: 2533 case X86::VPCMPUWZrri: 2534 case X86::VPCMPBZ128rrik: 2535 case X86::VPCMPUBZ128rrik: 2536 case X86::VPCMPBZ256rrik: 2537 case X86::VPCMPUBZ256rrik: 2538 case X86::VPCMPBZrrik: 2539 case X86::VPCMPUBZrrik: 2540 case X86::VPCMPDZ128rrik: 2541 case X86::VPCMPUDZ128rrik: 2542 case X86::VPCMPDZ256rrik: 2543 case X86::VPCMPUDZ256rrik: 2544 case X86::VPCMPDZrrik: 2545 case X86::VPCMPUDZrrik: 2546 case X86::VPCMPQZ128rrik: 2547 case X86::VPCMPUQZ128rrik: 2548 case X86::VPCMPQZ256rrik: 2549 case X86::VPCMPUQZ256rrik: 2550 case X86::VPCMPQZrrik: 2551 case X86::VPCMPUQZrrik: 2552 case X86::VPCMPWZ128rrik: 2553 case X86::VPCMPUWZ128rrik: 2554 case X86::VPCMPWZ256rrik: 2555 case X86::VPCMPUWZ256rrik: 2556 case X86::VPCMPWZrrik: 2557 case X86::VPCMPUWZrrik: 2558 WorkingMI = CloneIfNew(MI); 2559 // Flip comparison mode immediate (if necessary). 2560 WorkingMI->getOperand(MI.getNumOperands() - 1) 2561 .setImm(X86::getSwappedVPCMPImm( 2562 MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7)); 2563 break; 2564 case X86::VPCOMBri: 2565 case X86::VPCOMUBri: 2566 case X86::VPCOMDri: 2567 case X86::VPCOMUDri: 2568 case X86::VPCOMQri: 2569 case X86::VPCOMUQri: 2570 case X86::VPCOMWri: 2571 case X86::VPCOMUWri: 2572 WorkingMI = CloneIfNew(MI); 2573 // Flip comparison mode immediate (if necessary). 2574 WorkingMI->getOperand(3).setImm( 2575 X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7)); 2576 break; 2577 case X86::VCMPSDZrri: 2578 case X86::VCMPSSZrri: 2579 case X86::VCMPPDZrri: 2580 case X86::VCMPPSZrri: 2581 case X86::VCMPSHZrri: 2582 case X86::VCMPPHZrri: 2583 case X86::VCMPPHZ128rri: 2584 case X86::VCMPPHZ256rri: 2585 case X86::VCMPPDZ128rri: 2586 case X86::VCMPPSZ128rri: 2587 case X86::VCMPPDZ256rri: 2588 case X86::VCMPPSZ256rri: 2589 case X86::VCMPPDZrrik: 2590 case X86::VCMPPSZrrik: 2591 case X86::VCMPPDZ128rrik: 2592 case X86::VCMPPSZ128rrik: 2593 case X86::VCMPPDZ256rrik: 2594 case X86::VCMPPSZ256rrik: 2595 WorkingMI = CloneIfNew(MI); 2596 WorkingMI->getOperand(MI.getNumExplicitOperands() - 1) 2597 .setImm(X86::getSwappedVCMPImm( 2598 MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f)); 2599 break; 2600 case X86::VPERM2F128rr: 2601 case X86::VPERM2I128rr: 2602 // Flip permute source immediate. 2603 // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi. 2604 // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi. 2605 WorkingMI = CloneIfNew(MI); 2606 WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22); 2607 break; 2608 case X86::MOVHLPSrr: 2609 case X86::UNPCKHPDrr: 2610 case X86::VMOVHLPSrr: 2611 case X86::VUNPCKHPDrr: 2612 case X86::VMOVHLPSZrr: 2613 case X86::VUNPCKHPDZ128rr: 2614 assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!"); 2615 2616 switch (Opc) { 2617 default: 2618 llvm_unreachable("Unreachable!"); 2619 case X86::MOVHLPSrr: 2620 Opc = X86::UNPCKHPDrr; 2621 break; 2622 case X86::UNPCKHPDrr: 2623 Opc = X86::MOVHLPSrr; 2624 break; 2625 case X86::VMOVHLPSrr: 2626 Opc = X86::VUNPCKHPDrr; 2627 break; 2628 case X86::VUNPCKHPDrr: 2629 Opc = X86::VMOVHLPSrr; 2630 break; 2631 case X86::VMOVHLPSZrr: 2632 Opc = X86::VUNPCKHPDZ128rr; 2633 break; 2634 case X86::VUNPCKHPDZ128rr: 2635 Opc = X86::VMOVHLPSZrr; 2636 break; 2637 } 2638 WorkingMI = CloneIfNew(MI); 2639 WorkingMI->setDesc(get(Opc)); 2640 break; 2641 CASE_ND(CMOV16rr) 2642 CASE_ND(CMOV32rr) 2643 CASE_ND(CMOV64rr) { 2644 WorkingMI = CloneIfNew(MI); 2645 unsigned OpNo = MI.getDesc().getNumOperands() - 1; 2646 X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm()); 2647 WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC)); 2648 break; 2649 } 2650 case X86::VPTERNLOGDZrri: 2651 case X86::VPTERNLOGDZrmi: 2652 case X86::VPTERNLOGDZ128rri: 2653 case X86::VPTERNLOGDZ128rmi: 2654 case X86::VPTERNLOGDZ256rri: 2655 case X86::VPTERNLOGDZ256rmi: 2656 case X86::VPTERNLOGQZrri: 2657 case X86::VPTERNLOGQZrmi: 2658 case X86::VPTERNLOGQZ128rri: 2659 case X86::VPTERNLOGQZ128rmi: 2660 case X86::VPTERNLOGQZ256rri: 2661 case X86::VPTERNLOGQZ256rmi: 2662 case X86::VPTERNLOGDZrrik: 2663 case X86::VPTERNLOGDZ128rrik: 2664 case X86::VPTERNLOGDZ256rrik: 2665 case X86::VPTERNLOGQZrrik: 2666 case X86::VPTERNLOGQZ128rrik: 2667 case X86::VPTERNLOGQZ256rrik: 2668 case X86::VPTERNLOGDZrrikz: 2669 case X86::VPTERNLOGDZrmikz: 2670 case X86::VPTERNLOGDZ128rrikz: 2671 case X86::VPTERNLOGDZ128rmikz: 2672 case X86::VPTERNLOGDZ256rrikz: 2673 case X86::VPTERNLOGDZ256rmikz: 2674 case X86::VPTERNLOGQZrrikz: 2675 case X86::VPTERNLOGQZrmikz: 2676 case X86::VPTERNLOGQZ128rrikz: 2677 case X86::VPTERNLOGQZ128rmikz: 2678 case X86::VPTERNLOGQZ256rrikz: 2679 case X86::VPTERNLOGQZ256rmikz: 2680 case X86::VPTERNLOGDZ128rmbi: 2681 case X86::VPTERNLOGDZ256rmbi: 2682 case X86::VPTERNLOGDZrmbi: 2683 case X86::VPTERNLOGQZ128rmbi: 2684 case X86::VPTERNLOGQZ256rmbi: 2685 case X86::VPTERNLOGQZrmbi: 2686 case X86::VPTERNLOGDZ128rmbikz: 2687 case X86::VPTERNLOGDZ256rmbikz: 2688 case X86::VPTERNLOGDZrmbikz: 2689 case X86::VPTERNLOGQZ128rmbikz: 2690 case X86::VPTERNLOGQZ256rmbikz: 2691 case X86::VPTERNLOGQZrmbikz: { 2692 WorkingMI = CloneIfNew(MI); 2693 commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2); 2694 break; 2695 } 2696 default: 2697 if (isCommutableVPERMV3Instruction(Opc)) { 2698 WorkingMI = CloneIfNew(MI); 2699 WorkingMI->setDesc(get(getCommutedVPERMV3Opcode(Opc))); 2700 break; 2701 } 2702 2703 if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) { 2704 WorkingMI = CloneIfNew(MI); 2705 WorkingMI->setDesc( 2706 get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group))); 2707 break; 2708 } 2709 } 2710 return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); 2711 } 2712 2713 bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, 2714 unsigned &SrcOpIdx1, 2715 unsigned &SrcOpIdx2, 2716 bool IsIntrinsic) const { 2717 uint64_t TSFlags = MI.getDesc().TSFlags; 2718 2719 unsigned FirstCommutableVecOp = 1; 2720 unsigned LastCommutableVecOp = 3; 2721 unsigned KMaskOp = -1U; 2722 if (X86II::isKMasked(TSFlags)) { 2723 // For k-zero-masked operations it is Ok to commute the first vector 2724 // operand. Unless this is an intrinsic instruction. 2725 // For regular k-masked operations a conservative choice is done as the 2726 // elements of the first vector operand, for which the corresponding bit 2727 // in the k-mask operand is set to 0, are copied to the result of the 2728 // instruction. 2729 // TODO/FIXME: The commute still may be legal if it is known that the 2730 // k-mask operand is set to either all ones or all zeroes. 2731 // It is also Ok to commute the 1st operand if all users of MI use only 2732 // the elements enabled by the k-mask operand. For example, 2733 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i] 2734 // : v1[i]; 2735 // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 -> 2736 // // Ok, to commute v1 in FMADD213PSZrk. 2737 2738 // The k-mask operand has index = 2 for masked and zero-masked operations. 2739 KMaskOp = 2; 2740 2741 // The operand with index = 1 is used as a source for those elements for 2742 // which the corresponding bit in the k-mask is set to 0. 2743 if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic) 2744 FirstCommutableVecOp = 3; 2745 2746 LastCommutableVecOp++; 2747 } else if (IsIntrinsic) { 2748 // Commuting the first operand of an intrinsic instruction isn't possible 2749 // unless we can prove that only the lowest element of the result is used. 2750 FirstCommutableVecOp = 2; 2751 } 2752 2753 if (isMem(MI, LastCommutableVecOp)) 2754 LastCommutableVecOp--; 2755 2756 // Only the first RegOpsNum operands are commutable. 2757 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means 2758 // that the operand is not specified/fixed. 2759 if (SrcOpIdx1 != CommuteAnyOperandIndex && 2760 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp || 2761 SrcOpIdx1 == KMaskOp)) 2762 return false; 2763 if (SrcOpIdx2 != CommuteAnyOperandIndex && 2764 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp || 2765 SrcOpIdx2 == KMaskOp)) 2766 return false; 2767 2768 // Look for two different register operands assumed to be commutable 2769 // regardless of the FMA opcode. The FMA opcode is adjusted later. 2770 if (SrcOpIdx1 == CommuteAnyOperandIndex || 2771 SrcOpIdx2 == CommuteAnyOperandIndex) { 2772 unsigned CommutableOpIdx2 = SrcOpIdx2; 2773 2774 // At least one of operands to be commuted is not specified and 2775 // this method is free to choose appropriate commutable operands. 2776 if (SrcOpIdx1 == SrcOpIdx2) 2777 // Both of operands are not fixed. By default set one of commutable 2778 // operands to the last register operand of the instruction. 2779 CommutableOpIdx2 = LastCommutableVecOp; 2780 else if (SrcOpIdx2 == CommuteAnyOperandIndex) 2781 // Only one of operands is not fixed. 2782 CommutableOpIdx2 = SrcOpIdx1; 2783 2784 // CommutableOpIdx2 is well defined now. Let's choose another commutable 2785 // operand and assign its index to CommutableOpIdx1. 2786 Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); 2787 2788 unsigned CommutableOpIdx1; 2789 for (CommutableOpIdx1 = LastCommutableVecOp; 2790 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) { 2791 // Just ignore and skip the k-mask operand. 2792 if (CommutableOpIdx1 == KMaskOp) 2793 continue; 2794 2795 // The commuted operands must have different registers. 2796 // Otherwise, the commute transformation does not change anything and 2797 // is useless then. 2798 if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg()) 2799 break; 2800 } 2801 2802 // No appropriate commutable operands were found. 2803 if (CommutableOpIdx1 < FirstCommutableVecOp) 2804 return false; 2805 2806 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 2807 // to return those values. 2808 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1, 2809 CommutableOpIdx2)) 2810 return false; 2811 } 2812 2813 return true; 2814 } 2815 2816 bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI, 2817 unsigned &SrcOpIdx1, 2818 unsigned &SrcOpIdx2) const { 2819 const MCInstrDesc &Desc = MI.getDesc(); 2820 if (!Desc.isCommutable()) 2821 return false; 2822 2823 switch (MI.getOpcode()) { 2824 case X86::CMPSDrri: 2825 case X86::CMPSSrri: 2826 case X86::CMPPDrri: 2827 case X86::CMPPSrri: 2828 case X86::VCMPSDrri: 2829 case X86::VCMPSSrri: 2830 case X86::VCMPPDrri: 2831 case X86::VCMPPSrri: 2832 case X86::VCMPPDYrri: 2833 case X86::VCMPPSYrri: 2834 case X86::VCMPSDZrri: 2835 case X86::VCMPSSZrri: 2836 case X86::VCMPPDZrri: 2837 case X86::VCMPPSZrri: 2838 case X86::VCMPSHZrri: 2839 case X86::VCMPPHZrri: 2840 case X86::VCMPPHZ128rri: 2841 case X86::VCMPPHZ256rri: 2842 case X86::VCMPPDZ128rri: 2843 case X86::VCMPPSZ128rri: 2844 case X86::VCMPPDZ256rri: 2845 case X86::VCMPPSZ256rri: 2846 case X86::VCMPPDZrrik: 2847 case X86::VCMPPSZrrik: 2848 case X86::VCMPPDZ128rrik: 2849 case X86::VCMPPSZ128rrik: 2850 case X86::VCMPPDZ256rrik: 2851 case X86::VCMPPSZ256rrik: { 2852 unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0; 2853 2854 // Float comparison can be safely commuted for 2855 // Ordered/Unordered/Equal/NotEqual tests 2856 unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7; 2857 switch (Imm) { 2858 default: 2859 // EVEX versions can be commuted. 2860 if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX) 2861 break; 2862 return false; 2863 case 0x00: // EQUAL 2864 case 0x03: // UNORDERED 2865 case 0x04: // NOT EQUAL 2866 case 0x07: // ORDERED 2867 break; 2868 } 2869 2870 // The indices of the commutable operands are 1 and 2 (or 2 and 3 2871 // when masked). 2872 // Assign them to the returned operand indices here. 2873 return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset, 2874 2 + OpOffset); 2875 } 2876 case X86::MOVSSrr: 2877 // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can 2878 // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since 2879 // AVX implies sse4.1. 2880 if (Subtarget.hasSSE41()) 2881 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); 2882 return false; 2883 case X86::SHUFPDrri: 2884 // We can commute this to MOVSD. 2885 if (MI.getOperand(3).getImm() == 0x02) 2886 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); 2887 return false; 2888 case X86::MOVHLPSrr: 2889 case X86::UNPCKHPDrr: 2890 case X86::VMOVHLPSrr: 2891 case X86::VUNPCKHPDrr: 2892 case X86::VMOVHLPSZrr: 2893 case X86::VUNPCKHPDZ128rr: 2894 if (Subtarget.hasSSE2()) 2895 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); 2896 return false; 2897 case X86::VPTERNLOGDZrri: 2898 case X86::VPTERNLOGDZrmi: 2899 case X86::VPTERNLOGDZ128rri: 2900 case X86::VPTERNLOGDZ128rmi: 2901 case X86::VPTERNLOGDZ256rri: 2902 case X86::VPTERNLOGDZ256rmi: 2903 case X86::VPTERNLOGQZrri: 2904 case X86::VPTERNLOGQZrmi: 2905 case X86::VPTERNLOGQZ128rri: 2906 case X86::VPTERNLOGQZ128rmi: 2907 case X86::VPTERNLOGQZ256rri: 2908 case X86::VPTERNLOGQZ256rmi: 2909 case X86::VPTERNLOGDZrrik: 2910 case X86::VPTERNLOGDZ128rrik: 2911 case X86::VPTERNLOGDZ256rrik: 2912 case X86::VPTERNLOGQZrrik: 2913 case X86::VPTERNLOGQZ128rrik: 2914 case X86::VPTERNLOGQZ256rrik: 2915 case X86::VPTERNLOGDZrrikz: 2916 case X86::VPTERNLOGDZrmikz: 2917 case X86::VPTERNLOGDZ128rrikz: 2918 case X86::VPTERNLOGDZ128rmikz: 2919 case X86::VPTERNLOGDZ256rrikz: 2920 case X86::VPTERNLOGDZ256rmikz: 2921 case X86::VPTERNLOGQZrrikz: 2922 case X86::VPTERNLOGQZrmikz: 2923 case X86::VPTERNLOGQZ128rrikz: 2924 case X86::VPTERNLOGQZ128rmikz: 2925 case X86::VPTERNLOGQZ256rrikz: 2926 case X86::VPTERNLOGQZ256rmikz: 2927 case X86::VPTERNLOGDZ128rmbi: 2928 case X86::VPTERNLOGDZ256rmbi: 2929 case X86::VPTERNLOGDZrmbi: 2930 case X86::VPTERNLOGQZ128rmbi: 2931 case X86::VPTERNLOGQZ256rmbi: 2932 case X86::VPTERNLOGQZrmbi: 2933 case X86::VPTERNLOGDZ128rmbikz: 2934 case X86::VPTERNLOGDZ256rmbikz: 2935 case X86::VPTERNLOGDZrmbikz: 2936 case X86::VPTERNLOGQZ128rmbikz: 2937 case X86::VPTERNLOGQZ256rmbikz: 2938 case X86::VPTERNLOGQZrmbikz: 2939 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); 2940 case X86::VPDPWSSDYrr: 2941 case X86::VPDPWSSDrr: 2942 case X86::VPDPWSSDSYrr: 2943 case X86::VPDPWSSDSrr: 2944 case X86::VPDPWUUDrr: 2945 case X86::VPDPWUUDYrr: 2946 case X86::VPDPWUUDSrr: 2947 case X86::VPDPWUUDSYrr: 2948 case X86::VPDPBSSDSrr: 2949 case X86::VPDPBSSDSYrr: 2950 case X86::VPDPBSSDrr: 2951 case X86::VPDPBSSDYrr: 2952 case X86::VPDPBUUDSrr: 2953 case X86::VPDPBUUDSYrr: 2954 case X86::VPDPBUUDrr: 2955 case X86::VPDPBUUDYrr: 2956 case X86::VPDPWSSDZ128r: 2957 case X86::VPDPWSSDZ128rk: 2958 case X86::VPDPWSSDZ128rkz: 2959 case X86::VPDPWSSDZ256r: 2960 case X86::VPDPWSSDZ256rk: 2961 case X86::VPDPWSSDZ256rkz: 2962 case X86::VPDPWSSDZr: 2963 case X86::VPDPWSSDZrk: 2964 case X86::VPDPWSSDZrkz: 2965 case X86::VPDPWSSDSZ128r: 2966 case X86::VPDPWSSDSZ128rk: 2967 case X86::VPDPWSSDSZ128rkz: 2968 case X86::VPDPWSSDSZ256r: 2969 case X86::VPDPWSSDSZ256rk: 2970 case X86::VPDPWSSDSZ256rkz: 2971 case X86::VPDPWSSDSZr: 2972 case X86::VPDPWSSDSZrk: 2973 case X86::VPDPWSSDSZrkz: 2974 case X86::VPMADD52HUQrr: 2975 case X86::VPMADD52HUQYrr: 2976 case X86::VPMADD52HUQZ128r: 2977 case X86::VPMADD52HUQZ128rk: 2978 case X86::VPMADD52HUQZ128rkz: 2979 case X86::VPMADD52HUQZ256r: 2980 case X86::VPMADD52HUQZ256rk: 2981 case X86::VPMADD52HUQZ256rkz: 2982 case X86::VPMADD52HUQZr: 2983 case X86::VPMADD52HUQZrk: 2984 case X86::VPMADD52HUQZrkz: 2985 case X86::VPMADD52LUQrr: 2986 case X86::VPMADD52LUQYrr: 2987 case X86::VPMADD52LUQZ128r: 2988 case X86::VPMADD52LUQZ128rk: 2989 case X86::VPMADD52LUQZ128rkz: 2990 case X86::VPMADD52LUQZ256r: 2991 case X86::VPMADD52LUQZ256rk: 2992 case X86::VPMADD52LUQZ256rkz: 2993 case X86::VPMADD52LUQZr: 2994 case X86::VPMADD52LUQZrk: 2995 case X86::VPMADD52LUQZrkz: 2996 case X86::VFMADDCPHZr: 2997 case X86::VFMADDCPHZrk: 2998 case X86::VFMADDCPHZrkz: 2999 case X86::VFMADDCPHZ128r: 3000 case X86::VFMADDCPHZ128rk: 3001 case X86::VFMADDCPHZ128rkz: 3002 case X86::VFMADDCPHZ256r: 3003 case X86::VFMADDCPHZ256rk: 3004 case X86::VFMADDCPHZ256rkz: 3005 case X86::VFMADDCSHZr: 3006 case X86::VFMADDCSHZrk: 3007 case X86::VFMADDCSHZrkz: { 3008 unsigned CommutableOpIdx1 = 2; 3009 unsigned CommutableOpIdx2 = 3; 3010 if (X86II::isKMasked(Desc.TSFlags)) { 3011 // Skip the mask register. 3012 ++CommutableOpIdx1; 3013 ++CommutableOpIdx2; 3014 } 3015 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1, 3016 CommutableOpIdx2)) 3017 return false; 3018 if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg()) 3019 // No idea. 3020 return false; 3021 return true; 3022 } 3023 3024 default: 3025 const X86InstrFMA3Group *FMA3Group = 3026 getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags); 3027 if (FMA3Group) 3028 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, 3029 FMA3Group->isIntrinsic()); 3030 3031 // Handled masked instructions since we need to skip over the mask input 3032 // and the preserved input. 3033 if (X86II::isKMasked(Desc.TSFlags)) { 3034 // First assume that the first input is the mask operand and skip past it. 3035 unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1; 3036 unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2; 3037 // Check if the first input is tied. If there isn't one then we only 3038 // need to skip the mask operand which we did above. 3039 if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(), 3040 MCOI::TIED_TO) != -1)) { 3041 // If this is zero masking instruction with a tied operand, we need to 3042 // move the first index back to the first input since this must 3043 // be a 3 input instruction and we want the first two non-mask inputs. 3044 // Otherwise this is a 2 input instruction with a preserved input and 3045 // mask, so we need to move the indices to skip one more input. 3046 if (X86II::isKMergeMasked(Desc.TSFlags)) { 3047 ++CommutableOpIdx1; 3048 ++CommutableOpIdx2; 3049 } else { 3050 --CommutableOpIdx1; 3051 } 3052 } 3053 3054 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1, 3055 CommutableOpIdx2)) 3056 return false; 3057 3058 if (!MI.getOperand(SrcOpIdx1).isReg() || 3059 !MI.getOperand(SrcOpIdx2).isReg()) 3060 // No idea. 3061 return false; 3062 return true; 3063 } 3064 3065 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); 3066 } 3067 return false; 3068 } 3069 3070 static bool isConvertibleLEA(MachineInstr *MI) { 3071 unsigned Opcode = MI->getOpcode(); 3072 if (Opcode != X86::LEA32r && Opcode != X86::LEA64r && 3073 Opcode != X86::LEA64_32r) 3074 return false; 3075 3076 const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt); 3077 const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp); 3078 const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg); 3079 3080 if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 || 3081 Scale.getImm() > 1) 3082 return false; 3083 3084 return true; 3085 } 3086 3087 bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const { 3088 // Currently we're interested in following sequence only. 3089 // r3 = lea r1, r2 3090 // r5 = add r3, r4 3091 // Both r3 and r4 are killed in add, we hope the add instruction has the 3092 // operand order 3093 // r5 = add r4, r3 3094 // So later in X86FixupLEAs the lea instruction can be rewritten as add. 3095 unsigned Opcode = MI.getOpcode(); 3096 if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr) 3097 return false; 3098 3099 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3100 Register Reg1 = MI.getOperand(1).getReg(); 3101 Register Reg2 = MI.getOperand(2).getReg(); 3102 3103 // Check if Reg1 comes from LEA in the same MBB. 3104 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) { 3105 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) { 3106 Commute = true; 3107 return true; 3108 } 3109 } 3110 3111 // Check if Reg2 comes from LEA in the same MBB. 3112 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) { 3113 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) { 3114 Commute = false; 3115 return true; 3116 } 3117 } 3118 3119 return false; 3120 } 3121 3122 int X86::getCondSrcNoFromDesc(const MCInstrDesc &MCID) { 3123 unsigned Opcode = MCID.getOpcode(); 3124 if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode) || 3125 X86::isCFCMOVCC(Opcode) || X86::isCCMPCC(Opcode) || 3126 X86::isCTESTCC(Opcode))) 3127 return -1; 3128 // Assume that condition code is always the last use operand. 3129 unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs(); 3130 return NumUses - 1; 3131 } 3132 3133 X86::CondCode X86::getCondFromMI(const MachineInstr &MI) { 3134 const MCInstrDesc &MCID = MI.getDesc(); 3135 int CondNo = getCondSrcNoFromDesc(MCID); 3136 if (CondNo < 0) 3137 return X86::COND_INVALID; 3138 CondNo += MCID.getNumDefs(); 3139 return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm()); 3140 } 3141 3142 X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) { 3143 return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI) 3144 : X86::COND_INVALID; 3145 } 3146 3147 X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) { 3148 return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI) 3149 : X86::COND_INVALID; 3150 } 3151 3152 X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) { 3153 return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI) 3154 : X86::COND_INVALID; 3155 } 3156 3157 X86::CondCode X86::getCondFromCFCMov(const MachineInstr &MI) { 3158 return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI) 3159 : X86::COND_INVALID; 3160 } 3161 3162 X86::CondCode X86::getCondFromCCMP(const MachineInstr &MI) { 3163 return X86::isCCMPCC(MI.getOpcode()) || X86::isCTESTCC(MI.getOpcode()) 3164 ? X86::getCondFromMI(MI) 3165 : X86::COND_INVALID; 3166 } 3167 3168 int X86::getCCMPCondFlagsFromCondCode(X86::CondCode CC) { 3169 // CCMP/CTEST has two conditional operands: 3170 // - SCC: source conditonal code (same as CMOV) 3171 // - DCF: destination conditional flags, which has 4 valid bits 3172 // 3173 // +----+----+----+----+ 3174 // | OF | SF | ZF | CF | 3175 // +----+----+----+----+ 3176 // 3177 // If SCC(source conditional code) evaluates to false, CCMP/CTEST will updates 3178 // the conditional flags by as follows: 3179 // 3180 // OF = DCF.OF 3181 // SF = DCF.SF 3182 // ZF = DCF.ZF 3183 // CF = DCF.CF 3184 // PF = DCF.CF 3185 // AF = 0 (Auxiliary Carry Flag) 3186 // 3187 // Otherwise, the CMP or TEST is executed and it updates the 3188 // CSPAZO flags normally. 3189 // 3190 // NOTE: 3191 // If SCC = P, then SCC evaluates to true regardless of the CSPAZO value. 3192 // If SCC = NP, then SCC evaluates to false regardless of the CSPAZO value. 3193 3194 enum { CF = 1, ZF = 2, SF = 4, OF = 8, PF = CF }; 3195 3196 switch (CC) { 3197 default: 3198 llvm_unreachable("Illegal condition code!"); 3199 case X86::COND_NO: 3200 case X86::COND_NE: 3201 case X86::COND_GE: 3202 case X86::COND_G: 3203 case X86::COND_AE: 3204 case X86::COND_A: 3205 case X86::COND_NS: 3206 case X86::COND_NP: 3207 return 0; 3208 case X86::COND_O: 3209 return OF; 3210 case X86::COND_B: 3211 case X86::COND_BE: 3212 return CF; 3213 break; 3214 case X86::COND_E: 3215 case X86::COND_LE: 3216 return ZF; 3217 case X86::COND_S: 3218 case X86::COND_L: 3219 return SF; 3220 case X86::COND_P: 3221 return PF; 3222 } 3223 } 3224 3225 #define GET_X86_NF_TRANSFORM_TABLE 3226 #define GET_X86_ND2NONND_TABLE 3227 #include "X86GenInstrMapping.inc" 3228 3229 static unsigned getNewOpcFromTable(ArrayRef<X86TableEntry> Table, 3230 unsigned Opc) { 3231 const auto I = llvm::lower_bound(Table, Opc); 3232 return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc; 3233 } 3234 unsigned X86::getNFVariant(unsigned Opc) { 3235 return getNewOpcFromTable(X86NFTransformTable, Opc); 3236 } 3237 3238 unsigned X86::getNonNDVariant(unsigned Opc) { 3239 return getNewOpcFromTable(X86ND2NonNDTable, Opc); 3240 } 3241 3242 /// Return the inverse of the specified condition, 3243 /// e.g. turning COND_E to COND_NE. 3244 X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { 3245 switch (CC) { 3246 default: 3247 llvm_unreachable("Illegal condition code!"); 3248 case X86::COND_E: 3249 return X86::COND_NE; 3250 case X86::COND_NE: 3251 return X86::COND_E; 3252 case X86::COND_L: 3253 return X86::COND_GE; 3254 case X86::COND_LE: 3255 return X86::COND_G; 3256 case X86::COND_G: 3257 return X86::COND_LE; 3258 case X86::COND_GE: 3259 return X86::COND_L; 3260 case X86::COND_B: 3261 return X86::COND_AE; 3262 case X86::COND_BE: 3263 return X86::COND_A; 3264 case X86::COND_A: 3265 return X86::COND_BE; 3266 case X86::COND_AE: 3267 return X86::COND_B; 3268 case X86::COND_S: 3269 return X86::COND_NS; 3270 case X86::COND_NS: 3271 return X86::COND_S; 3272 case X86::COND_P: 3273 return X86::COND_NP; 3274 case X86::COND_NP: 3275 return X86::COND_P; 3276 case X86::COND_O: 3277 return X86::COND_NO; 3278 case X86::COND_NO: 3279 return X86::COND_O; 3280 case X86::COND_NE_OR_P: 3281 return X86::COND_E_AND_NP; 3282 case X86::COND_E_AND_NP: 3283 return X86::COND_NE_OR_P; 3284 } 3285 } 3286 3287 /// Assuming the flags are set by MI(a,b), return the condition code if we 3288 /// modify the instructions such that flags are set by MI(b,a). 3289 static X86::CondCode getSwappedCondition(X86::CondCode CC) { 3290 switch (CC) { 3291 default: 3292 return X86::COND_INVALID; 3293 case X86::COND_E: 3294 return X86::COND_E; 3295 case X86::COND_NE: 3296 return X86::COND_NE; 3297 case X86::COND_L: 3298 return X86::COND_G; 3299 case X86::COND_LE: 3300 return X86::COND_GE; 3301 case X86::COND_G: 3302 return X86::COND_L; 3303 case X86::COND_GE: 3304 return X86::COND_LE; 3305 case X86::COND_B: 3306 return X86::COND_A; 3307 case X86::COND_BE: 3308 return X86::COND_AE; 3309 case X86::COND_A: 3310 return X86::COND_B; 3311 case X86::COND_AE: 3312 return X86::COND_BE; 3313 } 3314 } 3315 3316 std::pair<X86::CondCode, bool> 3317 X86::getX86ConditionCode(CmpInst::Predicate Predicate) { 3318 X86::CondCode CC = X86::COND_INVALID; 3319 bool NeedSwap = false; 3320 switch (Predicate) { 3321 default: 3322 break; 3323 // Floating-point Predicates 3324 case CmpInst::FCMP_UEQ: 3325 CC = X86::COND_E; 3326 break; 3327 case CmpInst::FCMP_OLT: 3328 NeedSwap = true; 3329 [[fallthrough]]; 3330 case CmpInst::FCMP_OGT: 3331 CC = X86::COND_A; 3332 break; 3333 case CmpInst::FCMP_OLE: 3334 NeedSwap = true; 3335 [[fallthrough]]; 3336 case CmpInst::FCMP_OGE: 3337 CC = X86::COND_AE; 3338 break; 3339 case CmpInst::FCMP_UGT: 3340 NeedSwap = true; 3341 [[fallthrough]]; 3342 case CmpInst::FCMP_ULT: 3343 CC = X86::COND_B; 3344 break; 3345 case CmpInst::FCMP_UGE: 3346 NeedSwap = true; 3347 [[fallthrough]]; 3348 case CmpInst::FCMP_ULE: 3349 CC = X86::COND_BE; 3350 break; 3351 case CmpInst::FCMP_ONE: 3352 CC = X86::COND_NE; 3353 break; 3354 case CmpInst::FCMP_UNO: 3355 CC = X86::COND_P; 3356 break; 3357 case CmpInst::FCMP_ORD: 3358 CC = X86::COND_NP; 3359 break; 3360 case CmpInst::FCMP_OEQ: 3361 [[fallthrough]]; 3362 case CmpInst::FCMP_UNE: 3363 CC = X86::COND_INVALID; 3364 break; 3365 3366 // Integer Predicates 3367 case CmpInst::ICMP_EQ: 3368 CC = X86::COND_E; 3369 break; 3370 case CmpInst::ICMP_NE: 3371 CC = X86::COND_NE; 3372 break; 3373 case CmpInst::ICMP_UGT: 3374 CC = X86::COND_A; 3375 break; 3376 case CmpInst::ICMP_UGE: 3377 CC = X86::COND_AE; 3378 break; 3379 case CmpInst::ICMP_ULT: 3380 CC = X86::COND_B; 3381 break; 3382 case CmpInst::ICMP_ULE: 3383 CC = X86::COND_BE; 3384 break; 3385 case CmpInst::ICMP_SGT: 3386 CC = X86::COND_G; 3387 break; 3388 case CmpInst::ICMP_SGE: 3389 CC = X86::COND_GE; 3390 break; 3391 case CmpInst::ICMP_SLT: 3392 CC = X86::COND_L; 3393 break; 3394 case CmpInst::ICMP_SLE: 3395 CC = X86::COND_LE; 3396 break; 3397 } 3398 3399 return std::make_pair(CC, NeedSwap); 3400 } 3401 3402 /// Return a cmov opcode for the given register size in bytes, and operand type. 3403 unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand, 3404 bool HasNDD) { 3405 switch (RegBytes) { 3406 default: 3407 llvm_unreachable("Illegal register size!"); 3408 #define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC) 3409 case 2: 3410 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm) 3411 : GET_ND_IF_ENABLED(X86::CMOV16rr); 3412 case 4: 3413 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm) 3414 : GET_ND_IF_ENABLED(X86::CMOV32rr); 3415 case 8: 3416 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm) 3417 : GET_ND_IF_ENABLED(X86::CMOV64rr); 3418 } 3419 } 3420 3421 /// Get the VPCMP immediate for the given condition. 3422 unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) { 3423 switch (CC) { 3424 default: 3425 llvm_unreachable("Unexpected SETCC condition"); 3426 case ISD::SETNE: 3427 return 4; 3428 case ISD::SETEQ: 3429 return 0; 3430 case ISD::SETULT: 3431 case ISD::SETLT: 3432 return 1; 3433 case ISD::SETUGT: 3434 case ISD::SETGT: 3435 return 6; 3436 case ISD::SETUGE: 3437 case ISD::SETGE: 3438 return 5; 3439 case ISD::SETULE: 3440 case ISD::SETLE: 3441 return 2; 3442 } 3443 } 3444 3445 /// Get the VPCMP immediate if the operands are swapped. 3446 unsigned X86::getSwappedVPCMPImm(unsigned Imm) { 3447 switch (Imm) { 3448 default: 3449 llvm_unreachable("Unreachable!"); 3450 case 0x01: 3451 Imm = 0x06; 3452 break; // LT -> NLE 3453 case 0x02: 3454 Imm = 0x05; 3455 break; // LE -> NLT 3456 case 0x05: 3457 Imm = 0x02; 3458 break; // NLT -> LE 3459 case 0x06: 3460 Imm = 0x01; 3461 break; // NLE -> LT 3462 case 0x00: // EQ 3463 case 0x03: // FALSE 3464 case 0x04: // NE 3465 case 0x07: // TRUE 3466 break; 3467 } 3468 3469 return Imm; 3470 } 3471 3472 /// Get the VPCOM immediate if the operands are swapped. 3473 unsigned X86::getSwappedVPCOMImm(unsigned Imm) { 3474 switch (Imm) { 3475 default: 3476 llvm_unreachable("Unreachable!"); 3477 case 0x00: 3478 Imm = 0x02; 3479 break; // LT -> GT 3480 case 0x01: 3481 Imm = 0x03; 3482 break; // LE -> GE 3483 case 0x02: 3484 Imm = 0x00; 3485 break; // GT -> LT 3486 case 0x03: 3487 Imm = 0x01; 3488 break; // GE -> LE 3489 case 0x04: // EQ 3490 case 0x05: // NE 3491 case 0x06: // FALSE 3492 case 0x07: // TRUE 3493 break; 3494 } 3495 3496 return Imm; 3497 } 3498 3499 /// Get the VCMP immediate if the operands are swapped. 3500 unsigned X86::getSwappedVCMPImm(unsigned Imm) { 3501 // Only need the lower 2 bits to distinquish. 3502 switch (Imm & 0x3) { 3503 default: 3504 llvm_unreachable("Unreachable!"); 3505 case 0x00: 3506 case 0x03: 3507 // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted. 3508 break; 3509 case 0x01: 3510 case 0x02: 3511 // Need to toggle bits 3:0. Bit 4 stays the same. 3512 Imm ^= 0xf; 3513 break; 3514 } 3515 3516 return Imm; 3517 } 3518 3519 unsigned X86::getVectorRegisterWidth(const MCOperandInfo &Info) { 3520 if (Info.RegClass == X86::VR128RegClassID || 3521 Info.RegClass == X86::VR128XRegClassID) 3522 return 128; 3523 if (Info.RegClass == X86::VR256RegClassID || 3524 Info.RegClass == X86::VR256XRegClassID) 3525 return 256; 3526 if (Info.RegClass == X86::VR512RegClassID) 3527 return 512; 3528 llvm_unreachable("Unknown register class!"); 3529 } 3530 3531 /// Return true if the Reg is X87 register. 3532 static bool isX87Reg(unsigned Reg) { 3533 return (Reg == X86::FPCW || Reg == X86::FPSW || 3534 (Reg >= X86::ST0 && Reg <= X86::ST7)); 3535 } 3536 3537 /// check if the instruction is X87 instruction 3538 bool X86::isX87Instruction(MachineInstr &MI) { 3539 // Call defs X87 register, so we special case it here because 3540 // otherwise calls are incorrectly flagged as x87 instructions 3541 // as a result. 3542 if (MI.isCall()) 3543 return false; 3544 for (const MachineOperand &MO : MI.operands()) { 3545 if (!MO.isReg()) 3546 continue; 3547 if (isX87Reg(MO.getReg())) 3548 return true; 3549 } 3550 return false; 3551 } 3552 3553 int X86::getFirstAddrOperandIdx(const MachineInstr &MI) { 3554 auto IsMemOp = [](const MCOperandInfo &OpInfo) { 3555 return OpInfo.OperandType == MCOI::OPERAND_MEMORY; 3556 }; 3557 3558 const MCInstrDesc &Desc = MI.getDesc(); 3559 3560 // Directly invoke the MC-layer routine for real (i.e., non-pseudo) 3561 // instructions (fast case). 3562 if (!X86II::isPseudo(Desc.TSFlags)) { 3563 int MemRefIdx = X86II::getMemoryOperandNo(Desc.TSFlags); 3564 if (MemRefIdx >= 0) 3565 return MemRefIdx + X86II::getOperandBias(Desc); 3566 #ifdef EXPENSIVE_CHECKS 3567 assert(none_of(Desc.operands(), IsMemOp) && 3568 "Got false negative from X86II::getMemoryOperandNo()!"); 3569 #endif 3570 return -1; 3571 } 3572 3573 // Otherwise, handle pseudo instructions by examining the type of their 3574 // operands (slow case). An instruction cannot have a memory reference if it 3575 // has fewer than AddrNumOperands (= 5) explicit operands. 3576 unsigned NumOps = Desc.getNumOperands(); 3577 if (NumOps < X86::AddrNumOperands) { 3578 #ifdef EXPENSIVE_CHECKS 3579 assert(none_of(Desc.operands(), IsMemOp) && 3580 "Expected no operands to have OPERAND_MEMORY type!"); 3581 #endif 3582 return -1; 3583 } 3584 3585 // The first operand with type OPERAND_MEMORY indicates the start of a memory 3586 // reference. We expect the following AddrNumOperand-1 operands to also have 3587 // OPERAND_MEMORY type. 3588 for (unsigned I = 0, E = NumOps - X86::AddrNumOperands; I != E; ++I) { 3589 if (IsMemOp(Desc.operands()[I])) { 3590 #ifdef EXPENSIVE_CHECKS 3591 assert(std::all_of(Desc.operands().begin() + I, 3592 Desc.operands().begin() + I + X86::AddrNumOperands, 3593 IsMemOp) && 3594 "Expected all five operands in the memory reference to have " 3595 "OPERAND_MEMORY type!"); 3596 #endif 3597 return I; 3598 } 3599 } 3600 3601 return -1; 3602 } 3603 3604 const Constant *X86::getConstantFromPool(const MachineInstr &MI, 3605 unsigned OpNo) { 3606 assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) && 3607 "Unexpected number of operands!"); 3608 3609 const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg); 3610 if (!Index.isReg() || Index.getReg() != X86::NoRegister) 3611 return nullptr; 3612 3613 const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp); 3614 if (!Disp.isCPI() || Disp.getOffset() != 0) 3615 return nullptr; 3616 3617 ArrayRef<MachineConstantPoolEntry> Constants = 3618 MI.getParent()->getParent()->getConstantPool()->getConstants(); 3619 const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()]; 3620 3621 // Bail if this is a machine constant pool entry, we won't be able to dig out 3622 // anything useful. 3623 if (ConstantEntry.isMachineConstantPoolEntry()) 3624 return nullptr; 3625 3626 return ConstantEntry.Val.ConstVal; 3627 } 3628 3629 bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const { 3630 switch (MI.getOpcode()) { 3631 case X86::TCRETURNdi: 3632 case X86::TCRETURNri: 3633 case X86::TCRETURNmi: 3634 case X86::TCRETURNdi64: 3635 case X86::TCRETURNri64: 3636 case X86::TCRETURNmi64: 3637 return true; 3638 default: 3639 return false; 3640 } 3641 } 3642 3643 bool X86InstrInfo::canMakeTailCallConditional( 3644 SmallVectorImpl<MachineOperand> &BranchCond, 3645 const MachineInstr &TailCall) const { 3646 3647 const MachineFunction *MF = TailCall.getMF(); 3648 3649 if (MF->getTarget().getCodeModel() == CodeModel::Kernel) { 3650 // Kernel patches thunk calls in runtime, these should never be conditional. 3651 const MachineOperand &Target = TailCall.getOperand(0); 3652 if (Target.isSymbol()) { 3653 StringRef Symbol(Target.getSymbolName()); 3654 // this is currently only relevant to r11/kernel indirect thunk. 3655 if (Symbol == "__x86_indirect_thunk_r11") 3656 return false; 3657 } 3658 } 3659 3660 if (TailCall.getOpcode() != X86::TCRETURNdi && 3661 TailCall.getOpcode() != X86::TCRETURNdi64) { 3662 // Only direct calls can be done with a conditional branch. 3663 return false; 3664 } 3665 3666 if (Subtarget.isTargetWin64() && MF->hasWinCFI()) { 3667 // Conditional tail calls confuse the Win64 unwinder. 3668 return false; 3669 } 3670 3671 assert(BranchCond.size() == 1); 3672 if (BranchCond[0].getImm() > X86::LAST_VALID_COND) { 3673 // Can't make a conditional tail call with this condition. 3674 return false; 3675 } 3676 3677 const X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); 3678 if (X86FI->getTCReturnAddrDelta() != 0 || 3679 TailCall.getOperand(1).getImm() != 0) { 3680 // A conditional tail call cannot do any stack adjustment. 3681 return false; 3682 } 3683 3684 return true; 3685 } 3686 3687 void X86InstrInfo::replaceBranchWithTailCall( 3688 MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond, 3689 const MachineInstr &TailCall) const { 3690 assert(canMakeTailCallConditional(BranchCond, TailCall)); 3691 3692 MachineBasicBlock::iterator I = MBB.end(); 3693 while (I != MBB.begin()) { 3694 --I; 3695 if (I->isDebugInstr()) 3696 continue; 3697 if (!I->isBranch()) 3698 assert(0 && "Can't find the branch to replace!"); 3699 3700 X86::CondCode CC = X86::getCondFromBranch(*I); 3701 assert(BranchCond.size() == 1); 3702 if (CC != BranchCond[0].getImm()) 3703 continue; 3704 3705 break; 3706 } 3707 3708 unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc 3709 : X86::TCRETURNdi64cc; 3710 3711 auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc)); 3712 MIB->addOperand(TailCall.getOperand(0)); // Destination. 3713 MIB.addImm(0); // Stack offset (not used). 3714 MIB->addOperand(BranchCond[0]); // Condition. 3715 MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters. 3716 3717 // Add implicit uses and defs of all live regs potentially clobbered by the 3718 // call. This way they still appear live across the call. 3719 LivePhysRegs LiveRegs(getRegisterInfo()); 3720 LiveRegs.addLiveOuts(MBB); 3721 SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 8> Clobbers; 3722 LiveRegs.stepForward(*MIB, Clobbers); 3723 for (const auto &C : Clobbers) { 3724 MIB.addReg(C.first, RegState::Implicit); 3725 MIB.addReg(C.first, RegState::Implicit | RegState::Define); 3726 } 3727 3728 I->eraseFromParent(); 3729 } 3730 3731 // Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may 3732 // not be a fallthrough MBB now due to layout changes). Return nullptr if the 3733 // fallthrough MBB cannot be identified. 3734 static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB, 3735 MachineBasicBlock *TBB) { 3736 // Look for non-EHPad successors other than TBB. If we find exactly one, it 3737 // is the fallthrough MBB. If we find zero, then TBB is both the target MBB 3738 // and fallthrough MBB. If we find more than one, we cannot identify the 3739 // fallthrough MBB and should return nullptr. 3740 MachineBasicBlock *FallthroughBB = nullptr; 3741 for (MachineBasicBlock *Succ : MBB->successors()) { 3742 if (Succ->isEHPad() || (Succ == TBB && FallthroughBB)) 3743 continue; 3744 // Return a nullptr if we found more than one fallthrough successor. 3745 if (FallthroughBB && FallthroughBB != TBB) 3746 return nullptr; 3747 FallthroughBB = Succ; 3748 } 3749 return FallthroughBB; 3750 } 3751 3752 bool X86InstrInfo::analyzeBranchImpl( 3753 MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, 3754 SmallVectorImpl<MachineOperand> &Cond, 3755 SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const { 3756 3757 // Start from the bottom of the block and work up, examining the 3758 // terminator instructions. 3759 MachineBasicBlock::iterator I = MBB.end(); 3760 MachineBasicBlock::iterator UnCondBrIter = MBB.end(); 3761 while (I != MBB.begin()) { 3762 --I; 3763 if (I->isDebugInstr()) 3764 continue; 3765 3766 // Working from the bottom, when we see a non-terminator instruction, we're 3767 // done. 3768 if (!isUnpredicatedTerminator(*I)) 3769 break; 3770 3771 // A terminator that isn't a branch can't easily be handled by this 3772 // analysis. 3773 if (!I->isBranch()) 3774 return true; 3775 3776 // Handle unconditional branches. 3777 if (I->getOpcode() == X86::JMP_1) { 3778 UnCondBrIter = I; 3779 3780 if (!AllowModify) { 3781 TBB = I->getOperand(0).getMBB(); 3782 continue; 3783 } 3784 3785 // If the block has any instructions after a JMP, delete them. 3786 MBB.erase(std::next(I), MBB.end()); 3787 3788 Cond.clear(); 3789 FBB = nullptr; 3790 3791 // Delete the JMP if it's equivalent to a fall-through. 3792 if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { 3793 TBB = nullptr; 3794 I->eraseFromParent(); 3795 I = MBB.end(); 3796 UnCondBrIter = MBB.end(); 3797 continue; 3798 } 3799 3800 // TBB is used to indicate the unconditional destination. 3801 TBB = I->getOperand(0).getMBB(); 3802 continue; 3803 } 3804 3805 // Handle conditional branches. 3806 X86::CondCode BranchCode = X86::getCondFromBranch(*I); 3807 if (BranchCode == X86::COND_INVALID) 3808 return true; // Can't handle indirect branch. 3809 3810 // In practice we should never have an undef eflags operand, if we do 3811 // abort here as we are not prepared to preserve the flag. 3812 if (I->findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->isUndef()) 3813 return true; 3814 3815 // Working from the bottom, handle the first conditional branch. 3816 if (Cond.empty()) { 3817 FBB = TBB; 3818 TBB = I->getOperand(0).getMBB(); 3819 Cond.push_back(MachineOperand::CreateImm(BranchCode)); 3820 CondBranches.push_back(&*I); 3821 continue; 3822 } 3823 3824 // Handle subsequent conditional branches. Only handle the case where all 3825 // conditional branches branch to the same destination and their condition 3826 // opcodes fit one of the special multi-branch idioms. 3827 assert(Cond.size() == 1); 3828 assert(TBB); 3829 3830 // If the conditions are the same, we can leave them alone. 3831 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm(); 3832 auto NewTBB = I->getOperand(0).getMBB(); 3833 if (OldBranchCode == BranchCode && TBB == NewTBB) 3834 continue; 3835 3836 // If they differ, see if they fit one of the known patterns. Theoretically, 3837 // we could handle more patterns here, but we shouldn't expect to see them 3838 // if instruction selection has done a reasonable job. 3839 if (TBB == NewTBB && 3840 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) || 3841 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) { 3842 BranchCode = X86::COND_NE_OR_P; 3843 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) || 3844 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) { 3845 if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB))) 3846 return true; 3847 3848 // X86::COND_E_AND_NP usually has two different branch destinations. 3849 // 3850 // JP B1 3851 // JE B2 3852 // JMP B1 3853 // B1: 3854 // B2: 3855 // 3856 // Here this condition branches to B2 only if NP && E. It has another 3857 // equivalent form: 3858 // 3859 // JNE B1 3860 // JNP B2 3861 // JMP B1 3862 // B1: 3863 // B2: 3864 // 3865 // Similarly it branches to B2 only if E && NP. That is why this condition 3866 // is named with COND_E_AND_NP. 3867 BranchCode = X86::COND_E_AND_NP; 3868 } else 3869 return true; 3870 3871 // Update the MachineOperand. 3872 Cond[0].setImm(BranchCode); 3873 CondBranches.push_back(&*I); 3874 } 3875 3876 return false; 3877 } 3878 3879 bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 3880 MachineBasicBlock *&TBB, 3881 MachineBasicBlock *&FBB, 3882 SmallVectorImpl<MachineOperand> &Cond, 3883 bool AllowModify) const { 3884 SmallVector<MachineInstr *, 4> CondBranches; 3885 return analyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify); 3886 } 3887 3888 static int getJumpTableIndexFromAddr(const MachineInstr &MI) { 3889 const MCInstrDesc &Desc = MI.getDesc(); 3890 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); 3891 assert(MemRefBegin >= 0 && "instr should have memory operand"); 3892 MemRefBegin += X86II::getOperandBias(Desc); 3893 3894 const MachineOperand &MO = MI.getOperand(MemRefBegin + X86::AddrDisp); 3895 if (!MO.isJTI()) 3896 return -1; 3897 3898 return MO.getIndex(); 3899 } 3900 3901 static int getJumpTableIndexFromReg(const MachineRegisterInfo &MRI, 3902 Register Reg) { 3903 if (!Reg.isVirtual()) 3904 return -1; 3905 MachineInstr *MI = MRI.getUniqueVRegDef(Reg); 3906 if (MI == nullptr) 3907 return -1; 3908 unsigned Opcode = MI->getOpcode(); 3909 if (Opcode != X86::LEA64r && Opcode != X86::LEA32r) 3910 return -1; 3911 return getJumpTableIndexFromAddr(*MI); 3912 } 3913 3914 int X86InstrInfo::getJumpTableIndex(const MachineInstr &MI) const { 3915 unsigned Opcode = MI.getOpcode(); 3916 // Switch-jump pattern for non-PIC code looks like: 3917 // JMP64m $noreg, 8, %X, %jump-table.X, $noreg 3918 if (Opcode == X86::JMP64m || Opcode == X86::JMP32m) { 3919 return getJumpTableIndexFromAddr(MI); 3920 } 3921 // The pattern for PIC code looks like: 3922 // %0 = LEA64r $rip, 1, $noreg, %jump-table.X 3923 // %1 = MOVSX64rm32 %0, 4, XX, 0, $noreg 3924 // %2 = ADD64rr %1, %0 3925 // JMP64r %2 3926 if (Opcode == X86::JMP64r || Opcode == X86::JMP32r) { 3927 Register Reg = MI.getOperand(0).getReg(); 3928 if (!Reg.isVirtual()) 3929 return -1; 3930 const MachineFunction &MF = *MI.getParent()->getParent(); 3931 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3932 MachineInstr *Add = MRI.getUniqueVRegDef(Reg); 3933 if (Add == nullptr) 3934 return -1; 3935 if (Add->getOpcode() != X86::ADD64rr && Add->getOpcode() != X86::ADD32rr) 3936 return -1; 3937 int JTI1 = getJumpTableIndexFromReg(MRI, Add->getOperand(1).getReg()); 3938 if (JTI1 >= 0) 3939 return JTI1; 3940 int JTI2 = getJumpTableIndexFromReg(MRI, Add->getOperand(2).getReg()); 3941 if (JTI2 >= 0) 3942 return JTI2; 3943 } 3944 return -1; 3945 } 3946 3947 bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 3948 MachineBranchPredicate &MBP, 3949 bool AllowModify) const { 3950 using namespace std::placeholders; 3951 3952 SmallVector<MachineOperand, 4> Cond; 3953 SmallVector<MachineInstr *, 4> CondBranches; 3954 if (analyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches, 3955 AllowModify)) 3956 return true; 3957 3958 if (Cond.size() != 1) 3959 return true; 3960 3961 assert(MBP.TrueDest && "expected!"); 3962 3963 if (!MBP.FalseDest) 3964 MBP.FalseDest = MBB.getNextNode(); 3965 3966 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3967 3968 MachineInstr *ConditionDef = nullptr; 3969 bool SingleUseCondition = true; 3970 3971 for (MachineInstr &MI : llvm::drop_begin(llvm::reverse(MBB))) { 3972 if (MI.modifiesRegister(X86::EFLAGS, TRI)) { 3973 ConditionDef = &MI; 3974 break; 3975 } 3976 3977 if (MI.readsRegister(X86::EFLAGS, TRI)) 3978 SingleUseCondition = false; 3979 } 3980 3981 if (!ConditionDef) 3982 return true; 3983 3984 if (SingleUseCondition) { 3985 for (auto *Succ : MBB.successors()) 3986 if (Succ->isLiveIn(X86::EFLAGS)) 3987 SingleUseCondition = false; 3988 } 3989 3990 MBP.ConditionDef = ConditionDef; 3991 MBP.SingleUseCondition = SingleUseCondition; 3992 3993 // Currently we only recognize the simple pattern: 3994 // 3995 // test %reg, %reg 3996 // je %label 3997 // 3998 const unsigned TestOpcode = 3999 Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr; 4000 4001 if (ConditionDef->getOpcode() == TestOpcode && 4002 ConditionDef->getNumOperands() == 3 && 4003 ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) && 4004 (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) { 4005 MBP.LHS = ConditionDef->getOperand(0); 4006 MBP.RHS = MachineOperand::CreateImm(0); 4007 MBP.Predicate = Cond[0].getImm() == X86::COND_NE 4008 ? MachineBranchPredicate::PRED_NE 4009 : MachineBranchPredicate::PRED_EQ; 4010 return false; 4011 } 4012 4013 return true; 4014 } 4015 4016 unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB, 4017 int *BytesRemoved) const { 4018 assert(!BytesRemoved && "code size not handled"); 4019 4020 MachineBasicBlock::iterator I = MBB.end(); 4021 unsigned Count = 0; 4022 4023 while (I != MBB.begin()) { 4024 --I; 4025 if (I->isDebugInstr()) 4026 continue; 4027 if (I->getOpcode() != X86::JMP_1 && 4028 X86::getCondFromBranch(*I) == X86::COND_INVALID) 4029 break; 4030 // Remove the branch. 4031 I->eraseFromParent(); 4032 I = MBB.end(); 4033 ++Count; 4034 } 4035 4036 return Count; 4037 } 4038 4039 unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB, 4040 MachineBasicBlock *TBB, 4041 MachineBasicBlock *FBB, 4042 ArrayRef<MachineOperand> Cond, 4043 const DebugLoc &DL, int *BytesAdded) const { 4044 // Shouldn't be a fall through. 4045 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 4046 assert((Cond.size() == 1 || Cond.size() == 0) && 4047 "X86 branch conditions have one component!"); 4048 assert(!BytesAdded && "code size not handled"); 4049 4050 if (Cond.empty()) { 4051 // Unconditional branch? 4052 assert(!FBB && "Unconditional branch with multiple successors!"); 4053 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB); 4054 return 1; 4055 } 4056 4057 // If FBB is null, it is implied to be a fall-through block. 4058 bool FallThru = FBB == nullptr; 4059 4060 // Conditional branch. 4061 unsigned Count = 0; 4062 X86::CondCode CC = (X86::CondCode)Cond[0].getImm(); 4063 switch (CC) { 4064 case X86::COND_NE_OR_P: 4065 // Synthesize NE_OR_P with two branches. 4066 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE); 4067 ++Count; 4068 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P); 4069 ++Count; 4070 break; 4071 case X86::COND_E_AND_NP: 4072 // Use the next block of MBB as FBB if it is null. 4073 if (FBB == nullptr) { 4074 FBB = getFallThroughMBB(&MBB, TBB); 4075 assert(FBB && "MBB cannot be the last block in function when the false " 4076 "body is a fall-through."); 4077 } 4078 // Synthesize COND_E_AND_NP with two branches. 4079 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE); 4080 ++Count; 4081 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP); 4082 ++Count; 4083 break; 4084 default: { 4085 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC); 4086 ++Count; 4087 } 4088 } 4089 if (!FallThru) { 4090 // Two-way Conditional branch. Insert the second branch. 4091 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB); 4092 ++Count; 4093 } 4094 return Count; 4095 } 4096 4097 bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 4098 ArrayRef<MachineOperand> Cond, 4099 Register DstReg, Register TrueReg, 4100 Register FalseReg, int &CondCycles, 4101 int &TrueCycles, int &FalseCycles) const { 4102 // Not all subtargets have cmov instructions. 4103 if (!Subtarget.canUseCMOV()) 4104 return false; 4105 if (Cond.size() != 1) 4106 return false; 4107 // We cannot do the composite conditions, at least not in SSA form. 4108 if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND) 4109 return false; 4110 4111 // Check register classes. 4112 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4113 const TargetRegisterClass *RC = 4114 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 4115 if (!RC) 4116 return false; 4117 4118 // We have cmov instructions for 16, 32, and 64 bit general purpose registers. 4119 if (X86::GR16RegClass.hasSubClassEq(RC) || 4120 X86::GR32RegClass.hasSubClassEq(RC) || 4121 X86::GR64RegClass.hasSubClassEq(RC)) { 4122 // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy 4123 // Bridge. Probably Ivy Bridge as well. 4124 CondCycles = 2; 4125 TrueCycles = 2; 4126 FalseCycles = 2; 4127 return true; 4128 } 4129 4130 // Can't do vectors. 4131 return false; 4132 } 4133 4134 void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, 4135 MachineBasicBlock::iterator I, 4136 const DebugLoc &DL, Register DstReg, 4137 ArrayRef<MachineOperand> Cond, Register TrueReg, 4138 Register FalseReg) const { 4139 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4140 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); 4141 const TargetRegisterClass &RC = *MRI.getRegClass(DstReg); 4142 assert(Cond.size() == 1 && "Invalid Cond array"); 4143 unsigned Opc = 4144 X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8, 4145 false /*HasMemoryOperand*/, Subtarget.hasNDD()); 4146 BuildMI(MBB, I, DL, get(Opc), DstReg) 4147 .addReg(FalseReg) 4148 .addReg(TrueReg) 4149 .addImm(Cond[0].getImm()); 4150 } 4151 4152 /// Test if the given register is a physical h register. 4153 static bool isHReg(unsigned Reg) { 4154 return X86::GR8_ABCD_HRegClass.contains(Reg); 4155 } 4156 4157 // Try and copy between VR128/VR64 and GR64 registers. 4158 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, 4159 const X86Subtarget &Subtarget) { 4160 bool HasAVX = Subtarget.hasAVX(); 4161 bool HasAVX512 = Subtarget.hasAVX512(); 4162 bool HasEGPR = Subtarget.hasEGPR(); 4163 4164 // SrcReg(MaskReg) -> DestReg(GR64) 4165 // SrcReg(MaskReg) -> DestReg(GR32) 4166 4167 // All KMASK RegClasses hold the same k registers, can be tested against 4168 // anyone. 4169 if (X86::VK16RegClass.contains(SrcReg)) { 4170 if (X86::GR64RegClass.contains(DestReg)) { 4171 assert(Subtarget.hasBWI()); 4172 return HasEGPR ? X86::KMOVQrk_EVEX : X86::KMOVQrk; 4173 } 4174 if (X86::GR32RegClass.contains(DestReg)) 4175 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDrk_EVEX : X86::KMOVDrk) 4176 : (HasEGPR ? X86::KMOVWrk_EVEX : X86::KMOVWrk); 4177 } 4178 4179 // SrcReg(GR64) -> DestReg(MaskReg) 4180 // SrcReg(GR32) -> DestReg(MaskReg) 4181 4182 // All KMASK RegClasses hold the same k registers, can be tested against 4183 // anyone. 4184 if (X86::VK16RegClass.contains(DestReg)) { 4185 if (X86::GR64RegClass.contains(SrcReg)) { 4186 assert(Subtarget.hasBWI()); 4187 return HasEGPR ? X86::KMOVQkr_EVEX : X86::KMOVQkr; 4188 } 4189 if (X86::GR32RegClass.contains(SrcReg)) 4190 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDkr_EVEX : X86::KMOVDkr) 4191 : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr); 4192 } 4193 4194 // SrcReg(VR128) -> DestReg(GR64) 4195 // SrcReg(VR64) -> DestReg(GR64) 4196 // SrcReg(GR64) -> DestReg(VR128) 4197 // SrcReg(GR64) -> DestReg(VR64) 4198 4199 if (X86::GR64RegClass.contains(DestReg)) { 4200 if (X86::VR128XRegClass.contains(SrcReg)) 4201 // Copy from a VR128 register to a GR64 register. 4202 return HasAVX512 ? X86::VMOVPQIto64Zrr 4203 : HasAVX ? X86::VMOVPQIto64rr 4204 : X86::MOVPQIto64rr; 4205 if (X86::VR64RegClass.contains(SrcReg)) 4206 // Copy from a VR64 register to a GR64 register. 4207 return X86::MMX_MOVD64from64rr; 4208 } else if (X86::GR64RegClass.contains(SrcReg)) { 4209 // Copy from a GR64 register to a VR128 register. 4210 if (X86::VR128XRegClass.contains(DestReg)) 4211 return HasAVX512 ? X86::VMOV64toPQIZrr 4212 : HasAVX ? X86::VMOV64toPQIrr 4213 : X86::MOV64toPQIrr; 4214 // Copy from a GR64 register to a VR64 register. 4215 if (X86::VR64RegClass.contains(DestReg)) 4216 return X86::MMX_MOVD64to64rr; 4217 } 4218 4219 // SrcReg(VR128) -> DestReg(GR32) 4220 // SrcReg(GR32) -> DestReg(VR128) 4221 4222 if (X86::GR32RegClass.contains(DestReg) && 4223 X86::VR128XRegClass.contains(SrcReg)) 4224 // Copy from a VR128 register to a GR32 register. 4225 return HasAVX512 ? X86::VMOVPDI2DIZrr 4226 : HasAVX ? X86::VMOVPDI2DIrr 4227 : X86::MOVPDI2DIrr; 4228 4229 if (X86::VR128XRegClass.contains(DestReg) && 4230 X86::GR32RegClass.contains(SrcReg)) 4231 // Copy from a VR128 register to a VR128 register. 4232 return HasAVX512 ? X86::VMOVDI2PDIZrr 4233 : HasAVX ? X86::VMOVDI2PDIrr 4234 : X86::MOVDI2PDIrr; 4235 return 0; 4236 } 4237 4238 void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 4239 MachineBasicBlock::iterator MI, 4240 const DebugLoc &DL, MCRegister DestReg, 4241 MCRegister SrcReg, bool KillSrc) const { 4242 // First deal with the normal symmetric copies. 4243 bool HasAVX = Subtarget.hasAVX(); 4244 bool HasVLX = Subtarget.hasVLX(); 4245 bool HasEGPR = Subtarget.hasEGPR(); 4246 unsigned Opc = 0; 4247 if (X86::GR64RegClass.contains(DestReg, SrcReg)) 4248 Opc = X86::MOV64rr; 4249 else if (X86::GR32RegClass.contains(DestReg, SrcReg)) 4250 Opc = X86::MOV32rr; 4251 else if (X86::GR16RegClass.contains(DestReg, SrcReg)) 4252 Opc = X86::MOV16rr; 4253 else if (X86::GR8RegClass.contains(DestReg, SrcReg)) { 4254 // Copying to or from a physical H register on x86-64 requires a NOREX 4255 // move. Otherwise use a normal move. 4256 if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) { 4257 Opc = X86::MOV8rr_NOREX; 4258 // Both operands must be encodable without an REX prefix. 4259 assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) && 4260 "8-bit H register can not be copied outside GR8_NOREX"); 4261 } else 4262 Opc = X86::MOV8rr; 4263 } else if (X86::VR64RegClass.contains(DestReg, SrcReg)) 4264 Opc = X86::MMX_MOVQ64rr; 4265 else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) { 4266 if (HasVLX) 4267 Opc = X86::VMOVAPSZ128rr; 4268 else if (X86::VR128RegClass.contains(DestReg, SrcReg)) 4269 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; 4270 else { 4271 // If this an extended register and we don't have VLX we need to use a 4272 // 512-bit move. 4273 Opc = X86::VMOVAPSZrr; 4274 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4275 DestReg = 4276 TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass); 4277 SrcReg = 4278 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass); 4279 } 4280 } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) { 4281 if (HasVLX) 4282 Opc = X86::VMOVAPSZ256rr; 4283 else if (X86::VR256RegClass.contains(DestReg, SrcReg)) 4284 Opc = X86::VMOVAPSYrr; 4285 else { 4286 // If this an extended register and we don't have VLX we need to use a 4287 // 512-bit move. 4288 Opc = X86::VMOVAPSZrr; 4289 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4290 DestReg = 4291 TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass); 4292 SrcReg = 4293 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass); 4294 } 4295 } else if (X86::VR512RegClass.contains(DestReg, SrcReg)) 4296 Opc = X86::VMOVAPSZrr; 4297 // All KMASK RegClasses hold the same k registers, can be tested against 4298 // anyone. 4299 else if (X86::VK16RegClass.contains(DestReg, SrcReg)) 4300 Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk) 4301 : (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk); 4302 if (!Opc) 4303 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget); 4304 4305 if (Opc) { 4306 BuildMI(MBB, MI, DL, get(Opc), DestReg) 4307 .addReg(SrcReg, getKillRegState(KillSrc)); 4308 return; 4309 } 4310 4311 if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) { 4312 // FIXME: We use a fatal error here because historically LLVM has tried 4313 // lower some of these physreg copies and we want to ensure we get 4314 // reasonable bug reports if someone encounters a case no other testing 4315 // found. This path should be removed after the LLVM 7 release. 4316 report_fatal_error("Unable to copy EFLAGS physical register!"); 4317 } 4318 4319 LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to " 4320 << RI.getName(DestReg) << '\n'); 4321 report_fatal_error("Cannot emit physreg copy instruction"); 4322 } 4323 4324 std::optional<DestSourcePair> 4325 X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 4326 if (MI.isMoveReg()) { 4327 // FIXME: Dirty hack for apparent invariant that doesn't hold when 4328 // subreg_to_reg is coalesced with ordinary copies, such that the bits that 4329 // were asserted as 0 are now undef. 4330 if (MI.getOperand(0).isUndef() && MI.getOperand(0).getSubReg()) 4331 return std::nullopt; 4332 4333 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; 4334 } 4335 return std::nullopt; 4336 } 4337 4338 static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) { 4339 if (STI.hasFP16()) 4340 return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; 4341 if (Load) 4342 return STI.hasAVX512() ? X86::VMOVSSZrm 4343 : STI.hasAVX() ? X86::VMOVSSrm 4344 : X86::MOVSSrm; 4345 else 4346 return STI.hasAVX512() ? X86::VMOVSSZmr 4347 : STI.hasAVX() ? X86::VMOVSSmr 4348 : X86::MOVSSmr; 4349 } 4350 4351 static unsigned getLoadStoreRegOpcode(Register Reg, 4352 const TargetRegisterClass *RC, 4353 bool IsStackAligned, 4354 const X86Subtarget &STI, bool Load) { 4355 bool HasAVX = STI.hasAVX(); 4356 bool HasAVX512 = STI.hasAVX512(); 4357 bool HasVLX = STI.hasVLX(); 4358 bool HasEGPR = STI.hasEGPR(); 4359 4360 assert(RC != nullptr && "Invalid target register class"); 4361 switch (STI.getRegisterInfo()->getSpillSize(*RC)) { 4362 default: 4363 llvm_unreachable("Unknown spill size"); 4364 case 1: 4365 assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); 4366 if (STI.is64Bit()) 4367 // Copying to or from a physical H register on x86-64 requires a NOREX 4368 // move. Otherwise use a normal move. 4369 if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) 4370 return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; 4371 return Load ? X86::MOV8rm : X86::MOV8mr; 4372 case 2: 4373 if (X86::VK16RegClass.hasSubClassEq(RC)) 4374 return Load ? (HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm) 4375 : (HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk); 4376 assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); 4377 return Load ? X86::MOV16rm : X86::MOV16mr; 4378 case 4: 4379 if (X86::GR32RegClass.hasSubClassEq(RC)) 4380 return Load ? X86::MOV32rm : X86::MOV32mr; 4381 if (X86::FR32XRegClass.hasSubClassEq(RC)) 4382 return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt 4383 : HasAVX ? X86::VMOVSSrm_alt 4384 : X86::MOVSSrm_alt) 4385 : (HasAVX512 ? X86::VMOVSSZmr 4386 : HasAVX ? X86::VMOVSSmr 4387 : X86::MOVSSmr); 4388 if (X86::RFP32RegClass.hasSubClassEq(RC)) 4389 return Load ? X86::LD_Fp32m : X86::ST_Fp32m; 4390 if (X86::VK32RegClass.hasSubClassEq(RC)) { 4391 assert(STI.hasBWI() && "KMOVD requires BWI"); 4392 return Load ? (HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm) 4393 : (HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk); 4394 } 4395 // All of these mask pair classes have the same spill size, the same kind 4396 // of kmov instructions can be used with all of them. 4397 if (X86::VK1PAIRRegClass.hasSubClassEq(RC) || 4398 X86::VK2PAIRRegClass.hasSubClassEq(RC) || 4399 X86::VK4PAIRRegClass.hasSubClassEq(RC) || 4400 X86::VK8PAIRRegClass.hasSubClassEq(RC) || 4401 X86::VK16PAIRRegClass.hasSubClassEq(RC)) 4402 return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE; 4403 if (X86::FR16RegClass.hasSubClassEq(RC) || 4404 X86::FR16XRegClass.hasSubClassEq(RC)) 4405 return getLoadStoreOpcodeForFP16(Load, STI); 4406 llvm_unreachable("Unknown 4-byte regclass"); 4407 case 8: 4408 if (X86::GR64RegClass.hasSubClassEq(RC)) 4409 return Load ? X86::MOV64rm : X86::MOV64mr; 4410 if (X86::FR64XRegClass.hasSubClassEq(RC)) 4411 return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt 4412 : HasAVX ? X86::VMOVSDrm_alt 4413 : X86::MOVSDrm_alt) 4414 : (HasAVX512 ? X86::VMOVSDZmr 4415 : HasAVX ? X86::VMOVSDmr 4416 : X86::MOVSDmr); 4417 if (X86::VR64RegClass.hasSubClassEq(RC)) 4418 return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; 4419 if (X86::RFP64RegClass.hasSubClassEq(RC)) 4420 return Load ? X86::LD_Fp64m : X86::ST_Fp64m; 4421 if (X86::VK64RegClass.hasSubClassEq(RC)) { 4422 assert(STI.hasBWI() && "KMOVQ requires BWI"); 4423 return Load ? (HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm) 4424 : (HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk); 4425 } 4426 llvm_unreachable("Unknown 8-byte regclass"); 4427 case 10: 4428 assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); 4429 return Load ? X86::LD_Fp80m : X86::ST_FpP80m; 4430 case 16: { 4431 if (X86::VR128XRegClass.hasSubClassEq(RC)) { 4432 // If stack is realigned we can use aligned stores. 4433 if (IsStackAligned) 4434 return Load ? (HasVLX ? X86::VMOVAPSZ128rm 4435 : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX 4436 : HasAVX ? X86::VMOVAPSrm 4437 : X86::MOVAPSrm) 4438 : (HasVLX ? X86::VMOVAPSZ128mr 4439 : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX 4440 : HasAVX ? X86::VMOVAPSmr 4441 : X86::MOVAPSmr); 4442 else 4443 return Load ? (HasVLX ? X86::VMOVUPSZ128rm 4444 : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX 4445 : HasAVX ? X86::VMOVUPSrm 4446 : X86::MOVUPSrm) 4447 : (HasVLX ? X86::VMOVUPSZ128mr 4448 : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX 4449 : HasAVX ? X86::VMOVUPSmr 4450 : X86::MOVUPSmr); 4451 } 4452 llvm_unreachable("Unknown 16-byte regclass"); 4453 } 4454 case 32: 4455 assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); 4456 // If stack is realigned we can use aligned stores. 4457 if (IsStackAligned) 4458 return Load ? (HasVLX ? X86::VMOVAPSZ256rm 4459 : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX 4460 : X86::VMOVAPSYrm) 4461 : (HasVLX ? X86::VMOVAPSZ256mr 4462 : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX 4463 : X86::VMOVAPSYmr); 4464 else 4465 return Load ? (HasVLX ? X86::VMOVUPSZ256rm 4466 : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX 4467 : X86::VMOVUPSYrm) 4468 : (HasVLX ? X86::VMOVUPSZ256mr 4469 : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX 4470 : X86::VMOVUPSYmr); 4471 case 64: 4472 assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); 4473 assert(STI.hasAVX512() && "Using 512-bit register requires AVX512"); 4474 if (IsStackAligned) 4475 return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; 4476 else 4477 return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; 4478 case 1024: 4479 assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass"); 4480 assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE"); 4481 #define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC) 4482 return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD) 4483 : GET_EGPR_IF_ENABLED(X86::TILESTORED); 4484 #undef GET_EGPR_IF_ENABLED 4485 } 4486 } 4487 4488 std::optional<ExtAddrMode> 4489 X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 4490 const TargetRegisterInfo *TRI) const { 4491 const MCInstrDesc &Desc = MemI.getDesc(); 4492 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); 4493 if (MemRefBegin < 0) 4494 return std::nullopt; 4495 4496 MemRefBegin += X86II::getOperandBias(Desc); 4497 4498 auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg); 4499 if (!BaseOp.isReg()) // Can be an MO_FrameIndex 4500 return std::nullopt; 4501 4502 const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp); 4503 // Displacement can be symbolic 4504 if (!DispMO.isImm()) 4505 return std::nullopt; 4506 4507 ExtAddrMode AM; 4508 AM.BaseReg = BaseOp.getReg(); 4509 AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg(); 4510 AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm(); 4511 AM.Displacement = DispMO.getImm(); 4512 return AM; 4513 } 4514 4515 bool X86InstrInfo::verifyInstruction(const MachineInstr &MI, 4516 StringRef &ErrInfo) const { 4517 std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr); 4518 if (!AMOrNone) 4519 return true; 4520 4521 ExtAddrMode AM = *AMOrNone; 4522 assert(AM.Form == ExtAddrMode::Formula::Basic); 4523 if (AM.ScaledReg != X86::NoRegister) { 4524 switch (AM.Scale) { 4525 case 1: 4526 case 2: 4527 case 4: 4528 case 8: 4529 break; 4530 default: 4531 ErrInfo = "Scale factor in address must be 1, 2, 4 or 8"; 4532 return false; 4533 } 4534 } 4535 if (!isInt<32>(AM.Displacement)) { 4536 ErrInfo = "Displacement in address must fit into 32-bit signed " 4537 "integer"; 4538 return false; 4539 } 4540 4541 return true; 4542 } 4543 4544 bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI, 4545 const Register Reg, 4546 int64_t &ImmVal) const { 4547 Register MovReg = Reg; 4548 const MachineInstr *MovMI = &MI; 4549 4550 // Follow use-def for SUBREG_TO_REG to find the real move immediate 4551 // instruction. It is quite common for x86-64. 4552 if (MI.isSubregToReg()) { 4553 // We use following pattern to setup 64b immediate. 4554 // %8:gr32 = MOV32r0 implicit-def dead $eflags 4555 // %6:gr64 = SUBREG_TO_REG 0, killed %8:gr32, %subreg.sub_32bit 4556 if (!MI.getOperand(1).isImm()) 4557 return false; 4558 unsigned FillBits = MI.getOperand(1).getImm(); 4559 unsigned SubIdx = MI.getOperand(3).getImm(); 4560 MovReg = MI.getOperand(2).getReg(); 4561 if (SubIdx != X86::sub_32bit || FillBits != 0) 4562 return false; 4563 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4564 MovMI = MRI.getUniqueVRegDef(MovReg); 4565 if (!MovMI) 4566 return false; 4567 } 4568 4569 if (MovMI->getOpcode() == X86::MOV32r0 && 4570 MovMI->getOperand(0).getReg() == MovReg) { 4571 ImmVal = 0; 4572 return true; 4573 } 4574 4575 if (MovMI->getOpcode() != X86::MOV32ri && 4576 MovMI->getOpcode() != X86::MOV64ri && 4577 MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri) 4578 return false; 4579 // Mov Src can be a global address. 4580 if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg) 4581 return false; 4582 ImmVal = MovMI->getOperand(1).getImm(); 4583 return true; 4584 } 4585 4586 bool X86InstrInfo::preservesZeroValueInReg( 4587 const MachineInstr *MI, const Register NullValueReg, 4588 const TargetRegisterInfo *TRI) const { 4589 if (!MI->modifiesRegister(NullValueReg, TRI)) 4590 return true; 4591 switch (MI->getOpcode()) { 4592 // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax 4593 // X. 4594 case X86::SHR64ri: 4595 case X86::SHR32ri: 4596 case X86::SHL64ri: 4597 case X86::SHL32ri: 4598 assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() && 4599 "expected for shift opcode!"); 4600 return MI->getOperand(0).getReg() == NullValueReg && 4601 MI->getOperand(1).getReg() == NullValueReg; 4602 // Zero extend of a sub-reg of NullValueReg into itself does not change the 4603 // null value. 4604 case X86::MOV32rr: 4605 return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) { 4606 return TRI->isSubRegisterEq(NullValueReg, MO.getReg()); 4607 }); 4608 default: 4609 return false; 4610 } 4611 llvm_unreachable("Should be handled above!"); 4612 } 4613 4614 bool X86InstrInfo::getMemOperandsWithOffsetWidth( 4615 const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps, 4616 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, 4617 const TargetRegisterInfo *TRI) const { 4618 const MCInstrDesc &Desc = MemOp.getDesc(); 4619 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); 4620 if (MemRefBegin < 0) 4621 return false; 4622 4623 MemRefBegin += X86II::getOperandBias(Desc); 4624 4625 const MachineOperand *BaseOp = 4626 &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg); 4627 if (!BaseOp->isReg()) // Can be an MO_FrameIndex 4628 return false; 4629 4630 if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1) 4631 return false; 4632 4633 if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() != 4634 X86::NoRegister) 4635 return false; 4636 4637 const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp); 4638 4639 // Displacement can be symbolic 4640 if (!DispMO.isImm()) 4641 return false; 4642 4643 Offset = DispMO.getImm(); 4644 4645 if (!BaseOp->isReg()) 4646 return false; 4647 4648 OffsetIsScalable = false; 4649 // FIXME: Relying on memoperands() may not be right thing to do here. Check 4650 // with X86 maintainers, and fix it accordingly. For now, it is ok, since 4651 // there is no use of `Width` for X86 back-end at the moment. 4652 Width = 4653 !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0; 4654 BaseOps.push_back(BaseOp); 4655 return true; 4656 } 4657 4658 static unsigned getStoreRegOpcode(Register SrcReg, 4659 const TargetRegisterClass *RC, 4660 bool IsStackAligned, 4661 const X86Subtarget &STI) { 4662 return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false); 4663 } 4664 4665 static unsigned getLoadRegOpcode(Register DestReg, 4666 const TargetRegisterClass *RC, 4667 bool IsStackAligned, const X86Subtarget &STI) { 4668 return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true); 4669 } 4670 4671 static bool isAMXOpcode(unsigned Opc) { 4672 switch (Opc) { 4673 default: 4674 return false; 4675 case X86::TILELOADD: 4676 case X86::TILESTORED: 4677 case X86::TILELOADD_EVEX: 4678 case X86::TILESTORED_EVEX: 4679 return true; 4680 } 4681 } 4682 4683 void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, 4684 MachineBasicBlock::iterator MI, 4685 unsigned Opc, Register Reg, int FrameIdx, 4686 bool isKill) const { 4687 switch (Opc) { 4688 default: 4689 llvm_unreachable("Unexpected special opcode!"); 4690 case X86::TILESTORED: 4691 case X86::TILESTORED_EVEX: { 4692 // tilestored %tmm, (%sp, %idx) 4693 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); 4694 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); 4695 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); 4696 MachineInstr *NewMI = 4697 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) 4698 .addReg(Reg, getKillRegState(isKill)); 4699 MachineOperand &MO = NewMI->getOperand(X86::AddrIndexReg); 4700 MO.setReg(VirtReg); 4701 MO.setIsKill(true); 4702 break; 4703 } 4704 case X86::TILELOADD: 4705 case X86::TILELOADD_EVEX: { 4706 // tileloadd (%sp, %idx), %tmm 4707 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); 4708 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); 4709 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); 4710 MachineInstr *NewMI = addFrameReference( 4711 BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx); 4712 MachineOperand &MO = NewMI->getOperand(1 + X86::AddrIndexReg); 4713 MO.setReg(VirtReg); 4714 MO.setIsKill(true); 4715 break; 4716 } 4717 } 4718 } 4719 4720 void X86InstrInfo::storeRegToStackSlot( 4721 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, 4722 bool isKill, int FrameIdx, const TargetRegisterClass *RC, 4723 const TargetRegisterInfo *TRI, Register VReg) const { 4724 const MachineFunction &MF = *MBB.getParent(); 4725 const MachineFrameInfo &MFI = MF.getFrameInfo(); 4726 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && 4727 "Stack slot too small for store"); 4728 4729 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); 4730 bool isAligned = 4731 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || 4732 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx)); 4733 4734 unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); 4735 if (isAMXOpcode(Opc)) 4736 loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill); 4737 else 4738 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) 4739 .addReg(SrcReg, getKillRegState(isKill)); 4740 } 4741 4742 void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 4743 MachineBasicBlock::iterator MI, 4744 Register DestReg, int FrameIdx, 4745 const TargetRegisterClass *RC, 4746 const TargetRegisterInfo *TRI, 4747 Register VReg) const { 4748 const MachineFunction &MF = *MBB.getParent(); 4749 const MachineFrameInfo &MFI = MF.getFrameInfo(); 4750 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && 4751 "Load size exceeds stack slot"); 4752 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); 4753 bool isAligned = 4754 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || 4755 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx)); 4756 4757 unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); 4758 if (isAMXOpcode(Opc)) 4759 loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx); 4760 else 4761 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), 4762 FrameIdx); 4763 } 4764 4765 bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 4766 Register &SrcReg2, int64_t &CmpMask, 4767 int64_t &CmpValue) const { 4768 switch (MI.getOpcode()) { 4769 default: 4770 break; 4771 case X86::CMP64ri32: 4772 case X86::CMP32ri: 4773 case X86::CMP16ri: 4774 case X86::CMP8ri: 4775 SrcReg = MI.getOperand(0).getReg(); 4776 SrcReg2 = 0; 4777 if (MI.getOperand(1).isImm()) { 4778 CmpMask = ~0; 4779 CmpValue = MI.getOperand(1).getImm(); 4780 } else { 4781 CmpMask = CmpValue = 0; 4782 } 4783 return true; 4784 // A SUB can be used to perform comparison. 4785 CASE_ND(SUB64rm) 4786 CASE_ND(SUB32rm) 4787 CASE_ND(SUB16rm) 4788 CASE_ND(SUB8rm) 4789 SrcReg = MI.getOperand(1).getReg(); 4790 SrcReg2 = 0; 4791 CmpMask = 0; 4792 CmpValue = 0; 4793 return true; 4794 CASE_ND(SUB64rr) 4795 CASE_ND(SUB32rr) 4796 CASE_ND(SUB16rr) 4797 CASE_ND(SUB8rr) 4798 SrcReg = MI.getOperand(1).getReg(); 4799 SrcReg2 = MI.getOperand(2).getReg(); 4800 CmpMask = 0; 4801 CmpValue = 0; 4802 return true; 4803 CASE_ND(SUB64ri32) 4804 CASE_ND(SUB32ri) 4805 CASE_ND(SUB16ri) 4806 CASE_ND(SUB8ri) 4807 SrcReg = MI.getOperand(1).getReg(); 4808 SrcReg2 = 0; 4809 if (MI.getOperand(2).isImm()) { 4810 CmpMask = ~0; 4811 CmpValue = MI.getOperand(2).getImm(); 4812 } else { 4813 CmpMask = CmpValue = 0; 4814 } 4815 return true; 4816 case X86::CMP64rr: 4817 case X86::CMP32rr: 4818 case X86::CMP16rr: 4819 case X86::CMP8rr: 4820 SrcReg = MI.getOperand(0).getReg(); 4821 SrcReg2 = MI.getOperand(1).getReg(); 4822 CmpMask = 0; 4823 CmpValue = 0; 4824 return true; 4825 case X86::TEST8rr: 4826 case X86::TEST16rr: 4827 case X86::TEST32rr: 4828 case X86::TEST64rr: 4829 SrcReg = MI.getOperand(0).getReg(); 4830 if (MI.getOperand(1).getReg() != SrcReg) 4831 return false; 4832 // Compare against zero. 4833 SrcReg2 = 0; 4834 CmpMask = ~0; 4835 CmpValue = 0; 4836 return true; 4837 } 4838 return false; 4839 } 4840 4841 bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI, 4842 Register SrcReg, Register SrcReg2, 4843 int64_t ImmMask, int64_t ImmValue, 4844 const MachineInstr &OI, bool *IsSwapped, 4845 int64_t *ImmDelta) const { 4846 switch (OI.getOpcode()) { 4847 case X86::CMP64rr: 4848 case X86::CMP32rr: 4849 case X86::CMP16rr: 4850 case X86::CMP8rr: 4851 CASE_ND(SUB64rr) 4852 CASE_ND(SUB32rr) 4853 CASE_ND(SUB16rr) 4854 CASE_ND(SUB8rr) { 4855 Register OISrcReg; 4856 Register OISrcReg2; 4857 int64_t OIMask; 4858 int64_t OIValue; 4859 if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) || 4860 OIMask != ImmMask || OIValue != ImmValue) 4861 return false; 4862 if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) { 4863 *IsSwapped = false; 4864 return true; 4865 } 4866 if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) { 4867 *IsSwapped = true; 4868 return true; 4869 } 4870 return false; 4871 } 4872 case X86::CMP64ri32: 4873 case X86::CMP32ri: 4874 case X86::CMP16ri: 4875 case X86::CMP8ri: 4876 CASE_ND(SUB64ri32) 4877 CASE_ND(SUB32ri) 4878 CASE_ND(SUB16ri) 4879 CASE_ND(SUB8ri) 4880 case X86::TEST64rr: 4881 case X86::TEST32rr: 4882 case X86::TEST16rr: 4883 case X86::TEST8rr: { 4884 if (ImmMask != 0) { 4885 Register OISrcReg; 4886 Register OISrcReg2; 4887 int64_t OIMask; 4888 int64_t OIValue; 4889 if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) && 4890 SrcReg == OISrcReg && ImmMask == OIMask) { 4891 if (OIValue == ImmValue) { 4892 *ImmDelta = 0; 4893 return true; 4894 } else if (static_cast<uint64_t>(ImmValue) == 4895 static_cast<uint64_t>(OIValue) - 1) { 4896 *ImmDelta = -1; 4897 return true; 4898 } else if (static_cast<uint64_t>(ImmValue) == 4899 static_cast<uint64_t>(OIValue) + 1) { 4900 *ImmDelta = 1; 4901 return true; 4902 } else { 4903 return false; 4904 } 4905 } 4906 } 4907 return FlagI.isIdenticalTo(OI); 4908 } 4909 default: 4910 return false; 4911 } 4912 } 4913 4914 /// Check whether the definition can be converted 4915 /// to remove a comparison against zero. 4916 inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, 4917 bool &ClearsOverflowFlag) { 4918 NoSignFlag = false; 4919 ClearsOverflowFlag = false; 4920 4921 // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and 4922 // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during 4923 // Initial Exec to Local Exec relaxation. In these cases, we must not depend 4924 // on the EFLAGS modification of ADD actually happening in the final binary. 4925 if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) { 4926 unsigned Flags = MI.getOperand(5).getTargetFlags(); 4927 if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF || 4928 Flags == X86II::MO_GOTNTPOFF) 4929 return false; 4930 } 4931 4932 switch (MI.getOpcode()) { 4933 default: 4934 return false; 4935 4936 // The shift instructions only modify ZF if their shift count is non-zero. 4937 // N.B.: The processor truncates the shift count depending on the encoding. 4938 CASE_ND(SAR8ri) 4939 CASE_ND(SAR16ri) 4940 CASE_ND(SAR32ri) 4941 CASE_ND(SAR64ri) 4942 CASE_ND(SHR8ri) 4943 CASE_ND(SHR16ri) 4944 CASE_ND(SHR32ri) 4945 CASE_ND(SHR64ri) 4946 return getTruncatedShiftCount(MI, 2) != 0; 4947 4948 // Some left shift instructions can be turned into LEA instructions but only 4949 // if their flags aren't used. Avoid transforming such instructions. 4950 CASE_ND(SHL8ri) 4951 CASE_ND(SHL16ri) 4952 CASE_ND(SHL32ri) 4953 CASE_ND(SHL64ri) { 4954 unsigned ShAmt = getTruncatedShiftCount(MI, 2); 4955 if (isTruncatedShiftCountForLEA(ShAmt)) 4956 return false; 4957 return ShAmt != 0; 4958 } 4959 4960 CASE_ND(SHRD16rri8) 4961 CASE_ND(SHRD32rri8) 4962 CASE_ND(SHRD64rri8) 4963 CASE_ND(SHLD16rri8) 4964 CASE_ND(SHLD32rri8) 4965 CASE_ND(SHLD64rri8) 4966 return getTruncatedShiftCount(MI, 3) != 0; 4967 4968 CASE_ND(SUB64ri32) 4969 CASE_ND(SUB32ri) 4970 CASE_ND(SUB16ri) 4971 CASE_ND(SUB8ri) 4972 CASE_ND(SUB64rr) 4973 CASE_ND(SUB32rr) 4974 CASE_ND(SUB16rr) 4975 CASE_ND(SUB8rr) 4976 CASE_ND(SUB64rm) 4977 CASE_ND(SUB32rm) 4978 CASE_ND(SUB16rm) 4979 CASE_ND(SUB8rm) 4980 CASE_ND(DEC64r) 4981 CASE_ND(DEC32r) 4982 CASE_ND(DEC16r) 4983 CASE_ND(DEC8r) 4984 CASE_ND(ADD64ri32) 4985 CASE_ND(ADD32ri) 4986 CASE_ND(ADD16ri) 4987 CASE_ND(ADD8ri) 4988 CASE_ND(ADD64rr) 4989 CASE_ND(ADD32rr) 4990 CASE_ND(ADD16rr) 4991 CASE_ND(ADD8rr) 4992 CASE_ND(ADD64rm) 4993 CASE_ND(ADD32rm) 4994 CASE_ND(ADD16rm) 4995 CASE_ND(ADD8rm) 4996 CASE_ND(INC64r) 4997 CASE_ND(INC32r) 4998 CASE_ND(INC16r) 4999 CASE_ND(INC8r) 5000 CASE_ND(ADC64ri32) 5001 CASE_ND(ADC32ri) 5002 CASE_ND(ADC16ri) 5003 CASE_ND(ADC8ri) 5004 CASE_ND(ADC64rr) 5005 CASE_ND(ADC32rr) 5006 CASE_ND(ADC16rr) 5007 CASE_ND(ADC8rr) 5008 CASE_ND(ADC64rm) 5009 CASE_ND(ADC32rm) 5010 CASE_ND(ADC16rm) 5011 CASE_ND(ADC8rm) 5012 CASE_ND(SBB64ri32) 5013 CASE_ND(SBB32ri) 5014 CASE_ND(SBB16ri) 5015 CASE_ND(SBB8ri) 5016 CASE_ND(SBB64rr) 5017 CASE_ND(SBB32rr) 5018 CASE_ND(SBB16rr) 5019 CASE_ND(SBB8rr) 5020 CASE_ND(SBB64rm) 5021 CASE_ND(SBB32rm) 5022 CASE_ND(SBB16rm) 5023 CASE_ND(SBB8rm) 5024 CASE_ND(NEG8r) 5025 CASE_ND(NEG16r) 5026 CASE_ND(NEG32r) 5027 CASE_ND(NEG64r) 5028 case X86::LZCNT16rr: 5029 case X86::LZCNT16rm: 5030 case X86::LZCNT32rr: 5031 case X86::LZCNT32rm: 5032 case X86::LZCNT64rr: 5033 case X86::LZCNT64rm: 5034 case X86::POPCNT16rr: 5035 case X86::POPCNT16rm: 5036 case X86::POPCNT32rr: 5037 case X86::POPCNT32rm: 5038 case X86::POPCNT64rr: 5039 case X86::POPCNT64rm: 5040 case X86::TZCNT16rr: 5041 case X86::TZCNT16rm: 5042 case X86::TZCNT32rr: 5043 case X86::TZCNT32rm: 5044 case X86::TZCNT64rr: 5045 case X86::TZCNT64rm: 5046 return true; 5047 CASE_ND(AND64ri32) 5048 CASE_ND(AND32ri) 5049 CASE_ND(AND16ri) 5050 CASE_ND(AND8ri) 5051 CASE_ND(AND64rr) 5052 CASE_ND(AND32rr) 5053 CASE_ND(AND16rr) 5054 CASE_ND(AND8rr) 5055 CASE_ND(AND64rm) 5056 CASE_ND(AND32rm) 5057 CASE_ND(AND16rm) 5058 CASE_ND(AND8rm) 5059 CASE_ND(XOR64ri32) 5060 CASE_ND(XOR32ri) 5061 CASE_ND(XOR16ri) 5062 CASE_ND(XOR8ri) 5063 CASE_ND(XOR64rr) 5064 CASE_ND(XOR32rr) 5065 CASE_ND(XOR16rr) 5066 CASE_ND(XOR8rr) 5067 CASE_ND(XOR64rm) 5068 CASE_ND(XOR32rm) 5069 CASE_ND(XOR16rm) 5070 CASE_ND(XOR8rm) 5071 CASE_ND(OR64ri32) 5072 CASE_ND(OR32ri) 5073 CASE_ND(OR16ri) 5074 CASE_ND(OR8ri) 5075 CASE_ND(OR64rr) 5076 CASE_ND(OR32rr) 5077 CASE_ND(OR16rr) 5078 CASE_ND(OR8rr) 5079 CASE_ND(OR64rm) 5080 CASE_ND(OR32rm) 5081 CASE_ND(OR16rm) 5082 CASE_ND(OR8rm) 5083 case X86::ANDN32rr: 5084 case X86::ANDN32rm: 5085 case X86::ANDN64rr: 5086 case X86::ANDN64rm: 5087 case X86::BLSI32rr: 5088 case X86::BLSI32rm: 5089 case X86::BLSI64rr: 5090 case X86::BLSI64rm: 5091 case X86::BLSMSK32rr: 5092 case X86::BLSMSK32rm: 5093 case X86::BLSMSK64rr: 5094 case X86::BLSMSK64rm: 5095 case X86::BLSR32rr: 5096 case X86::BLSR32rm: 5097 case X86::BLSR64rr: 5098 case X86::BLSR64rm: 5099 case X86::BLCFILL32rr: 5100 case X86::BLCFILL32rm: 5101 case X86::BLCFILL64rr: 5102 case X86::BLCFILL64rm: 5103 case X86::BLCI32rr: 5104 case X86::BLCI32rm: 5105 case X86::BLCI64rr: 5106 case X86::BLCI64rm: 5107 case X86::BLCIC32rr: 5108 case X86::BLCIC32rm: 5109 case X86::BLCIC64rr: 5110 case X86::BLCIC64rm: 5111 case X86::BLCMSK32rr: 5112 case X86::BLCMSK32rm: 5113 case X86::BLCMSK64rr: 5114 case X86::BLCMSK64rm: 5115 case X86::BLCS32rr: 5116 case X86::BLCS32rm: 5117 case X86::BLCS64rr: 5118 case X86::BLCS64rm: 5119 case X86::BLSFILL32rr: 5120 case X86::BLSFILL32rm: 5121 case X86::BLSFILL64rr: 5122 case X86::BLSFILL64rm: 5123 case X86::BLSIC32rr: 5124 case X86::BLSIC32rm: 5125 case X86::BLSIC64rr: 5126 case X86::BLSIC64rm: 5127 case X86::BZHI32rr: 5128 case X86::BZHI32rm: 5129 case X86::BZHI64rr: 5130 case X86::BZHI64rm: 5131 case X86::T1MSKC32rr: 5132 case X86::T1MSKC32rm: 5133 case X86::T1MSKC64rr: 5134 case X86::T1MSKC64rm: 5135 case X86::TZMSK32rr: 5136 case X86::TZMSK32rm: 5137 case X86::TZMSK64rr: 5138 case X86::TZMSK64rm: 5139 // These instructions clear the overflow flag just like TEST. 5140 // FIXME: These are not the only instructions in this switch that clear the 5141 // overflow flag. 5142 ClearsOverflowFlag = true; 5143 return true; 5144 case X86::BEXTR32rr: 5145 case X86::BEXTR64rr: 5146 case X86::BEXTR32rm: 5147 case X86::BEXTR64rm: 5148 case X86::BEXTRI32ri: 5149 case X86::BEXTRI32mi: 5150 case X86::BEXTRI64ri: 5151 case X86::BEXTRI64mi: 5152 // BEXTR doesn't update the sign flag so we can't use it. It does clear 5153 // the overflow flag, but that's not useful without the sign flag. 5154 NoSignFlag = true; 5155 return true; 5156 } 5157 } 5158 5159 /// Check whether the use can be converted to remove a comparison against zero. 5160 static X86::CondCode isUseDefConvertible(const MachineInstr &MI) { 5161 switch (MI.getOpcode()) { 5162 default: 5163 return X86::COND_INVALID; 5164 CASE_ND(NEG8r) 5165 CASE_ND(NEG16r) 5166 CASE_ND(NEG32r) 5167 CASE_ND(NEG64r) 5168 return X86::COND_AE; 5169 case X86::LZCNT16rr: 5170 case X86::LZCNT32rr: 5171 case X86::LZCNT64rr: 5172 return X86::COND_B; 5173 case X86::POPCNT16rr: 5174 case X86::POPCNT32rr: 5175 case X86::POPCNT64rr: 5176 return X86::COND_E; 5177 case X86::TZCNT16rr: 5178 case X86::TZCNT32rr: 5179 case X86::TZCNT64rr: 5180 return X86::COND_B; 5181 case X86::BSF16rr: 5182 case X86::BSF32rr: 5183 case X86::BSF64rr: 5184 case X86::BSR16rr: 5185 case X86::BSR32rr: 5186 case X86::BSR64rr: 5187 return X86::COND_E; 5188 case X86::BLSI32rr: 5189 case X86::BLSI64rr: 5190 return X86::COND_AE; 5191 case X86::BLSR32rr: 5192 case X86::BLSR64rr: 5193 case X86::BLSMSK32rr: 5194 case X86::BLSMSK64rr: 5195 return X86::COND_B; 5196 // TODO: TBM instructions. 5197 } 5198 } 5199 5200 /// Check if there exists an earlier instruction that 5201 /// operates on the same source operands and sets flags in the same way as 5202 /// Compare; remove Compare if possible. 5203 bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, 5204 Register SrcReg2, int64_t CmpMask, 5205 int64_t CmpValue, 5206 const MachineRegisterInfo *MRI) const { 5207 // Check whether we can replace SUB with CMP. 5208 switch (CmpInstr.getOpcode()) { 5209 default: 5210 break; 5211 CASE_ND(SUB64ri32) 5212 CASE_ND(SUB32ri) 5213 CASE_ND(SUB16ri) 5214 CASE_ND(SUB8ri) 5215 CASE_ND(SUB64rm) 5216 CASE_ND(SUB32rm) 5217 CASE_ND(SUB16rm) 5218 CASE_ND(SUB8rm) 5219 CASE_ND(SUB64rr) 5220 CASE_ND(SUB32rr) 5221 CASE_ND(SUB16rr) 5222 CASE_ND(SUB8rr) { 5223 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 5224 return false; 5225 // There is no use of the destination register, we can replace SUB with CMP. 5226 unsigned NewOpcode = 0; 5227 #define FROM_TO(A, B) \ 5228 CASE_ND(A) NewOpcode = X86::B; \ 5229 break; 5230 switch (CmpInstr.getOpcode()) { 5231 default: 5232 llvm_unreachable("Unreachable!"); 5233 FROM_TO(SUB64rm, CMP64rm) 5234 FROM_TO(SUB32rm, CMP32rm) 5235 FROM_TO(SUB16rm, CMP16rm) 5236 FROM_TO(SUB8rm, CMP8rm) 5237 FROM_TO(SUB64rr, CMP64rr) 5238 FROM_TO(SUB32rr, CMP32rr) 5239 FROM_TO(SUB16rr, CMP16rr) 5240 FROM_TO(SUB8rr, CMP8rr) 5241 FROM_TO(SUB64ri32, CMP64ri32) 5242 FROM_TO(SUB32ri, CMP32ri) 5243 FROM_TO(SUB16ri, CMP16ri) 5244 FROM_TO(SUB8ri, CMP8ri) 5245 } 5246 #undef FROM_TO 5247 CmpInstr.setDesc(get(NewOpcode)); 5248 CmpInstr.removeOperand(0); 5249 // Mutating this instruction invalidates any debug data associated with it. 5250 CmpInstr.dropDebugNumber(); 5251 // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. 5252 if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm || 5253 NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm) 5254 return false; 5255 } 5256 } 5257 5258 // The following code tries to remove the comparison by re-using EFLAGS 5259 // from earlier instructions. 5260 5261 bool IsCmpZero = (CmpMask != 0 && CmpValue == 0); 5262 5263 // Transformation currently requires SSA values. 5264 if (SrcReg2.isPhysical()) 5265 return false; 5266 MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg); 5267 assert(SrcRegDef && "Must have a definition (SSA)"); 5268 5269 MachineInstr *MI = nullptr; 5270 MachineInstr *Sub = nullptr; 5271 MachineInstr *Movr0Inst = nullptr; 5272 bool NoSignFlag = false; 5273 bool ClearsOverflowFlag = false; 5274 bool ShouldUpdateCC = false; 5275 bool IsSwapped = false; 5276 X86::CondCode NewCC = X86::COND_INVALID; 5277 int64_t ImmDelta = 0; 5278 5279 // Search backward from CmpInstr for the next instruction defining EFLAGS. 5280 const TargetRegisterInfo *TRI = &getRegisterInfo(); 5281 MachineBasicBlock &CmpMBB = *CmpInstr.getParent(); 5282 MachineBasicBlock::reverse_iterator From = 5283 std::next(MachineBasicBlock::reverse_iterator(CmpInstr)); 5284 for (MachineBasicBlock *MBB = &CmpMBB;;) { 5285 for (MachineInstr &Inst : make_range(From, MBB->rend())) { 5286 // Try to use EFLAGS from the instruction defining %SrcReg. Example: 5287 // %eax = addl ... 5288 // ... // EFLAGS not changed 5289 // testl %eax, %eax // <-- can be removed 5290 if (&Inst == SrcRegDef) { 5291 if (IsCmpZero && 5292 isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) { 5293 MI = &Inst; 5294 break; 5295 } 5296 5297 // Look back for the following pattern, in which case the 5298 // test16rr/test64rr instruction could be erased. 5299 // 5300 // Example for test16rr: 5301 // %reg = and32ri %in_reg, 5 5302 // ... // EFLAGS not changed. 5303 // %src_reg = copy %reg.sub_16bit:gr32 5304 // test16rr %src_reg, %src_reg, implicit-def $eflags 5305 // Example for test64rr: 5306 // %reg = and32ri %in_reg, 5 5307 // ... // EFLAGS not changed. 5308 // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index 5309 // test64rr %src_reg, %src_reg, implicit-def $eflags 5310 MachineInstr *AndInstr = nullptr; 5311 if (IsCmpZero && 5312 findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI, 5313 NoSignFlag, ClearsOverflowFlag)) { 5314 assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode())); 5315 MI = AndInstr; 5316 break; 5317 } 5318 // Cannot find other candidates before definition of SrcReg. 5319 return false; 5320 } 5321 5322 if (Inst.modifiesRegister(X86::EFLAGS, TRI)) { 5323 // Try to use EFLAGS produced by an instruction reading %SrcReg. 5324 // Example: 5325 // %eax = ... 5326 // ... 5327 // popcntl %eax 5328 // ... // EFLAGS not changed 5329 // testl %eax, %eax // <-- can be removed 5330 if (IsCmpZero) { 5331 NewCC = isUseDefConvertible(Inst); 5332 if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() && 5333 Inst.getOperand(1).getReg() == SrcReg) { 5334 ShouldUpdateCC = true; 5335 MI = &Inst; 5336 break; 5337 } 5338 } 5339 5340 // Try to use EFLAGS from an instruction with similar flag results. 5341 // Example: 5342 // sub x, y or cmp x, y 5343 // ... // EFLAGS not changed 5344 // cmp x, y // <-- can be removed 5345 if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue, 5346 Inst, &IsSwapped, &ImmDelta)) { 5347 Sub = &Inst; 5348 break; 5349 } 5350 5351 // MOV32r0 is implemented with xor which clobbers condition code. It is 5352 // safe to move up, if the definition to EFLAGS is dead and earlier 5353 // instructions do not read or write EFLAGS. 5354 if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 && 5355 Inst.registerDefIsDead(X86::EFLAGS, TRI)) { 5356 Movr0Inst = &Inst; 5357 continue; 5358 } 5359 5360 // Cannot do anything for any other EFLAG changes. 5361 return false; 5362 } 5363 } 5364 5365 if (MI || Sub) 5366 break; 5367 5368 // Reached begin of basic block. Continue in predecessor if there is 5369 // exactly one. 5370 if (MBB->pred_size() != 1) 5371 return false; 5372 MBB = *MBB->pred_begin(); 5373 From = MBB->rbegin(); 5374 } 5375 5376 // Scan forward from the instruction after CmpInstr for uses of EFLAGS. 5377 // It is safe to remove CmpInstr if EFLAGS is redefined or killed. 5378 // If we are done with the basic block, we need to check whether EFLAGS is 5379 // live-out. 5380 bool FlagsMayLiveOut = true; 5381 SmallVector<std::pair<MachineInstr *, X86::CondCode>, 4> OpsToUpdate; 5382 MachineBasicBlock::iterator AfterCmpInstr = 5383 std::next(MachineBasicBlock::iterator(CmpInstr)); 5384 for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) { 5385 bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI); 5386 bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI); 5387 // We should check the usage if this instruction uses and updates EFLAGS. 5388 if (!UseEFLAGS && ModifyEFLAGS) { 5389 // It is safe to remove CmpInstr if EFLAGS is updated again. 5390 FlagsMayLiveOut = false; 5391 break; 5392 } 5393 if (!UseEFLAGS && !ModifyEFLAGS) 5394 continue; 5395 5396 // EFLAGS is used by this instruction. 5397 X86::CondCode OldCC = X86::getCondFromMI(Instr); 5398 if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID) 5399 return false; 5400 5401 X86::CondCode ReplacementCC = X86::COND_INVALID; 5402 if (MI) { 5403 switch (OldCC) { 5404 default: 5405 break; 5406 case X86::COND_A: 5407 case X86::COND_AE: 5408 case X86::COND_B: 5409 case X86::COND_BE: 5410 // CF is used, we can't perform this optimization. 5411 return false; 5412 case X86::COND_G: 5413 case X86::COND_GE: 5414 case X86::COND_L: 5415 case X86::COND_LE: 5416 // If SF is used, but the instruction doesn't update the SF, then we 5417 // can't do the optimization. 5418 if (NoSignFlag) 5419 return false; 5420 [[fallthrough]]; 5421 case X86::COND_O: 5422 case X86::COND_NO: 5423 // If OF is used, the instruction needs to clear it like CmpZero does. 5424 if (!ClearsOverflowFlag) 5425 return false; 5426 break; 5427 case X86::COND_S: 5428 case X86::COND_NS: 5429 // If SF is used, but the instruction doesn't update the SF, then we 5430 // can't do the optimization. 5431 if (NoSignFlag) 5432 return false; 5433 break; 5434 } 5435 5436 // If we're updating the condition code check if we have to reverse the 5437 // condition. 5438 if (ShouldUpdateCC) 5439 switch (OldCC) { 5440 default: 5441 return false; 5442 case X86::COND_E: 5443 ReplacementCC = NewCC; 5444 break; 5445 case X86::COND_NE: 5446 ReplacementCC = GetOppositeBranchCondition(NewCC); 5447 break; 5448 } 5449 } else if (IsSwapped) { 5450 // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs 5451 // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. 5452 // We swap the condition code and synthesize the new opcode. 5453 ReplacementCC = getSwappedCondition(OldCC); 5454 if (ReplacementCC == X86::COND_INVALID) 5455 return false; 5456 ShouldUpdateCC = true; 5457 } else if (ImmDelta != 0) { 5458 unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg)); 5459 // Shift amount for min/max constants to adjust for 8/16/32 instruction 5460 // sizes. 5461 switch (OldCC) { 5462 case X86::COND_L: // x <s (C + 1) --> x <=s C 5463 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue) 5464 return false; 5465 ReplacementCC = X86::COND_LE; 5466 break; 5467 case X86::COND_B: // x <u (C + 1) --> x <=u C 5468 if (ImmDelta != 1 || CmpValue == 0) 5469 return false; 5470 ReplacementCC = X86::COND_BE; 5471 break; 5472 case X86::COND_GE: // x >=s (C + 1) --> x >s C 5473 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue) 5474 return false; 5475 ReplacementCC = X86::COND_G; 5476 break; 5477 case X86::COND_AE: // x >=u (C + 1) --> x >u C 5478 if (ImmDelta != 1 || CmpValue == 0) 5479 return false; 5480 ReplacementCC = X86::COND_A; 5481 break; 5482 case X86::COND_G: // x >s (C - 1) --> x >=s C 5483 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue) 5484 return false; 5485 ReplacementCC = X86::COND_GE; 5486 break; 5487 case X86::COND_A: // x >u (C - 1) --> x >=u C 5488 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue) 5489 return false; 5490 ReplacementCC = X86::COND_AE; 5491 break; 5492 case X86::COND_LE: // x <=s (C - 1) --> x <s C 5493 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue) 5494 return false; 5495 ReplacementCC = X86::COND_L; 5496 break; 5497 case X86::COND_BE: // x <=u (C - 1) --> x <u C 5498 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue) 5499 return false; 5500 ReplacementCC = X86::COND_B; 5501 break; 5502 default: 5503 return false; 5504 } 5505 ShouldUpdateCC = true; 5506 } 5507 5508 if (ShouldUpdateCC && ReplacementCC != OldCC) { 5509 // Push the MachineInstr to OpsToUpdate. 5510 // If it is safe to remove CmpInstr, the condition code of these 5511 // instructions will be modified. 5512 OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC)); 5513 } 5514 if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) { 5515 // It is safe to remove CmpInstr if EFLAGS is updated again or killed. 5516 FlagsMayLiveOut = false; 5517 break; 5518 } 5519 } 5520 5521 // If we have to update users but EFLAGS is live-out abort, since we cannot 5522 // easily find all of the users. 5523 if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) { 5524 for (MachineBasicBlock *Successor : CmpMBB.successors()) 5525 if (Successor->isLiveIn(X86::EFLAGS)) 5526 return false; 5527 } 5528 5529 // The instruction to be updated is either Sub or MI. 5530 assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set"); 5531 Sub = MI != nullptr ? MI : Sub; 5532 MachineBasicBlock *SubBB = Sub->getParent(); 5533 // Move Movr0Inst to the appropriate place before Sub. 5534 if (Movr0Inst) { 5535 // Only move within the same block so we don't accidentally move to a 5536 // block with higher execution frequency. 5537 if (&CmpMBB != SubBB) 5538 return false; 5539 // Look backwards until we find a def that doesn't use the current EFLAGS. 5540 MachineBasicBlock::reverse_iterator InsertI = Sub, 5541 InsertE = Sub->getParent()->rend(); 5542 for (; InsertI != InsertE; ++InsertI) { 5543 MachineInstr *Instr = &*InsertI; 5544 if (!Instr->readsRegister(X86::EFLAGS, TRI) && 5545 Instr->modifiesRegister(X86::EFLAGS, TRI)) { 5546 Movr0Inst->getParent()->remove(Movr0Inst); 5547 Instr->getParent()->insert(MachineBasicBlock::iterator(Instr), 5548 Movr0Inst); 5549 break; 5550 } 5551 } 5552 if (InsertI == InsertE) 5553 return false; 5554 } 5555 5556 // Make sure Sub instruction defines EFLAGS and mark the def live. 5557 MachineOperand *FlagDef = 5558 Sub->findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr); 5559 assert(FlagDef && "Unable to locate a def EFLAGS operand"); 5560 FlagDef->setIsDead(false); 5561 5562 CmpInstr.eraseFromParent(); 5563 5564 // Modify the condition code of instructions in OpsToUpdate. 5565 for (auto &Op : OpsToUpdate) { 5566 Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1) 5567 .setImm(Op.second); 5568 } 5569 // Add EFLAGS to block live-ins between CmpBB and block of flags producer. 5570 for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB; 5571 MBB = *MBB->pred_begin()) { 5572 assert(MBB->pred_size() == 1 && "Expected exactly one predecessor"); 5573 if (!MBB->isLiveIn(X86::EFLAGS)) 5574 MBB->addLiveIn(X86::EFLAGS); 5575 } 5576 return true; 5577 } 5578 5579 /// Try to remove the load by folding it to a register 5580 /// operand at the use. We fold the load instructions if load defines a virtual 5581 /// register, the virtual register is used once in the same BB, and the 5582 /// instructions in-between do not load or store, and have no side effects. 5583 MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, 5584 const MachineRegisterInfo *MRI, 5585 Register &FoldAsLoadDefReg, 5586 MachineInstr *&DefMI) const { 5587 // Check whether we can move DefMI here. 5588 DefMI = MRI->getVRegDef(FoldAsLoadDefReg); 5589 assert(DefMI); 5590 bool SawStore = false; 5591 if (!DefMI->isSafeToMove(nullptr, SawStore)) 5592 return nullptr; 5593 5594 // Collect information about virtual register operands of MI. 5595 SmallVector<unsigned, 1> SrcOperandIds; 5596 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 5597 MachineOperand &MO = MI.getOperand(i); 5598 if (!MO.isReg()) 5599 continue; 5600 Register Reg = MO.getReg(); 5601 if (Reg != FoldAsLoadDefReg) 5602 continue; 5603 // Do not fold if we have a subreg use or a def. 5604 if (MO.getSubReg() || MO.isDef()) 5605 return nullptr; 5606 SrcOperandIds.push_back(i); 5607 } 5608 if (SrcOperandIds.empty()) 5609 return nullptr; 5610 5611 // Check whether we can fold the def into SrcOperandId. 5612 if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) { 5613 FoldAsLoadDefReg = 0; 5614 return FoldMI; 5615 } 5616 5617 return nullptr; 5618 } 5619 5620 /// \returns true if the instruction can be changed to COPY when imm is 0. 5621 static bool canConvert2Copy(unsigned Opc) { 5622 switch (Opc) { 5623 default: 5624 return false; 5625 CASE_ND(ADD64ri32) 5626 CASE_ND(SUB64ri32) 5627 CASE_ND(OR64ri32) 5628 CASE_ND(XOR64ri32) 5629 CASE_ND(ADD32ri) 5630 CASE_ND(SUB32ri) 5631 CASE_ND(OR32ri) 5632 CASE_ND(XOR32ri) 5633 return true; 5634 } 5635 } 5636 5637 /// Convert an ALUrr opcode to corresponding ALUri opcode. Such as 5638 /// ADD32rr ==> ADD32ri 5639 static unsigned convertALUrr2ALUri(unsigned Opc) { 5640 switch (Opc) { 5641 default: 5642 return 0; 5643 #define FROM_TO(FROM, TO) \ 5644 case X86::FROM: \ 5645 return X86::TO; \ 5646 case X86::FROM##_ND: \ 5647 return X86::TO##_ND; 5648 FROM_TO(ADD64rr, ADD64ri32) 5649 FROM_TO(ADC64rr, ADC64ri32) 5650 FROM_TO(SUB64rr, SUB64ri32) 5651 FROM_TO(SBB64rr, SBB64ri32) 5652 FROM_TO(AND64rr, AND64ri32) 5653 FROM_TO(OR64rr, OR64ri32) 5654 FROM_TO(XOR64rr, XOR64ri32) 5655 FROM_TO(SHR64rCL, SHR64ri) 5656 FROM_TO(SHL64rCL, SHL64ri) 5657 FROM_TO(SAR64rCL, SAR64ri) 5658 FROM_TO(ROL64rCL, ROL64ri) 5659 FROM_TO(ROR64rCL, ROR64ri) 5660 FROM_TO(RCL64rCL, RCL64ri) 5661 FROM_TO(RCR64rCL, RCR64ri) 5662 FROM_TO(ADD32rr, ADD32ri) 5663 FROM_TO(ADC32rr, ADC32ri) 5664 FROM_TO(SUB32rr, SUB32ri) 5665 FROM_TO(SBB32rr, SBB32ri) 5666 FROM_TO(AND32rr, AND32ri) 5667 FROM_TO(OR32rr, OR32ri) 5668 FROM_TO(XOR32rr, XOR32ri) 5669 FROM_TO(SHR32rCL, SHR32ri) 5670 FROM_TO(SHL32rCL, SHL32ri) 5671 FROM_TO(SAR32rCL, SAR32ri) 5672 FROM_TO(ROL32rCL, ROL32ri) 5673 FROM_TO(ROR32rCL, ROR32ri) 5674 FROM_TO(RCL32rCL, RCL32ri) 5675 FROM_TO(RCR32rCL, RCR32ri) 5676 #undef FROM_TO 5677 #define FROM_TO(FROM, TO) \ 5678 case X86::FROM: \ 5679 return X86::TO; 5680 FROM_TO(TEST64rr, TEST64ri32) 5681 FROM_TO(CTEST64rr, CTEST64ri32) 5682 FROM_TO(CMP64rr, CMP64ri32) 5683 FROM_TO(CCMP64rr, CCMP64ri32) 5684 FROM_TO(TEST32rr, TEST32ri) 5685 FROM_TO(CTEST32rr, CTEST32ri) 5686 FROM_TO(CMP32rr, CMP32ri) 5687 FROM_TO(CCMP32rr, CCMP32ri) 5688 #undef FROM_TO 5689 } 5690 } 5691 5692 /// Reg is assigned ImmVal in DefMI, and is used in UseMI. 5693 /// If MakeChange is true, this function tries to replace Reg by ImmVal in 5694 /// UseMI. If MakeChange is false, just check if folding is possible. 5695 // 5696 /// \returns true if folding is successful or possible. 5697 bool X86InstrInfo::foldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI, 5698 Register Reg, int64_t ImmVal, 5699 MachineRegisterInfo *MRI, 5700 bool MakeChange) const { 5701 bool Modified = false; 5702 5703 // 64 bit operations accept sign extended 32 bit immediates. 5704 // 32 bit operations accept all 32 bit immediates, so we don't need to check 5705 // them. 5706 const TargetRegisterClass *RC = nullptr; 5707 if (Reg.isVirtual()) 5708 RC = MRI->getRegClass(Reg); 5709 if ((Reg.isPhysical() && X86::GR64RegClass.contains(Reg)) || 5710 (Reg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC))) { 5711 if (!isInt<32>(ImmVal)) 5712 return false; 5713 } 5714 5715 if (UseMI.findRegisterUseOperand(Reg, /*TRI=*/nullptr)->getSubReg()) 5716 return false; 5717 // Immediate has larger code size than register. So avoid folding the 5718 // immediate if it has more than 1 use and we are optimizing for size. 5719 if (UseMI.getMF()->getFunction().hasOptSize() && Reg.isVirtual() && 5720 !MRI->hasOneNonDBGUse(Reg)) 5721 return false; 5722 5723 unsigned Opc = UseMI.getOpcode(); 5724 unsigned NewOpc; 5725 if (Opc == TargetOpcode::COPY) { 5726 Register ToReg = UseMI.getOperand(0).getReg(); 5727 const TargetRegisterClass *RC = nullptr; 5728 if (ToReg.isVirtual()) 5729 RC = MRI->getRegClass(ToReg); 5730 bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) || 5731 (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg)); 5732 bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) || 5733 (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg)); 5734 bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) || 5735 (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg)); 5736 5737 if (ImmVal == 0) { 5738 // We have MOV32r0 only. 5739 if (!GR32Reg) 5740 return false; 5741 } 5742 5743 if (GR64Reg) { 5744 if (isUInt<32>(ImmVal)) 5745 NewOpc = X86::MOV32ri64; 5746 else 5747 NewOpc = X86::MOV64ri; 5748 } else if (GR32Reg) { 5749 NewOpc = X86::MOV32ri; 5750 if (ImmVal == 0) { 5751 // MOV32r0 clobbers EFLAGS. 5752 const TargetRegisterInfo *TRI = &getRegisterInfo(); 5753 if (UseMI.getParent()->computeRegisterLiveness( 5754 TRI, X86::EFLAGS, UseMI) != MachineBasicBlock::LQR_Dead) 5755 return false; 5756 5757 // MOV32r0 is different than other cases because it doesn't encode the 5758 // immediate in the instruction. So we directly modify it here. 5759 if (!MakeChange) 5760 return true; 5761 UseMI.setDesc(get(X86::MOV32r0)); 5762 UseMI.removeOperand( 5763 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr)); 5764 UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/true, 5765 /*isImp=*/true, 5766 /*isKill=*/false, 5767 /*isDead=*/true)); 5768 Modified = true; 5769 } 5770 } else if (GR8Reg) 5771 NewOpc = X86::MOV8ri; 5772 else 5773 return false; 5774 } else 5775 NewOpc = convertALUrr2ALUri(Opc); 5776 5777 if (!NewOpc) 5778 return false; 5779 5780 // For SUB instructions the immediate can only be the second source operand. 5781 if ((NewOpc == X86::SUB64ri32 || NewOpc == X86::SUB32ri || 5782 NewOpc == X86::SBB64ri32 || NewOpc == X86::SBB32ri || 5783 NewOpc == X86::SUB64ri32_ND || NewOpc == X86::SUB32ri_ND || 5784 NewOpc == X86::SBB64ri32_ND || NewOpc == X86::SBB32ri_ND) && 5785 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 2) 5786 return false; 5787 // For CMP instructions the immediate can only be at index 1. 5788 if (((NewOpc == X86::CMP64ri32 || NewOpc == X86::CMP32ri) || 5789 (NewOpc == X86::CCMP64ri32 || NewOpc == X86::CCMP32ri)) && 5790 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 1) 5791 return false; 5792 5793 using namespace X86; 5794 if (isSHL(Opc) || isSHR(Opc) || isSAR(Opc) || isROL(Opc) || isROR(Opc) || 5795 isRCL(Opc) || isRCR(Opc)) { 5796 unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr); 5797 if (RegIdx < 2) 5798 return false; 5799 if (!isInt<8>(ImmVal)) 5800 return false; 5801 assert(Reg == X86::CL); 5802 5803 if (!MakeChange) 5804 return true; 5805 UseMI.setDesc(get(NewOpc)); 5806 UseMI.removeOperand(RegIdx); 5807 UseMI.addOperand(MachineOperand::CreateImm(ImmVal)); 5808 // Reg is physical register $cl, so we don't know if DefMI is dead through 5809 // MRI. Let the caller handle it, or pass dead-mi-elimination can delete 5810 // the dead physical register define instruction. 5811 return true; 5812 } 5813 5814 if (!MakeChange) 5815 return true; 5816 5817 if (!Modified) { 5818 // Modify the instruction. 5819 if (ImmVal == 0 && canConvert2Copy(NewOpc) && 5820 UseMI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) { 5821 // %100 = add %101, 0 5822 // ==> 5823 // %100 = COPY %101 5824 UseMI.setDesc(get(TargetOpcode::COPY)); 5825 UseMI.removeOperand( 5826 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr)); 5827 UseMI.removeOperand( 5828 UseMI.findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr)); 5829 UseMI.untieRegOperand(0); 5830 UseMI.clearFlag(MachineInstr::MIFlag::NoSWrap); 5831 UseMI.clearFlag(MachineInstr::MIFlag::NoUWrap); 5832 } else { 5833 unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex; 5834 unsigned ImmOpNum = 2; 5835 if (!UseMI.getOperand(0).isDef()) { 5836 Op1 = 0; // TEST, CMP, CTEST, CCMP 5837 ImmOpNum = 1; 5838 } 5839 if (Opc == TargetOpcode::COPY) 5840 ImmOpNum = 1; 5841 if (findCommutedOpIndices(UseMI, Op1, Op2) && 5842 UseMI.getOperand(Op1).getReg() == Reg) 5843 commuteInstruction(UseMI); 5844 5845 assert(UseMI.getOperand(ImmOpNum).getReg() == Reg); 5846 UseMI.setDesc(get(NewOpc)); 5847 UseMI.getOperand(ImmOpNum).ChangeToImmediate(ImmVal); 5848 } 5849 } 5850 5851 if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg)) 5852 DefMI->eraseFromBundle(); 5853 5854 return true; 5855 } 5856 5857 /// foldImmediate - 'Reg' is known to be defined by a move immediate 5858 /// instruction, try to fold the immediate into the use instruction. 5859 bool X86InstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 5860 Register Reg, MachineRegisterInfo *MRI) const { 5861 int64_t ImmVal; 5862 if (!getConstValDefinedInReg(DefMI, Reg, ImmVal)) 5863 return false; 5864 5865 return foldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true); 5866 } 5867 5868 /// Expand a single-def pseudo instruction to a two-addr 5869 /// instruction with two undef reads of the register being defined. 5870 /// This is used for mapping: 5871 /// %xmm4 = V_SET0 5872 /// to: 5873 /// %xmm4 = PXORrr undef %xmm4, undef %xmm4 5874 /// 5875 static bool Expand2AddrUndef(MachineInstrBuilder &MIB, 5876 const MCInstrDesc &Desc) { 5877 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); 5878 Register Reg = MIB.getReg(0); 5879 MIB->setDesc(Desc); 5880 5881 // MachineInstr::addOperand() will insert explicit operands before any 5882 // implicit operands. 5883 MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); 5884 // But we don't trust that. 5885 assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand"); 5886 return true; 5887 } 5888 5889 /// Expand a single-def pseudo instruction to a two-addr 5890 /// instruction with two %k0 reads. 5891 /// This is used for mapping: 5892 /// %k4 = K_SET1 5893 /// to: 5894 /// %k4 = KXNORrr %k0, %k0 5895 static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, 5896 Register Reg) { 5897 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); 5898 MIB->setDesc(Desc); 5899 MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); 5900 return true; 5901 } 5902 5903 static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, 5904 bool MinusOne) { 5905 MachineBasicBlock &MBB = *MIB->getParent(); 5906 const DebugLoc &DL = MIB->getDebugLoc(); 5907 Register Reg = MIB.getReg(0); 5908 5909 // Insert the XOR. 5910 BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) 5911 .addReg(Reg, RegState::Undef) 5912 .addReg(Reg, RegState::Undef); 5913 5914 // Turn the pseudo into an INC or DEC. 5915 MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r)); 5916 MIB.addReg(Reg); 5917 5918 return true; 5919 } 5920 5921 static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, 5922 const TargetInstrInfo &TII, 5923 const X86Subtarget &Subtarget) { 5924 MachineBasicBlock &MBB = *MIB->getParent(); 5925 const DebugLoc &DL = MIB->getDebugLoc(); 5926 int64_t Imm = MIB->getOperand(1).getImm(); 5927 assert(Imm != 0 && "Using push/pop for 0 is not efficient."); 5928 MachineBasicBlock::iterator I = MIB.getInstr(); 5929 5930 int StackAdjustment; 5931 5932 if (Subtarget.is64Bit()) { 5933 assert(MIB->getOpcode() == X86::MOV64ImmSExti8 || 5934 MIB->getOpcode() == X86::MOV32ImmSExti8); 5935 5936 // Can't use push/pop lowering if the function might write to the red zone. 5937 X86MachineFunctionInfo *X86FI = 5938 MBB.getParent()->getInfo<X86MachineFunctionInfo>(); 5939 if (X86FI->getUsesRedZone()) { 5940 MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8 5941 ? X86::MOV32ri 5942 : X86::MOV64ri)); 5943 return true; 5944 } 5945 5946 // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and 5947 // widen the register if necessary. 5948 StackAdjustment = 8; 5949 BuildMI(MBB, I, DL, TII.get(X86::PUSH64i32)).addImm(Imm); 5950 MIB->setDesc(TII.get(X86::POP64r)); 5951 MIB->getOperand(0).setReg(getX86SubSuperRegister(MIB.getReg(0), 64)); 5952 } else { 5953 assert(MIB->getOpcode() == X86::MOV32ImmSExti8); 5954 StackAdjustment = 4; 5955 BuildMI(MBB, I, DL, TII.get(X86::PUSH32i)).addImm(Imm); 5956 MIB->setDesc(TII.get(X86::POP32r)); 5957 } 5958 MIB->removeOperand(1); 5959 MIB->addImplicitDefUseOperands(*MBB.getParent()); 5960 5961 // Build CFI if necessary. 5962 MachineFunction &MF = *MBB.getParent(); 5963 const X86FrameLowering *TFL = Subtarget.getFrameLowering(); 5964 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); 5965 bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves(); 5966 bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; 5967 if (EmitCFI) { 5968 TFL->BuildCFI( 5969 MBB, I, DL, 5970 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment)); 5971 TFL->BuildCFI( 5972 MBB, std::next(I), DL, 5973 MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment)); 5974 } 5975 5976 return true; 5977 } 5978 5979 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different 5980 // code sequence is needed for other targets. 5981 static void expandLoadStackGuard(MachineInstrBuilder &MIB, 5982 const TargetInstrInfo &TII) { 5983 MachineBasicBlock &MBB = *MIB->getParent(); 5984 const DebugLoc &DL = MIB->getDebugLoc(); 5985 Register Reg = MIB.getReg(0); 5986 const GlobalValue *GV = 5987 cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); 5988 auto Flags = MachineMemOperand::MOLoad | 5989 MachineMemOperand::MODereferenceable | 5990 MachineMemOperand::MOInvariant; 5991 MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( 5992 MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8)); 5993 MachineBasicBlock::iterator I = MIB.getInstr(); 5994 5995 BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg) 5996 .addReg(X86::RIP) 5997 .addImm(1) 5998 .addReg(0) 5999 .addGlobalAddress(GV, 0, X86II::MO_GOTPCREL) 6000 .addReg(0) 6001 .addMemOperand(MMO); 6002 MIB->setDebugLoc(DL); 6003 MIB->setDesc(TII.get(X86::MOV64rm)); 6004 MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); 6005 } 6006 6007 static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) { 6008 MachineBasicBlock &MBB = *MIB->getParent(); 6009 MachineFunction &MF = *MBB.getParent(); 6010 const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); 6011 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 6012 unsigned XorOp = 6013 MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr; 6014 MIB->setDesc(TII.get(XorOp)); 6015 MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef); 6016 return true; 6017 } 6018 6019 // This is used to handle spills for 128/256-bit registers when we have AVX512, 6020 // but not VLX. If it uses an extended register we need to use an instruction 6021 // that loads the lower 128/256-bit, but is available with only AVX512F. 6022 static bool expandNOVLXLoad(MachineInstrBuilder &MIB, 6023 const TargetRegisterInfo *TRI, 6024 const MCInstrDesc &LoadDesc, 6025 const MCInstrDesc &BroadcastDesc, unsigned SubIdx) { 6026 Register DestReg = MIB.getReg(0); 6027 // Check if DestReg is XMM16-31 or YMM16-31. 6028 if (TRI->getEncodingValue(DestReg) < 16) { 6029 // We can use a normal VEX encoded load. 6030 MIB->setDesc(LoadDesc); 6031 } else { 6032 // Use a 128/256-bit VBROADCAST instruction. 6033 MIB->setDesc(BroadcastDesc); 6034 // Change the destination to a 512-bit register. 6035 DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass); 6036 MIB->getOperand(0).setReg(DestReg); 6037 } 6038 return true; 6039 } 6040 6041 // This is used to handle spills for 128/256-bit registers when we have AVX512, 6042 // but not VLX. If it uses an extended register we need to use an instruction 6043 // that stores the lower 128/256-bit, but is available with only AVX512F. 6044 static bool expandNOVLXStore(MachineInstrBuilder &MIB, 6045 const TargetRegisterInfo *TRI, 6046 const MCInstrDesc &StoreDesc, 6047 const MCInstrDesc &ExtractDesc, unsigned SubIdx) { 6048 Register SrcReg = MIB.getReg(X86::AddrNumOperands); 6049 // Check if DestReg is XMM16-31 or YMM16-31. 6050 if (TRI->getEncodingValue(SrcReg) < 16) { 6051 // We can use a normal VEX encoded store. 6052 MIB->setDesc(StoreDesc); 6053 } else { 6054 // Use a VEXTRACTF instruction. 6055 MIB->setDesc(ExtractDesc); 6056 // Change the destination to a 512-bit register. 6057 SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass); 6058 MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg); 6059 MIB.addImm(0x0); // Append immediate to extract from the lower bits. 6060 } 6061 6062 return true; 6063 } 6064 6065 static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { 6066 MIB->setDesc(Desc); 6067 int64_t ShiftAmt = MIB->getOperand(2).getImm(); 6068 // Temporarily remove the immediate so we can add another source register. 6069 MIB->removeOperand(2); 6070 // Add the register. Don't copy the kill flag if there is one. 6071 MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef())); 6072 // Add back the immediate. 6073 MIB.addImm(ShiftAmt); 6074 return true; 6075 } 6076 6077 bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 6078 bool HasAVX = Subtarget.hasAVX(); 6079 MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); 6080 switch (MI.getOpcode()) { 6081 case X86::MOV32r0: 6082 return Expand2AddrUndef(MIB, get(X86::XOR32rr)); 6083 case X86::MOV32r1: 6084 return expandMOV32r1(MIB, *this, /*MinusOne=*/false); 6085 case X86::MOV32r_1: 6086 return expandMOV32r1(MIB, *this, /*MinusOne=*/true); 6087 case X86::MOV32ImmSExti8: 6088 case X86::MOV64ImmSExti8: 6089 return ExpandMOVImmSExti8(MIB, *this, Subtarget); 6090 case X86::SETB_C32r: 6091 return Expand2AddrUndef(MIB, get(X86::SBB32rr)); 6092 case X86::SETB_C64r: 6093 return Expand2AddrUndef(MIB, get(X86::SBB64rr)); 6094 case X86::MMX_SET0: 6095 return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr)); 6096 case X86::V_SET0: 6097 case X86::FsFLD0SS: 6098 case X86::FsFLD0SD: 6099 case X86::FsFLD0SH: 6100 case X86::FsFLD0F128: 6101 return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); 6102 case X86::AVX_SET0: { 6103 assert(HasAVX && "AVX not supported"); 6104 const TargetRegisterInfo *TRI = &getRegisterInfo(); 6105 Register SrcReg = MIB.getReg(0); 6106 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); 6107 MIB->getOperand(0).setReg(XReg); 6108 Expand2AddrUndef(MIB, get(X86::VXORPSrr)); 6109 MIB.addReg(SrcReg, RegState::ImplicitDefine); 6110 return true; 6111 } 6112 case X86::AVX512_128_SET0: 6113 case X86::AVX512_FsFLD0SH: 6114 case X86::AVX512_FsFLD0SS: 6115 case X86::AVX512_FsFLD0SD: 6116 case X86::AVX512_FsFLD0F128: { 6117 bool HasVLX = Subtarget.hasVLX(); 6118 Register SrcReg = MIB.getReg(0); 6119 const TargetRegisterInfo *TRI = &getRegisterInfo(); 6120 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) 6121 return Expand2AddrUndef(MIB, 6122 get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr)); 6123 // Extended register without VLX. Use a larger XOR. 6124 SrcReg = 6125 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass); 6126 MIB->getOperand(0).setReg(SrcReg); 6127 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); 6128 } 6129 case X86::AVX512_256_SET0: 6130 case X86::AVX512_512_SET0: { 6131 bool HasVLX = Subtarget.hasVLX(); 6132 Register SrcReg = MIB.getReg(0); 6133 const TargetRegisterInfo *TRI = &getRegisterInfo(); 6134 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) { 6135 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); 6136 MIB->getOperand(0).setReg(XReg); 6137 Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr)); 6138 MIB.addReg(SrcReg, RegState::ImplicitDefine); 6139 return true; 6140 } 6141 if (MI.getOpcode() == X86::AVX512_256_SET0) { 6142 // No VLX so we must reference a zmm. 6143 unsigned ZReg = 6144 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass); 6145 MIB->getOperand(0).setReg(ZReg); 6146 } 6147 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); 6148 } 6149 case X86::V_SETALLONES: 6150 return Expand2AddrUndef(MIB, 6151 get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); 6152 case X86::AVX2_SETALLONES: 6153 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); 6154 case X86::AVX1_SETALLONES: { 6155 Register Reg = MIB.getReg(0); 6156 // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS. 6157 MIB->setDesc(get(X86::VCMPPSYrri)); 6158 MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); 6159 return true; 6160 } 6161 case X86::AVX512_512_SETALLONES: { 6162 Register Reg = MIB.getReg(0); 6163 MIB->setDesc(get(X86::VPTERNLOGDZrri)); 6164 // VPTERNLOGD needs 3 register inputs and an immediate. 6165 // 0xff will return 1s for any input. 6166 MIB.addReg(Reg, RegState::Undef) 6167 .addReg(Reg, RegState::Undef) 6168 .addReg(Reg, RegState::Undef) 6169 .addImm(0xff); 6170 return true; 6171 } 6172 case X86::AVX512_512_SEXT_MASK_32: 6173 case X86::AVX512_512_SEXT_MASK_64: { 6174 Register Reg = MIB.getReg(0); 6175 Register MaskReg = MIB.getReg(1); 6176 unsigned MaskState = getRegState(MIB->getOperand(1)); 6177 unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) 6178 ? X86::VPTERNLOGQZrrikz 6179 : X86::VPTERNLOGDZrrikz; 6180 MI.removeOperand(1); 6181 MIB->setDesc(get(Opc)); 6182 // VPTERNLOG needs 3 register inputs and an immediate. 6183 // 0xff will return 1s for any input. 6184 MIB.addReg(Reg, RegState::Undef) 6185 .addReg(MaskReg, MaskState) 6186 .addReg(Reg, RegState::Undef) 6187 .addReg(Reg, RegState::Undef) 6188 .addImm(0xff); 6189 return true; 6190 } 6191 case X86::VMOVAPSZ128rm_NOVLX: 6192 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm), 6193 get(X86::VBROADCASTF32X4rm), X86::sub_xmm); 6194 case X86::VMOVUPSZ128rm_NOVLX: 6195 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm), 6196 get(X86::VBROADCASTF32X4rm), X86::sub_xmm); 6197 case X86::VMOVAPSZ256rm_NOVLX: 6198 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm), 6199 get(X86::VBROADCASTF64X4rm), X86::sub_ymm); 6200 case X86::VMOVUPSZ256rm_NOVLX: 6201 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm), 6202 get(X86::VBROADCASTF64X4rm), X86::sub_ymm); 6203 case X86::VMOVAPSZ128mr_NOVLX: 6204 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr), 6205 get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm); 6206 case X86::VMOVUPSZ128mr_NOVLX: 6207 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr), 6208 get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm); 6209 case X86::VMOVAPSZ256mr_NOVLX: 6210 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr), 6211 get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); 6212 case X86::VMOVUPSZ256mr_NOVLX: 6213 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr), 6214 get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); 6215 case X86::MOV32ri64: { 6216 Register Reg = MIB.getReg(0); 6217 Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit); 6218 MI.setDesc(get(X86::MOV32ri)); 6219 MIB->getOperand(0).setReg(Reg32); 6220 MIB.addReg(Reg, RegState::ImplicitDefine); 6221 return true; 6222 } 6223 6224 case X86::RDFLAGS32: 6225 case X86::RDFLAGS64: { 6226 unsigned Is64Bit = MI.getOpcode() == X86::RDFLAGS64; 6227 MachineBasicBlock &MBB = *MIB->getParent(); 6228 6229 MachineInstr *NewMI = BuildMI(MBB, MI, MIB->getDebugLoc(), 6230 get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32)) 6231 .getInstr(); 6232 6233 // Permit reads of the EFLAGS and DF registers without them being defined. 6234 // This intrinsic exists to read external processor state in flags, such as 6235 // the trap flag, interrupt flag, and direction flag, none of which are 6236 // modeled by the backend. 6237 assert(NewMI->getOperand(2).getReg() == X86::EFLAGS && 6238 "Unexpected register in operand! Should be EFLAGS."); 6239 NewMI->getOperand(2).setIsUndef(); 6240 assert(NewMI->getOperand(3).getReg() == X86::DF && 6241 "Unexpected register in operand! Should be DF."); 6242 NewMI->getOperand(3).setIsUndef(); 6243 6244 MIB->setDesc(get(Is64Bit ? X86::POP64r : X86::POP32r)); 6245 return true; 6246 } 6247 6248 case X86::WRFLAGS32: 6249 case X86::WRFLAGS64: { 6250 unsigned Is64Bit = MI.getOpcode() == X86::WRFLAGS64; 6251 MachineBasicBlock &MBB = *MIB->getParent(); 6252 6253 BuildMI(MBB, MI, MIB->getDebugLoc(), 6254 get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) 6255 .addReg(MI.getOperand(0).getReg()); 6256 BuildMI(MBB, MI, MIB->getDebugLoc(), 6257 get(Is64Bit ? X86::POPF64 : X86::POPF32)); 6258 MI.eraseFromParent(); 6259 return true; 6260 } 6261 6262 // KNL does not recognize dependency-breaking idioms for mask registers, 6263 // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1. 6264 // Using %k0 as the undef input register is a performance heuristic based 6265 // on the assumption that %k0 is used less frequently than the other mask 6266 // registers, since it is not usable as a write mask. 6267 // FIXME: A more advanced approach would be to choose the best input mask 6268 // register based on context. 6269 case X86::KSET0W: 6270 return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0); 6271 case X86::KSET0D: 6272 return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0); 6273 case X86::KSET0Q: 6274 return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0); 6275 case X86::KSET1W: 6276 return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0); 6277 case X86::KSET1D: 6278 return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0); 6279 case X86::KSET1Q: 6280 return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0); 6281 case TargetOpcode::LOAD_STACK_GUARD: 6282 expandLoadStackGuard(MIB, *this); 6283 return true; 6284 case X86::XOR64_FP: 6285 case X86::XOR32_FP: 6286 return expandXorFP(MIB, *this); 6287 case X86::SHLDROT32ri: 6288 return expandSHXDROT(MIB, get(X86::SHLD32rri8)); 6289 case X86::SHLDROT64ri: 6290 return expandSHXDROT(MIB, get(X86::SHLD64rri8)); 6291 case X86::SHRDROT32ri: 6292 return expandSHXDROT(MIB, get(X86::SHRD32rri8)); 6293 case X86::SHRDROT64ri: 6294 return expandSHXDROT(MIB, get(X86::SHRD64rri8)); 6295 case X86::ADD8rr_DB: 6296 MIB->setDesc(get(X86::OR8rr)); 6297 break; 6298 case X86::ADD16rr_DB: 6299 MIB->setDesc(get(X86::OR16rr)); 6300 break; 6301 case X86::ADD32rr_DB: 6302 MIB->setDesc(get(X86::OR32rr)); 6303 break; 6304 case X86::ADD64rr_DB: 6305 MIB->setDesc(get(X86::OR64rr)); 6306 break; 6307 case X86::ADD8ri_DB: 6308 MIB->setDesc(get(X86::OR8ri)); 6309 break; 6310 case X86::ADD16ri_DB: 6311 MIB->setDesc(get(X86::OR16ri)); 6312 break; 6313 case X86::ADD32ri_DB: 6314 MIB->setDesc(get(X86::OR32ri)); 6315 break; 6316 case X86::ADD64ri32_DB: 6317 MIB->setDesc(get(X86::OR64ri32)); 6318 break; 6319 } 6320 return false; 6321 } 6322 6323 /// Return true for all instructions that only update 6324 /// the first 32 or 64-bits of the destination register and leave the rest 6325 /// unmodified. This can be used to avoid folding loads if the instructions 6326 /// only update part of the destination register, and the non-updated part is 6327 /// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these 6328 /// instructions breaks the partial register dependency and it can improve 6329 /// performance. e.g.: 6330 /// 6331 /// movss (%rdi), %xmm0 6332 /// cvtss2sd %xmm0, %xmm0 6333 /// 6334 /// Instead of 6335 /// cvtss2sd (%rdi), %xmm0 6336 /// 6337 /// FIXME: This should be turned into a TSFlags. 6338 /// 6339 static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget, 6340 bool ForLoadFold = false) { 6341 switch (Opcode) { 6342 case X86::CVTSI2SSrr: 6343 case X86::CVTSI2SSrm: 6344 case X86::CVTSI642SSrr: 6345 case X86::CVTSI642SSrm: 6346 case X86::CVTSI2SDrr: 6347 case X86::CVTSI2SDrm: 6348 case X86::CVTSI642SDrr: 6349 case X86::CVTSI642SDrm: 6350 // Load folding won't effect the undef register update since the input is 6351 // a GPR. 6352 return !ForLoadFold; 6353 case X86::CVTSD2SSrr: 6354 case X86::CVTSD2SSrm: 6355 case X86::CVTSS2SDrr: 6356 case X86::CVTSS2SDrm: 6357 case X86::MOVHPDrm: 6358 case X86::MOVHPSrm: 6359 case X86::MOVLPDrm: 6360 case X86::MOVLPSrm: 6361 case X86::RCPSSr: 6362 case X86::RCPSSm: 6363 case X86::RCPSSr_Int: 6364 case X86::RCPSSm_Int: 6365 case X86::ROUNDSDri: 6366 case X86::ROUNDSDmi: 6367 case X86::ROUNDSSri: 6368 case X86::ROUNDSSmi: 6369 case X86::RSQRTSSr: 6370 case X86::RSQRTSSm: 6371 case X86::RSQRTSSr_Int: 6372 case X86::RSQRTSSm_Int: 6373 case X86::SQRTSSr: 6374 case X86::SQRTSSm: 6375 case X86::SQRTSSr_Int: 6376 case X86::SQRTSSm_Int: 6377 case X86::SQRTSDr: 6378 case X86::SQRTSDm: 6379 case X86::SQRTSDr_Int: 6380 case X86::SQRTSDm_Int: 6381 return true; 6382 case X86::VFCMULCPHZ128rm: 6383 case X86::VFCMULCPHZ128rmb: 6384 case X86::VFCMULCPHZ128rmbkz: 6385 case X86::VFCMULCPHZ128rmkz: 6386 case X86::VFCMULCPHZ128rr: 6387 case X86::VFCMULCPHZ128rrkz: 6388 case X86::VFCMULCPHZ256rm: 6389 case X86::VFCMULCPHZ256rmb: 6390 case X86::VFCMULCPHZ256rmbkz: 6391 case X86::VFCMULCPHZ256rmkz: 6392 case X86::VFCMULCPHZ256rr: 6393 case X86::VFCMULCPHZ256rrkz: 6394 case X86::VFCMULCPHZrm: 6395 case X86::VFCMULCPHZrmb: 6396 case X86::VFCMULCPHZrmbkz: 6397 case X86::VFCMULCPHZrmkz: 6398 case X86::VFCMULCPHZrr: 6399 case X86::VFCMULCPHZrrb: 6400 case X86::VFCMULCPHZrrbkz: 6401 case X86::VFCMULCPHZrrkz: 6402 case X86::VFMULCPHZ128rm: 6403 case X86::VFMULCPHZ128rmb: 6404 case X86::VFMULCPHZ128rmbkz: 6405 case X86::VFMULCPHZ128rmkz: 6406 case X86::VFMULCPHZ128rr: 6407 case X86::VFMULCPHZ128rrkz: 6408 case X86::VFMULCPHZ256rm: 6409 case X86::VFMULCPHZ256rmb: 6410 case X86::VFMULCPHZ256rmbkz: 6411 case X86::VFMULCPHZ256rmkz: 6412 case X86::VFMULCPHZ256rr: 6413 case X86::VFMULCPHZ256rrkz: 6414 case X86::VFMULCPHZrm: 6415 case X86::VFMULCPHZrmb: 6416 case X86::VFMULCPHZrmbkz: 6417 case X86::VFMULCPHZrmkz: 6418 case X86::VFMULCPHZrr: 6419 case X86::VFMULCPHZrrb: 6420 case X86::VFMULCPHZrrbkz: 6421 case X86::VFMULCPHZrrkz: 6422 case X86::VFCMULCSHZrm: 6423 case X86::VFCMULCSHZrmkz: 6424 case X86::VFCMULCSHZrr: 6425 case X86::VFCMULCSHZrrb: 6426 case X86::VFCMULCSHZrrbkz: 6427 case X86::VFCMULCSHZrrkz: 6428 case X86::VFMULCSHZrm: 6429 case X86::VFMULCSHZrmkz: 6430 case X86::VFMULCSHZrr: 6431 case X86::VFMULCSHZrrb: 6432 case X86::VFMULCSHZrrbkz: 6433 case X86::VFMULCSHZrrkz: 6434 return Subtarget.hasMULCFalseDeps(); 6435 case X86::VPERMDYrm: 6436 case X86::VPERMDYrr: 6437 case X86::VPERMQYmi: 6438 case X86::VPERMQYri: 6439 case X86::VPERMPSYrm: 6440 case X86::VPERMPSYrr: 6441 case X86::VPERMPDYmi: 6442 case X86::VPERMPDYri: 6443 case X86::VPERMDZ256rm: 6444 case X86::VPERMDZ256rmb: 6445 case X86::VPERMDZ256rmbkz: 6446 case X86::VPERMDZ256rmkz: 6447 case X86::VPERMDZ256rr: 6448 case X86::VPERMDZ256rrkz: 6449 case X86::VPERMDZrm: 6450 case X86::VPERMDZrmb: 6451 case X86::VPERMDZrmbkz: 6452 case X86::VPERMDZrmkz: 6453 case X86::VPERMDZrr: 6454 case X86::VPERMDZrrkz: 6455 case X86::VPERMQZ256mbi: 6456 case X86::VPERMQZ256mbikz: 6457 case X86::VPERMQZ256mi: 6458 case X86::VPERMQZ256mikz: 6459 case X86::VPERMQZ256ri: 6460 case X86::VPERMQZ256rikz: 6461 case X86::VPERMQZ256rm: 6462 case X86::VPERMQZ256rmb: 6463 case X86::VPERMQZ256rmbkz: 6464 case X86::VPERMQZ256rmkz: 6465 case X86::VPERMQZ256rr: 6466 case X86::VPERMQZ256rrkz: 6467 case X86::VPERMQZmbi: 6468 case X86::VPERMQZmbikz: 6469 case X86::VPERMQZmi: 6470 case X86::VPERMQZmikz: 6471 case X86::VPERMQZri: 6472 case X86::VPERMQZrikz: 6473 case X86::VPERMQZrm: 6474 case X86::VPERMQZrmb: 6475 case X86::VPERMQZrmbkz: 6476 case X86::VPERMQZrmkz: 6477 case X86::VPERMQZrr: 6478 case X86::VPERMQZrrkz: 6479 case X86::VPERMPSZ256rm: 6480 case X86::VPERMPSZ256rmb: 6481 case X86::VPERMPSZ256rmbkz: 6482 case X86::VPERMPSZ256rmkz: 6483 case X86::VPERMPSZ256rr: 6484 case X86::VPERMPSZ256rrkz: 6485 case X86::VPERMPSZrm: 6486 case X86::VPERMPSZrmb: 6487 case X86::VPERMPSZrmbkz: 6488 case X86::VPERMPSZrmkz: 6489 case X86::VPERMPSZrr: 6490 case X86::VPERMPSZrrkz: 6491 case X86::VPERMPDZ256mbi: 6492 case X86::VPERMPDZ256mbikz: 6493 case X86::VPERMPDZ256mi: 6494 case X86::VPERMPDZ256mikz: 6495 case X86::VPERMPDZ256ri: 6496 case X86::VPERMPDZ256rikz: 6497 case X86::VPERMPDZ256rm: 6498 case X86::VPERMPDZ256rmb: 6499 case X86::VPERMPDZ256rmbkz: 6500 case X86::VPERMPDZ256rmkz: 6501 case X86::VPERMPDZ256rr: 6502 case X86::VPERMPDZ256rrkz: 6503 case X86::VPERMPDZmbi: 6504 case X86::VPERMPDZmbikz: 6505 case X86::VPERMPDZmi: 6506 case X86::VPERMPDZmikz: 6507 case X86::VPERMPDZri: 6508 case X86::VPERMPDZrikz: 6509 case X86::VPERMPDZrm: 6510 case X86::VPERMPDZrmb: 6511 case X86::VPERMPDZrmbkz: 6512 case X86::VPERMPDZrmkz: 6513 case X86::VPERMPDZrr: 6514 case X86::VPERMPDZrrkz: 6515 return Subtarget.hasPERMFalseDeps(); 6516 case X86::VRANGEPDZ128rmbi: 6517 case X86::VRANGEPDZ128rmbikz: 6518 case X86::VRANGEPDZ128rmi: 6519 case X86::VRANGEPDZ128rmikz: 6520 case X86::VRANGEPDZ128rri: 6521 case X86::VRANGEPDZ128rrikz: 6522 case X86::VRANGEPDZ256rmbi: 6523 case X86::VRANGEPDZ256rmbikz: 6524 case X86::VRANGEPDZ256rmi: 6525 case X86::VRANGEPDZ256rmikz: 6526 case X86::VRANGEPDZ256rri: 6527 case X86::VRANGEPDZ256rrikz: 6528 case X86::VRANGEPDZrmbi: 6529 case X86::VRANGEPDZrmbikz: 6530 case X86::VRANGEPDZrmi: 6531 case X86::VRANGEPDZrmikz: 6532 case X86::VRANGEPDZrri: 6533 case X86::VRANGEPDZrrib: 6534 case X86::VRANGEPDZrribkz: 6535 case X86::VRANGEPDZrrikz: 6536 case X86::VRANGEPSZ128rmbi: 6537 case X86::VRANGEPSZ128rmbikz: 6538 case X86::VRANGEPSZ128rmi: 6539 case X86::VRANGEPSZ128rmikz: 6540 case X86::VRANGEPSZ128rri: 6541 case X86::VRANGEPSZ128rrikz: 6542 case X86::VRANGEPSZ256rmbi: 6543 case X86::VRANGEPSZ256rmbikz: 6544 case X86::VRANGEPSZ256rmi: 6545 case X86::VRANGEPSZ256rmikz: 6546 case X86::VRANGEPSZ256rri: 6547 case X86::VRANGEPSZ256rrikz: 6548 case X86::VRANGEPSZrmbi: 6549 case X86::VRANGEPSZrmbikz: 6550 case X86::VRANGEPSZrmi: 6551 case X86::VRANGEPSZrmikz: 6552 case X86::VRANGEPSZrri: 6553 case X86::VRANGEPSZrrib: 6554 case X86::VRANGEPSZrribkz: 6555 case X86::VRANGEPSZrrikz: 6556 case X86::VRANGESDZrmi: 6557 case X86::VRANGESDZrmikz: 6558 case X86::VRANGESDZrri: 6559 case X86::VRANGESDZrrib: 6560 case X86::VRANGESDZrribkz: 6561 case X86::VRANGESDZrrikz: 6562 case X86::VRANGESSZrmi: 6563 case X86::VRANGESSZrmikz: 6564 case X86::VRANGESSZrri: 6565 case X86::VRANGESSZrrib: 6566 case X86::VRANGESSZrribkz: 6567 case X86::VRANGESSZrrikz: 6568 return Subtarget.hasRANGEFalseDeps(); 6569 case X86::VGETMANTSSZrmi: 6570 case X86::VGETMANTSSZrmikz: 6571 case X86::VGETMANTSSZrri: 6572 case X86::VGETMANTSSZrrib: 6573 case X86::VGETMANTSSZrribkz: 6574 case X86::VGETMANTSSZrrikz: 6575 case X86::VGETMANTSDZrmi: 6576 case X86::VGETMANTSDZrmikz: 6577 case X86::VGETMANTSDZrri: 6578 case X86::VGETMANTSDZrrib: 6579 case X86::VGETMANTSDZrribkz: 6580 case X86::VGETMANTSDZrrikz: 6581 case X86::VGETMANTSHZrmi: 6582 case X86::VGETMANTSHZrmikz: 6583 case X86::VGETMANTSHZrri: 6584 case X86::VGETMANTSHZrrib: 6585 case X86::VGETMANTSHZrribkz: 6586 case X86::VGETMANTSHZrrikz: 6587 case X86::VGETMANTPSZ128rmbi: 6588 case X86::VGETMANTPSZ128rmbikz: 6589 case X86::VGETMANTPSZ128rmi: 6590 case X86::VGETMANTPSZ128rmikz: 6591 case X86::VGETMANTPSZ256rmbi: 6592 case X86::VGETMANTPSZ256rmbikz: 6593 case X86::VGETMANTPSZ256rmi: 6594 case X86::VGETMANTPSZ256rmikz: 6595 case X86::VGETMANTPSZrmbi: 6596 case X86::VGETMANTPSZrmbikz: 6597 case X86::VGETMANTPSZrmi: 6598 case X86::VGETMANTPSZrmikz: 6599 case X86::VGETMANTPDZ128rmbi: 6600 case X86::VGETMANTPDZ128rmbikz: 6601 case X86::VGETMANTPDZ128rmi: 6602 case X86::VGETMANTPDZ128rmikz: 6603 case X86::VGETMANTPDZ256rmbi: 6604 case X86::VGETMANTPDZ256rmbikz: 6605 case X86::VGETMANTPDZ256rmi: 6606 case X86::VGETMANTPDZ256rmikz: 6607 case X86::VGETMANTPDZrmbi: 6608 case X86::VGETMANTPDZrmbikz: 6609 case X86::VGETMANTPDZrmi: 6610 case X86::VGETMANTPDZrmikz: 6611 return Subtarget.hasGETMANTFalseDeps(); 6612 case X86::VPMULLQZ128rm: 6613 case X86::VPMULLQZ128rmb: 6614 case X86::VPMULLQZ128rmbkz: 6615 case X86::VPMULLQZ128rmkz: 6616 case X86::VPMULLQZ128rr: 6617 case X86::VPMULLQZ128rrkz: 6618 case X86::VPMULLQZ256rm: 6619 case X86::VPMULLQZ256rmb: 6620 case X86::VPMULLQZ256rmbkz: 6621 case X86::VPMULLQZ256rmkz: 6622 case X86::VPMULLQZ256rr: 6623 case X86::VPMULLQZ256rrkz: 6624 case X86::VPMULLQZrm: 6625 case X86::VPMULLQZrmb: 6626 case X86::VPMULLQZrmbkz: 6627 case X86::VPMULLQZrmkz: 6628 case X86::VPMULLQZrr: 6629 case X86::VPMULLQZrrkz: 6630 return Subtarget.hasMULLQFalseDeps(); 6631 // GPR 6632 case X86::POPCNT32rm: 6633 case X86::POPCNT32rr: 6634 case X86::POPCNT64rm: 6635 case X86::POPCNT64rr: 6636 return Subtarget.hasPOPCNTFalseDeps(); 6637 case X86::LZCNT32rm: 6638 case X86::LZCNT32rr: 6639 case X86::LZCNT64rm: 6640 case X86::LZCNT64rr: 6641 case X86::TZCNT32rm: 6642 case X86::TZCNT32rr: 6643 case X86::TZCNT64rm: 6644 case X86::TZCNT64rr: 6645 return Subtarget.hasLZCNTFalseDeps(); 6646 } 6647 6648 return false; 6649 } 6650 6651 /// Inform the BreakFalseDeps pass how many idle 6652 /// instructions we would like before a partial register update. 6653 unsigned X86InstrInfo::getPartialRegUpdateClearance( 6654 const MachineInstr &MI, unsigned OpNum, 6655 const TargetRegisterInfo *TRI) const { 6656 if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget)) 6657 return 0; 6658 6659 // If MI is marked as reading Reg, the partial register update is wanted. 6660 const MachineOperand &MO = MI.getOperand(0); 6661 Register Reg = MO.getReg(); 6662 if (Reg.isVirtual()) { 6663 if (MO.readsReg() || MI.readsVirtualRegister(Reg)) 6664 return 0; 6665 } else { 6666 if (MI.readsRegister(Reg, TRI)) 6667 return 0; 6668 } 6669 6670 // If any instructions in the clearance range are reading Reg, insert a 6671 // dependency breaking instruction, which is inexpensive and is likely to 6672 // be hidden in other instruction's cycles. 6673 return PartialRegUpdateClearance; 6674 } 6675 6676 // Return true for any instruction the copies the high bits of the first source 6677 // operand into the unused high bits of the destination operand. 6678 // Also returns true for instructions that have two inputs where one may 6679 // be undef and we want it to use the same register as the other input. 6680 static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, 6681 bool ForLoadFold = false) { 6682 // Set the OpNum parameter to the first source operand. 6683 switch (Opcode) { 6684 case X86::MMX_PUNPCKHBWrr: 6685 case X86::MMX_PUNPCKHWDrr: 6686 case X86::MMX_PUNPCKHDQrr: 6687 case X86::MMX_PUNPCKLBWrr: 6688 case X86::MMX_PUNPCKLWDrr: 6689 case X86::MMX_PUNPCKLDQrr: 6690 case X86::MOVHLPSrr: 6691 case X86::PACKSSWBrr: 6692 case X86::PACKUSWBrr: 6693 case X86::PACKSSDWrr: 6694 case X86::PACKUSDWrr: 6695 case X86::PUNPCKHBWrr: 6696 case X86::PUNPCKLBWrr: 6697 case X86::PUNPCKHWDrr: 6698 case X86::PUNPCKLWDrr: 6699 case X86::PUNPCKHDQrr: 6700 case X86::PUNPCKLDQrr: 6701 case X86::PUNPCKHQDQrr: 6702 case X86::PUNPCKLQDQrr: 6703 case X86::SHUFPDrri: 6704 case X86::SHUFPSrri: 6705 // These instructions are sometimes used with an undef first or second 6706 // source. Return true here so BreakFalseDeps will assign this source to the 6707 // same register as the first source to avoid a false dependency. 6708 // Operand 1 of these instructions is tied so they're separate from their 6709 // VEX counterparts. 6710 return OpNum == 2 && !ForLoadFold; 6711 6712 case X86::VMOVLHPSrr: 6713 case X86::VMOVLHPSZrr: 6714 case X86::VPACKSSWBrr: 6715 case X86::VPACKUSWBrr: 6716 case X86::VPACKSSDWrr: 6717 case X86::VPACKUSDWrr: 6718 case X86::VPACKSSWBZ128rr: 6719 case X86::VPACKUSWBZ128rr: 6720 case X86::VPACKSSDWZ128rr: 6721 case X86::VPACKUSDWZ128rr: 6722 case X86::VPERM2F128rr: 6723 case X86::VPERM2I128rr: 6724 case X86::VSHUFF32X4Z256rri: 6725 case X86::VSHUFF32X4Zrri: 6726 case X86::VSHUFF64X2Z256rri: 6727 case X86::VSHUFF64X2Zrri: 6728 case X86::VSHUFI32X4Z256rri: 6729 case X86::VSHUFI32X4Zrri: 6730 case X86::VSHUFI64X2Z256rri: 6731 case X86::VSHUFI64X2Zrri: 6732 case X86::VPUNPCKHBWrr: 6733 case X86::VPUNPCKLBWrr: 6734 case X86::VPUNPCKHBWYrr: 6735 case X86::VPUNPCKLBWYrr: 6736 case X86::VPUNPCKHBWZ128rr: 6737 case X86::VPUNPCKLBWZ128rr: 6738 case X86::VPUNPCKHBWZ256rr: 6739 case X86::VPUNPCKLBWZ256rr: 6740 case X86::VPUNPCKHBWZrr: 6741 case X86::VPUNPCKLBWZrr: 6742 case X86::VPUNPCKHWDrr: 6743 case X86::VPUNPCKLWDrr: 6744 case X86::VPUNPCKHWDYrr: 6745 case X86::VPUNPCKLWDYrr: 6746 case X86::VPUNPCKHWDZ128rr: 6747 case X86::VPUNPCKLWDZ128rr: 6748 case X86::VPUNPCKHWDZ256rr: 6749 case X86::VPUNPCKLWDZ256rr: 6750 case X86::VPUNPCKHWDZrr: 6751 case X86::VPUNPCKLWDZrr: 6752 case X86::VPUNPCKHDQrr: 6753 case X86::VPUNPCKLDQrr: 6754 case X86::VPUNPCKHDQYrr: 6755 case X86::VPUNPCKLDQYrr: 6756 case X86::VPUNPCKHDQZ128rr: 6757 case X86::VPUNPCKLDQZ128rr: 6758 case X86::VPUNPCKHDQZ256rr: 6759 case X86::VPUNPCKLDQZ256rr: 6760 case X86::VPUNPCKHDQZrr: 6761 case X86::VPUNPCKLDQZrr: 6762 case X86::VPUNPCKHQDQrr: 6763 case X86::VPUNPCKLQDQrr: 6764 case X86::VPUNPCKHQDQYrr: 6765 case X86::VPUNPCKLQDQYrr: 6766 case X86::VPUNPCKHQDQZ128rr: 6767 case X86::VPUNPCKLQDQZ128rr: 6768 case X86::VPUNPCKHQDQZ256rr: 6769 case X86::VPUNPCKLQDQZ256rr: 6770 case X86::VPUNPCKHQDQZrr: 6771 case X86::VPUNPCKLQDQZrr: 6772 // These instructions are sometimes used with an undef first or second 6773 // source. Return true here so BreakFalseDeps will assign this source to the 6774 // same register as the first source to avoid a false dependency. 6775 return (OpNum == 1 || OpNum == 2) && !ForLoadFold; 6776 6777 case X86::VCVTSI2SSrr: 6778 case X86::VCVTSI2SSrm: 6779 case X86::VCVTSI2SSrr_Int: 6780 case X86::VCVTSI2SSrm_Int: 6781 case X86::VCVTSI642SSrr: 6782 case X86::VCVTSI642SSrm: 6783 case X86::VCVTSI642SSrr_Int: 6784 case X86::VCVTSI642SSrm_Int: 6785 case X86::VCVTSI2SDrr: 6786 case X86::VCVTSI2SDrm: 6787 case X86::VCVTSI2SDrr_Int: 6788 case X86::VCVTSI2SDrm_Int: 6789 case X86::VCVTSI642SDrr: 6790 case X86::VCVTSI642SDrm: 6791 case X86::VCVTSI642SDrr_Int: 6792 case X86::VCVTSI642SDrm_Int: 6793 // AVX-512 6794 case X86::VCVTSI2SSZrr: 6795 case X86::VCVTSI2SSZrm: 6796 case X86::VCVTSI2SSZrr_Int: 6797 case X86::VCVTSI2SSZrrb_Int: 6798 case X86::VCVTSI2SSZrm_Int: 6799 case X86::VCVTSI642SSZrr: 6800 case X86::VCVTSI642SSZrm: 6801 case X86::VCVTSI642SSZrr_Int: 6802 case X86::VCVTSI642SSZrrb_Int: 6803 case X86::VCVTSI642SSZrm_Int: 6804 case X86::VCVTSI2SDZrr: 6805 case X86::VCVTSI2SDZrm: 6806 case X86::VCVTSI2SDZrr_Int: 6807 case X86::VCVTSI2SDZrm_Int: 6808 case X86::VCVTSI642SDZrr: 6809 case X86::VCVTSI642SDZrm: 6810 case X86::VCVTSI642SDZrr_Int: 6811 case X86::VCVTSI642SDZrrb_Int: 6812 case X86::VCVTSI642SDZrm_Int: 6813 case X86::VCVTUSI2SSZrr: 6814 case X86::VCVTUSI2SSZrm: 6815 case X86::VCVTUSI2SSZrr_Int: 6816 case X86::VCVTUSI2SSZrrb_Int: 6817 case X86::VCVTUSI2SSZrm_Int: 6818 case X86::VCVTUSI642SSZrr: 6819 case X86::VCVTUSI642SSZrm: 6820 case X86::VCVTUSI642SSZrr_Int: 6821 case X86::VCVTUSI642SSZrrb_Int: 6822 case X86::VCVTUSI642SSZrm_Int: 6823 case X86::VCVTUSI2SDZrr: 6824 case X86::VCVTUSI2SDZrm: 6825 case X86::VCVTUSI2SDZrr_Int: 6826 case X86::VCVTUSI2SDZrm_Int: 6827 case X86::VCVTUSI642SDZrr: 6828 case X86::VCVTUSI642SDZrm: 6829 case X86::VCVTUSI642SDZrr_Int: 6830 case X86::VCVTUSI642SDZrrb_Int: 6831 case X86::VCVTUSI642SDZrm_Int: 6832 case X86::VCVTSI2SHZrr: 6833 case X86::VCVTSI2SHZrm: 6834 case X86::VCVTSI2SHZrr_Int: 6835 case X86::VCVTSI2SHZrrb_Int: 6836 case X86::VCVTSI2SHZrm_Int: 6837 case X86::VCVTSI642SHZrr: 6838 case X86::VCVTSI642SHZrm: 6839 case X86::VCVTSI642SHZrr_Int: 6840 case X86::VCVTSI642SHZrrb_Int: 6841 case X86::VCVTSI642SHZrm_Int: 6842 case X86::VCVTUSI2SHZrr: 6843 case X86::VCVTUSI2SHZrm: 6844 case X86::VCVTUSI2SHZrr_Int: 6845 case X86::VCVTUSI2SHZrrb_Int: 6846 case X86::VCVTUSI2SHZrm_Int: 6847 case X86::VCVTUSI642SHZrr: 6848 case X86::VCVTUSI642SHZrm: 6849 case X86::VCVTUSI642SHZrr_Int: 6850 case X86::VCVTUSI642SHZrrb_Int: 6851 case X86::VCVTUSI642SHZrm_Int: 6852 // Load folding won't effect the undef register update since the input is 6853 // a GPR. 6854 return OpNum == 1 && !ForLoadFold; 6855 case X86::VCVTSD2SSrr: 6856 case X86::VCVTSD2SSrm: 6857 case X86::VCVTSD2SSrr_Int: 6858 case X86::VCVTSD2SSrm_Int: 6859 case X86::VCVTSS2SDrr: 6860 case X86::VCVTSS2SDrm: 6861 case X86::VCVTSS2SDrr_Int: 6862 case X86::VCVTSS2SDrm_Int: 6863 case X86::VRCPSSr: 6864 case X86::VRCPSSr_Int: 6865 case X86::VRCPSSm: 6866 case X86::VRCPSSm_Int: 6867 case X86::VROUNDSDri: 6868 case X86::VROUNDSDmi: 6869 case X86::VROUNDSDri_Int: 6870 case X86::VROUNDSDmi_Int: 6871 case X86::VROUNDSSri: 6872 case X86::VROUNDSSmi: 6873 case X86::VROUNDSSri_Int: 6874 case X86::VROUNDSSmi_Int: 6875 case X86::VRSQRTSSr: 6876 case X86::VRSQRTSSr_Int: 6877 case X86::VRSQRTSSm: 6878 case X86::VRSQRTSSm_Int: 6879 case X86::VSQRTSSr: 6880 case X86::VSQRTSSr_Int: 6881 case X86::VSQRTSSm: 6882 case X86::VSQRTSSm_Int: 6883 case X86::VSQRTSDr: 6884 case X86::VSQRTSDr_Int: 6885 case X86::VSQRTSDm: 6886 case X86::VSQRTSDm_Int: 6887 // AVX-512 6888 case X86::VCVTSD2SSZrr: 6889 case X86::VCVTSD2SSZrr_Int: 6890 case X86::VCVTSD2SSZrrb_Int: 6891 case X86::VCVTSD2SSZrm: 6892 case X86::VCVTSD2SSZrm_Int: 6893 case X86::VCVTSS2SDZrr: 6894 case X86::VCVTSS2SDZrr_Int: 6895 case X86::VCVTSS2SDZrrb_Int: 6896 case X86::VCVTSS2SDZrm: 6897 case X86::VCVTSS2SDZrm_Int: 6898 case X86::VGETEXPSDZr: 6899 case X86::VGETEXPSDZrb: 6900 case X86::VGETEXPSDZm: 6901 case X86::VGETEXPSSZr: 6902 case X86::VGETEXPSSZrb: 6903 case X86::VGETEXPSSZm: 6904 case X86::VGETMANTSDZrri: 6905 case X86::VGETMANTSDZrrib: 6906 case X86::VGETMANTSDZrmi: 6907 case X86::VGETMANTSSZrri: 6908 case X86::VGETMANTSSZrrib: 6909 case X86::VGETMANTSSZrmi: 6910 case X86::VRNDSCALESDZr: 6911 case X86::VRNDSCALESDZr_Int: 6912 case X86::VRNDSCALESDZrb_Int: 6913 case X86::VRNDSCALESDZm: 6914 case X86::VRNDSCALESDZm_Int: 6915 case X86::VRNDSCALESSZr: 6916 case X86::VRNDSCALESSZr_Int: 6917 case X86::VRNDSCALESSZrb_Int: 6918 case X86::VRNDSCALESSZm: 6919 case X86::VRNDSCALESSZm_Int: 6920 case X86::VRCP14SDZrr: 6921 case X86::VRCP14SDZrm: 6922 case X86::VRCP14SSZrr: 6923 case X86::VRCP14SSZrm: 6924 case X86::VRCPSHZrr: 6925 case X86::VRCPSHZrm: 6926 case X86::VRSQRTSHZrr: 6927 case X86::VRSQRTSHZrm: 6928 case X86::VREDUCESHZrmi: 6929 case X86::VREDUCESHZrri: 6930 case X86::VREDUCESHZrrib: 6931 case X86::VGETEXPSHZr: 6932 case X86::VGETEXPSHZrb: 6933 case X86::VGETEXPSHZm: 6934 case X86::VGETMANTSHZrri: 6935 case X86::VGETMANTSHZrrib: 6936 case X86::VGETMANTSHZrmi: 6937 case X86::VRNDSCALESHZr: 6938 case X86::VRNDSCALESHZr_Int: 6939 case X86::VRNDSCALESHZrb_Int: 6940 case X86::VRNDSCALESHZm: 6941 case X86::VRNDSCALESHZm_Int: 6942 case X86::VSQRTSHZr: 6943 case X86::VSQRTSHZr_Int: 6944 case X86::VSQRTSHZrb_Int: 6945 case X86::VSQRTSHZm: 6946 case X86::VSQRTSHZm_Int: 6947 case X86::VRCP28SDZr: 6948 case X86::VRCP28SDZrb: 6949 case X86::VRCP28SDZm: 6950 case X86::VRCP28SSZr: 6951 case X86::VRCP28SSZrb: 6952 case X86::VRCP28SSZm: 6953 case X86::VREDUCESSZrmi: 6954 case X86::VREDUCESSZrri: 6955 case X86::VREDUCESSZrrib: 6956 case X86::VRSQRT14SDZrr: 6957 case X86::VRSQRT14SDZrm: 6958 case X86::VRSQRT14SSZrr: 6959 case X86::VRSQRT14SSZrm: 6960 case X86::VRSQRT28SDZr: 6961 case X86::VRSQRT28SDZrb: 6962 case X86::VRSQRT28SDZm: 6963 case X86::VRSQRT28SSZr: 6964 case X86::VRSQRT28SSZrb: 6965 case X86::VRSQRT28SSZm: 6966 case X86::VSQRTSSZr: 6967 case X86::VSQRTSSZr_Int: 6968 case X86::VSQRTSSZrb_Int: 6969 case X86::VSQRTSSZm: 6970 case X86::VSQRTSSZm_Int: 6971 case X86::VSQRTSDZr: 6972 case X86::VSQRTSDZr_Int: 6973 case X86::VSQRTSDZrb_Int: 6974 case X86::VSQRTSDZm: 6975 case X86::VSQRTSDZm_Int: 6976 case X86::VCVTSD2SHZrr: 6977 case X86::VCVTSD2SHZrr_Int: 6978 case X86::VCVTSD2SHZrrb_Int: 6979 case X86::VCVTSD2SHZrm: 6980 case X86::VCVTSD2SHZrm_Int: 6981 case X86::VCVTSS2SHZrr: 6982 case X86::VCVTSS2SHZrr_Int: 6983 case X86::VCVTSS2SHZrrb_Int: 6984 case X86::VCVTSS2SHZrm: 6985 case X86::VCVTSS2SHZrm_Int: 6986 case X86::VCVTSH2SDZrr: 6987 case X86::VCVTSH2SDZrr_Int: 6988 case X86::VCVTSH2SDZrrb_Int: 6989 case X86::VCVTSH2SDZrm: 6990 case X86::VCVTSH2SDZrm_Int: 6991 case X86::VCVTSH2SSZrr: 6992 case X86::VCVTSH2SSZrr_Int: 6993 case X86::VCVTSH2SSZrrb_Int: 6994 case X86::VCVTSH2SSZrm: 6995 case X86::VCVTSH2SSZrm_Int: 6996 return OpNum == 1; 6997 case X86::VMOVSSZrrk: 6998 case X86::VMOVSDZrrk: 6999 return OpNum == 3 && !ForLoadFold; 7000 case X86::VMOVSSZrrkz: 7001 case X86::VMOVSDZrrkz: 7002 return OpNum == 2 && !ForLoadFold; 7003 } 7004 7005 return false; 7006 } 7007 7008 /// Inform the BreakFalseDeps pass how many idle instructions we would like 7009 /// before certain undef register reads. 7010 /// 7011 /// This catches the VCVTSI2SD family of instructions: 7012 /// 7013 /// vcvtsi2sdq %rax, undef %xmm0, %xmm14 7014 /// 7015 /// We should to be careful *not* to catch VXOR idioms which are presumably 7016 /// handled specially in the pipeline: 7017 /// 7018 /// vxorps undef %xmm1, undef %xmm1, %xmm1 7019 /// 7020 /// Like getPartialRegUpdateClearance, this makes a strong assumption that the 7021 /// high bits that are passed-through are not live. 7022 unsigned 7023 X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, 7024 const TargetRegisterInfo *TRI) const { 7025 const MachineOperand &MO = MI.getOperand(OpNum); 7026 if (MO.getReg().isPhysical() && hasUndefRegUpdate(MI.getOpcode(), OpNum)) 7027 return UndefRegClearance; 7028 7029 return 0; 7030 } 7031 7032 void X86InstrInfo::breakPartialRegDependency( 7033 MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { 7034 Register Reg = MI.getOperand(OpNum).getReg(); 7035 // If MI kills this register, the false dependence is already broken. 7036 if (MI.killsRegister(Reg, TRI)) 7037 return; 7038 7039 if (X86::VR128RegClass.contains(Reg)) { 7040 // These instructions are all floating point domain, so xorps is the best 7041 // choice. 7042 unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; 7043 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg) 7044 .addReg(Reg, RegState::Undef) 7045 .addReg(Reg, RegState::Undef); 7046 MI.addRegisterKilled(Reg, TRI, true); 7047 } else if (X86::VR256RegClass.contains(Reg)) { 7048 // Use vxorps to clear the full ymm register. 7049 // It wants to read and write the xmm sub-register. 7050 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm); 7051 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg) 7052 .addReg(XReg, RegState::Undef) 7053 .addReg(XReg, RegState::Undef) 7054 .addReg(Reg, RegState::ImplicitDefine); 7055 MI.addRegisterKilled(Reg, TRI, true); 7056 } else if (X86::VR128XRegClass.contains(Reg)) { 7057 // Only handle VLX targets. 7058 if (!Subtarget.hasVLX()) 7059 return; 7060 // Since vxorps requires AVX512DQ, vpxord should be the best choice. 7061 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg) 7062 .addReg(Reg, RegState::Undef) 7063 .addReg(Reg, RegState::Undef); 7064 MI.addRegisterKilled(Reg, TRI, true); 7065 } else if (X86::VR256XRegClass.contains(Reg) || 7066 X86::VR512RegClass.contains(Reg)) { 7067 // Only handle VLX targets. 7068 if (!Subtarget.hasVLX()) 7069 return; 7070 // Use vpxord to clear the full ymm/zmm register. 7071 // It wants to read and write the xmm sub-register. 7072 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm); 7073 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg) 7074 .addReg(XReg, RegState::Undef) 7075 .addReg(XReg, RegState::Undef) 7076 .addReg(Reg, RegState::ImplicitDefine); 7077 MI.addRegisterKilled(Reg, TRI, true); 7078 } else if (X86::GR64RegClass.contains(Reg)) { 7079 // Using XOR32rr because it has shorter encoding and zeros up the upper bits 7080 // as well. 7081 Register XReg = TRI->getSubReg(Reg, X86::sub_32bit); 7082 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg) 7083 .addReg(XReg, RegState::Undef) 7084 .addReg(XReg, RegState::Undef) 7085 .addReg(Reg, RegState::ImplicitDefine); 7086 MI.addRegisterKilled(Reg, TRI, true); 7087 } else if (X86::GR32RegClass.contains(Reg)) { 7088 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg) 7089 .addReg(Reg, RegState::Undef) 7090 .addReg(Reg, RegState::Undef); 7091 MI.addRegisterKilled(Reg, TRI, true); 7092 } 7093 } 7094 7095 static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs, 7096 int PtrOffset = 0) { 7097 unsigned NumAddrOps = MOs.size(); 7098 7099 if (NumAddrOps < 4) { 7100 // FrameIndex only - add an immediate offset (whether its zero or not). 7101 for (unsigned i = 0; i != NumAddrOps; ++i) 7102 MIB.add(MOs[i]); 7103 addOffset(MIB, PtrOffset); 7104 } else { 7105 // General Memory Addressing - we need to add any offset to an existing 7106 // offset. 7107 assert(MOs.size() == 5 && "Unexpected memory operand list length"); 7108 for (unsigned i = 0; i != NumAddrOps; ++i) { 7109 const MachineOperand &MO = MOs[i]; 7110 if (i == 3 && PtrOffset != 0) { 7111 MIB.addDisp(MO, PtrOffset); 7112 } else { 7113 MIB.add(MO); 7114 } 7115 } 7116 } 7117 } 7118 7119 static void updateOperandRegConstraints(MachineFunction &MF, 7120 MachineInstr &NewMI, 7121 const TargetInstrInfo &TII) { 7122 MachineRegisterInfo &MRI = MF.getRegInfo(); 7123 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); 7124 7125 for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) { 7126 MachineOperand &MO = NewMI.getOperand(Idx); 7127 // We only need to update constraints on virtual register operands. 7128 if (!MO.isReg()) 7129 continue; 7130 Register Reg = MO.getReg(); 7131 if (!Reg.isVirtual()) 7132 continue; 7133 7134 auto *NewRC = MRI.constrainRegClass( 7135 Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI, MF)); 7136 if (!NewRC) { 7137 LLVM_DEBUG( 7138 dbgs() << "WARNING: Unable to update register constraint for operand " 7139 << Idx << " of instruction:\n"; 7140 NewMI.dump(); dbgs() << "\n"); 7141 } 7142 } 7143 } 7144 7145 static MachineInstr *fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, 7146 ArrayRef<MachineOperand> MOs, 7147 MachineBasicBlock::iterator InsertPt, 7148 MachineInstr &MI, 7149 const TargetInstrInfo &TII) { 7150 // Create the base instruction with the memory operand as the first part. 7151 // Omit the implicit operands, something BuildMI can't do. 7152 MachineInstr *NewMI = 7153 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true); 7154 MachineInstrBuilder MIB(MF, NewMI); 7155 addOperands(MIB, MOs); 7156 7157 // Loop over the rest of the ri operands, converting them over. 7158 unsigned NumOps = MI.getDesc().getNumOperands() - 2; 7159 for (unsigned i = 0; i != NumOps; ++i) { 7160 MachineOperand &MO = MI.getOperand(i + 2); 7161 MIB.add(MO); 7162 } 7163 for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2)) 7164 MIB.add(MO); 7165 7166 updateOperandRegConstraints(MF, *NewMI, TII); 7167 7168 MachineBasicBlock *MBB = InsertPt->getParent(); 7169 MBB->insert(InsertPt, NewMI); 7170 7171 return MIB; 7172 } 7173 7174 static MachineInstr *fuseInst(MachineFunction &MF, unsigned Opcode, 7175 unsigned OpNo, ArrayRef<MachineOperand> MOs, 7176 MachineBasicBlock::iterator InsertPt, 7177 MachineInstr &MI, const TargetInstrInfo &TII, 7178 int PtrOffset = 0) { 7179 // Omit the implicit operands, something BuildMI can't do. 7180 MachineInstr *NewMI = 7181 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true); 7182 MachineInstrBuilder MIB(MF, NewMI); 7183 7184 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 7185 MachineOperand &MO = MI.getOperand(i); 7186 if (i == OpNo) { 7187 assert(MO.isReg() && "Expected to fold into reg operand!"); 7188 addOperands(MIB, MOs, PtrOffset); 7189 } else { 7190 MIB.add(MO); 7191 } 7192 } 7193 7194 updateOperandRegConstraints(MF, *NewMI, TII); 7195 7196 // Copy the NoFPExcept flag from the instruction we're fusing. 7197 if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept)) 7198 NewMI->setFlag(MachineInstr::MIFlag::NoFPExcept); 7199 7200 MachineBasicBlock *MBB = InsertPt->getParent(); 7201 MBB->insert(InsertPt, NewMI); 7202 7203 return MIB; 7204 } 7205 7206 static MachineInstr *makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, 7207 ArrayRef<MachineOperand> MOs, 7208 MachineBasicBlock::iterator InsertPt, 7209 MachineInstr &MI) { 7210 MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt, 7211 MI.getDebugLoc(), TII.get(Opcode)); 7212 addOperands(MIB, MOs); 7213 return MIB.addImm(0); 7214 } 7215 7216 MachineInstr *X86InstrInfo::foldMemoryOperandCustom( 7217 MachineFunction &MF, MachineInstr &MI, unsigned OpNum, 7218 ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, 7219 unsigned Size, Align Alignment) const { 7220 switch (MI.getOpcode()) { 7221 case X86::INSERTPSrr: 7222 case X86::VINSERTPSrr: 7223 case X86::VINSERTPSZrr: 7224 // Attempt to convert the load of inserted vector into a fold load 7225 // of a single float. 7226 if (OpNum == 2) { 7227 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); 7228 unsigned ZMask = Imm & 15; 7229 unsigned DstIdx = (Imm >> 4) & 3; 7230 unsigned SrcIdx = (Imm >> 6) & 3; 7231 7232 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 7233 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); 7234 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; 7235 if ((Size == 0 || Size >= 16) && RCSize >= 16 && 7236 (MI.getOpcode() != X86::INSERTPSrr || Alignment >= Align(4))) { 7237 int PtrOffset = SrcIdx * 4; 7238 unsigned NewImm = (DstIdx << 4) | ZMask; 7239 unsigned NewOpCode = 7240 (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm 7241 : (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm 7242 : X86::INSERTPSrm; 7243 MachineInstr *NewMI = 7244 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); 7245 NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); 7246 return NewMI; 7247 } 7248 } 7249 break; 7250 case X86::MOVHLPSrr: 7251 case X86::VMOVHLPSrr: 7252 case X86::VMOVHLPSZrr: 7253 // Move the upper 64-bits of the second operand to the lower 64-bits. 7254 // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. 7255 // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. 7256 if (OpNum == 2) { 7257 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 7258 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); 7259 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; 7260 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) { 7261 unsigned NewOpCode = 7262 (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm 7263 : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm 7264 : X86::MOVLPSrm; 7265 MachineInstr *NewMI = 7266 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8); 7267 return NewMI; 7268 } 7269 } 7270 break; 7271 case X86::UNPCKLPDrr: 7272 // If we won't be able to fold this to the memory form of UNPCKL, use 7273 // MOVHPD instead. Done as custom because we can't have this in the load 7274 // table twice. 7275 if (OpNum == 2) { 7276 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 7277 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); 7278 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; 7279 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) { 7280 MachineInstr *NewMI = 7281 fuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this); 7282 return NewMI; 7283 } 7284 } 7285 break; 7286 case X86::MOV32r0: 7287 if (auto *NewMI = 7288 makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs, 7289 InsertPt, MI)) 7290 return NewMI; 7291 break; 7292 } 7293 7294 return nullptr; 7295 } 7296 7297 static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, 7298 MachineInstr &MI) { 7299 if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/ true) || 7300 !MI.getOperand(1).isReg()) 7301 return false; 7302 7303 // The are two cases we need to handle depending on where in the pipeline 7304 // the folding attempt is being made. 7305 // -Register has the undef flag set. 7306 // -Register is produced by the IMPLICIT_DEF instruction. 7307 7308 if (MI.getOperand(1).isUndef()) 7309 return true; 7310 7311 MachineRegisterInfo &RegInfo = MF.getRegInfo(); 7312 MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg()); 7313 return VRegDef && VRegDef->isImplicitDef(); 7314 } 7315 7316 unsigned X86InstrInfo::commuteOperandsForFold(MachineInstr &MI, 7317 unsigned Idx1) const { 7318 unsigned Idx2 = CommuteAnyOperandIndex; 7319 if (!findCommutedOpIndices(MI, Idx1, Idx2)) 7320 return Idx1; 7321 7322 bool HasDef = MI.getDesc().getNumDefs(); 7323 Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register(); 7324 Register Reg1 = MI.getOperand(Idx1).getReg(); 7325 Register Reg2 = MI.getOperand(Idx2).getReg(); 7326 bool Tied1 = 0 == MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO); 7327 bool Tied2 = 0 == MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO); 7328 7329 // If either of the commutable operands are tied to the destination 7330 // then we can not commute + fold. 7331 if ((HasDef && Reg0 == Reg1 && Tied1) || (HasDef && Reg0 == Reg2 && Tied2)) 7332 return Idx1; 7333 7334 return commuteInstruction(MI, false, Idx1, Idx2) ? Idx2 : Idx1; 7335 } 7336 7337 static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx) { 7338 if (PrintFailedFusing && !MI.isCopy()) 7339 dbgs() << "We failed to fuse operand " << Idx << " in " << MI; 7340 } 7341 7342 MachineInstr *X86InstrInfo::foldMemoryOperandImpl( 7343 MachineFunction &MF, MachineInstr &MI, unsigned OpNum, 7344 ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, 7345 unsigned Size, Align Alignment, bool AllowCommute) const { 7346 bool isSlowTwoMemOps = Subtarget.slowTwoMemOps(); 7347 unsigned Opc = MI.getOpcode(); 7348 7349 // For CPUs that favor the register form of a call or push, 7350 // do not fold loads into calls or pushes, unless optimizing for size 7351 // aggressively. 7352 if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() && 7353 (Opc == X86::CALL32r || Opc == X86::CALL64r || Opc == X86::PUSH16r || 7354 Opc == X86::PUSH32r || Opc == X86::PUSH64r)) 7355 return nullptr; 7356 7357 // Avoid partial and undef register update stalls unless optimizing for size. 7358 if (!MF.getFunction().hasOptSize() && 7359 (hasPartialRegUpdate(Opc, Subtarget, /*ForLoadFold*/ true) || 7360 shouldPreventUndefRegUpdateMemFold(MF, MI))) 7361 return nullptr; 7362 7363 unsigned NumOps = MI.getDesc().getNumOperands(); 7364 bool IsTwoAddr = NumOps > 1 && OpNum < 2 && MI.getOperand(0).isReg() && 7365 MI.getOperand(1).isReg() && 7366 MI.getOperand(0).getReg() == MI.getOperand(1).getReg(); 7367 7368 // FIXME: AsmPrinter doesn't know how to handle 7369 // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. 7370 if (Opc == X86::ADD32ri && 7371 MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) 7372 return nullptr; 7373 7374 // GOTTPOFF relocation loads can only be folded into add instructions. 7375 // FIXME: Need to exclude other relocations that only support specific 7376 // instructions. 7377 if (MOs.size() == X86::AddrNumOperands && 7378 MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF && 7379 Opc != X86::ADD64rr) 7380 return nullptr; 7381 7382 // Don't fold loads into indirect calls that need a KCFI check as we'll 7383 // have to unfold these in X86TargetLowering::EmitKCFICheck anyway. 7384 if (MI.isCall() && MI.getCFIType()) 7385 return nullptr; 7386 7387 // Attempt to fold any custom cases we have. 7388 if (auto *CustomMI = foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, 7389 Size, Alignment)) 7390 return CustomMI; 7391 7392 // Folding a memory location into the two-address part of a two-address 7393 // instruction is different than folding it other places. It requires 7394 // replacing the *two* registers with the memory location. 7395 // 7396 // Utilize the mapping NonNDD -> RMW for the NDD variant. 7397 unsigned NonNDOpc = Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U; 7398 const X86FoldTableEntry *I = 7399 IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc) 7400 : lookupFoldTable(Opc, OpNum); 7401 7402 MachineInstr *NewMI = nullptr; 7403 if (I) { 7404 unsigned Opcode = I->DstOp; 7405 if (Alignment < 7406 Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT))) 7407 return nullptr; 7408 bool NarrowToMOV32rm = false; 7409 if (Size) { 7410 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 7411 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); 7412 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; 7413 // Check if it's safe to fold the load. If the size of the object is 7414 // narrower than the load width, then it's not. 7415 // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int. 7416 if ((I->Flags & TB_FOLDED_LOAD) && Size < RCSize) { 7417 // If this is a 64-bit load, but the spill slot is 32, then we can do 7418 // a 32-bit load which is implicitly zero-extended. This likely is 7419 // due to live interval analysis remat'ing a load from stack slot. 7420 if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) 7421 return nullptr; 7422 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) 7423 return nullptr; 7424 Opcode = X86::MOV32rm; 7425 NarrowToMOV32rm = true; 7426 } 7427 // For stores, make sure the size of the object is equal to the size of 7428 // the store. If the object is larger, the extra bits would be garbage. If 7429 // the object is smaller we might overwrite another object or fault. 7430 if ((I->Flags & TB_FOLDED_STORE) && Size != RCSize) 7431 return nullptr; 7432 } 7433 7434 NewMI = IsTwoAddr ? fuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this) 7435 : fuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this); 7436 7437 if (NarrowToMOV32rm) { 7438 // If this is the special case where we use a MOV32rm to load a 32-bit 7439 // value and zero-extend the top bits. Change the destination register 7440 // to a 32-bit one. 7441 Register DstReg = NewMI->getOperand(0).getReg(); 7442 if (DstReg.isPhysical()) 7443 NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); 7444 else 7445 NewMI->getOperand(0).setSubReg(X86::sub_32bit); 7446 } 7447 return NewMI; 7448 } 7449 7450 if (AllowCommute) { 7451 // If the instruction and target operand are commutable, commute the 7452 // instruction and try again. 7453 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum); 7454 if (CommuteOpIdx2 == OpNum) { 7455 printFailMsgforFold(MI, OpNum); 7456 return nullptr; 7457 } 7458 // Attempt to fold with the commuted version of the instruction. 7459 NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size, 7460 Alignment, /*AllowCommute=*/false); 7461 if (NewMI) 7462 return NewMI; 7463 // Folding failed again - undo the commute before returning. 7464 commuteInstruction(MI, false, OpNum, CommuteOpIdx2); 7465 } 7466 7467 printFailMsgforFold(MI, OpNum); 7468 return nullptr; 7469 } 7470 7471 MachineInstr *X86InstrInfo::foldMemoryOperandImpl( 7472 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 7473 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, 7474 VirtRegMap *VRM) const { 7475 // Check switch flag 7476 if (NoFusing) 7477 return nullptr; 7478 7479 // Avoid partial and undef register update stalls unless optimizing for size. 7480 if (!MF.getFunction().hasOptSize() && 7481 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) || 7482 shouldPreventUndefRegUpdateMemFold(MF, MI))) 7483 return nullptr; 7484 7485 // Don't fold subreg spills, or reloads that use a high subreg. 7486 for (auto Op : Ops) { 7487 MachineOperand &MO = MI.getOperand(Op); 7488 auto SubReg = MO.getSubReg(); 7489 // MOV32r0 is special b/c it's used to clear a 64-bit register too. 7490 // (See patterns for MOV32r0 in TD files). 7491 if (MI.getOpcode() == X86::MOV32r0 && SubReg == X86::sub_32bit) 7492 continue; 7493 if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi)) 7494 return nullptr; 7495 } 7496 7497 const MachineFrameInfo &MFI = MF.getFrameInfo(); 7498 unsigned Size = MFI.getObjectSize(FrameIndex); 7499 Align Alignment = MFI.getObjectAlign(FrameIndex); 7500 // If the function stack isn't realigned we don't want to fold instructions 7501 // that need increased alignment. 7502 if (!RI.hasStackRealignment(MF)) 7503 Alignment = 7504 std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign()); 7505 7506 auto Impl = [&]() { 7507 return foldMemoryOperandImpl(MF, MI, Ops[0], 7508 MachineOperand::CreateFI(FrameIndex), InsertPt, 7509 Size, Alignment, /*AllowCommute=*/true); 7510 }; 7511 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { 7512 unsigned NewOpc = 0; 7513 unsigned RCSize = 0; 7514 unsigned Opc = MI.getOpcode(); 7515 switch (Opc) { 7516 default: 7517 // NDD can be folded into RMW though its Op0 and Op1 are not tied. 7518 return (Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U) ? Impl() 7519 : nullptr; 7520 case X86::TEST8rr: 7521 NewOpc = X86::CMP8ri; 7522 RCSize = 1; 7523 break; 7524 case X86::TEST16rr: 7525 NewOpc = X86::CMP16ri; 7526 RCSize = 2; 7527 break; 7528 case X86::TEST32rr: 7529 NewOpc = X86::CMP32ri; 7530 RCSize = 4; 7531 break; 7532 case X86::TEST64rr: 7533 NewOpc = X86::CMP64ri32; 7534 RCSize = 8; 7535 break; 7536 } 7537 // Check if it's safe to fold the load. If the size of the object is 7538 // narrower than the load width, then it's not. 7539 if (Size < RCSize) 7540 return nullptr; 7541 // Change to CMPXXri r, 0 first. 7542 MI.setDesc(get(NewOpc)); 7543 MI.getOperand(1).ChangeToImmediate(0); 7544 } else if (Ops.size() != 1) 7545 return nullptr; 7546 7547 return Impl(); 7548 } 7549 7550 /// Check if \p LoadMI is a partial register load that we can't fold into \p MI 7551 /// because the latter uses contents that wouldn't be defined in the folded 7552 /// version. For instance, this transformation isn't legal: 7553 /// movss (%rdi), %xmm0 7554 /// addps %xmm0, %xmm0 7555 /// -> 7556 /// addps (%rdi), %xmm0 7557 /// 7558 /// But this one is: 7559 /// movss (%rdi), %xmm0 7560 /// addss %xmm0, %xmm0 7561 /// -> 7562 /// addss (%rdi), %xmm0 7563 /// 7564 static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, 7565 const MachineInstr &UserMI, 7566 const MachineFunction &MF) { 7567 unsigned Opc = LoadMI.getOpcode(); 7568 unsigned UserOpc = UserMI.getOpcode(); 7569 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 7570 const TargetRegisterClass *RC = 7571 MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg()); 7572 unsigned RegSize = TRI.getRegSizeInBits(*RC); 7573 7574 if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm || 7575 Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt || 7576 Opc == X86::VMOVSSZrm_alt) && 7577 RegSize > 32) { 7578 // These instructions only load 32 bits, we can't fold them if the 7579 // destination register is wider than 32 bits (4 bytes), and its user 7580 // instruction isn't scalar (SS). 7581 switch (UserOpc) { 7582 case X86::CVTSS2SDrr_Int: 7583 case X86::VCVTSS2SDrr_Int: 7584 case X86::VCVTSS2SDZrr_Int: 7585 case X86::VCVTSS2SDZrr_Intk: 7586 case X86::VCVTSS2SDZrr_Intkz: 7587 case X86::CVTSS2SIrr_Int: 7588 case X86::CVTSS2SI64rr_Int: 7589 case X86::VCVTSS2SIrr_Int: 7590 case X86::VCVTSS2SI64rr_Int: 7591 case X86::VCVTSS2SIZrr_Int: 7592 case X86::VCVTSS2SI64Zrr_Int: 7593 case X86::CVTTSS2SIrr_Int: 7594 case X86::CVTTSS2SI64rr_Int: 7595 case X86::VCVTTSS2SIrr_Int: 7596 case X86::VCVTTSS2SI64rr_Int: 7597 case X86::VCVTTSS2SIZrr_Int: 7598 case X86::VCVTTSS2SI64Zrr_Int: 7599 case X86::VCVTSS2USIZrr_Int: 7600 case X86::VCVTSS2USI64Zrr_Int: 7601 case X86::VCVTTSS2USIZrr_Int: 7602 case X86::VCVTTSS2USI64Zrr_Int: 7603 case X86::RCPSSr_Int: 7604 case X86::VRCPSSr_Int: 7605 case X86::RSQRTSSr_Int: 7606 case X86::VRSQRTSSr_Int: 7607 case X86::ROUNDSSri_Int: 7608 case X86::VROUNDSSri_Int: 7609 case X86::COMISSrr_Int: 7610 case X86::VCOMISSrr_Int: 7611 case X86::VCOMISSZrr_Int: 7612 case X86::UCOMISSrr_Int: 7613 case X86::VUCOMISSrr_Int: 7614 case X86::VUCOMISSZrr_Int: 7615 case X86::ADDSSrr_Int: 7616 case X86::VADDSSrr_Int: 7617 case X86::VADDSSZrr_Int: 7618 case X86::CMPSSrri_Int: 7619 case X86::VCMPSSrri_Int: 7620 case X86::VCMPSSZrri_Int: 7621 case X86::DIVSSrr_Int: 7622 case X86::VDIVSSrr_Int: 7623 case X86::VDIVSSZrr_Int: 7624 case X86::MAXSSrr_Int: 7625 case X86::VMAXSSrr_Int: 7626 case X86::VMAXSSZrr_Int: 7627 case X86::MINSSrr_Int: 7628 case X86::VMINSSrr_Int: 7629 case X86::VMINSSZrr_Int: 7630 case X86::MULSSrr_Int: 7631 case X86::VMULSSrr_Int: 7632 case X86::VMULSSZrr_Int: 7633 case X86::SQRTSSr_Int: 7634 case X86::VSQRTSSr_Int: 7635 case X86::VSQRTSSZr_Int: 7636 case X86::SUBSSrr_Int: 7637 case X86::VSUBSSrr_Int: 7638 case X86::VSUBSSZrr_Int: 7639 case X86::VADDSSZrr_Intk: 7640 case X86::VADDSSZrr_Intkz: 7641 case X86::VCMPSSZrri_Intk: 7642 case X86::VDIVSSZrr_Intk: 7643 case X86::VDIVSSZrr_Intkz: 7644 case X86::VMAXSSZrr_Intk: 7645 case X86::VMAXSSZrr_Intkz: 7646 case X86::VMINSSZrr_Intk: 7647 case X86::VMINSSZrr_Intkz: 7648 case X86::VMULSSZrr_Intk: 7649 case X86::VMULSSZrr_Intkz: 7650 case X86::VSQRTSSZr_Intk: 7651 case X86::VSQRTSSZr_Intkz: 7652 case X86::VSUBSSZrr_Intk: 7653 case X86::VSUBSSZrr_Intkz: 7654 case X86::VFMADDSS4rr_Int: 7655 case X86::VFNMADDSS4rr_Int: 7656 case X86::VFMSUBSS4rr_Int: 7657 case X86::VFNMSUBSS4rr_Int: 7658 case X86::VFMADD132SSr_Int: 7659 case X86::VFNMADD132SSr_Int: 7660 case X86::VFMADD213SSr_Int: 7661 case X86::VFNMADD213SSr_Int: 7662 case X86::VFMADD231SSr_Int: 7663 case X86::VFNMADD231SSr_Int: 7664 case X86::VFMSUB132SSr_Int: 7665 case X86::VFNMSUB132SSr_Int: 7666 case X86::VFMSUB213SSr_Int: 7667 case X86::VFNMSUB213SSr_Int: 7668 case X86::VFMSUB231SSr_Int: 7669 case X86::VFNMSUB231SSr_Int: 7670 case X86::VFMADD132SSZr_Int: 7671 case X86::VFNMADD132SSZr_Int: 7672 case X86::VFMADD213SSZr_Int: 7673 case X86::VFNMADD213SSZr_Int: 7674 case X86::VFMADD231SSZr_Int: 7675 case X86::VFNMADD231SSZr_Int: 7676 case X86::VFMSUB132SSZr_Int: 7677 case X86::VFNMSUB132SSZr_Int: 7678 case X86::VFMSUB213SSZr_Int: 7679 case X86::VFNMSUB213SSZr_Int: 7680 case X86::VFMSUB231SSZr_Int: 7681 case X86::VFNMSUB231SSZr_Int: 7682 case X86::VFMADD132SSZr_Intk: 7683 case X86::VFNMADD132SSZr_Intk: 7684 case X86::VFMADD213SSZr_Intk: 7685 case X86::VFNMADD213SSZr_Intk: 7686 case X86::VFMADD231SSZr_Intk: 7687 case X86::VFNMADD231SSZr_Intk: 7688 case X86::VFMSUB132SSZr_Intk: 7689 case X86::VFNMSUB132SSZr_Intk: 7690 case X86::VFMSUB213SSZr_Intk: 7691 case X86::VFNMSUB213SSZr_Intk: 7692 case X86::VFMSUB231SSZr_Intk: 7693 case X86::VFNMSUB231SSZr_Intk: 7694 case X86::VFMADD132SSZr_Intkz: 7695 case X86::VFNMADD132SSZr_Intkz: 7696 case X86::VFMADD213SSZr_Intkz: 7697 case X86::VFNMADD213SSZr_Intkz: 7698 case X86::VFMADD231SSZr_Intkz: 7699 case X86::VFNMADD231SSZr_Intkz: 7700 case X86::VFMSUB132SSZr_Intkz: 7701 case X86::VFNMSUB132SSZr_Intkz: 7702 case X86::VFMSUB213SSZr_Intkz: 7703 case X86::VFNMSUB213SSZr_Intkz: 7704 case X86::VFMSUB231SSZr_Intkz: 7705 case X86::VFNMSUB231SSZr_Intkz: 7706 case X86::VFIXUPIMMSSZrri: 7707 case X86::VFIXUPIMMSSZrrik: 7708 case X86::VFIXUPIMMSSZrrikz: 7709 case X86::VFPCLASSSSZrr: 7710 case X86::VFPCLASSSSZrrk: 7711 case X86::VGETEXPSSZr: 7712 case X86::VGETEXPSSZrk: 7713 case X86::VGETEXPSSZrkz: 7714 case X86::VGETMANTSSZrri: 7715 case X86::VGETMANTSSZrrik: 7716 case X86::VGETMANTSSZrrikz: 7717 case X86::VRANGESSZrri: 7718 case X86::VRANGESSZrrik: 7719 case X86::VRANGESSZrrikz: 7720 case X86::VRCP14SSZrr: 7721 case X86::VRCP14SSZrrk: 7722 case X86::VRCP14SSZrrkz: 7723 case X86::VRCP28SSZr: 7724 case X86::VRCP28SSZrk: 7725 case X86::VRCP28SSZrkz: 7726 case X86::VREDUCESSZrri: 7727 case X86::VREDUCESSZrrik: 7728 case X86::VREDUCESSZrrikz: 7729 case X86::VRNDSCALESSZr_Int: 7730 case X86::VRNDSCALESSZr_Intk: 7731 case X86::VRNDSCALESSZr_Intkz: 7732 case X86::VRSQRT14SSZrr: 7733 case X86::VRSQRT14SSZrrk: 7734 case X86::VRSQRT14SSZrrkz: 7735 case X86::VRSQRT28SSZr: 7736 case X86::VRSQRT28SSZrk: 7737 case X86::VRSQRT28SSZrkz: 7738 case X86::VSCALEFSSZrr: 7739 case X86::VSCALEFSSZrrk: 7740 case X86::VSCALEFSSZrrkz: 7741 return false; 7742 default: 7743 return true; 7744 } 7745 } 7746 7747 if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm || 7748 Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt || 7749 Opc == X86::VMOVSDZrm_alt) && 7750 RegSize > 64) { 7751 // These instructions only load 64 bits, we can't fold them if the 7752 // destination register is wider than 64 bits (8 bytes), and its user 7753 // instruction isn't scalar (SD). 7754 switch (UserOpc) { 7755 case X86::CVTSD2SSrr_Int: 7756 case X86::VCVTSD2SSrr_Int: 7757 case X86::VCVTSD2SSZrr_Int: 7758 case X86::VCVTSD2SSZrr_Intk: 7759 case X86::VCVTSD2SSZrr_Intkz: 7760 case X86::CVTSD2SIrr_Int: 7761 case X86::CVTSD2SI64rr_Int: 7762 case X86::VCVTSD2SIrr_Int: 7763 case X86::VCVTSD2SI64rr_Int: 7764 case X86::VCVTSD2SIZrr_Int: 7765 case X86::VCVTSD2SI64Zrr_Int: 7766 case X86::CVTTSD2SIrr_Int: 7767 case X86::CVTTSD2SI64rr_Int: 7768 case X86::VCVTTSD2SIrr_Int: 7769 case X86::VCVTTSD2SI64rr_Int: 7770 case X86::VCVTTSD2SIZrr_Int: 7771 case X86::VCVTTSD2SI64Zrr_Int: 7772 case X86::VCVTSD2USIZrr_Int: 7773 case X86::VCVTSD2USI64Zrr_Int: 7774 case X86::VCVTTSD2USIZrr_Int: 7775 case X86::VCVTTSD2USI64Zrr_Int: 7776 case X86::ROUNDSDri_Int: 7777 case X86::VROUNDSDri_Int: 7778 case X86::COMISDrr_Int: 7779 case X86::VCOMISDrr_Int: 7780 case X86::VCOMISDZrr_Int: 7781 case X86::UCOMISDrr_Int: 7782 case X86::VUCOMISDrr_Int: 7783 case X86::VUCOMISDZrr_Int: 7784 case X86::ADDSDrr_Int: 7785 case X86::VADDSDrr_Int: 7786 case X86::VADDSDZrr_Int: 7787 case X86::CMPSDrri_Int: 7788 case X86::VCMPSDrri_Int: 7789 case X86::VCMPSDZrri_Int: 7790 case X86::DIVSDrr_Int: 7791 case X86::VDIVSDrr_Int: 7792 case X86::VDIVSDZrr_Int: 7793 case X86::MAXSDrr_Int: 7794 case X86::VMAXSDrr_Int: 7795 case X86::VMAXSDZrr_Int: 7796 case X86::MINSDrr_Int: 7797 case X86::VMINSDrr_Int: 7798 case X86::VMINSDZrr_Int: 7799 case X86::MULSDrr_Int: 7800 case X86::VMULSDrr_Int: 7801 case X86::VMULSDZrr_Int: 7802 case X86::SQRTSDr_Int: 7803 case X86::VSQRTSDr_Int: 7804 case X86::VSQRTSDZr_Int: 7805 case X86::SUBSDrr_Int: 7806 case X86::VSUBSDrr_Int: 7807 case X86::VSUBSDZrr_Int: 7808 case X86::VADDSDZrr_Intk: 7809 case X86::VADDSDZrr_Intkz: 7810 case X86::VCMPSDZrri_Intk: 7811 case X86::VDIVSDZrr_Intk: 7812 case X86::VDIVSDZrr_Intkz: 7813 case X86::VMAXSDZrr_Intk: 7814 case X86::VMAXSDZrr_Intkz: 7815 case X86::VMINSDZrr_Intk: 7816 case X86::VMINSDZrr_Intkz: 7817 case X86::VMULSDZrr_Intk: 7818 case X86::VMULSDZrr_Intkz: 7819 case X86::VSQRTSDZr_Intk: 7820 case X86::VSQRTSDZr_Intkz: 7821 case X86::VSUBSDZrr_Intk: 7822 case X86::VSUBSDZrr_Intkz: 7823 case X86::VFMADDSD4rr_Int: 7824 case X86::VFNMADDSD4rr_Int: 7825 case X86::VFMSUBSD4rr_Int: 7826 case X86::VFNMSUBSD4rr_Int: 7827 case X86::VFMADD132SDr_Int: 7828 case X86::VFNMADD132SDr_Int: 7829 case X86::VFMADD213SDr_Int: 7830 case X86::VFNMADD213SDr_Int: 7831 case X86::VFMADD231SDr_Int: 7832 case X86::VFNMADD231SDr_Int: 7833 case X86::VFMSUB132SDr_Int: 7834 case X86::VFNMSUB132SDr_Int: 7835 case X86::VFMSUB213SDr_Int: 7836 case X86::VFNMSUB213SDr_Int: 7837 case X86::VFMSUB231SDr_Int: 7838 case X86::VFNMSUB231SDr_Int: 7839 case X86::VFMADD132SDZr_Int: 7840 case X86::VFNMADD132SDZr_Int: 7841 case X86::VFMADD213SDZr_Int: 7842 case X86::VFNMADD213SDZr_Int: 7843 case X86::VFMADD231SDZr_Int: 7844 case X86::VFNMADD231SDZr_Int: 7845 case X86::VFMSUB132SDZr_Int: 7846 case X86::VFNMSUB132SDZr_Int: 7847 case X86::VFMSUB213SDZr_Int: 7848 case X86::VFNMSUB213SDZr_Int: 7849 case X86::VFMSUB231SDZr_Int: 7850 case X86::VFNMSUB231SDZr_Int: 7851 case X86::VFMADD132SDZr_Intk: 7852 case X86::VFNMADD132SDZr_Intk: 7853 case X86::VFMADD213SDZr_Intk: 7854 case X86::VFNMADD213SDZr_Intk: 7855 case X86::VFMADD231SDZr_Intk: 7856 case X86::VFNMADD231SDZr_Intk: 7857 case X86::VFMSUB132SDZr_Intk: 7858 case X86::VFNMSUB132SDZr_Intk: 7859 case X86::VFMSUB213SDZr_Intk: 7860 case X86::VFNMSUB213SDZr_Intk: 7861 case X86::VFMSUB231SDZr_Intk: 7862 case X86::VFNMSUB231SDZr_Intk: 7863 case X86::VFMADD132SDZr_Intkz: 7864 case X86::VFNMADD132SDZr_Intkz: 7865 case X86::VFMADD213SDZr_Intkz: 7866 case X86::VFNMADD213SDZr_Intkz: 7867 case X86::VFMADD231SDZr_Intkz: 7868 case X86::VFNMADD231SDZr_Intkz: 7869 case X86::VFMSUB132SDZr_Intkz: 7870 case X86::VFNMSUB132SDZr_Intkz: 7871 case X86::VFMSUB213SDZr_Intkz: 7872 case X86::VFNMSUB213SDZr_Intkz: 7873 case X86::VFMSUB231SDZr_Intkz: 7874 case X86::VFNMSUB231SDZr_Intkz: 7875 case X86::VFIXUPIMMSDZrri: 7876 case X86::VFIXUPIMMSDZrrik: 7877 case X86::VFIXUPIMMSDZrrikz: 7878 case X86::VFPCLASSSDZrr: 7879 case X86::VFPCLASSSDZrrk: 7880 case X86::VGETEXPSDZr: 7881 case X86::VGETEXPSDZrk: 7882 case X86::VGETEXPSDZrkz: 7883 case X86::VGETMANTSDZrri: 7884 case X86::VGETMANTSDZrrik: 7885 case X86::VGETMANTSDZrrikz: 7886 case X86::VRANGESDZrri: 7887 case X86::VRANGESDZrrik: 7888 case X86::VRANGESDZrrikz: 7889 case X86::VRCP14SDZrr: 7890 case X86::VRCP14SDZrrk: 7891 case X86::VRCP14SDZrrkz: 7892 case X86::VRCP28SDZr: 7893 case X86::VRCP28SDZrk: 7894 case X86::VRCP28SDZrkz: 7895 case X86::VREDUCESDZrri: 7896 case X86::VREDUCESDZrrik: 7897 case X86::VREDUCESDZrrikz: 7898 case X86::VRNDSCALESDZr_Int: 7899 case X86::VRNDSCALESDZr_Intk: 7900 case X86::VRNDSCALESDZr_Intkz: 7901 case X86::VRSQRT14SDZrr: 7902 case X86::VRSQRT14SDZrrk: 7903 case X86::VRSQRT14SDZrrkz: 7904 case X86::VRSQRT28SDZr: 7905 case X86::VRSQRT28SDZrk: 7906 case X86::VRSQRT28SDZrkz: 7907 case X86::VSCALEFSDZrr: 7908 case X86::VSCALEFSDZrrk: 7909 case X86::VSCALEFSDZrrkz: 7910 return false; 7911 default: 7912 return true; 7913 } 7914 } 7915 7916 if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) { 7917 // These instructions only load 16 bits, we can't fold them if the 7918 // destination register is wider than 16 bits (2 bytes), and its user 7919 // instruction isn't scalar (SH). 7920 switch (UserOpc) { 7921 case X86::VADDSHZrr_Int: 7922 case X86::VCMPSHZrri_Int: 7923 case X86::VDIVSHZrr_Int: 7924 case X86::VMAXSHZrr_Int: 7925 case X86::VMINSHZrr_Int: 7926 case X86::VMULSHZrr_Int: 7927 case X86::VSUBSHZrr_Int: 7928 case X86::VADDSHZrr_Intk: 7929 case X86::VADDSHZrr_Intkz: 7930 case X86::VCMPSHZrri_Intk: 7931 case X86::VDIVSHZrr_Intk: 7932 case X86::VDIVSHZrr_Intkz: 7933 case X86::VMAXSHZrr_Intk: 7934 case X86::VMAXSHZrr_Intkz: 7935 case X86::VMINSHZrr_Intk: 7936 case X86::VMINSHZrr_Intkz: 7937 case X86::VMULSHZrr_Intk: 7938 case X86::VMULSHZrr_Intkz: 7939 case X86::VSUBSHZrr_Intk: 7940 case X86::VSUBSHZrr_Intkz: 7941 case X86::VFMADD132SHZr_Int: 7942 case X86::VFNMADD132SHZr_Int: 7943 case X86::VFMADD213SHZr_Int: 7944 case X86::VFNMADD213SHZr_Int: 7945 case X86::VFMADD231SHZr_Int: 7946 case X86::VFNMADD231SHZr_Int: 7947 case X86::VFMSUB132SHZr_Int: 7948 case X86::VFNMSUB132SHZr_Int: 7949 case X86::VFMSUB213SHZr_Int: 7950 case X86::VFNMSUB213SHZr_Int: 7951 case X86::VFMSUB231SHZr_Int: 7952 case X86::VFNMSUB231SHZr_Int: 7953 case X86::VFMADD132SHZr_Intk: 7954 case X86::VFNMADD132SHZr_Intk: 7955 case X86::VFMADD213SHZr_Intk: 7956 case X86::VFNMADD213SHZr_Intk: 7957 case X86::VFMADD231SHZr_Intk: 7958 case X86::VFNMADD231SHZr_Intk: 7959 case X86::VFMSUB132SHZr_Intk: 7960 case X86::VFNMSUB132SHZr_Intk: 7961 case X86::VFMSUB213SHZr_Intk: 7962 case X86::VFNMSUB213SHZr_Intk: 7963 case X86::VFMSUB231SHZr_Intk: 7964 case X86::VFNMSUB231SHZr_Intk: 7965 case X86::VFMADD132SHZr_Intkz: 7966 case X86::VFNMADD132SHZr_Intkz: 7967 case X86::VFMADD213SHZr_Intkz: 7968 case X86::VFNMADD213SHZr_Intkz: 7969 case X86::VFMADD231SHZr_Intkz: 7970 case X86::VFNMADD231SHZr_Intkz: 7971 case X86::VFMSUB132SHZr_Intkz: 7972 case X86::VFNMSUB132SHZr_Intkz: 7973 case X86::VFMSUB213SHZr_Intkz: 7974 case X86::VFNMSUB213SHZr_Intkz: 7975 case X86::VFMSUB231SHZr_Intkz: 7976 case X86::VFNMSUB231SHZr_Intkz: 7977 return false; 7978 default: 7979 return true; 7980 } 7981 } 7982 7983 return false; 7984 } 7985 7986 MachineInstr *X86InstrInfo::foldMemoryOperandImpl( 7987 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 7988 MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, 7989 LiveIntervals *LIS) const { 7990 7991 // TODO: Support the case where LoadMI loads a wide register, but MI 7992 // only uses a subreg. 7993 for (auto Op : Ops) { 7994 if (MI.getOperand(Op).getSubReg()) 7995 return nullptr; 7996 } 7997 7998 // If loading from a FrameIndex, fold directly from the FrameIndex. 7999 unsigned NumOps = LoadMI.getDesc().getNumOperands(); 8000 int FrameIndex; 8001 if (isLoadFromStackSlot(LoadMI, FrameIndex)) { 8002 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF)) 8003 return nullptr; 8004 return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS); 8005 } 8006 8007 // Check switch flag 8008 if (NoFusing) 8009 return nullptr; 8010 8011 // Avoid partial and undef register update stalls unless optimizing for size. 8012 if (!MF.getFunction().hasOptSize() && 8013 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) || 8014 shouldPreventUndefRegUpdateMemFold(MF, MI))) 8015 return nullptr; 8016 8017 // Determine the alignment of the load. 8018 Align Alignment; 8019 unsigned LoadOpc = LoadMI.getOpcode(); 8020 if (LoadMI.hasOneMemOperand()) 8021 Alignment = (*LoadMI.memoperands_begin())->getAlign(); 8022 else 8023 switch (LoadOpc) { 8024 case X86::AVX512_512_SET0: 8025 case X86::AVX512_512_SETALLONES: 8026 Alignment = Align(64); 8027 break; 8028 case X86::AVX2_SETALLONES: 8029 case X86::AVX1_SETALLONES: 8030 case X86::AVX_SET0: 8031 case X86::AVX512_256_SET0: 8032 Alignment = Align(32); 8033 break; 8034 case X86::V_SET0: 8035 case X86::V_SETALLONES: 8036 case X86::AVX512_128_SET0: 8037 case X86::FsFLD0F128: 8038 case X86::AVX512_FsFLD0F128: 8039 Alignment = Align(16); 8040 break; 8041 case X86::MMX_SET0: 8042 case X86::FsFLD0SD: 8043 case X86::AVX512_FsFLD0SD: 8044 Alignment = Align(8); 8045 break; 8046 case X86::FsFLD0SS: 8047 case X86::AVX512_FsFLD0SS: 8048 Alignment = Align(4); 8049 break; 8050 case X86::FsFLD0SH: 8051 case X86::AVX512_FsFLD0SH: 8052 Alignment = Align(2); 8053 break; 8054 default: 8055 return nullptr; 8056 } 8057 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { 8058 unsigned NewOpc = 0; 8059 switch (MI.getOpcode()) { 8060 default: 8061 return nullptr; 8062 case X86::TEST8rr: 8063 NewOpc = X86::CMP8ri; 8064 break; 8065 case X86::TEST16rr: 8066 NewOpc = X86::CMP16ri; 8067 break; 8068 case X86::TEST32rr: 8069 NewOpc = X86::CMP32ri; 8070 break; 8071 case X86::TEST64rr: 8072 NewOpc = X86::CMP64ri32; 8073 break; 8074 } 8075 // Change to CMPXXri r, 0 first. 8076 MI.setDesc(get(NewOpc)); 8077 MI.getOperand(1).ChangeToImmediate(0); 8078 } else if (Ops.size() != 1) 8079 return nullptr; 8080 8081 // Make sure the subregisters match. 8082 // Otherwise we risk changing the size of the load. 8083 if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg()) 8084 return nullptr; 8085 8086 SmallVector<MachineOperand, X86::AddrNumOperands> MOs; 8087 switch (LoadOpc) { 8088 case X86::MMX_SET0: 8089 case X86::V_SET0: 8090 case X86::V_SETALLONES: 8091 case X86::AVX2_SETALLONES: 8092 case X86::AVX1_SETALLONES: 8093 case X86::AVX_SET0: 8094 case X86::AVX512_128_SET0: 8095 case X86::AVX512_256_SET0: 8096 case X86::AVX512_512_SET0: 8097 case X86::AVX512_512_SETALLONES: 8098 case X86::FsFLD0SH: 8099 case X86::AVX512_FsFLD0SH: 8100 case X86::FsFLD0SD: 8101 case X86::AVX512_FsFLD0SD: 8102 case X86::FsFLD0SS: 8103 case X86::AVX512_FsFLD0SS: 8104 case X86::FsFLD0F128: 8105 case X86::AVX512_FsFLD0F128: { 8106 // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. 8107 // Create a constant-pool entry and operands to load from it. 8108 8109 // Large code model can't fold loads this way. 8110 if (MF.getTarget().getCodeModel() == CodeModel::Large) 8111 return nullptr; 8112 8113 // x86-32 PIC requires a PIC base register for constant pools. 8114 unsigned PICBase = 0; 8115 // Since we're using Small or Kernel code model, we can always use 8116 // RIP-relative addressing for a smaller encoding. 8117 if (Subtarget.is64Bit()) { 8118 PICBase = X86::RIP; 8119 } else if (MF.getTarget().isPositionIndependent()) { 8120 // FIXME: PICBase = getGlobalBaseReg(&MF); 8121 // This doesn't work for several reasons. 8122 // 1. GlobalBaseReg may have been spilled. 8123 // 2. It may not be live at MI. 8124 return nullptr; 8125 } 8126 8127 // Create a constant-pool entry. 8128 MachineConstantPool &MCP = *MF.getConstantPool(); 8129 Type *Ty; 8130 bool IsAllOnes = false; 8131 switch (LoadOpc) { 8132 case X86::FsFLD0SS: 8133 case X86::AVX512_FsFLD0SS: 8134 Ty = Type::getFloatTy(MF.getFunction().getContext()); 8135 break; 8136 case X86::FsFLD0SD: 8137 case X86::AVX512_FsFLD0SD: 8138 Ty = Type::getDoubleTy(MF.getFunction().getContext()); 8139 break; 8140 case X86::FsFLD0F128: 8141 case X86::AVX512_FsFLD0F128: 8142 Ty = Type::getFP128Ty(MF.getFunction().getContext()); 8143 break; 8144 case X86::FsFLD0SH: 8145 case X86::AVX512_FsFLD0SH: 8146 Ty = Type::getHalfTy(MF.getFunction().getContext()); 8147 break; 8148 case X86::AVX512_512_SETALLONES: 8149 IsAllOnes = true; 8150 [[fallthrough]]; 8151 case X86::AVX512_512_SET0: 8152 Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8153 16); 8154 break; 8155 case X86::AVX1_SETALLONES: 8156 case X86::AVX2_SETALLONES: 8157 IsAllOnes = true; 8158 [[fallthrough]]; 8159 case X86::AVX512_256_SET0: 8160 case X86::AVX_SET0: 8161 Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8162 8); 8163 8164 break; 8165 case X86::MMX_SET0: 8166 Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8167 2); 8168 break; 8169 case X86::V_SETALLONES: 8170 IsAllOnes = true; 8171 [[fallthrough]]; 8172 case X86::V_SET0: 8173 case X86::AVX512_128_SET0: 8174 Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8175 4); 8176 break; 8177 } 8178 8179 const Constant *C = 8180 IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); 8181 unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); 8182 8183 // Create operands to load from the constant pool entry. 8184 MOs.push_back(MachineOperand::CreateReg(PICBase, false)); 8185 MOs.push_back(MachineOperand::CreateImm(1)); 8186 MOs.push_back(MachineOperand::CreateReg(0, false)); 8187 MOs.push_back(MachineOperand::CreateCPI(CPI, 0)); 8188 MOs.push_back(MachineOperand::CreateReg(0, false)); 8189 break; 8190 } 8191 case X86::VPBROADCASTBZ128rm: 8192 case X86::VPBROADCASTBZ256rm: 8193 case X86::VPBROADCASTBZrm: 8194 case X86::VBROADCASTF32X2Z256rm: 8195 case X86::VBROADCASTF32X2Zrm: 8196 case X86::VBROADCASTI32X2Z128rm: 8197 case X86::VBROADCASTI32X2Z256rm: 8198 case X86::VBROADCASTI32X2Zrm: 8199 // No instructions currently fuse with 8bits or 32bits x 2. 8200 return nullptr; 8201 8202 #define FOLD_BROADCAST(SIZE) \ 8203 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, \ 8204 LoadMI.operands_begin() + NumOps); \ 8205 return foldMemoryBroadcast(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/SIZE, \ 8206 /*AllowCommute=*/true); 8207 case X86::VPBROADCASTWZ128rm: 8208 case X86::VPBROADCASTWZ256rm: 8209 case X86::VPBROADCASTWZrm: 8210 FOLD_BROADCAST(16); 8211 case X86::VPBROADCASTDZ128rm: 8212 case X86::VPBROADCASTDZ256rm: 8213 case X86::VPBROADCASTDZrm: 8214 case X86::VBROADCASTSSZ128rm: 8215 case X86::VBROADCASTSSZ256rm: 8216 case X86::VBROADCASTSSZrm: 8217 FOLD_BROADCAST(32); 8218 case X86::VPBROADCASTQZ128rm: 8219 case X86::VPBROADCASTQZ256rm: 8220 case X86::VPBROADCASTQZrm: 8221 case X86::VBROADCASTSDZ256rm: 8222 case X86::VBROADCASTSDZrm: 8223 FOLD_BROADCAST(64); 8224 default: { 8225 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF)) 8226 return nullptr; 8227 8228 // Folding a normal load. Just copy the load's address operands. 8229 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, 8230 LoadMI.operands_begin() + NumOps); 8231 break; 8232 } 8233 } 8234 return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt, 8235 /*Size=*/0, Alignment, /*AllowCommute=*/true); 8236 } 8237 8238 MachineInstr * 8239 X86InstrInfo::foldMemoryBroadcast(MachineFunction &MF, MachineInstr &MI, 8240 unsigned OpNum, ArrayRef<MachineOperand> MOs, 8241 MachineBasicBlock::iterator InsertPt, 8242 unsigned BitsSize, bool AllowCommute) const { 8243 8244 if (auto *I = lookupBroadcastFoldTable(MI.getOpcode(), OpNum)) 8245 return matchBroadcastSize(*I, BitsSize) 8246 ? fuseInst(MF, I->DstOp, OpNum, MOs, InsertPt, MI, *this) 8247 : nullptr; 8248 8249 if (AllowCommute) { 8250 // If the instruction and target operand are commutable, commute the 8251 // instruction and try again. 8252 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum); 8253 if (CommuteOpIdx2 == OpNum) { 8254 printFailMsgforFold(MI, OpNum); 8255 return nullptr; 8256 } 8257 MachineInstr *NewMI = 8258 foldMemoryBroadcast(MF, MI, CommuteOpIdx2, MOs, InsertPt, BitsSize, 8259 /*AllowCommute=*/false); 8260 if (NewMI) 8261 return NewMI; 8262 // Folding failed again - undo the commute before returning. 8263 commuteInstruction(MI, false, OpNum, CommuteOpIdx2); 8264 } 8265 8266 printFailMsgforFold(MI, OpNum); 8267 return nullptr; 8268 } 8269 8270 static SmallVector<MachineMemOperand *, 2> 8271 extractLoadMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) { 8272 SmallVector<MachineMemOperand *, 2> LoadMMOs; 8273 8274 for (MachineMemOperand *MMO : MMOs) { 8275 if (!MMO->isLoad()) 8276 continue; 8277 8278 if (!MMO->isStore()) { 8279 // Reuse the MMO. 8280 LoadMMOs.push_back(MMO); 8281 } else { 8282 // Clone the MMO and unset the store flag. 8283 LoadMMOs.push_back(MF.getMachineMemOperand( 8284 MMO, MMO->getFlags() & ~MachineMemOperand::MOStore)); 8285 } 8286 } 8287 8288 return LoadMMOs; 8289 } 8290 8291 static SmallVector<MachineMemOperand *, 2> 8292 extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) { 8293 SmallVector<MachineMemOperand *, 2> StoreMMOs; 8294 8295 for (MachineMemOperand *MMO : MMOs) { 8296 if (!MMO->isStore()) 8297 continue; 8298 8299 if (!MMO->isLoad()) { 8300 // Reuse the MMO. 8301 StoreMMOs.push_back(MMO); 8302 } else { 8303 // Clone the MMO and unset the load flag. 8304 StoreMMOs.push_back(MF.getMachineMemOperand( 8305 MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad)); 8306 } 8307 } 8308 8309 return StoreMMOs; 8310 } 8311 8312 static unsigned getBroadcastOpcode(const X86FoldTableEntry *I, 8313 const TargetRegisterClass *RC, 8314 const X86Subtarget &STI) { 8315 assert(STI.hasAVX512() && "Expected at least AVX512!"); 8316 unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC); 8317 assert((SpillSize == 64 || STI.hasVLX()) && 8318 "Can't broadcast less than 64 bytes without AVX512VL!"); 8319 8320 #define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64) \ 8321 case TYPE: \ 8322 switch (SpillSize) { \ 8323 default: \ 8324 llvm_unreachable("Unknown spill size"); \ 8325 case 16: \ 8326 return X86::OP16; \ 8327 case 32: \ 8328 return X86::OP32; \ 8329 case 64: \ 8330 return X86::OP64; \ 8331 } \ 8332 break; 8333 8334 switch (I->Flags & TB_BCAST_MASK) { 8335 default: 8336 llvm_unreachable("Unexpected broadcast type!"); 8337 CASE_BCAST_TYPE_OPC(TB_BCAST_W, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm, 8338 VPBROADCASTWZrm) 8339 CASE_BCAST_TYPE_OPC(TB_BCAST_D, VPBROADCASTDZ128rm, VPBROADCASTDZ256rm, 8340 VPBROADCASTDZrm) 8341 CASE_BCAST_TYPE_OPC(TB_BCAST_Q, VPBROADCASTQZ128rm, VPBROADCASTQZ256rm, 8342 VPBROADCASTQZrm) 8343 CASE_BCAST_TYPE_OPC(TB_BCAST_SH, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm, 8344 VPBROADCASTWZrm) 8345 CASE_BCAST_TYPE_OPC(TB_BCAST_SS, VBROADCASTSSZ128rm, VBROADCASTSSZ256rm, 8346 VBROADCASTSSZrm) 8347 CASE_BCAST_TYPE_OPC(TB_BCAST_SD, VMOVDDUPZ128rm, VBROADCASTSDZ256rm, 8348 VBROADCASTSDZrm) 8349 } 8350 } 8351 8352 bool X86InstrInfo::unfoldMemoryOperand( 8353 MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad, 8354 bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const { 8355 const X86FoldTableEntry *I = lookupUnfoldTable(MI.getOpcode()); 8356 if (I == nullptr) 8357 return false; 8358 unsigned Opc = I->DstOp; 8359 unsigned Index = I->Flags & TB_INDEX_MASK; 8360 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; 8361 bool FoldedStore = I->Flags & TB_FOLDED_STORE; 8362 if (UnfoldLoad && !FoldedLoad) 8363 return false; 8364 UnfoldLoad &= FoldedLoad; 8365 if (UnfoldStore && !FoldedStore) 8366 return false; 8367 UnfoldStore &= FoldedStore; 8368 8369 const MCInstrDesc &MCID = get(Opc); 8370 8371 const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); 8372 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 8373 // TODO: Check if 32-byte or greater accesses are slow too? 8374 if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass && 8375 Subtarget.isUnalignedMem16Slow()) 8376 // Without memoperands, loadRegFromAddr and storeRegToStackSlot will 8377 // conservatively assume the address is unaligned. That's bad for 8378 // performance. 8379 return false; 8380 SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps; 8381 SmallVector<MachineOperand, 2> BeforeOps; 8382 SmallVector<MachineOperand, 2> AfterOps; 8383 SmallVector<MachineOperand, 4> ImpOps; 8384 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 8385 MachineOperand &Op = MI.getOperand(i); 8386 if (i >= Index && i < Index + X86::AddrNumOperands) 8387 AddrOps.push_back(Op); 8388 else if (Op.isReg() && Op.isImplicit()) 8389 ImpOps.push_back(Op); 8390 else if (i < Index) 8391 BeforeOps.push_back(Op); 8392 else if (i > Index) 8393 AfterOps.push_back(Op); 8394 } 8395 8396 // Emit the load or broadcast instruction. 8397 if (UnfoldLoad) { 8398 auto MMOs = extractLoadMMOs(MI.memoperands(), MF); 8399 8400 unsigned Opc; 8401 if (I->Flags & TB_BCAST_MASK) { 8402 Opc = getBroadcastOpcode(I, RC, Subtarget); 8403 } else { 8404 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); 8405 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; 8406 Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget); 8407 } 8408 8409 DebugLoc DL; 8410 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg); 8411 for (const MachineOperand &AddrOp : AddrOps) 8412 MIB.add(AddrOp); 8413 MIB.setMemRefs(MMOs); 8414 NewMIs.push_back(MIB); 8415 8416 if (UnfoldStore) { 8417 // Address operands cannot be marked isKill. 8418 for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) { 8419 MachineOperand &MO = NewMIs[0]->getOperand(i); 8420 if (MO.isReg()) 8421 MO.setIsKill(false); 8422 } 8423 } 8424 } 8425 8426 // Emit the data processing instruction. 8427 MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true); 8428 MachineInstrBuilder MIB(MF, DataMI); 8429 8430 if (FoldedStore) 8431 MIB.addReg(Reg, RegState::Define); 8432 for (MachineOperand &BeforeOp : BeforeOps) 8433 MIB.add(BeforeOp); 8434 if (FoldedLoad) 8435 MIB.addReg(Reg); 8436 for (MachineOperand &AfterOp : AfterOps) 8437 MIB.add(AfterOp); 8438 for (MachineOperand &ImpOp : ImpOps) { 8439 MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) | 8440 RegState::Implicit | 8441 getKillRegState(ImpOp.isKill()) | 8442 getDeadRegState(ImpOp.isDead()) | 8443 getUndefRegState(ImpOp.isUndef())); 8444 } 8445 // Change CMP32ri r, 0 back to TEST32rr r, r, etc. 8446 switch (DataMI->getOpcode()) { 8447 default: 8448 break; 8449 case X86::CMP64ri32: 8450 case X86::CMP32ri: 8451 case X86::CMP16ri: 8452 case X86::CMP8ri: { 8453 MachineOperand &MO0 = DataMI->getOperand(0); 8454 MachineOperand &MO1 = DataMI->getOperand(1); 8455 if (MO1.isImm() && MO1.getImm() == 0) { 8456 unsigned NewOpc; 8457 switch (DataMI->getOpcode()) { 8458 default: 8459 llvm_unreachable("Unreachable!"); 8460 case X86::CMP64ri32: 8461 NewOpc = X86::TEST64rr; 8462 break; 8463 case X86::CMP32ri: 8464 NewOpc = X86::TEST32rr; 8465 break; 8466 case X86::CMP16ri: 8467 NewOpc = X86::TEST16rr; 8468 break; 8469 case X86::CMP8ri: 8470 NewOpc = X86::TEST8rr; 8471 break; 8472 } 8473 DataMI->setDesc(get(NewOpc)); 8474 MO1.ChangeToRegister(MO0.getReg(), false); 8475 } 8476 } 8477 } 8478 NewMIs.push_back(DataMI); 8479 8480 // Emit the store instruction. 8481 if (UnfoldStore) { 8482 const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF); 8483 auto MMOs = extractStoreMMOs(MI.memoperands(), MF); 8484 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16); 8485 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; 8486 unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget); 8487 DebugLoc DL; 8488 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); 8489 for (const MachineOperand &AddrOp : AddrOps) 8490 MIB.add(AddrOp); 8491 MIB.addReg(Reg, RegState::Kill); 8492 MIB.setMemRefs(MMOs); 8493 NewMIs.push_back(MIB); 8494 } 8495 8496 return true; 8497 } 8498 8499 bool X86InstrInfo::unfoldMemoryOperand( 8500 SelectionDAG &DAG, SDNode *N, SmallVectorImpl<SDNode *> &NewNodes) const { 8501 if (!N->isMachineOpcode()) 8502 return false; 8503 8504 const X86FoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode()); 8505 if (I == nullptr) 8506 return false; 8507 unsigned Opc = I->DstOp; 8508 unsigned Index = I->Flags & TB_INDEX_MASK; 8509 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; 8510 bool FoldedStore = I->Flags & TB_FOLDED_STORE; 8511 const MCInstrDesc &MCID = get(Opc); 8512 MachineFunction &MF = DAG.getMachineFunction(); 8513 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 8514 const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); 8515 unsigned NumDefs = MCID.NumDefs; 8516 std::vector<SDValue> AddrOps; 8517 std::vector<SDValue> BeforeOps; 8518 std::vector<SDValue> AfterOps; 8519 SDLoc dl(N); 8520 unsigned NumOps = N->getNumOperands(); 8521 for (unsigned i = 0; i != NumOps - 1; ++i) { 8522 SDValue Op = N->getOperand(i); 8523 if (i >= Index - NumDefs && i < Index - NumDefs + X86::AddrNumOperands) 8524 AddrOps.push_back(Op); 8525 else if (i < Index - NumDefs) 8526 BeforeOps.push_back(Op); 8527 else if (i > Index - NumDefs) 8528 AfterOps.push_back(Op); 8529 } 8530 SDValue Chain = N->getOperand(NumOps - 1); 8531 AddrOps.push_back(Chain); 8532 8533 // Emit the load instruction. 8534 SDNode *Load = nullptr; 8535 if (FoldedLoad) { 8536 EVT VT = *TRI.legalclasstypes_begin(*RC); 8537 auto MMOs = extractLoadMMOs(cast<MachineSDNode>(N)->memoperands(), MF); 8538 if (MMOs.empty() && RC == &X86::VR128RegClass && 8539 Subtarget.isUnalignedMem16Slow()) 8540 // Do not introduce a slow unaligned load. 8541 return false; 8542 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte 8543 // memory access is slow above. 8544 8545 unsigned Opc; 8546 if (I->Flags & TB_BCAST_MASK) { 8547 Opc = getBroadcastOpcode(I, RC, Subtarget); 8548 } else { 8549 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); 8550 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; 8551 Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget); 8552 } 8553 8554 Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps); 8555 NewNodes.push_back(Load); 8556 8557 // Preserve memory reference information. 8558 DAG.setNodeMemRefs(cast<MachineSDNode>(Load), MMOs); 8559 } 8560 8561 // Emit the data processing instruction. 8562 std::vector<EVT> VTs; 8563 const TargetRegisterClass *DstRC = nullptr; 8564 if (MCID.getNumDefs() > 0) { 8565 DstRC = getRegClass(MCID, 0, &RI, MF); 8566 VTs.push_back(*TRI.legalclasstypes_begin(*DstRC)); 8567 } 8568 for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { 8569 EVT VT = N->getValueType(i); 8570 if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs()) 8571 VTs.push_back(VT); 8572 } 8573 if (Load) 8574 BeforeOps.push_back(SDValue(Load, 0)); 8575 llvm::append_range(BeforeOps, AfterOps); 8576 // Change CMP32ri r, 0 back to TEST32rr r, r, etc. 8577 switch (Opc) { 8578 default: 8579 break; 8580 case X86::CMP64ri32: 8581 case X86::CMP32ri: 8582 case X86::CMP16ri: 8583 case X86::CMP8ri: 8584 if (isNullConstant(BeforeOps[1])) { 8585 switch (Opc) { 8586 default: 8587 llvm_unreachable("Unreachable!"); 8588 case X86::CMP64ri32: 8589 Opc = X86::TEST64rr; 8590 break; 8591 case X86::CMP32ri: 8592 Opc = X86::TEST32rr; 8593 break; 8594 case X86::CMP16ri: 8595 Opc = X86::TEST16rr; 8596 break; 8597 case X86::CMP8ri: 8598 Opc = X86::TEST8rr; 8599 break; 8600 } 8601 BeforeOps[1] = BeforeOps[0]; 8602 } 8603 } 8604 SDNode *NewNode = DAG.getMachineNode(Opc, dl, VTs, BeforeOps); 8605 NewNodes.push_back(NewNode); 8606 8607 // Emit the store instruction. 8608 if (FoldedStore) { 8609 AddrOps.pop_back(); 8610 AddrOps.push_back(SDValue(NewNode, 0)); 8611 AddrOps.push_back(Chain); 8612 auto MMOs = extractStoreMMOs(cast<MachineSDNode>(N)->memoperands(), MF); 8613 if (MMOs.empty() && RC == &X86::VR128RegClass && 8614 Subtarget.isUnalignedMem16Slow()) 8615 // Do not introduce a slow unaligned store. 8616 return false; 8617 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte 8618 // memory access is slow above. 8619 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); 8620 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; 8621 SDNode *Store = 8622 DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget), 8623 dl, MVT::Other, AddrOps); 8624 NewNodes.push_back(Store); 8625 8626 // Preserve memory reference information. 8627 DAG.setNodeMemRefs(cast<MachineSDNode>(Store), MMOs); 8628 } 8629 8630 return true; 8631 } 8632 8633 unsigned 8634 X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, 8635 bool UnfoldStore, 8636 unsigned *LoadRegIndex) const { 8637 const X86FoldTableEntry *I = lookupUnfoldTable(Opc); 8638 if (I == nullptr) 8639 return 0; 8640 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; 8641 bool FoldedStore = I->Flags & TB_FOLDED_STORE; 8642 if (UnfoldLoad && !FoldedLoad) 8643 return 0; 8644 if (UnfoldStore && !FoldedStore) 8645 return 0; 8646 if (LoadRegIndex) 8647 *LoadRegIndex = I->Flags & TB_INDEX_MASK; 8648 return I->DstOp; 8649 } 8650 8651 bool X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, 8652 int64_t &Offset1, 8653 int64_t &Offset2) const { 8654 if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) 8655 return false; 8656 8657 auto IsLoadOpcode = [&](unsigned Opcode) { 8658 switch (Opcode) { 8659 default: 8660 return false; 8661 case X86::MOV8rm: 8662 case X86::MOV16rm: 8663 case X86::MOV32rm: 8664 case X86::MOV64rm: 8665 case X86::LD_Fp32m: 8666 case X86::LD_Fp64m: 8667 case X86::LD_Fp80m: 8668 case X86::MOVSSrm: 8669 case X86::MOVSSrm_alt: 8670 case X86::MOVSDrm: 8671 case X86::MOVSDrm_alt: 8672 case X86::MMX_MOVD64rm: 8673 case X86::MMX_MOVQ64rm: 8674 case X86::MOVAPSrm: 8675 case X86::MOVUPSrm: 8676 case X86::MOVAPDrm: 8677 case X86::MOVUPDrm: 8678 case X86::MOVDQArm: 8679 case X86::MOVDQUrm: 8680 // AVX load instructions 8681 case X86::VMOVSSrm: 8682 case X86::VMOVSSrm_alt: 8683 case X86::VMOVSDrm: 8684 case X86::VMOVSDrm_alt: 8685 case X86::VMOVAPSrm: 8686 case X86::VMOVUPSrm: 8687 case X86::VMOVAPDrm: 8688 case X86::VMOVUPDrm: 8689 case X86::VMOVDQArm: 8690 case X86::VMOVDQUrm: 8691 case X86::VMOVAPSYrm: 8692 case X86::VMOVUPSYrm: 8693 case X86::VMOVAPDYrm: 8694 case X86::VMOVUPDYrm: 8695 case X86::VMOVDQAYrm: 8696 case X86::VMOVDQUYrm: 8697 // AVX512 load instructions 8698 case X86::VMOVSSZrm: 8699 case X86::VMOVSSZrm_alt: 8700 case X86::VMOVSDZrm: 8701 case X86::VMOVSDZrm_alt: 8702 case X86::VMOVAPSZ128rm: 8703 case X86::VMOVUPSZ128rm: 8704 case X86::VMOVAPSZ128rm_NOVLX: 8705 case X86::VMOVUPSZ128rm_NOVLX: 8706 case X86::VMOVAPDZ128rm: 8707 case X86::VMOVUPDZ128rm: 8708 case X86::VMOVDQU8Z128rm: 8709 case X86::VMOVDQU16Z128rm: 8710 case X86::VMOVDQA32Z128rm: 8711 case X86::VMOVDQU32Z128rm: 8712 case X86::VMOVDQA64Z128rm: 8713 case X86::VMOVDQU64Z128rm: 8714 case X86::VMOVAPSZ256rm: 8715 case X86::VMOVUPSZ256rm: 8716 case X86::VMOVAPSZ256rm_NOVLX: 8717 case X86::VMOVUPSZ256rm_NOVLX: 8718 case X86::VMOVAPDZ256rm: 8719 case X86::VMOVUPDZ256rm: 8720 case X86::VMOVDQU8Z256rm: 8721 case X86::VMOVDQU16Z256rm: 8722 case X86::VMOVDQA32Z256rm: 8723 case X86::VMOVDQU32Z256rm: 8724 case X86::VMOVDQA64Z256rm: 8725 case X86::VMOVDQU64Z256rm: 8726 case X86::VMOVAPSZrm: 8727 case X86::VMOVUPSZrm: 8728 case X86::VMOVAPDZrm: 8729 case X86::VMOVUPDZrm: 8730 case X86::VMOVDQU8Zrm: 8731 case X86::VMOVDQU16Zrm: 8732 case X86::VMOVDQA32Zrm: 8733 case X86::VMOVDQU32Zrm: 8734 case X86::VMOVDQA64Zrm: 8735 case X86::VMOVDQU64Zrm: 8736 case X86::KMOVBkm: 8737 case X86::KMOVBkm_EVEX: 8738 case X86::KMOVWkm: 8739 case X86::KMOVWkm_EVEX: 8740 case X86::KMOVDkm: 8741 case X86::KMOVDkm_EVEX: 8742 case X86::KMOVQkm: 8743 case X86::KMOVQkm_EVEX: 8744 return true; 8745 } 8746 }; 8747 8748 if (!IsLoadOpcode(Load1->getMachineOpcode()) || 8749 !IsLoadOpcode(Load2->getMachineOpcode())) 8750 return false; 8751 8752 // Lambda to check if both the loads have the same value for an operand index. 8753 auto HasSameOp = [&](int I) { 8754 return Load1->getOperand(I) == Load2->getOperand(I); 8755 }; 8756 8757 // All operands except the displacement should match. 8758 if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) || 8759 !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg)) 8760 return false; 8761 8762 // Chain Operand must be the same. 8763 if (!HasSameOp(5)) 8764 return false; 8765 8766 // Now let's examine if the displacements are constants. 8767 auto Disp1 = dyn_cast<ConstantSDNode>(Load1->getOperand(X86::AddrDisp)); 8768 auto Disp2 = dyn_cast<ConstantSDNode>(Load2->getOperand(X86::AddrDisp)); 8769 if (!Disp1 || !Disp2) 8770 return false; 8771 8772 Offset1 = Disp1->getSExtValue(); 8773 Offset2 = Disp2->getSExtValue(); 8774 return true; 8775 } 8776 8777 bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, 8778 int64_t Offset1, int64_t Offset2, 8779 unsigned NumLoads) const { 8780 assert(Offset2 > Offset1); 8781 if ((Offset2 - Offset1) / 8 > 64) 8782 return false; 8783 8784 unsigned Opc1 = Load1->getMachineOpcode(); 8785 unsigned Opc2 = Load2->getMachineOpcode(); 8786 if (Opc1 != Opc2) 8787 return false; // FIXME: overly conservative? 8788 8789 switch (Opc1) { 8790 default: 8791 break; 8792 case X86::LD_Fp32m: 8793 case X86::LD_Fp64m: 8794 case X86::LD_Fp80m: 8795 case X86::MMX_MOVD64rm: 8796 case X86::MMX_MOVQ64rm: 8797 return false; 8798 } 8799 8800 EVT VT = Load1->getValueType(0); 8801 switch (VT.getSimpleVT().SimpleTy) { 8802 default: 8803 // XMM registers. In 64-bit mode we can be a bit more aggressive since we 8804 // have 16 of them to play with. 8805 if (Subtarget.is64Bit()) { 8806 if (NumLoads >= 3) 8807 return false; 8808 } else if (NumLoads) { 8809 return false; 8810 } 8811 break; 8812 case MVT::i8: 8813 case MVT::i16: 8814 case MVT::i32: 8815 case MVT::i64: 8816 case MVT::f32: 8817 case MVT::f64: 8818 if (NumLoads) 8819 return false; 8820 break; 8821 } 8822 8823 return true; 8824 } 8825 8826 bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 8827 const MachineBasicBlock *MBB, 8828 const MachineFunction &MF) const { 8829 8830 // ENDBR instructions should not be scheduled around. 8831 unsigned Opcode = MI.getOpcode(); 8832 if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 || 8833 Opcode == X86::PLDTILECFGV) 8834 return true; 8835 8836 // Frame setup and destory can't be scheduled around. 8837 if (MI.getFlag(MachineInstr::FrameSetup) || 8838 MI.getFlag(MachineInstr::FrameDestroy)) 8839 return true; 8840 8841 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); 8842 } 8843 8844 bool X86InstrInfo::reverseBranchCondition( 8845 SmallVectorImpl<MachineOperand> &Cond) const { 8846 assert(Cond.size() == 1 && "Invalid X86 branch condition!"); 8847 X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm()); 8848 Cond[0].setImm(GetOppositeBranchCondition(CC)); 8849 return false; 8850 } 8851 8852 bool X86InstrInfo::isSafeToMoveRegClassDefs( 8853 const TargetRegisterClass *RC) const { 8854 // FIXME: Return false for x87 stack register classes for now. We can't 8855 // allow any loads of these registers before FpGet_ST0_80. 8856 return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass || 8857 RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass || 8858 RC == &X86::RFP80RegClass); 8859 } 8860 8861 /// Return a virtual register initialized with the 8862 /// the global base register value. Output instructions required to 8863 /// initialize the register in the function entry block, if necessary. 8864 /// 8865 /// TODO: Eliminate this and move the code to X86MachineFunctionInfo. 8866 /// 8867 unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { 8868 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); 8869 Register GlobalBaseReg = X86FI->getGlobalBaseReg(); 8870 if (GlobalBaseReg != 0) 8871 return GlobalBaseReg; 8872 8873 // Create the register. The code to initialize it is inserted 8874 // later, by the CGBR pass (below). 8875 MachineRegisterInfo &RegInfo = MF->getRegInfo(); 8876 GlobalBaseReg = RegInfo.createVirtualRegister( 8877 Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass); 8878 X86FI->setGlobalBaseReg(GlobalBaseReg); 8879 return GlobalBaseReg; 8880 } 8881 8882 // FIXME: Some shuffle and unpack instructions have equivalents in different 8883 // domains, but they require a bit more work than just switching opcodes. 8884 8885 static const uint16_t *lookup(unsigned opcode, unsigned domain, 8886 ArrayRef<uint16_t[3]> Table) { 8887 for (const uint16_t(&Row)[3] : Table) 8888 if (Row[domain - 1] == opcode) 8889 return Row; 8890 return nullptr; 8891 } 8892 8893 static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain, 8894 ArrayRef<uint16_t[4]> Table) { 8895 // If this is the integer domain make sure to check both integer columns. 8896 for (const uint16_t(&Row)[4] : Table) 8897 if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode)) 8898 return Row; 8899 return nullptr; 8900 } 8901 8902 // Helper to attempt to widen/narrow blend masks. 8903 static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth, 8904 unsigned NewWidth, unsigned *pNewMask = nullptr) { 8905 assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) && 8906 "Illegal blend mask scale"); 8907 unsigned NewMask = 0; 8908 8909 if ((OldWidth % NewWidth) == 0) { 8910 unsigned Scale = OldWidth / NewWidth; 8911 unsigned SubMask = (1u << Scale) - 1; 8912 for (unsigned i = 0; i != NewWidth; ++i) { 8913 unsigned Sub = (OldMask >> (i * Scale)) & SubMask; 8914 if (Sub == SubMask) 8915 NewMask |= (1u << i); 8916 else if (Sub != 0x0) 8917 return false; 8918 } 8919 } else { 8920 unsigned Scale = NewWidth / OldWidth; 8921 unsigned SubMask = (1u << Scale) - 1; 8922 for (unsigned i = 0; i != OldWidth; ++i) { 8923 if (OldMask & (1 << i)) { 8924 NewMask |= (SubMask << (i * Scale)); 8925 } 8926 } 8927 } 8928 8929 if (pNewMask) 8930 *pNewMask = NewMask; 8931 return true; 8932 } 8933 8934 uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const { 8935 unsigned Opcode = MI.getOpcode(); 8936 unsigned NumOperands = MI.getDesc().getNumOperands(); 8937 8938 auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) { 8939 uint16_t validDomains = 0; 8940 if (MI.getOperand(NumOperands - 1).isImm()) { 8941 unsigned Imm = MI.getOperand(NumOperands - 1).getImm(); 8942 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4)) 8943 validDomains |= 0x2; // PackedSingle 8944 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2)) 8945 validDomains |= 0x4; // PackedDouble 8946 if (!Is256 || Subtarget.hasAVX2()) 8947 validDomains |= 0x8; // PackedInt 8948 } 8949 return validDomains; 8950 }; 8951 8952 switch (Opcode) { 8953 case X86::BLENDPDrmi: 8954 case X86::BLENDPDrri: 8955 case X86::VBLENDPDrmi: 8956 case X86::VBLENDPDrri: 8957 return GetBlendDomains(2, false); 8958 case X86::VBLENDPDYrmi: 8959 case X86::VBLENDPDYrri: 8960 return GetBlendDomains(4, true); 8961 case X86::BLENDPSrmi: 8962 case X86::BLENDPSrri: 8963 case X86::VBLENDPSrmi: 8964 case X86::VBLENDPSrri: 8965 case X86::VPBLENDDrmi: 8966 case X86::VPBLENDDrri: 8967 return GetBlendDomains(4, false); 8968 case X86::VBLENDPSYrmi: 8969 case X86::VBLENDPSYrri: 8970 case X86::VPBLENDDYrmi: 8971 case X86::VPBLENDDYrri: 8972 return GetBlendDomains(8, true); 8973 case X86::PBLENDWrmi: 8974 case X86::PBLENDWrri: 8975 case X86::VPBLENDWrmi: 8976 case X86::VPBLENDWrri: 8977 // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks. 8978 case X86::VPBLENDWYrmi: 8979 case X86::VPBLENDWYrri: 8980 return GetBlendDomains(8, false); 8981 case X86::VPANDDZ128rr: 8982 case X86::VPANDDZ128rm: 8983 case X86::VPANDDZ256rr: 8984 case X86::VPANDDZ256rm: 8985 case X86::VPANDQZ128rr: 8986 case X86::VPANDQZ128rm: 8987 case X86::VPANDQZ256rr: 8988 case X86::VPANDQZ256rm: 8989 case X86::VPANDNDZ128rr: 8990 case X86::VPANDNDZ128rm: 8991 case X86::VPANDNDZ256rr: 8992 case X86::VPANDNDZ256rm: 8993 case X86::VPANDNQZ128rr: 8994 case X86::VPANDNQZ128rm: 8995 case X86::VPANDNQZ256rr: 8996 case X86::VPANDNQZ256rm: 8997 case X86::VPORDZ128rr: 8998 case X86::VPORDZ128rm: 8999 case X86::VPORDZ256rr: 9000 case X86::VPORDZ256rm: 9001 case X86::VPORQZ128rr: 9002 case X86::VPORQZ128rm: 9003 case X86::VPORQZ256rr: 9004 case X86::VPORQZ256rm: 9005 case X86::VPXORDZ128rr: 9006 case X86::VPXORDZ128rm: 9007 case X86::VPXORDZ256rr: 9008 case X86::VPXORDZ256rm: 9009 case X86::VPXORQZ128rr: 9010 case X86::VPXORQZ128rm: 9011 case X86::VPXORQZ256rr: 9012 case X86::VPXORQZ256rm: 9013 // If we don't have DQI see if we can still switch from an EVEX integer 9014 // instruction to a VEX floating point instruction. 9015 if (Subtarget.hasDQI()) 9016 return 0; 9017 9018 if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16) 9019 return 0; 9020 if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16) 9021 return 0; 9022 // Register forms will have 3 operands. Memory form will have more. 9023 if (NumOperands == 3 && 9024 RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16) 9025 return 0; 9026 9027 // All domains are valid. 9028 return 0xe; 9029 case X86::MOVHLPSrr: 9030 // We can swap domains when both inputs are the same register. 9031 // FIXME: This doesn't catch all the cases we would like. If the input 9032 // register isn't KILLed by the instruction, the two address instruction 9033 // pass puts a COPY on one input. The other input uses the original 9034 // register. This prevents the same physical register from being used by 9035 // both inputs. 9036 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() && 9037 MI.getOperand(0).getSubReg() == 0 && 9038 MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0) 9039 return 0x6; 9040 return 0; 9041 case X86::SHUFPDrri: 9042 return 0x6; 9043 } 9044 return 0; 9045 } 9046 9047 #include "X86ReplaceableInstrs.def" 9048 9049 bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, 9050 unsigned Domain) const { 9051 assert(Domain > 0 && Domain < 4 && "Invalid execution domain"); 9052 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; 9053 assert(dom && "Not an SSE instruction"); 9054 9055 unsigned Opcode = MI.getOpcode(); 9056 unsigned NumOperands = MI.getDesc().getNumOperands(); 9057 9058 auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) { 9059 if (MI.getOperand(NumOperands - 1).isImm()) { 9060 unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255; 9061 Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm); 9062 unsigned NewImm = Imm; 9063 9064 const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs); 9065 if (!table) 9066 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs); 9067 9068 if (Domain == 1) { // PackedSingle 9069 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm); 9070 } else if (Domain == 2) { // PackedDouble 9071 AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm); 9072 } else if (Domain == 3) { // PackedInt 9073 if (Subtarget.hasAVX2()) { 9074 // If we are already VPBLENDW use that, else use VPBLENDD. 9075 if ((ImmWidth / (Is256 ? 2 : 1)) != 8) { 9076 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs); 9077 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm); 9078 } 9079 } else { 9080 assert(!Is256 && "128-bit vector expected"); 9081 AdjustBlendMask(Imm, ImmWidth, 8, &NewImm); 9082 } 9083 } 9084 9085 assert(table && table[Domain - 1] && "Unknown domain op"); 9086 MI.setDesc(get(table[Domain - 1])); 9087 MI.getOperand(NumOperands - 1).setImm(NewImm & 255); 9088 } 9089 return true; 9090 }; 9091 9092 switch (Opcode) { 9093 case X86::BLENDPDrmi: 9094 case X86::BLENDPDrri: 9095 case X86::VBLENDPDrmi: 9096 case X86::VBLENDPDrri: 9097 return SetBlendDomain(2, false); 9098 case X86::VBLENDPDYrmi: 9099 case X86::VBLENDPDYrri: 9100 return SetBlendDomain(4, true); 9101 case X86::BLENDPSrmi: 9102 case X86::BLENDPSrri: 9103 case X86::VBLENDPSrmi: 9104 case X86::VBLENDPSrri: 9105 case X86::VPBLENDDrmi: 9106 case X86::VPBLENDDrri: 9107 return SetBlendDomain(4, false); 9108 case X86::VBLENDPSYrmi: 9109 case X86::VBLENDPSYrri: 9110 case X86::VPBLENDDYrmi: 9111 case X86::VPBLENDDYrri: 9112 return SetBlendDomain(8, true); 9113 case X86::PBLENDWrmi: 9114 case X86::PBLENDWrri: 9115 case X86::VPBLENDWrmi: 9116 case X86::VPBLENDWrri: 9117 return SetBlendDomain(8, false); 9118 case X86::VPBLENDWYrmi: 9119 case X86::VPBLENDWYrri: 9120 return SetBlendDomain(16, true); 9121 case X86::VPANDDZ128rr: 9122 case X86::VPANDDZ128rm: 9123 case X86::VPANDDZ256rr: 9124 case X86::VPANDDZ256rm: 9125 case X86::VPANDQZ128rr: 9126 case X86::VPANDQZ128rm: 9127 case X86::VPANDQZ256rr: 9128 case X86::VPANDQZ256rm: 9129 case X86::VPANDNDZ128rr: 9130 case X86::VPANDNDZ128rm: 9131 case X86::VPANDNDZ256rr: 9132 case X86::VPANDNDZ256rm: 9133 case X86::VPANDNQZ128rr: 9134 case X86::VPANDNQZ128rm: 9135 case X86::VPANDNQZ256rr: 9136 case X86::VPANDNQZ256rm: 9137 case X86::VPORDZ128rr: 9138 case X86::VPORDZ128rm: 9139 case X86::VPORDZ256rr: 9140 case X86::VPORDZ256rm: 9141 case X86::VPORQZ128rr: 9142 case X86::VPORQZ128rm: 9143 case X86::VPORQZ256rr: 9144 case X86::VPORQZ256rm: 9145 case X86::VPXORDZ128rr: 9146 case X86::VPXORDZ128rm: 9147 case X86::VPXORDZ256rr: 9148 case X86::VPXORDZ256rm: 9149 case X86::VPXORQZ128rr: 9150 case X86::VPXORQZ128rm: 9151 case X86::VPXORQZ256rr: 9152 case X86::VPXORQZ256rm: { 9153 // Without DQI, convert EVEX instructions to VEX instructions. 9154 if (Subtarget.hasDQI()) 9155 return false; 9156 9157 const uint16_t *table = 9158 lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs); 9159 assert(table && "Instruction not found in table?"); 9160 // Don't change integer Q instructions to D instructions and 9161 // use D intructions if we started with a PS instruction. 9162 if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) 9163 Domain = 4; 9164 MI.setDesc(get(table[Domain - 1])); 9165 return true; 9166 } 9167 case X86::UNPCKHPDrr: 9168 case X86::MOVHLPSrr: 9169 // We just need to commute the instruction which will switch the domains. 9170 if (Domain != dom && Domain != 3 && 9171 MI.getOperand(1).getReg() == MI.getOperand(2).getReg() && 9172 MI.getOperand(0).getSubReg() == 0 && 9173 MI.getOperand(1).getSubReg() == 0 && 9174 MI.getOperand(2).getSubReg() == 0) { 9175 commuteInstruction(MI, false); 9176 return true; 9177 } 9178 // We must always return true for MOVHLPSrr. 9179 if (Opcode == X86::MOVHLPSrr) 9180 return true; 9181 break; 9182 case X86::SHUFPDrri: { 9183 if (Domain == 1) { 9184 unsigned Imm = MI.getOperand(3).getImm(); 9185 unsigned NewImm = 0x44; 9186 if (Imm & 1) 9187 NewImm |= 0x0a; 9188 if (Imm & 2) 9189 NewImm |= 0xa0; 9190 MI.getOperand(3).setImm(NewImm); 9191 MI.setDesc(get(X86::SHUFPSrri)); 9192 } 9193 return true; 9194 } 9195 } 9196 return false; 9197 } 9198 9199 std::pair<uint16_t, uint16_t> 9200 X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { 9201 uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; 9202 unsigned opcode = MI.getOpcode(); 9203 uint16_t validDomains = 0; 9204 if (domain) { 9205 // Attempt to match for custom instructions. 9206 validDomains = getExecutionDomainCustom(MI); 9207 if (validDomains) 9208 return std::make_pair(domain, validDomains); 9209 9210 if (lookup(opcode, domain, ReplaceableInstrs)) { 9211 validDomains = 0xe; 9212 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) { 9213 validDomains = Subtarget.hasAVX2() ? 0xe : 0x6; 9214 } else if (lookup(opcode, domain, ReplaceableInstrsFP)) { 9215 validDomains = 0x6; 9216 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) { 9217 // Insert/extract instructions should only effect domain if AVX2 9218 // is enabled. 9219 if (!Subtarget.hasAVX2()) 9220 return std::make_pair(0, 0); 9221 validDomains = 0xe; 9222 } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) { 9223 validDomains = 0xe; 9224 } else if (Subtarget.hasDQI() && 9225 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) { 9226 validDomains = 0xe; 9227 } else if (Subtarget.hasDQI()) { 9228 if (const uint16_t *table = 9229 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) { 9230 if (domain == 1 || (domain == 3 && table[3] == opcode)) 9231 validDomains = 0xa; 9232 else 9233 validDomains = 0xc; 9234 } 9235 } 9236 } 9237 return std::make_pair(domain, validDomains); 9238 } 9239 9240 void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { 9241 assert(Domain > 0 && Domain < 4 && "Invalid execution domain"); 9242 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; 9243 assert(dom && "Not an SSE instruction"); 9244 9245 // Attempt to match for custom instructions. 9246 if (setExecutionDomainCustom(MI, Domain)) 9247 return; 9248 9249 const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs); 9250 if (!table) { // try the other table 9251 assert((Subtarget.hasAVX2() || Domain < 3) && 9252 "256-bit vector operations only available in AVX2"); 9253 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2); 9254 } 9255 if (!table) { // try the FP table 9256 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP); 9257 assert((!table || Domain < 3) && 9258 "Can only select PackedSingle or PackedDouble"); 9259 } 9260 if (!table) { // try the other table 9261 assert(Subtarget.hasAVX2() && 9262 "256-bit insert/extract only available in AVX2"); 9263 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract); 9264 } 9265 if (!table) { // try the AVX512 table 9266 assert(Subtarget.hasAVX512() && "Requires AVX-512"); 9267 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512); 9268 // Don't change integer Q instructions to D instructions. 9269 if (table && Domain == 3 && table[3] == MI.getOpcode()) 9270 Domain = 4; 9271 } 9272 if (!table) { // try the AVX512DQ table 9273 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); 9274 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ); 9275 // Don't change integer Q instructions to D instructions and 9276 // use D instructions if we started with a PS instruction. 9277 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) 9278 Domain = 4; 9279 } 9280 if (!table) { // try the AVX512DQMasked table 9281 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); 9282 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked); 9283 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) 9284 Domain = 4; 9285 } 9286 assert(table && "Cannot change domain"); 9287 MI.setDesc(get(table[Domain - 1])); 9288 } 9289 9290 void X86InstrInfo::insertNoop(MachineBasicBlock &MBB, 9291 MachineBasicBlock::iterator MI) const { 9292 DebugLoc DL; 9293 BuildMI(MBB, MI, DL, get(X86::NOOP)); 9294 } 9295 9296 /// Return the noop instruction to use for a noop. 9297 MCInst X86InstrInfo::getNop() const { 9298 MCInst Nop; 9299 Nop.setOpcode(X86::NOOP); 9300 return Nop; 9301 } 9302 9303 bool X86InstrInfo::isHighLatencyDef(int opc) const { 9304 switch (opc) { 9305 default: 9306 return false; 9307 case X86::DIVPDrm: 9308 case X86::DIVPDrr: 9309 case X86::DIVPSrm: 9310 case X86::DIVPSrr: 9311 case X86::DIVSDrm: 9312 case X86::DIVSDrm_Int: 9313 case X86::DIVSDrr: 9314 case X86::DIVSDrr_Int: 9315 case X86::DIVSSrm: 9316 case X86::DIVSSrm_Int: 9317 case X86::DIVSSrr: 9318 case X86::DIVSSrr_Int: 9319 case X86::SQRTPDm: 9320 case X86::SQRTPDr: 9321 case X86::SQRTPSm: 9322 case X86::SQRTPSr: 9323 case X86::SQRTSDm: 9324 case X86::SQRTSDm_Int: 9325 case X86::SQRTSDr: 9326 case X86::SQRTSDr_Int: 9327 case X86::SQRTSSm: 9328 case X86::SQRTSSm_Int: 9329 case X86::SQRTSSr: 9330 case X86::SQRTSSr_Int: 9331 // AVX instructions with high latency 9332 case X86::VDIVPDrm: 9333 case X86::VDIVPDrr: 9334 case X86::VDIVPDYrm: 9335 case X86::VDIVPDYrr: 9336 case X86::VDIVPSrm: 9337 case X86::VDIVPSrr: 9338 case X86::VDIVPSYrm: 9339 case X86::VDIVPSYrr: 9340 case X86::VDIVSDrm: 9341 case X86::VDIVSDrm_Int: 9342 case X86::VDIVSDrr: 9343 case X86::VDIVSDrr_Int: 9344 case X86::VDIVSSrm: 9345 case X86::VDIVSSrm_Int: 9346 case X86::VDIVSSrr: 9347 case X86::VDIVSSrr_Int: 9348 case X86::VSQRTPDm: 9349 case X86::VSQRTPDr: 9350 case X86::VSQRTPDYm: 9351 case X86::VSQRTPDYr: 9352 case X86::VSQRTPSm: 9353 case X86::VSQRTPSr: 9354 case X86::VSQRTPSYm: 9355 case X86::VSQRTPSYr: 9356 case X86::VSQRTSDm: 9357 case X86::VSQRTSDm_Int: 9358 case X86::VSQRTSDr: 9359 case X86::VSQRTSDr_Int: 9360 case X86::VSQRTSSm: 9361 case X86::VSQRTSSm_Int: 9362 case X86::VSQRTSSr: 9363 case X86::VSQRTSSr_Int: 9364 // AVX512 instructions with high latency 9365 case X86::VDIVPDZ128rm: 9366 case X86::VDIVPDZ128rmb: 9367 case X86::VDIVPDZ128rmbk: 9368 case X86::VDIVPDZ128rmbkz: 9369 case X86::VDIVPDZ128rmk: 9370 case X86::VDIVPDZ128rmkz: 9371 case X86::VDIVPDZ128rr: 9372 case X86::VDIVPDZ128rrk: 9373 case X86::VDIVPDZ128rrkz: 9374 case X86::VDIVPDZ256rm: 9375 case X86::VDIVPDZ256rmb: 9376 case X86::VDIVPDZ256rmbk: 9377 case X86::VDIVPDZ256rmbkz: 9378 case X86::VDIVPDZ256rmk: 9379 case X86::VDIVPDZ256rmkz: 9380 case X86::VDIVPDZ256rr: 9381 case X86::VDIVPDZ256rrk: 9382 case X86::VDIVPDZ256rrkz: 9383 case X86::VDIVPDZrrb: 9384 case X86::VDIVPDZrrbk: 9385 case X86::VDIVPDZrrbkz: 9386 case X86::VDIVPDZrm: 9387 case X86::VDIVPDZrmb: 9388 case X86::VDIVPDZrmbk: 9389 case X86::VDIVPDZrmbkz: 9390 case X86::VDIVPDZrmk: 9391 case X86::VDIVPDZrmkz: 9392 case X86::VDIVPDZrr: 9393 case X86::VDIVPDZrrk: 9394 case X86::VDIVPDZrrkz: 9395 case X86::VDIVPSZ128rm: 9396 case X86::VDIVPSZ128rmb: 9397 case X86::VDIVPSZ128rmbk: 9398 case X86::VDIVPSZ128rmbkz: 9399 case X86::VDIVPSZ128rmk: 9400 case X86::VDIVPSZ128rmkz: 9401 case X86::VDIVPSZ128rr: 9402 case X86::VDIVPSZ128rrk: 9403 case X86::VDIVPSZ128rrkz: 9404 case X86::VDIVPSZ256rm: 9405 case X86::VDIVPSZ256rmb: 9406 case X86::VDIVPSZ256rmbk: 9407 case X86::VDIVPSZ256rmbkz: 9408 case X86::VDIVPSZ256rmk: 9409 case X86::VDIVPSZ256rmkz: 9410 case X86::VDIVPSZ256rr: 9411 case X86::VDIVPSZ256rrk: 9412 case X86::VDIVPSZ256rrkz: 9413 case X86::VDIVPSZrrb: 9414 case X86::VDIVPSZrrbk: 9415 case X86::VDIVPSZrrbkz: 9416 case X86::VDIVPSZrm: 9417 case X86::VDIVPSZrmb: 9418 case X86::VDIVPSZrmbk: 9419 case X86::VDIVPSZrmbkz: 9420 case X86::VDIVPSZrmk: 9421 case X86::VDIVPSZrmkz: 9422 case X86::VDIVPSZrr: 9423 case X86::VDIVPSZrrk: 9424 case X86::VDIVPSZrrkz: 9425 case X86::VDIVSDZrm: 9426 case X86::VDIVSDZrr: 9427 case X86::VDIVSDZrm_Int: 9428 case X86::VDIVSDZrm_Intk: 9429 case X86::VDIVSDZrm_Intkz: 9430 case X86::VDIVSDZrr_Int: 9431 case X86::VDIVSDZrr_Intk: 9432 case X86::VDIVSDZrr_Intkz: 9433 case X86::VDIVSDZrrb_Int: 9434 case X86::VDIVSDZrrb_Intk: 9435 case X86::VDIVSDZrrb_Intkz: 9436 case X86::VDIVSSZrm: 9437 case X86::VDIVSSZrr: 9438 case X86::VDIVSSZrm_Int: 9439 case X86::VDIVSSZrm_Intk: 9440 case X86::VDIVSSZrm_Intkz: 9441 case X86::VDIVSSZrr_Int: 9442 case X86::VDIVSSZrr_Intk: 9443 case X86::VDIVSSZrr_Intkz: 9444 case X86::VDIVSSZrrb_Int: 9445 case X86::VDIVSSZrrb_Intk: 9446 case X86::VDIVSSZrrb_Intkz: 9447 case X86::VSQRTPDZ128m: 9448 case X86::VSQRTPDZ128mb: 9449 case X86::VSQRTPDZ128mbk: 9450 case X86::VSQRTPDZ128mbkz: 9451 case X86::VSQRTPDZ128mk: 9452 case X86::VSQRTPDZ128mkz: 9453 case X86::VSQRTPDZ128r: 9454 case X86::VSQRTPDZ128rk: 9455 case X86::VSQRTPDZ128rkz: 9456 case X86::VSQRTPDZ256m: 9457 case X86::VSQRTPDZ256mb: 9458 case X86::VSQRTPDZ256mbk: 9459 case X86::VSQRTPDZ256mbkz: 9460 case X86::VSQRTPDZ256mk: 9461 case X86::VSQRTPDZ256mkz: 9462 case X86::VSQRTPDZ256r: 9463 case X86::VSQRTPDZ256rk: 9464 case X86::VSQRTPDZ256rkz: 9465 case X86::VSQRTPDZm: 9466 case X86::VSQRTPDZmb: 9467 case X86::VSQRTPDZmbk: 9468 case X86::VSQRTPDZmbkz: 9469 case X86::VSQRTPDZmk: 9470 case X86::VSQRTPDZmkz: 9471 case X86::VSQRTPDZr: 9472 case X86::VSQRTPDZrb: 9473 case X86::VSQRTPDZrbk: 9474 case X86::VSQRTPDZrbkz: 9475 case X86::VSQRTPDZrk: 9476 case X86::VSQRTPDZrkz: 9477 case X86::VSQRTPSZ128m: 9478 case X86::VSQRTPSZ128mb: 9479 case X86::VSQRTPSZ128mbk: 9480 case X86::VSQRTPSZ128mbkz: 9481 case X86::VSQRTPSZ128mk: 9482 case X86::VSQRTPSZ128mkz: 9483 case X86::VSQRTPSZ128r: 9484 case X86::VSQRTPSZ128rk: 9485 case X86::VSQRTPSZ128rkz: 9486 case X86::VSQRTPSZ256m: 9487 case X86::VSQRTPSZ256mb: 9488 case X86::VSQRTPSZ256mbk: 9489 case X86::VSQRTPSZ256mbkz: 9490 case X86::VSQRTPSZ256mk: 9491 case X86::VSQRTPSZ256mkz: 9492 case X86::VSQRTPSZ256r: 9493 case X86::VSQRTPSZ256rk: 9494 case X86::VSQRTPSZ256rkz: 9495 case X86::VSQRTPSZm: 9496 case X86::VSQRTPSZmb: 9497 case X86::VSQRTPSZmbk: 9498 case X86::VSQRTPSZmbkz: 9499 case X86::VSQRTPSZmk: 9500 case X86::VSQRTPSZmkz: 9501 case X86::VSQRTPSZr: 9502 case X86::VSQRTPSZrb: 9503 case X86::VSQRTPSZrbk: 9504 case X86::VSQRTPSZrbkz: 9505 case X86::VSQRTPSZrk: 9506 case X86::VSQRTPSZrkz: 9507 case X86::VSQRTSDZm: 9508 case X86::VSQRTSDZm_Int: 9509 case X86::VSQRTSDZm_Intk: 9510 case X86::VSQRTSDZm_Intkz: 9511 case X86::VSQRTSDZr: 9512 case X86::VSQRTSDZr_Int: 9513 case X86::VSQRTSDZr_Intk: 9514 case X86::VSQRTSDZr_Intkz: 9515 case X86::VSQRTSDZrb_Int: 9516 case X86::VSQRTSDZrb_Intk: 9517 case X86::VSQRTSDZrb_Intkz: 9518 case X86::VSQRTSSZm: 9519 case X86::VSQRTSSZm_Int: 9520 case X86::VSQRTSSZm_Intk: 9521 case X86::VSQRTSSZm_Intkz: 9522 case X86::VSQRTSSZr: 9523 case X86::VSQRTSSZr_Int: 9524 case X86::VSQRTSSZr_Intk: 9525 case X86::VSQRTSSZr_Intkz: 9526 case X86::VSQRTSSZrb_Int: 9527 case X86::VSQRTSSZrb_Intk: 9528 case X86::VSQRTSSZrb_Intkz: 9529 9530 case X86::VGATHERDPDYrm: 9531 case X86::VGATHERDPDZ128rm: 9532 case X86::VGATHERDPDZ256rm: 9533 case X86::VGATHERDPDZrm: 9534 case X86::VGATHERDPDrm: 9535 case X86::VGATHERDPSYrm: 9536 case X86::VGATHERDPSZ128rm: 9537 case X86::VGATHERDPSZ256rm: 9538 case X86::VGATHERDPSZrm: 9539 case X86::VGATHERDPSrm: 9540 case X86::VGATHERPF0DPDm: 9541 case X86::VGATHERPF0DPSm: 9542 case X86::VGATHERPF0QPDm: 9543 case X86::VGATHERPF0QPSm: 9544 case X86::VGATHERPF1DPDm: 9545 case X86::VGATHERPF1DPSm: 9546 case X86::VGATHERPF1QPDm: 9547 case X86::VGATHERPF1QPSm: 9548 case X86::VGATHERQPDYrm: 9549 case X86::VGATHERQPDZ128rm: 9550 case X86::VGATHERQPDZ256rm: 9551 case X86::VGATHERQPDZrm: 9552 case X86::VGATHERQPDrm: 9553 case X86::VGATHERQPSYrm: 9554 case X86::VGATHERQPSZ128rm: 9555 case X86::VGATHERQPSZ256rm: 9556 case X86::VGATHERQPSZrm: 9557 case X86::VGATHERQPSrm: 9558 case X86::VPGATHERDDYrm: 9559 case X86::VPGATHERDDZ128rm: 9560 case X86::VPGATHERDDZ256rm: 9561 case X86::VPGATHERDDZrm: 9562 case X86::VPGATHERDDrm: 9563 case X86::VPGATHERDQYrm: 9564 case X86::VPGATHERDQZ128rm: 9565 case X86::VPGATHERDQZ256rm: 9566 case X86::VPGATHERDQZrm: 9567 case X86::VPGATHERDQrm: 9568 case X86::VPGATHERQDYrm: 9569 case X86::VPGATHERQDZ128rm: 9570 case X86::VPGATHERQDZ256rm: 9571 case X86::VPGATHERQDZrm: 9572 case X86::VPGATHERQDrm: 9573 case X86::VPGATHERQQYrm: 9574 case X86::VPGATHERQQZ128rm: 9575 case X86::VPGATHERQQZ256rm: 9576 case X86::VPGATHERQQZrm: 9577 case X86::VPGATHERQQrm: 9578 case X86::VSCATTERDPDZ128mr: 9579 case X86::VSCATTERDPDZ256mr: 9580 case X86::VSCATTERDPDZmr: 9581 case X86::VSCATTERDPSZ128mr: 9582 case X86::VSCATTERDPSZ256mr: 9583 case X86::VSCATTERDPSZmr: 9584 case X86::VSCATTERPF0DPDm: 9585 case X86::VSCATTERPF0DPSm: 9586 case X86::VSCATTERPF0QPDm: 9587 case X86::VSCATTERPF0QPSm: 9588 case X86::VSCATTERPF1DPDm: 9589 case X86::VSCATTERPF1DPSm: 9590 case X86::VSCATTERPF1QPDm: 9591 case X86::VSCATTERPF1QPSm: 9592 case X86::VSCATTERQPDZ128mr: 9593 case X86::VSCATTERQPDZ256mr: 9594 case X86::VSCATTERQPDZmr: 9595 case X86::VSCATTERQPSZ128mr: 9596 case X86::VSCATTERQPSZ256mr: 9597 case X86::VSCATTERQPSZmr: 9598 case X86::VPSCATTERDDZ128mr: 9599 case X86::VPSCATTERDDZ256mr: 9600 case X86::VPSCATTERDDZmr: 9601 case X86::VPSCATTERDQZ128mr: 9602 case X86::VPSCATTERDQZ256mr: 9603 case X86::VPSCATTERDQZmr: 9604 case X86::VPSCATTERQDZ128mr: 9605 case X86::VPSCATTERQDZ256mr: 9606 case X86::VPSCATTERQDZmr: 9607 case X86::VPSCATTERQQZ128mr: 9608 case X86::VPSCATTERQQZ256mr: 9609 case X86::VPSCATTERQQZmr: 9610 return true; 9611 } 9612 } 9613 9614 bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel, 9615 const MachineRegisterInfo *MRI, 9616 const MachineInstr &DefMI, 9617 unsigned DefIdx, 9618 const MachineInstr &UseMI, 9619 unsigned UseIdx) const { 9620 return isHighLatencyDef(DefMI.getOpcode()); 9621 } 9622 9623 bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst, 9624 const MachineBasicBlock *MBB) const { 9625 assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 && 9626 Inst.getNumDefs() <= 2 && "Reassociation needs binary operators"); 9627 9628 // Integer binary math/logic instructions have a third source operand: 9629 // the EFLAGS register. That operand must be both defined here and never 9630 // used; ie, it must be dead. If the EFLAGS operand is live, then we can 9631 // not change anything because rearranging the operands could affect other 9632 // instructions that depend on the exact status flags (zero, sign, etc.) 9633 // that are set by using these particular operands with this operation. 9634 const MachineOperand *FlagDef = 9635 Inst.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr); 9636 assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?"); 9637 if (FlagDef && !FlagDef->isDead()) 9638 return false; 9639 9640 return TargetInstrInfo::hasReassociableOperands(Inst, MBB); 9641 } 9642 9643 // TODO: There are many more machine instruction opcodes to match: 9644 // 1. Other data types (integer, vectors) 9645 // 2. Other math / logic operations (xor, or) 9646 // 3. Other forms of the same operation (intrinsics and other variants) 9647 bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, 9648 bool Invert) const { 9649 if (Invert) 9650 return false; 9651 switch (Inst.getOpcode()) { 9652 CASE_ND(ADD8rr) 9653 CASE_ND(ADD16rr) 9654 CASE_ND(ADD32rr) 9655 CASE_ND(ADD64rr) 9656 CASE_ND(AND8rr) 9657 CASE_ND(AND16rr) 9658 CASE_ND(AND32rr) 9659 CASE_ND(AND64rr) 9660 CASE_ND(OR8rr) 9661 CASE_ND(OR16rr) 9662 CASE_ND(OR32rr) 9663 CASE_ND(OR64rr) 9664 CASE_ND(XOR8rr) 9665 CASE_ND(XOR16rr) 9666 CASE_ND(XOR32rr) 9667 CASE_ND(XOR64rr) 9668 CASE_ND(IMUL16rr) 9669 CASE_ND(IMUL32rr) 9670 CASE_ND(IMUL64rr) 9671 case X86::PANDrr: 9672 case X86::PORrr: 9673 case X86::PXORrr: 9674 case X86::ANDPDrr: 9675 case X86::ANDPSrr: 9676 case X86::ORPDrr: 9677 case X86::ORPSrr: 9678 case X86::XORPDrr: 9679 case X86::XORPSrr: 9680 case X86::PADDBrr: 9681 case X86::PADDWrr: 9682 case X86::PADDDrr: 9683 case X86::PADDQrr: 9684 case X86::PMULLWrr: 9685 case X86::PMULLDrr: 9686 case X86::PMAXSBrr: 9687 case X86::PMAXSDrr: 9688 case X86::PMAXSWrr: 9689 case X86::PMAXUBrr: 9690 case X86::PMAXUDrr: 9691 case X86::PMAXUWrr: 9692 case X86::PMINSBrr: 9693 case X86::PMINSDrr: 9694 case X86::PMINSWrr: 9695 case X86::PMINUBrr: 9696 case X86::PMINUDrr: 9697 case X86::PMINUWrr: 9698 case X86::VPANDrr: 9699 case X86::VPANDYrr: 9700 case X86::VPANDDZ128rr: 9701 case X86::VPANDDZ256rr: 9702 case X86::VPANDDZrr: 9703 case X86::VPANDQZ128rr: 9704 case X86::VPANDQZ256rr: 9705 case X86::VPANDQZrr: 9706 case X86::VPORrr: 9707 case X86::VPORYrr: 9708 case X86::VPORDZ128rr: 9709 case X86::VPORDZ256rr: 9710 case X86::VPORDZrr: 9711 case X86::VPORQZ128rr: 9712 case X86::VPORQZ256rr: 9713 case X86::VPORQZrr: 9714 case X86::VPXORrr: 9715 case X86::VPXORYrr: 9716 case X86::VPXORDZ128rr: 9717 case X86::VPXORDZ256rr: 9718 case X86::VPXORDZrr: 9719 case X86::VPXORQZ128rr: 9720 case X86::VPXORQZ256rr: 9721 case X86::VPXORQZrr: 9722 case X86::VANDPDrr: 9723 case X86::VANDPSrr: 9724 case X86::VANDPDYrr: 9725 case X86::VANDPSYrr: 9726 case X86::VANDPDZ128rr: 9727 case X86::VANDPSZ128rr: 9728 case X86::VANDPDZ256rr: 9729 case X86::VANDPSZ256rr: 9730 case X86::VANDPDZrr: 9731 case X86::VANDPSZrr: 9732 case X86::VORPDrr: 9733 case X86::VORPSrr: 9734 case X86::VORPDYrr: 9735 case X86::VORPSYrr: 9736 case X86::VORPDZ128rr: 9737 case X86::VORPSZ128rr: 9738 case X86::VORPDZ256rr: 9739 case X86::VORPSZ256rr: 9740 case X86::VORPDZrr: 9741 case X86::VORPSZrr: 9742 case X86::VXORPDrr: 9743 case X86::VXORPSrr: 9744 case X86::VXORPDYrr: 9745 case X86::VXORPSYrr: 9746 case X86::VXORPDZ128rr: 9747 case X86::VXORPSZ128rr: 9748 case X86::VXORPDZ256rr: 9749 case X86::VXORPSZ256rr: 9750 case X86::VXORPDZrr: 9751 case X86::VXORPSZrr: 9752 case X86::KADDBrr: 9753 case X86::KADDWrr: 9754 case X86::KADDDrr: 9755 case X86::KADDQrr: 9756 case X86::KANDBrr: 9757 case X86::KANDWrr: 9758 case X86::KANDDrr: 9759 case X86::KANDQrr: 9760 case X86::KORBrr: 9761 case X86::KORWrr: 9762 case X86::KORDrr: 9763 case X86::KORQrr: 9764 case X86::KXORBrr: 9765 case X86::KXORWrr: 9766 case X86::KXORDrr: 9767 case X86::KXORQrr: 9768 case X86::VPADDBrr: 9769 case X86::VPADDWrr: 9770 case X86::VPADDDrr: 9771 case X86::VPADDQrr: 9772 case X86::VPADDBYrr: 9773 case X86::VPADDWYrr: 9774 case X86::VPADDDYrr: 9775 case X86::VPADDQYrr: 9776 case X86::VPADDBZ128rr: 9777 case X86::VPADDWZ128rr: 9778 case X86::VPADDDZ128rr: 9779 case X86::VPADDQZ128rr: 9780 case X86::VPADDBZ256rr: 9781 case X86::VPADDWZ256rr: 9782 case X86::VPADDDZ256rr: 9783 case X86::VPADDQZ256rr: 9784 case X86::VPADDBZrr: 9785 case X86::VPADDWZrr: 9786 case X86::VPADDDZrr: 9787 case X86::VPADDQZrr: 9788 case X86::VPMULLWrr: 9789 case X86::VPMULLWYrr: 9790 case X86::VPMULLWZ128rr: 9791 case X86::VPMULLWZ256rr: 9792 case X86::VPMULLWZrr: 9793 case X86::VPMULLDrr: 9794 case X86::VPMULLDYrr: 9795 case X86::VPMULLDZ128rr: 9796 case X86::VPMULLDZ256rr: 9797 case X86::VPMULLDZrr: 9798 case X86::VPMULLQZ128rr: 9799 case X86::VPMULLQZ256rr: 9800 case X86::VPMULLQZrr: 9801 case X86::VPMAXSBrr: 9802 case X86::VPMAXSBYrr: 9803 case X86::VPMAXSBZ128rr: 9804 case X86::VPMAXSBZ256rr: 9805 case X86::VPMAXSBZrr: 9806 case X86::VPMAXSDrr: 9807 case X86::VPMAXSDYrr: 9808 case X86::VPMAXSDZ128rr: 9809 case X86::VPMAXSDZ256rr: 9810 case X86::VPMAXSDZrr: 9811 case X86::VPMAXSQZ128rr: 9812 case X86::VPMAXSQZ256rr: 9813 case X86::VPMAXSQZrr: 9814 case X86::VPMAXSWrr: 9815 case X86::VPMAXSWYrr: 9816 case X86::VPMAXSWZ128rr: 9817 case X86::VPMAXSWZ256rr: 9818 case X86::VPMAXSWZrr: 9819 case X86::VPMAXUBrr: 9820 case X86::VPMAXUBYrr: 9821 case X86::VPMAXUBZ128rr: 9822 case X86::VPMAXUBZ256rr: 9823 case X86::VPMAXUBZrr: 9824 case X86::VPMAXUDrr: 9825 case X86::VPMAXUDYrr: 9826 case X86::VPMAXUDZ128rr: 9827 case X86::VPMAXUDZ256rr: 9828 case X86::VPMAXUDZrr: 9829 case X86::VPMAXUQZ128rr: 9830 case X86::VPMAXUQZ256rr: 9831 case X86::VPMAXUQZrr: 9832 case X86::VPMAXUWrr: 9833 case X86::VPMAXUWYrr: 9834 case X86::VPMAXUWZ128rr: 9835 case X86::VPMAXUWZ256rr: 9836 case X86::VPMAXUWZrr: 9837 case X86::VPMINSBrr: 9838 case X86::VPMINSBYrr: 9839 case X86::VPMINSBZ128rr: 9840 case X86::VPMINSBZ256rr: 9841 case X86::VPMINSBZrr: 9842 case X86::VPMINSDrr: 9843 case X86::VPMINSDYrr: 9844 case X86::VPMINSDZ128rr: 9845 case X86::VPMINSDZ256rr: 9846 case X86::VPMINSDZrr: 9847 case X86::VPMINSQZ128rr: 9848 case X86::VPMINSQZ256rr: 9849 case X86::VPMINSQZrr: 9850 case X86::VPMINSWrr: 9851 case X86::VPMINSWYrr: 9852 case X86::VPMINSWZ128rr: 9853 case X86::VPMINSWZ256rr: 9854 case X86::VPMINSWZrr: 9855 case X86::VPMINUBrr: 9856 case X86::VPMINUBYrr: 9857 case X86::VPMINUBZ128rr: 9858 case X86::VPMINUBZ256rr: 9859 case X86::VPMINUBZrr: 9860 case X86::VPMINUDrr: 9861 case X86::VPMINUDYrr: 9862 case X86::VPMINUDZ128rr: 9863 case X86::VPMINUDZ256rr: 9864 case X86::VPMINUDZrr: 9865 case X86::VPMINUQZ128rr: 9866 case X86::VPMINUQZ256rr: 9867 case X86::VPMINUQZrr: 9868 case X86::VPMINUWrr: 9869 case X86::VPMINUWYrr: 9870 case X86::VPMINUWZ128rr: 9871 case X86::VPMINUWZ256rr: 9872 case X86::VPMINUWZrr: 9873 // Normal min/max instructions are not commutative because of NaN and signed 9874 // zero semantics, but these are. Thus, there's no need to check for global 9875 // relaxed math; the instructions themselves have the properties we need. 9876 case X86::MAXCPDrr: 9877 case X86::MAXCPSrr: 9878 case X86::MAXCSDrr: 9879 case X86::MAXCSSrr: 9880 case X86::MINCPDrr: 9881 case X86::MINCPSrr: 9882 case X86::MINCSDrr: 9883 case X86::MINCSSrr: 9884 case X86::VMAXCPDrr: 9885 case X86::VMAXCPSrr: 9886 case X86::VMAXCPDYrr: 9887 case X86::VMAXCPSYrr: 9888 case X86::VMAXCPDZ128rr: 9889 case X86::VMAXCPSZ128rr: 9890 case X86::VMAXCPDZ256rr: 9891 case X86::VMAXCPSZ256rr: 9892 case X86::VMAXCPDZrr: 9893 case X86::VMAXCPSZrr: 9894 case X86::VMAXCSDrr: 9895 case X86::VMAXCSSrr: 9896 case X86::VMAXCSDZrr: 9897 case X86::VMAXCSSZrr: 9898 case X86::VMINCPDrr: 9899 case X86::VMINCPSrr: 9900 case X86::VMINCPDYrr: 9901 case X86::VMINCPSYrr: 9902 case X86::VMINCPDZ128rr: 9903 case X86::VMINCPSZ128rr: 9904 case X86::VMINCPDZ256rr: 9905 case X86::VMINCPSZ256rr: 9906 case X86::VMINCPDZrr: 9907 case X86::VMINCPSZrr: 9908 case X86::VMINCSDrr: 9909 case X86::VMINCSSrr: 9910 case X86::VMINCSDZrr: 9911 case X86::VMINCSSZrr: 9912 case X86::VMAXCPHZ128rr: 9913 case X86::VMAXCPHZ256rr: 9914 case X86::VMAXCPHZrr: 9915 case X86::VMAXCSHZrr: 9916 case X86::VMINCPHZ128rr: 9917 case X86::VMINCPHZ256rr: 9918 case X86::VMINCPHZrr: 9919 case X86::VMINCSHZrr: 9920 return true; 9921 case X86::ADDPDrr: 9922 case X86::ADDPSrr: 9923 case X86::ADDSDrr: 9924 case X86::ADDSSrr: 9925 case X86::MULPDrr: 9926 case X86::MULPSrr: 9927 case X86::MULSDrr: 9928 case X86::MULSSrr: 9929 case X86::VADDPDrr: 9930 case X86::VADDPSrr: 9931 case X86::VADDPDYrr: 9932 case X86::VADDPSYrr: 9933 case X86::VADDPDZ128rr: 9934 case X86::VADDPSZ128rr: 9935 case X86::VADDPDZ256rr: 9936 case X86::VADDPSZ256rr: 9937 case X86::VADDPDZrr: 9938 case X86::VADDPSZrr: 9939 case X86::VADDSDrr: 9940 case X86::VADDSSrr: 9941 case X86::VADDSDZrr: 9942 case X86::VADDSSZrr: 9943 case X86::VMULPDrr: 9944 case X86::VMULPSrr: 9945 case X86::VMULPDYrr: 9946 case X86::VMULPSYrr: 9947 case X86::VMULPDZ128rr: 9948 case X86::VMULPSZ128rr: 9949 case X86::VMULPDZ256rr: 9950 case X86::VMULPSZ256rr: 9951 case X86::VMULPDZrr: 9952 case X86::VMULPSZrr: 9953 case X86::VMULSDrr: 9954 case X86::VMULSSrr: 9955 case X86::VMULSDZrr: 9956 case X86::VMULSSZrr: 9957 case X86::VADDPHZ128rr: 9958 case X86::VADDPHZ256rr: 9959 case X86::VADDPHZrr: 9960 case X86::VADDSHZrr: 9961 case X86::VMULPHZ128rr: 9962 case X86::VMULPHZ256rr: 9963 case X86::VMULPHZrr: 9964 case X86::VMULSHZrr: 9965 return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && 9966 Inst.getFlag(MachineInstr::MIFlag::FmNsz); 9967 default: 9968 return false; 9969 } 9970 } 9971 9972 /// If \p DescribedReg overlaps with the MOVrr instruction's destination 9973 /// register then, if possible, describe the value in terms of the source 9974 /// register. 9975 static std::optional<ParamLoadedValue> 9976 describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, 9977 const TargetRegisterInfo *TRI) { 9978 Register DestReg = MI.getOperand(0).getReg(); 9979 Register SrcReg = MI.getOperand(1).getReg(); 9980 9981 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 9982 9983 // If the described register is the destination, just return the source. 9984 if (DestReg == DescribedReg) 9985 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 9986 9987 // If the described register is a sub-register of the destination register, 9988 // then pick out the source register's corresponding sub-register. 9989 if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) { 9990 Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx); 9991 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 9992 } 9993 9994 // The remaining case to consider is when the described register is a 9995 // super-register of the destination register. MOV8rr and MOV16rr does not 9996 // write to any of the other bytes in the register, meaning that we'd have to 9997 // describe the value using a combination of the source register and the 9998 // non-overlapping bits in the described register, which is not currently 9999 // possible. 10000 if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr || 10001 !TRI->isSuperRegister(DestReg, DescribedReg)) 10002 return std::nullopt; 10003 10004 assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case"); 10005 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 10006 } 10007 10008 std::optional<ParamLoadedValue> 10009 X86InstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const { 10010 const MachineOperand *Op = nullptr; 10011 DIExpression *Expr = nullptr; 10012 10013 const TargetRegisterInfo *TRI = &getRegisterInfo(); 10014 10015 switch (MI.getOpcode()) { 10016 case X86::LEA32r: 10017 case X86::LEA64r: 10018 case X86::LEA64_32r: { 10019 // We may need to describe a 64-bit parameter with a 32-bit LEA. 10020 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 10021 return std::nullopt; 10022 10023 // Operand 4 could be global address. For now we do not support 10024 // such situation. 10025 if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm()) 10026 return std::nullopt; 10027 10028 const MachineOperand &Op1 = MI.getOperand(1); 10029 const MachineOperand &Op2 = MI.getOperand(3); 10030 assert(Op2.isReg() && 10031 (Op2.getReg() == X86::NoRegister || Op2.getReg().isPhysical())); 10032 10033 // Omit situations like: 10034 // %rsi = lea %rsi, 4, ... 10035 if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) || 10036 Op2.getReg() == MI.getOperand(0).getReg()) 10037 return std::nullopt; 10038 else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister && 10039 TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) || 10040 (Op2.getReg() != X86::NoRegister && 10041 TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg()))) 10042 return std::nullopt; 10043 10044 int64_t Coef = MI.getOperand(2).getImm(); 10045 int64_t Offset = MI.getOperand(4).getImm(); 10046 SmallVector<uint64_t, 8> Ops; 10047 10048 if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) { 10049 Op = &Op1; 10050 } else if (Op1.isFI()) 10051 Op = &Op1; 10052 10053 if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) { 10054 Ops.push_back(dwarf::DW_OP_constu); 10055 Ops.push_back(Coef + 1); 10056 Ops.push_back(dwarf::DW_OP_mul); 10057 } else { 10058 if (Op && Op2.getReg() != X86::NoRegister) { 10059 int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false); 10060 if (dwarfReg < 0) 10061 return std::nullopt; 10062 else if (dwarfReg < 32) { 10063 Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg); 10064 Ops.push_back(0); 10065 } else { 10066 Ops.push_back(dwarf::DW_OP_bregx); 10067 Ops.push_back(dwarfReg); 10068 Ops.push_back(0); 10069 } 10070 } else if (!Op) { 10071 assert(Op2.getReg() != X86::NoRegister); 10072 Op = &Op2; 10073 } 10074 10075 if (Coef > 1) { 10076 assert(Op2.getReg() != X86::NoRegister); 10077 Ops.push_back(dwarf::DW_OP_constu); 10078 Ops.push_back(Coef); 10079 Ops.push_back(dwarf::DW_OP_mul); 10080 } 10081 10082 if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) && 10083 Op2.getReg() != X86::NoRegister) { 10084 Ops.push_back(dwarf::DW_OP_plus); 10085 } 10086 } 10087 10088 DIExpression::appendOffset(Ops, Offset); 10089 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops); 10090 10091 return ParamLoadedValue(*Op, Expr); 10092 } 10093 case X86::MOV8ri: 10094 case X86::MOV16ri: 10095 // TODO: Handle MOV8ri and MOV16ri. 10096 return std::nullopt; 10097 case X86::MOV32ri: 10098 case X86::MOV64ri: 10099 case X86::MOV64ri32: 10100 // MOV32ri may be used for producing zero-extended 32-bit immediates in 10101 // 64-bit parameters, so we need to consider super-registers. 10102 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 10103 return std::nullopt; 10104 return ParamLoadedValue(MI.getOperand(1), Expr); 10105 case X86::MOV8rr: 10106 case X86::MOV16rr: 10107 case X86::MOV32rr: 10108 case X86::MOV64rr: 10109 return describeMOVrrLoadedValue(MI, Reg, TRI); 10110 case X86::XOR32rr: { 10111 // 64-bit parameters are zero-materialized using XOR32rr, so also consider 10112 // super-registers. 10113 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 10114 return std::nullopt; 10115 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) 10116 return ParamLoadedValue(MachineOperand::CreateImm(0), Expr); 10117 return std::nullopt; 10118 } 10119 case X86::MOVSX64rr32: { 10120 // We may need to describe the lower 32 bits of the MOVSX; for example, in 10121 // cases like this: 10122 // 10123 // $ebx = [...] 10124 // $rdi = MOVSX64rr32 $ebx 10125 // $esi = MOV32rr $edi 10126 if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg)) 10127 return std::nullopt; 10128 10129 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 10130 10131 // If the described register is the destination register we need to 10132 // sign-extend the source register from 32 bits. The other case we handle 10133 // is when the described register is the 32-bit sub-register of the 10134 // destination register, in case we just need to return the source 10135 // register. 10136 if (Reg == MI.getOperand(0).getReg()) 10137 Expr = DIExpression::appendExt(Expr, 32, 64, true); 10138 else 10139 assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) && 10140 "Unhandled sub-register case for MOVSX64rr32"); 10141 10142 return ParamLoadedValue(MI.getOperand(1), Expr); 10143 } 10144 default: 10145 assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction"); 10146 return TargetInstrInfo::describeLoadedValue(MI, Reg); 10147 } 10148 } 10149 10150 /// This is an architecture-specific helper function of reassociateOps. 10151 /// Set special operand attributes for new instructions after reassociation. 10152 void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, 10153 MachineInstr &OldMI2, 10154 MachineInstr &NewMI1, 10155 MachineInstr &NewMI2) const { 10156 // Integer instructions may define an implicit EFLAGS dest register operand. 10157 MachineOperand *OldFlagDef1 = 10158 OldMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr); 10159 MachineOperand *OldFlagDef2 = 10160 OldMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr); 10161 10162 assert(!OldFlagDef1 == !OldFlagDef2 && 10163 "Unexpected instruction type for reassociation"); 10164 10165 if (!OldFlagDef1 || !OldFlagDef2) 10166 return; 10167 10168 assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() && 10169 "Must have dead EFLAGS operand in reassociable instruction"); 10170 10171 MachineOperand *NewFlagDef1 = 10172 NewMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr); 10173 MachineOperand *NewFlagDef2 = 10174 NewMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr); 10175 10176 assert(NewFlagDef1 && NewFlagDef2 && 10177 "Unexpected operand in reassociable instruction"); 10178 10179 // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations 10180 // of this pass or other passes. The EFLAGS operands must be dead in these new 10181 // instructions because the EFLAGS operands in the original instructions must 10182 // be dead in order for reassociation to occur. 10183 NewFlagDef1->setIsDead(); 10184 NewFlagDef2->setIsDead(); 10185 } 10186 10187 std::pair<unsigned, unsigned> 10188 X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 10189 return std::make_pair(TF, 0u); 10190 } 10191 10192 ArrayRef<std::pair<unsigned, const char *>> 10193 X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 10194 using namespace X86II; 10195 static const std::pair<unsigned, const char *> TargetFlags[] = { 10196 {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"}, 10197 {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"}, 10198 {MO_GOT, "x86-got"}, 10199 {MO_GOTOFF, "x86-gotoff"}, 10200 {MO_GOTPCREL, "x86-gotpcrel"}, 10201 {MO_GOTPCREL_NORELAX, "x86-gotpcrel-norelax"}, 10202 {MO_PLT, "x86-plt"}, 10203 {MO_TLSGD, "x86-tlsgd"}, 10204 {MO_TLSLD, "x86-tlsld"}, 10205 {MO_TLSLDM, "x86-tlsldm"}, 10206 {MO_GOTTPOFF, "x86-gottpoff"}, 10207 {MO_INDNTPOFF, "x86-indntpoff"}, 10208 {MO_TPOFF, "x86-tpoff"}, 10209 {MO_DTPOFF, "x86-dtpoff"}, 10210 {MO_NTPOFF, "x86-ntpoff"}, 10211 {MO_GOTNTPOFF, "x86-gotntpoff"}, 10212 {MO_DLLIMPORT, "x86-dllimport"}, 10213 {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"}, 10214 {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"}, 10215 {MO_TLVP, "x86-tlvp"}, 10216 {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"}, 10217 {MO_SECREL, "x86-secrel"}, 10218 {MO_COFFSTUB, "x86-coffstub"}}; 10219 return ArrayRef(TargetFlags); 10220 } 10221 10222 namespace { 10223 /// Create Global Base Reg pass. This initializes the PIC 10224 /// global base register for x86-32. 10225 struct CGBR : public MachineFunctionPass { 10226 static char ID; 10227 CGBR() : MachineFunctionPass(ID) {} 10228 10229 bool runOnMachineFunction(MachineFunction &MF) override { 10230 const X86TargetMachine *TM = 10231 static_cast<const X86TargetMachine *>(&MF.getTarget()); 10232 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); 10233 10234 // Only emit a global base reg in PIC mode. 10235 if (!TM->isPositionIndependent()) 10236 return false; 10237 10238 X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); 10239 Register GlobalBaseReg = X86FI->getGlobalBaseReg(); 10240 10241 // If we didn't need a GlobalBaseReg, don't insert code. 10242 if (GlobalBaseReg == 0) 10243 return false; 10244 10245 // Insert the set of GlobalBaseReg into the first MBB of the function 10246 MachineBasicBlock &FirstMBB = MF.front(); 10247 MachineBasicBlock::iterator MBBI = FirstMBB.begin(); 10248 DebugLoc DL = FirstMBB.findDebugLoc(MBBI); 10249 MachineRegisterInfo &RegInfo = MF.getRegInfo(); 10250 const X86InstrInfo *TII = STI.getInstrInfo(); 10251 10252 Register PC; 10253 if (STI.isPICStyleGOT()) 10254 PC = RegInfo.createVirtualRegister(&X86::GR32RegClass); 10255 else 10256 PC = GlobalBaseReg; 10257 10258 if (STI.is64Bit()) { 10259 if (TM->getCodeModel() == CodeModel::Large) { 10260 // In the large code model, we are aiming for this code, though the 10261 // register allocation may vary: 10262 // leaq .LN$pb(%rip), %rax 10263 // movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx 10264 // addq %rcx, %rax 10265 // RAX now holds address of _GLOBAL_OFFSET_TABLE_. 10266 Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass); 10267 Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass); 10268 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg) 10269 .addReg(X86::RIP) 10270 .addImm(0) 10271 .addReg(0) 10272 .addSym(MF.getPICBaseSymbol()) 10273 .addReg(0); 10274 std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol()); 10275 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg) 10276 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_", 10277 X86II::MO_PIC_BASE_OFFSET); 10278 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC) 10279 .addReg(PBReg, RegState::Kill) 10280 .addReg(GOTReg, RegState::Kill); 10281 } else { 10282 // In other code models, use a RIP-relative LEA to materialize the 10283 // GOT. 10284 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC) 10285 .addReg(X86::RIP) 10286 .addImm(0) 10287 .addReg(0) 10288 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_") 10289 .addReg(0); 10290 } 10291 } else { 10292 // Operand of MovePCtoStack is completely ignored by asm printer. It's 10293 // only used in JIT code emission as displacement to pc. 10294 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0); 10295 10296 // If we're using vanilla 'GOT' PIC style, we should use relative 10297 // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external. 10298 if (STI.isPICStyleGOT()) { 10299 // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], 10300 // %some_register 10301 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg) 10302 .addReg(PC) 10303 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_", 10304 X86II::MO_GOT_ABSOLUTE_ADDRESS); 10305 } 10306 } 10307 10308 return true; 10309 } 10310 10311 StringRef getPassName() const override { 10312 return "X86 PIC Global Base Reg Initialization"; 10313 } 10314 10315 void getAnalysisUsage(AnalysisUsage &AU) const override { 10316 AU.setPreservesCFG(); 10317 MachineFunctionPass::getAnalysisUsage(AU); 10318 } 10319 }; 10320 } // namespace 10321 10322 char CGBR::ID = 0; 10323 FunctionPass *llvm::createX86GlobalBaseRegPass() { return new CGBR(); } 10324 10325 namespace { 10326 struct LDTLSCleanup : public MachineFunctionPass { 10327 static char ID; 10328 LDTLSCleanup() : MachineFunctionPass(ID) {} 10329 10330 bool runOnMachineFunction(MachineFunction &MF) override { 10331 if (skipFunction(MF.getFunction())) 10332 return false; 10333 10334 X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); 10335 if (MFI->getNumLocalDynamicTLSAccesses() < 2) { 10336 // No point folding accesses if there isn't at least two. 10337 return false; 10338 } 10339 10340 MachineDominatorTree *DT = 10341 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 10342 return VisitNode(DT->getRootNode(), 0); 10343 } 10344 10345 // Visit the dominator subtree rooted at Node in pre-order. 10346 // If TLSBaseAddrReg is non-null, then use that to replace any 10347 // TLS_base_addr instructions. Otherwise, create the register 10348 // when the first such instruction is seen, and then use it 10349 // as we encounter more instructions. 10350 bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) { 10351 MachineBasicBlock *BB = Node->getBlock(); 10352 bool Changed = false; 10353 10354 // Traverse the current block. 10355 for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; 10356 ++I) { 10357 switch (I->getOpcode()) { 10358 case X86::TLS_base_addr32: 10359 case X86::TLS_base_addr64: 10360 if (TLSBaseAddrReg) 10361 I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg); 10362 else 10363 I = SetRegister(*I, &TLSBaseAddrReg); 10364 Changed = true; 10365 break; 10366 default: 10367 break; 10368 } 10369 } 10370 10371 // Visit the children of this block in the dominator tree. 10372 for (auto &I : *Node) { 10373 Changed |= VisitNode(I, TLSBaseAddrReg); 10374 } 10375 10376 return Changed; 10377 } 10378 10379 // Replace the TLS_base_addr instruction I with a copy from 10380 // TLSBaseAddrReg, returning the new instruction. 10381 MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I, 10382 unsigned TLSBaseAddrReg) { 10383 MachineFunction *MF = I.getParent()->getParent(); 10384 const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>(); 10385 const bool is64Bit = STI.is64Bit(); 10386 const X86InstrInfo *TII = STI.getInstrInfo(); 10387 10388 // Insert a Copy from TLSBaseAddrReg to RAX/EAX. 10389 MachineInstr *Copy = 10390 BuildMI(*I.getParent(), I, I.getDebugLoc(), 10391 TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX) 10392 .addReg(TLSBaseAddrReg); 10393 10394 // Erase the TLS_base_addr instruction. 10395 I.eraseFromParent(); 10396 10397 return Copy; 10398 } 10399 10400 // Create a virtual register in *TLSBaseAddrReg, and populate it by 10401 // inserting a copy instruction after I. Returns the new instruction. 10402 MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) { 10403 MachineFunction *MF = I.getParent()->getParent(); 10404 const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>(); 10405 const bool is64Bit = STI.is64Bit(); 10406 const X86InstrInfo *TII = STI.getInstrInfo(); 10407 10408 // Create a virtual register for the TLS base address. 10409 MachineRegisterInfo &RegInfo = MF->getRegInfo(); 10410 *TLSBaseAddrReg = RegInfo.createVirtualRegister( 10411 is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass); 10412 10413 // Insert a copy from RAX/EAX to TLSBaseAddrReg. 10414 MachineInstr *Next = I.getNextNode(); 10415 MachineInstr *Copy = BuildMI(*I.getParent(), Next, I.getDebugLoc(), 10416 TII->get(TargetOpcode::COPY), *TLSBaseAddrReg) 10417 .addReg(is64Bit ? X86::RAX : X86::EAX); 10418 10419 return Copy; 10420 } 10421 10422 StringRef getPassName() const override { 10423 return "Local Dynamic TLS Access Clean-up"; 10424 } 10425 10426 void getAnalysisUsage(AnalysisUsage &AU) const override { 10427 AU.setPreservesCFG(); 10428 AU.addRequired<MachineDominatorTreeWrapperPass>(); 10429 MachineFunctionPass::getAnalysisUsage(AU); 10430 } 10431 }; 10432 } // namespace 10433 10434 char LDTLSCleanup::ID = 0; 10435 FunctionPass *llvm::createCleanupLocalDynamicTLSPass() { 10436 return new LDTLSCleanup(); 10437 } 10438 10439 /// Constants defining how certain sequences should be outlined. 10440 /// 10441 /// \p MachineOutlinerDefault implies that the function is called with a call 10442 /// instruction, and a return must be emitted for the outlined function frame. 10443 /// 10444 /// That is, 10445 /// 10446 /// I1 OUTLINED_FUNCTION: 10447 /// I2 --> call OUTLINED_FUNCTION I1 10448 /// I3 I2 10449 /// I3 10450 /// ret 10451 /// 10452 /// * Call construction overhead: 1 (call instruction) 10453 /// * Frame construction overhead: 1 (return instruction) 10454 /// 10455 /// \p MachineOutlinerTailCall implies that the function is being tail called. 10456 /// A jump is emitted instead of a call, and the return is already present in 10457 /// the outlined sequence. That is, 10458 /// 10459 /// I1 OUTLINED_FUNCTION: 10460 /// I2 --> jmp OUTLINED_FUNCTION I1 10461 /// ret I2 10462 /// ret 10463 /// 10464 /// * Call construction overhead: 1 (jump instruction) 10465 /// * Frame construction overhead: 0 (don't need to return) 10466 /// 10467 enum MachineOutlinerClass { MachineOutlinerDefault, MachineOutlinerTailCall }; 10468 10469 std::optional<outliner::OutlinedFunction> 10470 X86InstrInfo::getOutliningCandidateInfo( 10471 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 10472 unsigned SequenceSize = 0; 10473 for (auto &MI : RepeatedSequenceLocs[0]) { 10474 // FIXME: x86 doesn't implement getInstSizeInBytes, so 10475 // we can't tell the cost. Just assume each instruction 10476 // is one byte. 10477 if (MI.isDebugInstr() || MI.isKill()) 10478 continue; 10479 SequenceSize += 1; 10480 } 10481 10482 // We check to see if CFI Instructions are present, and if they are 10483 // we find the number of CFI Instructions in the candidates. 10484 unsigned CFICount = 0; 10485 for (auto &I : RepeatedSequenceLocs[0]) { 10486 if (I.isCFIInstruction()) 10487 CFICount++; 10488 } 10489 10490 // We compare the number of found CFI Instructions to the number of CFI 10491 // instructions in the parent function for each candidate. We must check this 10492 // since if we outline one of the CFI instructions in a function, we have to 10493 // outline them all for correctness. If we do not, the address offsets will be 10494 // incorrect between the two sections of the program. 10495 for (outliner::Candidate &C : RepeatedSequenceLocs) { 10496 std::vector<MCCFIInstruction> CFIInstructions = 10497 C.getMF()->getFrameInstructions(); 10498 10499 if (CFICount > 0 && CFICount != CFIInstructions.size()) 10500 return std::nullopt; 10501 } 10502 10503 // FIXME: Use real size in bytes for call and ret instructions. 10504 if (RepeatedSequenceLocs[0].back().isTerminator()) { 10505 for (outliner::Candidate &C : RepeatedSequenceLocs) 10506 C.setCallInfo(MachineOutlinerTailCall, 1); 10507 10508 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 10509 0, // Number of bytes to emit frame. 10510 MachineOutlinerTailCall // Type of frame. 10511 ); 10512 } 10513 10514 if (CFICount > 0) 10515 return std::nullopt; 10516 10517 for (outliner::Candidate &C : RepeatedSequenceLocs) 10518 C.setCallInfo(MachineOutlinerDefault, 1); 10519 10520 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 1, 10521 MachineOutlinerDefault); 10522 } 10523 10524 bool X86InstrInfo::isFunctionSafeToOutlineFrom( 10525 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 10526 const Function &F = MF.getFunction(); 10527 10528 // Does the function use a red zone? If it does, then we can't risk messing 10529 // with the stack. 10530 if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) { 10531 // It could have a red zone. If it does, then we don't want to touch it. 10532 const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); 10533 if (!X86FI || X86FI->getUsesRedZone()) 10534 return false; 10535 } 10536 10537 // If we *don't* want to outline from things that could potentially be deduped 10538 // then return false. 10539 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 10540 return false; 10541 10542 // This function is viable for outlining, so return true. 10543 return true; 10544 } 10545 10546 outliner::InstrType 10547 X86InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT, 10548 unsigned Flags) const { 10549 MachineInstr &MI = *MIT; 10550 10551 // Is this a terminator for a basic block? 10552 if (MI.isTerminator()) 10553 // TargetInstrInfo::getOutliningType has already filtered out anything 10554 // that would break this, so we can allow it here. 10555 return outliner::InstrType::Legal; 10556 10557 // Don't outline anything that modifies or reads from the stack pointer. 10558 // 10559 // FIXME: There are instructions which are being manually built without 10560 // explicit uses/defs so we also have to check the MCInstrDesc. We should be 10561 // able to remove the extra checks once those are fixed up. For example, 10562 // sometimes we might get something like %rax = POP64r 1. This won't be 10563 // caught by modifiesRegister or readsRegister even though the instruction 10564 // really ought to be formed so that modifiesRegister/readsRegister would 10565 // catch it. 10566 if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) || 10567 MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) || 10568 MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) 10569 return outliner::InstrType::Illegal; 10570 10571 // Outlined calls change the instruction pointer, so don't read from it. 10572 if (MI.readsRegister(X86::RIP, &RI) || 10573 MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) || 10574 MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP)) 10575 return outliner::InstrType::Illegal; 10576 10577 // Don't outline CFI instructions. 10578 if (MI.isCFIInstruction()) 10579 return outliner::InstrType::Illegal; 10580 10581 return outliner::InstrType::Legal; 10582 } 10583 10584 void X86InstrInfo::buildOutlinedFrame( 10585 MachineBasicBlock &MBB, MachineFunction &MF, 10586 const outliner::OutlinedFunction &OF) const { 10587 // If we're a tail call, we already have a return, so don't do anything. 10588 if (OF.FrameConstructionID == MachineOutlinerTailCall) 10589 return; 10590 10591 // We're a normal call, so our sequence doesn't have a return instruction. 10592 // Add it in. 10593 MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RET64)); 10594 MBB.insert(MBB.end(), retq); 10595 } 10596 10597 MachineBasicBlock::iterator X86InstrInfo::insertOutlinedCall( 10598 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 10599 MachineFunction &MF, outliner::Candidate &C) const { 10600 // Is it a tail call? 10601 if (C.CallConstructionID == MachineOutlinerTailCall) { 10602 // Yes, just insert a JMP. 10603 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64)) 10604 .addGlobalAddress(M.getNamedValue(MF.getName()))); 10605 } else { 10606 // No, insert a call. 10607 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32)) 10608 .addGlobalAddress(M.getNamedValue(MF.getName()))); 10609 } 10610 10611 return It; 10612 } 10613 10614 void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB, 10615 MachineBasicBlock::iterator Iter, 10616 DebugLoc &DL, 10617 bool AllowSideEffects) const { 10618 const MachineFunction &MF = *MBB.getParent(); 10619 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); 10620 const TargetRegisterInfo &TRI = getRegisterInfo(); 10621 10622 if (ST.hasMMX() && X86::VR64RegClass.contains(Reg)) 10623 // FIXME: Should we ignore MMX registers? 10624 return; 10625 10626 if (TRI.isGeneralPurposeRegister(MF, Reg)) { 10627 // Convert register to the 32-bit version. Both 'movl' and 'xorl' clear the 10628 // upper bits of a 64-bit register automagically. 10629 Reg = getX86SubSuperRegister(Reg, 32); 10630 10631 if (!AllowSideEffects) 10632 // XOR affects flags, so use a MOV instead. 10633 BuildMI(MBB, Iter, DL, get(X86::MOV32ri), Reg).addImm(0); 10634 else 10635 BuildMI(MBB, Iter, DL, get(X86::XOR32rr), Reg) 10636 .addReg(Reg, RegState::Undef) 10637 .addReg(Reg, RegState::Undef); 10638 } else if (X86::VR128RegClass.contains(Reg)) { 10639 // XMM# 10640 if (!ST.hasSSE1()) 10641 return; 10642 10643 // PXOR is safe to use because it doesn't affect flags. 10644 BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg) 10645 .addReg(Reg, RegState::Undef) 10646 .addReg(Reg, RegState::Undef); 10647 } else if (X86::VR256RegClass.contains(Reg)) { 10648 // YMM# 10649 if (!ST.hasAVX()) 10650 return; 10651 10652 // VPXOR is safe to use because it doesn't affect flags. 10653 BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg) 10654 .addReg(Reg, RegState::Undef) 10655 .addReg(Reg, RegState::Undef); 10656 } else if (X86::VR512RegClass.contains(Reg)) { 10657 // ZMM# 10658 if (!ST.hasAVX512()) 10659 return; 10660 10661 // VPXORY is safe to use because it doesn't affect flags. 10662 BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg) 10663 .addReg(Reg, RegState::Undef) 10664 .addReg(Reg, RegState::Undef); 10665 } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) || 10666 X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) || 10667 X86::VK16RegClass.contains(Reg)) { 10668 if (!ST.hasVLX()) 10669 return; 10670 10671 // KXOR is safe to use because it doesn't affect flags. 10672 unsigned Op = ST.hasBWI() ? X86::KXORQrr : X86::KXORWrr; 10673 BuildMI(MBB, Iter, DL, get(Op), Reg) 10674 .addReg(Reg, RegState::Undef) 10675 .addReg(Reg, RegState::Undef); 10676 } 10677 } 10678 10679 bool X86InstrInfo::getMachineCombinerPatterns( 10680 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns, 10681 bool DoRegPressureReduce) const { 10682 unsigned Opc = Root.getOpcode(); 10683 switch (Opc) { 10684 case X86::VPDPWSSDrr: 10685 case X86::VPDPWSSDrm: 10686 case X86::VPDPWSSDYrr: 10687 case X86::VPDPWSSDYrm: { 10688 if (!Subtarget.hasFastDPWSSD()) { 10689 Patterns.push_back(X86MachineCombinerPattern::DPWSSD); 10690 return true; 10691 } 10692 break; 10693 } 10694 case X86::VPDPWSSDZ128r: 10695 case X86::VPDPWSSDZ128m: 10696 case X86::VPDPWSSDZ256r: 10697 case X86::VPDPWSSDZ256m: 10698 case X86::VPDPWSSDZr: 10699 case X86::VPDPWSSDZm: { 10700 if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) { 10701 Patterns.push_back(X86MachineCombinerPattern::DPWSSD); 10702 return true; 10703 } 10704 break; 10705 } 10706 } 10707 return TargetInstrInfo::getMachineCombinerPatterns(Root, 10708 Patterns, DoRegPressureReduce); 10709 } 10710 10711 static void 10712 genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII, 10713 SmallVectorImpl<MachineInstr *> &InsInstrs, 10714 SmallVectorImpl<MachineInstr *> &DelInstrs, 10715 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { 10716 MachineFunction *MF = Root.getMF(); 10717 MachineRegisterInfo &RegInfo = MF->getRegInfo(); 10718 10719 unsigned Opc = Root.getOpcode(); 10720 unsigned AddOpc = 0; 10721 unsigned MaddOpc = 0; 10722 switch (Opc) { 10723 default: 10724 assert(false && "It should not reach here"); 10725 break; 10726 // vpdpwssd xmm2,xmm3,xmm1 10727 // --> 10728 // vpmaddwd xmm3,xmm3,xmm1 10729 // vpaddd xmm2,xmm2,xmm3 10730 case X86::VPDPWSSDrr: 10731 MaddOpc = X86::VPMADDWDrr; 10732 AddOpc = X86::VPADDDrr; 10733 break; 10734 case X86::VPDPWSSDrm: 10735 MaddOpc = X86::VPMADDWDrm; 10736 AddOpc = X86::VPADDDrr; 10737 break; 10738 case X86::VPDPWSSDZ128r: 10739 MaddOpc = X86::VPMADDWDZ128rr; 10740 AddOpc = X86::VPADDDZ128rr; 10741 break; 10742 case X86::VPDPWSSDZ128m: 10743 MaddOpc = X86::VPMADDWDZ128rm; 10744 AddOpc = X86::VPADDDZ128rr; 10745 break; 10746 // vpdpwssd ymm2,ymm3,ymm1 10747 // --> 10748 // vpmaddwd ymm3,ymm3,ymm1 10749 // vpaddd ymm2,ymm2,ymm3 10750 case X86::VPDPWSSDYrr: 10751 MaddOpc = X86::VPMADDWDYrr; 10752 AddOpc = X86::VPADDDYrr; 10753 break; 10754 case X86::VPDPWSSDYrm: 10755 MaddOpc = X86::VPMADDWDYrm; 10756 AddOpc = X86::VPADDDYrr; 10757 break; 10758 case X86::VPDPWSSDZ256r: 10759 MaddOpc = X86::VPMADDWDZ256rr; 10760 AddOpc = X86::VPADDDZ256rr; 10761 break; 10762 case X86::VPDPWSSDZ256m: 10763 MaddOpc = X86::VPMADDWDZ256rm; 10764 AddOpc = X86::VPADDDZ256rr; 10765 break; 10766 // vpdpwssd zmm2,zmm3,zmm1 10767 // --> 10768 // vpmaddwd zmm3,zmm3,zmm1 10769 // vpaddd zmm2,zmm2,zmm3 10770 case X86::VPDPWSSDZr: 10771 MaddOpc = X86::VPMADDWDZrr; 10772 AddOpc = X86::VPADDDZrr; 10773 break; 10774 case X86::VPDPWSSDZm: 10775 MaddOpc = X86::VPMADDWDZrm; 10776 AddOpc = X86::VPADDDZrr; 10777 break; 10778 } 10779 // Create vpmaddwd. 10780 const TargetRegisterClass *RC = 10781 RegInfo.getRegClass(Root.getOperand(0).getReg()); 10782 Register NewReg = RegInfo.createVirtualRegister(RC); 10783 MachineInstr *Madd = Root.getMF()->CloneMachineInstr(&Root); 10784 Madd->setDesc(TII.get(MaddOpc)); 10785 Madd->untieRegOperand(1); 10786 Madd->removeOperand(1); 10787 Madd->getOperand(0).setReg(NewReg); 10788 InstrIdxForVirtReg.insert(std::make_pair(NewReg, 0)); 10789 // Create vpaddd. 10790 Register DstReg = Root.getOperand(0).getReg(); 10791 bool IsKill = Root.getOperand(1).isKill(); 10792 MachineInstr *Add = 10793 BuildMI(*MF, MIMetadata(Root), TII.get(AddOpc), DstReg) 10794 .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill)) 10795 .addReg(Madd->getOperand(0).getReg(), getKillRegState(true)); 10796 InsInstrs.push_back(Madd); 10797 InsInstrs.push_back(Add); 10798 DelInstrs.push_back(&Root); 10799 } 10800 10801 void X86InstrInfo::genAlternativeCodeSequence( 10802 MachineInstr &Root, unsigned Pattern, 10803 SmallVectorImpl<MachineInstr *> &InsInstrs, 10804 SmallVectorImpl<MachineInstr *> &DelInstrs, 10805 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 10806 switch (Pattern) { 10807 default: 10808 // Reassociate instructions. 10809 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 10810 DelInstrs, InstrIdxForVirtReg); 10811 return; 10812 case X86MachineCombinerPattern::DPWSSD: 10813 genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs, 10814 InstrIdxForVirtReg); 10815 return; 10816 } 10817 } 10818 10819 // See also: X86DAGToDAGISel::SelectInlineAsmMemoryOperand(). 10820 void X86InstrInfo::getFrameIndexOperands(SmallVectorImpl<MachineOperand> &Ops, 10821 int FI) const { 10822 X86AddressMode M; 10823 M.BaseType = X86AddressMode::FrameIndexBase; 10824 M.Base.FrameIndex = FI; 10825 M.getFullAddress(Ops); 10826 } 10827 10828 #define GET_INSTRINFO_HELPERS 10829 #include "X86GenInstrInfo.inc" 10830