1 //===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the X86-specific support for the FastISel class. Much 10 // of the target-specific code is generated by tablegen in the file 11 // X86GenFastISel.inc, which is #included here. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "X86.h" 16 #include "X86CallingConv.h" 17 #include "X86InstrBuilder.h" 18 #include "X86InstrInfo.h" 19 #include "X86MachineFunctionInfo.h" 20 #include "X86RegisterInfo.h" 21 #include "X86Subtarget.h" 22 #include "X86TargetMachine.h" 23 #include "llvm/Analysis/BranchProbabilityInfo.h" 24 #include "llvm/CodeGen/FastISel.h" 25 #include "llvm/CodeGen/FunctionLoweringInfo.h" 26 #include "llvm/CodeGen/MachineConstantPool.h" 27 #include "llvm/CodeGen/MachineFrameInfo.h" 28 #include "llvm/CodeGen/MachineRegisterInfo.h" 29 #include "llvm/IR/CallingConv.h" 30 #include "llvm/IR/DebugInfo.h" 31 #include "llvm/IR/DerivedTypes.h" 32 #include "llvm/IR/GetElementPtrTypeIterator.h" 33 #include "llvm/IR/GlobalAlias.h" 34 #include "llvm/IR/GlobalVariable.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/IntrinsicsX86.h" 38 #include "llvm/IR/Operator.h" 39 #include "llvm/MC/MCAsmInfo.h" 40 #include "llvm/MC/MCSymbol.h" 41 #include "llvm/Support/ErrorHandling.h" 42 #include "llvm/Target/TargetOptions.h" 43 using namespace llvm; 44 45 namespace { 46 47 class X86FastISel final : public FastISel { 48 /// Subtarget - Keep a pointer to the X86Subtarget around so that we can 49 /// make the right decision when generating code for different targets. 50 const X86Subtarget *Subtarget; 51 52 /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 53 /// floating point ops. 54 /// When SSE is available, use it for f32 operations. 55 /// When SSE2 is available, use it for f64 operations. 56 bool X86ScalarSSEf64; 57 bool X86ScalarSSEf32; 58 59 public: 60 explicit X86FastISel(FunctionLoweringInfo &funcInfo, 61 const TargetLibraryInfo *libInfo) 62 : FastISel(funcInfo, libInfo) { 63 Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>(); 64 X86ScalarSSEf64 = Subtarget->hasSSE2(); 65 X86ScalarSSEf32 = Subtarget->hasSSE1(); 66 } 67 68 bool fastSelectInstruction(const Instruction *I) override; 69 70 /// The specified machine instr operand is a vreg, and that 71 /// vreg is being provided by the specified load instruction. If possible, 72 /// try to fold the load as an operand to the instruction, returning true if 73 /// possible. 74 bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, 75 const LoadInst *LI) override; 76 77 bool fastLowerArguments() override; 78 bool fastLowerCall(CallLoweringInfo &CLI) override; 79 bool fastLowerIntrinsicCall(const IntrinsicInst *II) override; 80 81 #include "X86GenFastISel.inc" 82 83 private: 84 bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, 85 const DebugLoc &DL); 86 87 bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO, 88 unsigned &ResultReg, unsigned Alignment = 1); 89 90 bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM, 91 MachineMemOperand *MMO = nullptr, bool Aligned = false); 92 bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, 93 X86AddressMode &AM, 94 MachineMemOperand *MMO = nullptr, bool Aligned = false); 95 96 bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, 97 unsigned &ResultReg); 98 99 bool X86SelectAddress(const Value *V, X86AddressMode &AM); 100 bool X86SelectCallAddress(const Value *V, X86AddressMode &AM); 101 102 bool X86SelectLoad(const Instruction *I); 103 104 bool X86SelectStore(const Instruction *I); 105 106 bool X86SelectRet(const Instruction *I); 107 108 bool X86SelectCmp(const Instruction *I); 109 110 bool X86SelectZExt(const Instruction *I); 111 112 bool X86SelectSExt(const Instruction *I); 113 114 bool X86SelectBranch(const Instruction *I); 115 116 bool X86SelectShift(const Instruction *I); 117 118 bool X86SelectDivRem(const Instruction *I); 119 120 bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I); 121 122 bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I); 123 124 bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I); 125 126 bool X86SelectSelect(const Instruction *I); 127 128 bool X86SelectTrunc(const Instruction *I); 129 130 bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc, 131 const TargetRegisterClass *RC); 132 133 bool X86SelectFPExt(const Instruction *I); 134 bool X86SelectFPTrunc(const Instruction *I); 135 bool X86SelectSIToFP(const Instruction *I); 136 bool X86SelectUIToFP(const Instruction *I); 137 bool X86SelectIntToFP(const Instruction *I, bool IsSigned); 138 139 const X86InstrInfo *getInstrInfo() const { 140 return Subtarget->getInstrInfo(); 141 } 142 const X86TargetMachine *getTargetMachine() const { 143 return static_cast<const X86TargetMachine *>(&TM); 144 } 145 146 bool handleConstantAddresses(const Value *V, X86AddressMode &AM); 147 148 unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT); 149 unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT); 150 unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT); 151 unsigned fastMaterializeConstant(const Constant *C) override; 152 153 unsigned fastMaterializeAlloca(const AllocaInst *C) override; 154 155 unsigned fastMaterializeFloatZero(const ConstantFP *CF) override; 156 157 /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is 158 /// computed in an SSE register, not on the X87 floating point stack. 159 bool isScalarFPTypeInSSEReg(EVT VT) const { 160 return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 161 (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 162 } 163 164 bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); 165 166 bool IsMemcpySmall(uint64_t Len); 167 168 bool TryEmitSmallMemcpy(X86AddressMode DestAM, 169 X86AddressMode SrcAM, uint64_t Len); 170 171 bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, 172 const Value *Cond); 173 174 const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB, 175 X86AddressMode &AM); 176 177 unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode, 178 const TargetRegisterClass *RC, unsigned Op0, 179 bool Op0IsKill, unsigned Op1, bool Op1IsKill, 180 unsigned Op2, bool Op2IsKill, unsigned Op3, 181 bool Op3IsKill); 182 }; 183 184 } // end anonymous namespace. 185 186 static std::pair<unsigned, bool> 187 getX86SSEConditionCode(CmpInst::Predicate Predicate) { 188 unsigned CC; 189 bool NeedSwap = false; 190 191 // SSE Condition code mapping: 192 // 0 - EQ 193 // 1 - LT 194 // 2 - LE 195 // 3 - UNORD 196 // 4 - NEQ 197 // 5 - NLT 198 // 6 - NLE 199 // 7 - ORD 200 switch (Predicate) { 201 default: llvm_unreachable("Unexpected predicate"); 202 case CmpInst::FCMP_OEQ: CC = 0; break; 203 case CmpInst::FCMP_OGT: NeedSwap = true; LLVM_FALLTHROUGH; 204 case CmpInst::FCMP_OLT: CC = 1; break; 205 case CmpInst::FCMP_OGE: NeedSwap = true; LLVM_FALLTHROUGH; 206 case CmpInst::FCMP_OLE: CC = 2; break; 207 case CmpInst::FCMP_UNO: CC = 3; break; 208 case CmpInst::FCMP_UNE: CC = 4; break; 209 case CmpInst::FCMP_ULE: NeedSwap = true; LLVM_FALLTHROUGH; 210 case CmpInst::FCMP_UGE: CC = 5; break; 211 case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH; 212 case CmpInst::FCMP_UGT: CC = 6; break; 213 case CmpInst::FCMP_ORD: CC = 7; break; 214 case CmpInst::FCMP_UEQ: CC = 8; break; 215 case CmpInst::FCMP_ONE: CC = 12; break; 216 } 217 218 return std::make_pair(CC, NeedSwap); 219 } 220 221 /// Adds a complex addressing mode to the given machine instr builder. 222 /// Note, this will constrain the index register. If its not possible to 223 /// constrain the given index register, then a new one will be created. The 224 /// IndexReg field of the addressing mode will be updated to match in this case. 225 const MachineInstrBuilder & 226 X86FastISel::addFullAddress(const MachineInstrBuilder &MIB, 227 X86AddressMode &AM) { 228 // First constrain the index register. It needs to be a GR64_NOSP. 229 AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg, 230 MIB->getNumOperands() + 231 X86::AddrIndexReg); 232 return ::addFullAddress(MIB, AM); 233 } 234 235 /// Check if it is possible to fold the condition from the XALU intrinsic 236 /// into the user. The condition code will only be updated on success. 237 bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, 238 const Value *Cond) { 239 if (!isa<ExtractValueInst>(Cond)) 240 return false; 241 242 const auto *EV = cast<ExtractValueInst>(Cond); 243 if (!isa<IntrinsicInst>(EV->getAggregateOperand())) 244 return false; 245 246 const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand()); 247 MVT RetVT; 248 const Function *Callee = II->getCalledFunction(); 249 Type *RetTy = 250 cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U); 251 if (!isTypeLegal(RetTy, RetVT)) 252 return false; 253 254 if (RetVT != MVT::i32 && RetVT != MVT::i64) 255 return false; 256 257 X86::CondCode TmpCC; 258 switch (II->getIntrinsicID()) { 259 default: return false; 260 case Intrinsic::sadd_with_overflow: 261 case Intrinsic::ssub_with_overflow: 262 case Intrinsic::smul_with_overflow: 263 case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break; 264 case Intrinsic::uadd_with_overflow: 265 case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break; 266 } 267 268 // Check if both instructions are in the same basic block. 269 if (II->getParent() != I->getParent()) 270 return false; 271 272 // Make sure nothing is in the way 273 BasicBlock::const_iterator Start(I); 274 BasicBlock::const_iterator End(II); 275 for (auto Itr = std::prev(Start); Itr != End; --Itr) { 276 // We only expect extractvalue instructions between the intrinsic and the 277 // instruction to be selected. 278 if (!isa<ExtractValueInst>(Itr)) 279 return false; 280 281 // Check that the extractvalue operand comes from the intrinsic. 282 const auto *EVI = cast<ExtractValueInst>(Itr); 283 if (EVI->getAggregateOperand() != II) 284 return false; 285 } 286 287 // Make sure no potentially eflags clobbering phi moves can be inserted in 288 // between. 289 auto HasPhis = [](const BasicBlock *Succ) { 290 return !llvm::empty(Succ->phis()); 291 }; 292 if (I->isTerminator() && llvm::any_of(successors(I), HasPhis)) 293 return false; 294 295 CC = TmpCC; 296 return true; 297 } 298 299 bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { 300 EVT evt = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true); 301 if (evt == MVT::Other || !evt.isSimple()) 302 // Unhandled type. Halt "fast" selection and bail. 303 return false; 304 305 VT = evt.getSimpleVT(); 306 // For now, require SSE/SSE2 for performing floating-point operations, 307 // since x87 requires additional work. 308 if (VT == MVT::f64 && !X86ScalarSSEf64) 309 return false; 310 if (VT == MVT::f32 && !X86ScalarSSEf32) 311 return false; 312 // Similarly, no f80 support yet. 313 if (VT == MVT::f80) 314 return false; 315 // We only handle legal types. For example, on x86-32 the instruction 316 // selector contains all of the 64-bit instructions from x86-64, 317 // under the assumption that i64 won't be used if the target doesn't 318 // support it. 319 return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT); 320 } 321 322 /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. 323 /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. 324 /// Return true and the result register by reference if it is possible. 325 bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM, 326 MachineMemOperand *MMO, unsigned &ResultReg, 327 unsigned Alignment) { 328 bool HasSSE41 = Subtarget->hasSSE41(); 329 bool HasAVX = Subtarget->hasAVX(); 330 bool HasAVX2 = Subtarget->hasAVX2(); 331 bool HasAVX512 = Subtarget->hasAVX512(); 332 bool HasVLX = Subtarget->hasVLX(); 333 bool IsNonTemporal = MMO && MMO->isNonTemporal(); 334 335 // Treat i1 loads the same as i8 loads. Masking will be done when storing. 336 if (VT == MVT::i1) 337 VT = MVT::i8; 338 339 // Get opcode and regclass of the output for the given load instruction. 340 unsigned Opc = 0; 341 switch (VT.SimpleTy) { 342 default: return false; 343 case MVT::i8: 344 Opc = X86::MOV8rm; 345 break; 346 case MVT::i16: 347 Opc = X86::MOV16rm; 348 break; 349 case MVT::i32: 350 Opc = X86::MOV32rm; 351 break; 352 case MVT::i64: 353 // Must be in x86-64 mode. 354 Opc = X86::MOV64rm; 355 break; 356 case MVT::f32: 357 if (X86ScalarSSEf32) 358 Opc = HasAVX512 ? X86::VMOVSSZrm_alt : 359 HasAVX ? X86::VMOVSSrm_alt : 360 X86::MOVSSrm_alt; 361 else 362 Opc = X86::LD_Fp32m; 363 break; 364 case MVT::f64: 365 if (X86ScalarSSEf64) 366 Opc = HasAVX512 ? X86::VMOVSDZrm_alt : 367 HasAVX ? X86::VMOVSDrm_alt : 368 X86::MOVSDrm_alt; 369 else 370 Opc = X86::LD_Fp64m; 371 break; 372 case MVT::f80: 373 // No f80 support yet. 374 return false; 375 case MVT::v4f32: 376 if (IsNonTemporal && Alignment >= 16 && HasSSE41) 377 Opc = HasVLX ? X86::VMOVNTDQAZ128rm : 378 HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; 379 else if (Alignment >= 16) 380 Opc = HasVLX ? X86::VMOVAPSZ128rm : 381 HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm; 382 else 383 Opc = HasVLX ? X86::VMOVUPSZ128rm : 384 HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; 385 break; 386 case MVT::v2f64: 387 if (IsNonTemporal && Alignment >= 16 && HasSSE41) 388 Opc = HasVLX ? X86::VMOVNTDQAZ128rm : 389 HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; 390 else if (Alignment >= 16) 391 Opc = HasVLX ? X86::VMOVAPDZ128rm : 392 HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm; 393 else 394 Opc = HasVLX ? X86::VMOVUPDZ128rm : 395 HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; 396 break; 397 case MVT::v4i32: 398 case MVT::v2i64: 399 case MVT::v8i16: 400 case MVT::v16i8: 401 if (IsNonTemporal && Alignment >= 16 && HasSSE41) 402 Opc = HasVLX ? X86::VMOVNTDQAZ128rm : 403 HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; 404 else if (Alignment >= 16) 405 Opc = HasVLX ? X86::VMOVDQA64Z128rm : 406 HasAVX ? X86::VMOVDQArm : X86::MOVDQArm; 407 else 408 Opc = HasVLX ? X86::VMOVDQU64Z128rm : 409 HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; 410 break; 411 case MVT::v8f32: 412 assert(HasAVX); 413 if (IsNonTemporal && Alignment >= 32 && HasAVX2) 414 Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; 415 else if (IsNonTemporal && Alignment >= 16) 416 return false; // Force split for X86::VMOVNTDQArm 417 else if (Alignment >= 32) 418 Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm; 419 else 420 Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm; 421 break; 422 case MVT::v4f64: 423 assert(HasAVX); 424 if (IsNonTemporal && Alignment >= 32 && HasAVX2) 425 Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; 426 else if (IsNonTemporal && Alignment >= 16) 427 return false; // Force split for X86::VMOVNTDQArm 428 else if (Alignment >= 32) 429 Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm; 430 else 431 Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm; 432 break; 433 case MVT::v8i32: 434 case MVT::v4i64: 435 case MVT::v16i16: 436 case MVT::v32i8: 437 assert(HasAVX); 438 if (IsNonTemporal && Alignment >= 32 && HasAVX2) 439 Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; 440 else if (IsNonTemporal && Alignment >= 16) 441 return false; // Force split for X86::VMOVNTDQArm 442 else if (Alignment >= 32) 443 Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm; 444 else 445 Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm; 446 break; 447 case MVT::v16f32: 448 assert(HasAVX512); 449 if (IsNonTemporal && Alignment >= 64) 450 Opc = X86::VMOVNTDQAZrm; 451 else 452 Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm; 453 break; 454 case MVT::v8f64: 455 assert(HasAVX512); 456 if (IsNonTemporal && Alignment >= 64) 457 Opc = X86::VMOVNTDQAZrm; 458 else 459 Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm; 460 break; 461 case MVT::v8i64: 462 case MVT::v16i32: 463 case MVT::v32i16: 464 case MVT::v64i8: 465 assert(HasAVX512); 466 // Note: There are a lot more choices based on type with AVX-512, but 467 // there's really no advantage when the load isn't masked. 468 if (IsNonTemporal && Alignment >= 64) 469 Opc = X86::VMOVNTDQAZrm; 470 else 471 Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm; 472 break; 473 } 474 475 const TargetRegisterClass *RC = TLI.getRegClassFor(VT); 476 477 ResultReg = createResultReg(RC); 478 MachineInstrBuilder MIB = 479 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); 480 addFullAddress(MIB, AM); 481 if (MMO) 482 MIB->addMemOperand(*FuncInfo.MF, MMO); 483 return true; 484 } 485 486 /// X86FastEmitStore - Emit a machine instruction to store a value Val of 487 /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr 488 /// and a displacement offset, or a GlobalAddress, 489 /// i.e. V. Return true if it is possible. 490 bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, 491 X86AddressMode &AM, 492 MachineMemOperand *MMO, bool Aligned) { 493 bool HasSSE1 = Subtarget->hasSSE1(); 494 bool HasSSE2 = Subtarget->hasSSE2(); 495 bool HasSSE4A = Subtarget->hasSSE4A(); 496 bool HasAVX = Subtarget->hasAVX(); 497 bool HasAVX512 = Subtarget->hasAVX512(); 498 bool HasVLX = Subtarget->hasVLX(); 499 bool IsNonTemporal = MMO && MMO->isNonTemporal(); 500 501 // Get opcode and regclass of the output for the given store instruction. 502 unsigned Opc = 0; 503 switch (VT.getSimpleVT().SimpleTy) { 504 case MVT::f80: // No f80 support yet. 505 default: return false; 506 case MVT::i1: { 507 // Mask out all but lowest bit. 508 Register AndResult = createResultReg(&X86::GR8RegClass); 509 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 510 TII.get(X86::AND8ri), AndResult) 511 .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); 512 ValReg = AndResult; 513 LLVM_FALLTHROUGH; // handle i1 as i8. 514 } 515 case MVT::i8: Opc = X86::MOV8mr; break; 516 case MVT::i16: Opc = X86::MOV16mr; break; 517 case MVT::i32: 518 Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr; 519 break; 520 case MVT::i64: 521 // Must be in x86-64 mode. 522 Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr; 523 break; 524 case MVT::f32: 525 if (X86ScalarSSEf32) { 526 if (IsNonTemporal && HasSSE4A) 527 Opc = X86::MOVNTSS; 528 else 529 Opc = HasAVX512 ? X86::VMOVSSZmr : 530 HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; 531 } else 532 Opc = X86::ST_Fp32m; 533 break; 534 case MVT::f64: 535 if (X86ScalarSSEf32) { 536 if (IsNonTemporal && HasSSE4A) 537 Opc = X86::MOVNTSD; 538 else 539 Opc = HasAVX512 ? X86::VMOVSDZmr : 540 HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; 541 } else 542 Opc = X86::ST_Fp64m; 543 break; 544 case MVT::x86mmx: 545 Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr; 546 break; 547 case MVT::v4f32: 548 if (Aligned) { 549 if (IsNonTemporal) 550 Opc = HasVLX ? X86::VMOVNTPSZ128mr : 551 HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; 552 else 553 Opc = HasVLX ? X86::VMOVAPSZ128mr : 554 HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; 555 } else 556 Opc = HasVLX ? X86::VMOVUPSZ128mr : 557 HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; 558 break; 559 case MVT::v2f64: 560 if (Aligned) { 561 if (IsNonTemporal) 562 Opc = HasVLX ? X86::VMOVNTPDZ128mr : 563 HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; 564 else 565 Opc = HasVLX ? X86::VMOVAPDZ128mr : 566 HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; 567 } else 568 Opc = HasVLX ? X86::VMOVUPDZ128mr : 569 HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; 570 break; 571 case MVT::v4i32: 572 case MVT::v2i64: 573 case MVT::v8i16: 574 case MVT::v16i8: 575 if (Aligned) { 576 if (IsNonTemporal) 577 Opc = HasVLX ? X86::VMOVNTDQZ128mr : 578 HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; 579 else 580 Opc = HasVLX ? X86::VMOVDQA64Z128mr : 581 HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; 582 } else 583 Opc = HasVLX ? X86::VMOVDQU64Z128mr : 584 HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr; 585 break; 586 case MVT::v8f32: 587 assert(HasAVX); 588 if (Aligned) { 589 if (IsNonTemporal) 590 Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr; 591 else 592 Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr; 593 } else 594 Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr; 595 break; 596 case MVT::v4f64: 597 assert(HasAVX); 598 if (Aligned) { 599 if (IsNonTemporal) 600 Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr; 601 else 602 Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr; 603 } else 604 Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr; 605 break; 606 case MVT::v8i32: 607 case MVT::v4i64: 608 case MVT::v16i16: 609 case MVT::v32i8: 610 assert(HasAVX); 611 if (Aligned) { 612 if (IsNonTemporal) 613 Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr; 614 else 615 Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr; 616 } else 617 Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr; 618 break; 619 case MVT::v16f32: 620 assert(HasAVX512); 621 if (Aligned) 622 Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr; 623 else 624 Opc = X86::VMOVUPSZmr; 625 break; 626 case MVT::v8f64: 627 assert(HasAVX512); 628 if (Aligned) { 629 Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr; 630 } else 631 Opc = X86::VMOVUPDZmr; 632 break; 633 case MVT::v8i64: 634 case MVT::v16i32: 635 case MVT::v32i16: 636 case MVT::v64i8: 637 assert(HasAVX512); 638 // Note: There are a lot more choices based on type with AVX-512, but 639 // there's really no advantage when the store isn't masked. 640 if (Aligned) 641 Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr; 642 else 643 Opc = X86::VMOVDQU64Zmr; 644 break; 645 } 646 647 const MCInstrDesc &Desc = TII.get(Opc); 648 // Some of the instructions in the previous switch use FR128 instead 649 // of FR32 for ValReg. Make sure the register we feed the instruction 650 // matches its register class constraints. 651 // Note: This is fine to do a copy from FR32 to FR128, this is the 652 // same registers behind the scene and actually why it did not trigger 653 // any bugs before. 654 ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1); 655 MachineInstrBuilder MIB = 656 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, Desc); 657 addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill)); 658 if (MMO) 659 MIB->addMemOperand(*FuncInfo.MF, MMO); 660 661 return true; 662 } 663 664 bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, 665 X86AddressMode &AM, 666 MachineMemOperand *MMO, bool Aligned) { 667 // Handle 'null' like i32/i64 0. 668 if (isa<ConstantPointerNull>(Val)) 669 Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext())); 670 671 // If this is a store of a simple constant, fold the constant into the store. 672 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { 673 unsigned Opc = 0; 674 bool Signed = true; 675 switch (VT.getSimpleVT().SimpleTy) { 676 default: break; 677 case MVT::i1: 678 Signed = false; 679 LLVM_FALLTHROUGH; // Handle as i8. 680 case MVT::i8: Opc = X86::MOV8mi; break; 681 case MVT::i16: Opc = X86::MOV16mi; break; 682 case MVT::i32: Opc = X86::MOV32mi; break; 683 case MVT::i64: 684 // Must be a 32-bit sign extended value. 685 if (isInt<32>(CI->getSExtValue())) 686 Opc = X86::MOV64mi32; 687 break; 688 } 689 690 if (Opc) { 691 MachineInstrBuilder MIB = 692 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); 693 addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue() 694 : CI->getZExtValue()); 695 if (MMO) 696 MIB->addMemOperand(*FuncInfo.MF, MMO); 697 return true; 698 } 699 } 700 701 Register ValReg = getRegForValue(Val); 702 if (ValReg == 0) 703 return false; 704 705 bool ValKill = hasTrivialKill(Val); 706 return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned); 707 } 708 709 /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of 710 /// type SrcVT to type DstVT using the specified extension opcode Opc (e.g. 711 /// ISD::SIGN_EXTEND). 712 bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, 713 unsigned Src, EVT SrcVT, 714 unsigned &ResultReg) { 715 unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, 716 Src, /*TODO: Kill=*/false); 717 if (RR == 0) 718 return false; 719 720 ResultReg = RR; 721 return true; 722 } 723 724 bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { 725 // Handle constant address. 726 if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { 727 // Can't handle alternate code models yet. 728 if (TM.getCodeModel() != CodeModel::Small) 729 return false; 730 731 // Can't handle TLS yet. 732 if (GV->isThreadLocal()) 733 return false; 734 735 // Can't handle !absolute_symbol references yet. 736 if (GV->isAbsoluteSymbolRef()) 737 return false; 738 739 // RIP-relative addresses can't have additional register operands, so if 740 // we've already folded stuff into the addressing mode, just force the 741 // global value into its own register, which we can use as the basereg. 742 if (!Subtarget->isPICStyleRIPRel() || 743 (AM.Base.Reg == 0 && AM.IndexReg == 0)) { 744 // Okay, we've committed to selecting this global. Set up the address. 745 AM.GV = GV; 746 747 // Allow the subtarget to classify the global. 748 unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); 749 750 // If this reference is relative to the pic base, set it now. 751 if (isGlobalRelativeToPICBase(GVFlags)) { 752 // FIXME: How do we know Base.Reg is free?? 753 AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); 754 } 755 756 // Unless the ABI requires an extra load, return a direct reference to 757 // the global. 758 if (!isGlobalStubReference(GVFlags)) { 759 if (Subtarget->isPICStyleRIPRel()) { 760 // Use rip-relative addressing if we can. Above we verified that the 761 // base and index registers are unused. 762 assert(AM.Base.Reg == 0 && AM.IndexReg == 0); 763 AM.Base.Reg = X86::RIP; 764 } 765 AM.GVOpFlags = GVFlags; 766 return true; 767 } 768 769 // Ok, we need to do a load from a stub. If we've already loaded from 770 // this stub, reuse the loaded pointer, otherwise emit the load now. 771 DenseMap<const Value *, Register>::iterator I = LocalValueMap.find(V); 772 Register LoadReg; 773 if (I != LocalValueMap.end() && I->second) { 774 LoadReg = I->second; 775 } else { 776 // Issue load from stub. 777 unsigned Opc = 0; 778 const TargetRegisterClass *RC = nullptr; 779 X86AddressMode StubAM; 780 StubAM.Base.Reg = AM.Base.Reg; 781 StubAM.GV = GV; 782 StubAM.GVOpFlags = GVFlags; 783 784 // Prepare for inserting code in the local-value area. 785 SavePoint SaveInsertPt = enterLocalValueArea(); 786 787 if (TLI.getPointerTy(DL) == MVT::i64) { 788 Opc = X86::MOV64rm; 789 RC = &X86::GR64RegClass; 790 } else { 791 Opc = X86::MOV32rm; 792 RC = &X86::GR32RegClass; 793 } 794 795 if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL) 796 StubAM.Base.Reg = X86::RIP; 797 798 LoadReg = createResultReg(RC); 799 MachineInstrBuilder LoadMI = 800 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg); 801 addFullAddress(LoadMI, StubAM); 802 803 // Ok, back to normal mode. 804 leaveLocalValueArea(SaveInsertPt); 805 806 // Prevent loading GV stub multiple times in same MBB. 807 LocalValueMap[V] = LoadReg; 808 } 809 810 // Now construct the final address. Note that the Disp, Scale, 811 // and Index values may already be set here. 812 AM.Base.Reg = LoadReg; 813 AM.GV = nullptr; 814 return true; 815 } 816 } 817 818 // If all else fails, try to materialize the value in a register. 819 if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { 820 if (AM.Base.Reg == 0) { 821 AM.Base.Reg = getRegForValue(V); 822 return AM.Base.Reg != 0; 823 } 824 if (AM.IndexReg == 0) { 825 assert(AM.Scale == 1 && "Scale with no index!"); 826 AM.IndexReg = getRegForValue(V); 827 return AM.IndexReg != 0; 828 } 829 } 830 831 return false; 832 } 833 834 /// X86SelectAddress - Attempt to fill in an address from the given value. 835 /// 836 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { 837 SmallVector<const Value *, 32> GEPs; 838 redo_gep: 839 const User *U = nullptr; 840 unsigned Opcode = Instruction::UserOp1; 841 if (const Instruction *I = dyn_cast<Instruction>(V)) { 842 // Don't walk into other basic blocks; it's possible we haven't 843 // visited them yet, so the instructions may not yet be assigned 844 // virtual registers. 845 if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) || 846 FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { 847 Opcode = I->getOpcode(); 848 U = I; 849 } 850 } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { 851 Opcode = C->getOpcode(); 852 U = C; 853 } 854 855 if (PointerType *Ty = dyn_cast<PointerType>(V->getType())) 856 if (Ty->getAddressSpace() > 255) 857 // Fast instruction selection doesn't support the special 858 // address spaces. 859 return false; 860 861 switch (Opcode) { 862 default: break; 863 case Instruction::BitCast: 864 // Look past bitcasts. 865 return X86SelectAddress(U->getOperand(0), AM); 866 867 case Instruction::IntToPtr: 868 // Look past no-op inttoptrs. 869 if (TLI.getValueType(DL, U->getOperand(0)->getType()) == 870 TLI.getPointerTy(DL)) 871 return X86SelectAddress(U->getOperand(0), AM); 872 break; 873 874 case Instruction::PtrToInt: 875 // Look past no-op ptrtoints. 876 if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) 877 return X86SelectAddress(U->getOperand(0), AM); 878 break; 879 880 case Instruction::Alloca: { 881 // Do static allocas. 882 const AllocaInst *A = cast<AllocaInst>(V); 883 DenseMap<const AllocaInst *, int>::iterator SI = 884 FuncInfo.StaticAllocaMap.find(A); 885 if (SI != FuncInfo.StaticAllocaMap.end()) { 886 AM.BaseType = X86AddressMode::FrameIndexBase; 887 AM.Base.FrameIndex = SI->second; 888 return true; 889 } 890 break; 891 } 892 893 case Instruction::Add: { 894 // Adds of constants are common and easy enough. 895 if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) { 896 uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue(); 897 // They have to fit in the 32-bit signed displacement field though. 898 if (isInt<32>(Disp)) { 899 AM.Disp = (uint32_t)Disp; 900 return X86SelectAddress(U->getOperand(0), AM); 901 } 902 } 903 break; 904 } 905 906 case Instruction::GetElementPtr: { 907 X86AddressMode SavedAM = AM; 908 909 // Pattern-match simple GEPs. 910 uint64_t Disp = (int32_t)AM.Disp; 911 unsigned IndexReg = AM.IndexReg; 912 unsigned Scale = AM.Scale; 913 gep_type_iterator GTI = gep_type_begin(U); 914 // Iterate through the indices, folding what we can. Constants can be 915 // folded, and one dynamic index can be handled, if the scale is supported. 916 for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); 917 i != e; ++i, ++GTI) { 918 const Value *Op = *i; 919 if (StructType *STy = GTI.getStructTypeOrNull()) { 920 const StructLayout *SL = DL.getStructLayout(STy); 921 Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); 922 continue; 923 } 924 925 // A array/variable index is always of the form i*S where S is the 926 // constant scale size. See if we can push the scale into immediates. 927 uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); 928 for (;;) { 929 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { 930 // Constant-offset addressing. 931 Disp += CI->getSExtValue() * S; 932 break; 933 } 934 if (canFoldAddIntoGEP(U, Op)) { 935 // A compatible add with a constant operand. Fold the constant. 936 ConstantInt *CI = 937 cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); 938 Disp += CI->getSExtValue() * S; 939 // Iterate on the other operand. 940 Op = cast<AddOperator>(Op)->getOperand(0); 941 continue; 942 } 943 if (IndexReg == 0 && 944 (!AM.GV || !Subtarget->isPICStyleRIPRel()) && 945 (S == 1 || S == 2 || S == 4 || S == 8)) { 946 // Scaled-index addressing. 947 Scale = S; 948 IndexReg = getRegForGEPIndex(Op).first; 949 if (IndexReg == 0) 950 return false; 951 break; 952 } 953 // Unsupported. 954 goto unsupported_gep; 955 } 956 } 957 958 // Check for displacement overflow. 959 if (!isInt<32>(Disp)) 960 break; 961 962 AM.IndexReg = IndexReg; 963 AM.Scale = Scale; 964 AM.Disp = (uint32_t)Disp; 965 GEPs.push_back(V); 966 967 if (const GetElementPtrInst *GEP = 968 dyn_cast<GetElementPtrInst>(U->getOperand(0))) { 969 // Ok, the GEP indices were covered by constant-offset and scaled-index 970 // addressing. Update the address state and move on to examining the base. 971 V = GEP; 972 goto redo_gep; 973 } else if (X86SelectAddress(U->getOperand(0), AM)) { 974 return true; 975 } 976 977 // If we couldn't merge the gep value into this addr mode, revert back to 978 // our address and just match the value instead of completely failing. 979 AM = SavedAM; 980 981 for (const Value *I : reverse(GEPs)) 982 if (handleConstantAddresses(I, AM)) 983 return true; 984 985 return false; 986 unsupported_gep: 987 // Ok, the GEP indices weren't all covered. 988 break; 989 } 990 } 991 992 return handleConstantAddresses(V, AM); 993 } 994 995 /// X86SelectCallAddress - Attempt to fill in an address from the given value. 996 /// 997 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { 998 const User *U = nullptr; 999 unsigned Opcode = Instruction::UserOp1; 1000 const Instruction *I = dyn_cast<Instruction>(V); 1001 // Record if the value is defined in the same basic block. 1002 // 1003 // This information is crucial to know whether or not folding an 1004 // operand is valid. 1005 // Indeed, FastISel generates or reuses a virtual register for all 1006 // operands of all instructions it selects. Obviously, the definition and 1007 // its uses must use the same virtual register otherwise the produced 1008 // code is incorrect. 1009 // Before instruction selection, FunctionLoweringInfo::set sets the virtual 1010 // registers for values that are alive across basic blocks. This ensures 1011 // that the values are consistently set between across basic block, even 1012 // if different instruction selection mechanisms are used (e.g., a mix of 1013 // SDISel and FastISel). 1014 // For values local to a basic block, the instruction selection process 1015 // generates these virtual registers with whatever method is appropriate 1016 // for its needs. In particular, FastISel and SDISel do not share the way 1017 // local virtual registers are set. 1018 // Therefore, this is impossible (or at least unsafe) to share values 1019 // between basic blocks unless they use the same instruction selection 1020 // method, which is not guarantee for X86. 1021 // Moreover, things like hasOneUse could not be used accurately, if we 1022 // allow to reference values across basic blocks whereas they are not 1023 // alive across basic blocks initially. 1024 bool InMBB = true; 1025 if (I) { 1026 Opcode = I->getOpcode(); 1027 U = I; 1028 InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock(); 1029 } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { 1030 Opcode = C->getOpcode(); 1031 U = C; 1032 } 1033 1034 switch (Opcode) { 1035 default: break; 1036 case Instruction::BitCast: 1037 // Look past bitcasts if its operand is in the same BB. 1038 if (InMBB) 1039 return X86SelectCallAddress(U->getOperand(0), AM); 1040 break; 1041 1042 case Instruction::IntToPtr: 1043 // Look past no-op inttoptrs if its operand is in the same BB. 1044 if (InMBB && 1045 TLI.getValueType(DL, U->getOperand(0)->getType()) == 1046 TLI.getPointerTy(DL)) 1047 return X86SelectCallAddress(U->getOperand(0), AM); 1048 break; 1049 1050 case Instruction::PtrToInt: 1051 // Look past no-op ptrtoints if its operand is in the same BB. 1052 if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) 1053 return X86SelectCallAddress(U->getOperand(0), AM); 1054 break; 1055 } 1056 1057 // Handle constant address. 1058 if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { 1059 // Can't handle alternate code models yet. 1060 if (TM.getCodeModel() != CodeModel::Small) 1061 return false; 1062 1063 // RIP-relative addresses can't have additional register operands. 1064 if (Subtarget->isPICStyleRIPRel() && 1065 (AM.Base.Reg != 0 || AM.IndexReg != 0)) 1066 return false; 1067 1068 // Can't handle TLS. 1069 if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) 1070 if (GVar->isThreadLocal()) 1071 return false; 1072 1073 // Okay, we've committed to selecting this global. Set up the basic address. 1074 AM.GV = GV; 1075 1076 // Return a direct reference to the global. Fastisel can handle calls to 1077 // functions that require loads, such as dllimport and nonlazybind 1078 // functions. 1079 if (Subtarget->isPICStyleRIPRel()) { 1080 // Use rip-relative addressing if we can. Above we verified that the 1081 // base and index registers are unused. 1082 assert(AM.Base.Reg == 0 && AM.IndexReg == 0); 1083 AM.Base.Reg = X86::RIP; 1084 } else { 1085 AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr); 1086 } 1087 1088 return true; 1089 } 1090 1091 // If all else fails, try to materialize the value in a register. 1092 if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { 1093 auto GetCallRegForValue = [this](const Value *V) { 1094 Register Reg = getRegForValue(V); 1095 1096 // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits. 1097 if (Reg && Subtarget->isTarget64BitILP32()) { 1098 Register CopyReg = createResultReg(&X86::GR32RegClass); 1099 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32rr), 1100 CopyReg) 1101 .addReg(Reg); 1102 1103 Register ExtReg = createResultReg(&X86::GR64RegClass); 1104 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1105 TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg) 1106 .addImm(0) 1107 .addReg(CopyReg) 1108 .addImm(X86::sub_32bit); 1109 Reg = ExtReg; 1110 } 1111 1112 return Reg; 1113 }; 1114 1115 if (AM.Base.Reg == 0) { 1116 AM.Base.Reg = GetCallRegForValue(V); 1117 return AM.Base.Reg != 0; 1118 } 1119 if (AM.IndexReg == 0) { 1120 assert(AM.Scale == 1 && "Scale with no index!"); 1121 AM.IndexReg = GetCallRegForValue(V); 1122 return AM.IndexReg != 0; 1123 } 1124 } 1125 1126 return false; 1127 } 1128 1129 1130 /// X86SelectStore - Select and emit code to implement store instructions. 1131 bool X86FastISel::X86SelectStore(const Instruction *I) { 1132 // Atomic stores need special handling. 1133 const StoreInst *S = cast<StoreInst>(I); 1134 1135 if (S->isAtomic()) 1136 return false; 1137 1138 const Value *PtrV = I->getOperand(1); 1139 if (TLI.supportSwiftError()) { 1140 // Swifterror values can come from either a function parameter with 1141 // swifterror attribute or an alloca with swifterror attribute. 1142 if (const Argument *Arg = dyn_cast<Argument>(PtrV)) { 1143 if (Arg->hasSwiftErrorAttr()) 1144 return false; 1145 } 1146 1147 if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) { 1148 if (Alloca->isSwiftError()) 1149 return false; 1150 } 1151 } 1152 1153 const Value *Val = S->getValueOperand(); 1154 const Value *Ptr = S->getPointerOperand(); 1155 1156 MVT VT; 1157 if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true)) 1158 return false; 1159 1160 Align Alignment = S->getAlign(); 1161 Align ABIAlignment = DL.getABITypeAlign(Val->getType()); 1162 bool Aligned = Alignment >= ABIAlignment; 1163 1164 X86AddressMode AM; 1165 if (!X86SelectAddress(Ptr, AM)) 1166 return false; 1167 1168 return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned); 1169 } 1170 1171 /// X86SelectRet - Select and emit code to implement ret instructions. 1172 bool X86FastISel::X86SelectRet(const Instruction *I) { 1173 const ReturnInst *Ret = cast<ReturnInst>(I); 1174 const Function &F = *I->getParent()->getParent(); 1175 const X86MachineFunctionInfo *X86MFInfo = 1176 FuncInfo.MF->getInfo<X86MachineFunctionInfo>(); 1177 1178 if (!FuncInfo.CanLowerReturn) 1179 return false; 1180 1181 if (TLI.supportSwiftError() && 1182 F.getAttributes().hasAttrSomewhere(Attribute::SwiftError)) 1183 return false; 1184 1185 if (TLI.supportSplitCSR(FuncInfo.MF)) 1186 return false; 1187 1188 CallingConv::ID CC = F.getCallingConv(); 1189 if (CC != CallingConv::C && 1190 CC != CallingConv::Fast && 1191 CC != CallingConv::Tail && 1192 CC != CallingConv::X86_FastCall && 1193 CC != CallingConv::X86_StdCall && 1194 CC != CallingConv::X86_ThisCall && 1195 CC != CallingConv::X86_64_SysV && 1196 CC != CallingConv::Win64) 1197 return false; 1198 1199 // Don't handle popping bytes if they don't fit the ret's immediate. 1200 if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn())) 1201 return false; 1202 1203 // fastcc with -tailcallopt is intended to provide a guaranteed 1204 // tail call optimization. Fastisel doesn't know how to do that. 1205 if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) || 1206 CC == CallingConv::Tail) 1207 return false; 1208 1209 // Let SDISel handle vararg functions. 1210 if (F.isVarArg()) 1211 return false; 1212 1213 // Build a list of return value registers. 1214 SmallVector<unsigned, 4> RetRegs; 1215 1216 if (Ret->getNumOperands() > 0) { 1217 SmallVector<ISD::OutputArg, 4> Outs; 1218 GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL); 1219 1220 // Analyze operands of the call, assigning locations to each operand. 1221 SmallVector<CCValAssign, 16> ValLocs; 1222 CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); 1223 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1224 1225 const Value *RV = Ret->getOperand(0); 1226 Register Reg = getRegForValue(RV); 1227 if (Reg == 0) 1228 return false; 1229 1230 // Only handle a single return value for now. 1231 if (ValLocs.size() != 1) 1232 return false; 1233 1234 CCValAssign &VA = ValLocs[0]; 1235 1236 // Don't bother handling odd stuff for now. 1237 if (VA.getLocInfo() != CCValAssign::Full) 1238 return false; 1239 // Only handle register returns for now. 1240 if (!VA.isRegLoc()) 1241 return false; 1242 1243 // The calling-convention tables for x87 returns don't tell 1244 // the whole story. 1245 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) 1246 return false; 1247 1248 unsigned SrcReg = Reg + VA.getValNo(); 1249 EVT SrcVT = TLI.getValueType(DL, RV->getType()); 1250 EVT DstVT = VA.getValVT(); 1251 // Special handling for extended integers. 1252 if (SrcVT != DstVT) { 1253 if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) 1254 return false; 1255 1256 if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) 1257 return false; 1258 1259 assert(DstVT == MVT::i32 && "X86 should always ext to i32"); 1260 1261 if (SrcVT == MVT::i1) { 1262 if (Outs[0].Flags.isSExt()) 1263 return false; 1264 // TODO 1265 SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*Op0IsKill=*/false); 1266 SrcVT = MVT::i8; 1267 } 1268 unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : 1269 ISD::SIGN_EXTEND; 1270 // TODO 1271 SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg, 1272 /*Op0IsKill=*/false); 1273 } 1274 1275 // Make the copy. 1276 Register DstReg = VA.getLocReg(); 1277 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); 1278 // Avoid a cross-class copy. This is very unlikely. 1279 if (!SrcRC->contains(DstReg)) 1280 return false; 1281 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1282 TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg); 1283 1284 // Add register to return instruction. 1285 RetRegs.push_back(VA.getLocReg()); 1286 } 1287 1288 // Swift calling convention does not require we copy the sret argument 1289 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. 1290 1291 // All x86 ABIs require that for returning structs by value we copy 1292 // the sret argument into %rax/%eax (depending on ABI) for the return. 1293 // We saved the argument into a virtual register in the entry block, 1294 // so now we copy the value out and into %rax/%eax. 1295 if (F.hasStructRetAttr() && CC != CallingConv::Swift) { 1296 Register Reg = X86MFInfo->getSRetReturnReg(); 1297 assert(Reg && 1298 "SRetReturnReg should have been set in LowerFormalArguments()!"); 1299 unsigned RetReg = Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX; 1300 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1301 TII.get(TargetOpcode::COPY), RetReg).addReg(Reg); 1302 RetRegs.push_back(RetReg); 1303 } 1304 1305 // Now emit the RET. 1306 MachineInstrBuilder MIB; 1307 if (X86MFInfo->getBytesToPopOnReturn()) { 1308 MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1309 TII.get(Subtarget->is64Bit() ? X86::RETIQ : X86::RETIL)) 1310 .addImm(X86MFInfo->getBytesToPopOnReturn()); 1311 } else { 1312 MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1313 TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL)); 1314 } 1315 for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) 1316 MIB.addReg(RetRegs[i], RegState::Implicit); 1317 return true; 1318 } 1319 1320 /// X86SelectLoad - Select and emit code to implement load instructions. 1321 /// 1322 bool X86FastISel::X86SelectLoad(const Instruction *I) { 1323 const LoadInst *LI = cast<LoadInst>(I); 1324 1325 // Atomic loads need special handling. 1326 if (LI->isAtomic()) 1327 return false; 1328 1329 const Value *SV = I->getOperand(0); 1330 if (TLI.supportSwiftError()) { 1331 // Swifterror values can come from either a function parameter with 1332 // swifterror attribute or an alloca with swifterror attribute. 1333 if (const Argument *Arg = dyn_cast<Argument>(SV)) { 1334 if (Arg->hasSwiftErrorAttr()) 1335 return false; 1336 } 1337 1338 if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) { 1339 if (Alloca->isSwiftError()) 1340 return false; 1341 } 1342 } 1343 1344 MVT VT; 1345 if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true)) 1346 return false; 1347 1348 const Value *Ptr = LI->getPointerOperand(); 1349 1350 X86AddressMode AM; 1351 if (!X86SelectAddress(Ptr, AM)) 1352 return false; 1353 1354 unsigned ResultReg = 0; 1355 if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg, 1356 LI->getAlign().value())) 1357 return false; 1358 1359 updateValueMap(I, ResultReg); 1360 return true; 1361 } 1362 1363 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { 1364 bool HasAVX512 = Subtarget->hasAVX512(); 1365 bool HasAVX = Subtarget->hasAVX(); 1366 bool X86ScalarSSEf32 = Subtarget->hasSSE1(); 1367 bool X86ScalarSSEf64 = Subtarget->hasSSE2(); 1368 1369 switch (VT.getSimpleVT().SimpleTy) { 1370 default: return 0; 1371 case MVT::i8: return X86::CMP8rr; 1372 case MVT::i16: return X86::CMP16rr; 1373 case MVT::i32: return X86::CMP32rr; 1374 case MVT::i64: return X86::CMP64rr; 1375 case MVT::f32: 1376 return X86ScalarSSEf32 1377 ? (HasAVX512 ? X86::VUCOMISSZrr 1378 : HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) 1379 : 0; 1380 case MVT::f64: 1381 return X86ScalarSSEf64 1382 ? (HasAVX512 ? X86::VUCOMISDZrr 1383 : HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) 1384 : 0; 1385 } 1386 } 1387 1388 /// If we have a comparison with RHS as the RHS of the comparison, return an 1389 /// opcode that works for the compare (e.g. CMP32ri) otherwise return 0. 1390 static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { 1391 int64_t Val = RHSC->getSExtValue(); 1392 switch (VT.getSimpleVT().SimpleTy) { 1393 // Otherwise, we can't fold the immediate into this comparison. 1394 default: 1395 return 0; 1396 case MVT::i8: 1397 return X86::CMP8ri; 1398 case MVT::i16: 1399 if (isInt<8>(Val)) 1400 return X86::CMP16ri8; 1401 return X86::CMP16ri; 1402 case MVT::i32: 1403 if (isInt<8>(Val)) 1404 return X86::CMP32ri8; 1405 return X86::CMP32ri; 1406 case MVT::i64: 1407 if (isInt<8>(Val)) 1408 return X86::CMP64ri8; 1409 // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext 1410 // field. 1411 if (isInt<32>(Val)) 1412 return X86::CMP64ri32; 1413 return 0; 1414 } 1415 } 1416 1417 bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT, 1418 const DebugLoc &CurDbgLoc) { 1419 Register Op0Reg = getRegForValue(Op0); 1420 if (Op0Reg == 0) return false; 1421 1422 // Handle 'null' like i32/i64 0. 1423 if (isa<ConstantPointerNull>(Op1)) 1424 Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext())); 1425 1426 // We have two options: compare with register or immediate. If the RHS of 1427 // the compare is an immediate that we can fold into this compare, use 1428 // CMPri, otherwise use CMPrr. 1429 if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { 1430 if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) { 1431 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc)) 1432 .addReg(Op0Reg) 1433 .addImm(Op1C->getSExtValue()); 1434 return true; 1435 } 1436 } 1437 1438 unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget); 1439 if (CompareOpc == 0) return false; 1440 1441 Register Op1Reg = getRegForValue(Op1); 1442 if (Op1Reg == 0) return false; 1443 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc)) 1444 .addReg(Op0Reg) 1445 .addReg(Op1Reg); 1446 1447 return true; 1448 } 1449 1450 bool X86FastISel::X86SelectCmp(const Instruction *I) { 1451 const CmpInst *CI = cast<CmpInst>(I); 1452 1453 MVT VT; 1454 if (!isTypeLegal(I->getOperand(0)->getType(), VT)) 1455 return false; 1456 1457 // Try to optimize or fold the cmp. 1458 CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); 1459 unsigned ResultReg = 0; 1460 switch (Predicate) { 1461 default: break; 1462 case CmpInst::FCMP_FALSE: { 1463 ResultReg = createResultReg(&X86::GR32RegClass); 1464 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), 1465 ResultReg); 1466 ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, 1467 /*Op0IsKill=*/true, X86::sub_8bit); 1468 if (!ResultReg) 1469 return false; 1470 break; 1471 } 1472 case CmpInst::FCMP_TRUE: { 1473 ResultReg = createResultReg(&X86::GR8RegClass); 1474 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), 1475 ResultReg).addImm(1); 1476 break; 1477 } 1478 } 1479 1480 if (ResultReg) { 1481 updateValueMap(I, ResultReg); 1482 return true; 1483 } 1484 1485 const Value *LHS = CI->getOperand(0); 1486 const Value *RHS = CI->getOperand(1); 1487 1488 // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. 1489 // We don't have to materialize a zero constant for this case and can just use 1490 // %x again on the RHS. 1491 if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { 1492 const auto *RHSC = dyn_cast<ConstantFP>(RHS); 1493 if (RHSC && RHSC->isNullValue()) 1494 RHS = LHS; 1495 } 1496 1497 // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. 1498 static const uint16_t SETFOpcTable[2][3] = { 1499 { X86::COND_E, X86::COND_NP, X86::AND8rr }, 1500 { X86::COND_NE, X86::COND_P, X86::OR8rr } 1501 }; 1502 const uint16_t *SETFOpc = nullptr; 1503 switch (Predicate) { 1504 default: break; 1505 case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; 1506 case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break; 1507 } 1508 1509 ResultReg = createResultReg(&X86::GR8RegClass); 1510 if (SETFOpc) { 1511 if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) 1512 return false; 1513 1514 Register FlagReg1 = createResultReg(&X86::GR8RegClass); 1515 Register FlagReg2 = createResultReg(&X86::GR8RegClass); 1516 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), 1517 FlagReg1).addImm(SETFOpc[0]); 1518 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), 1519 FlagReg2).addImm(SETFOpc[1]); 1520 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), 1521 ResultReg).addReg(FlagReg1).addReg(FlagReg2); 1522 updateValueMap(I, ResultReg); 1523 return true; 1524 } 1525 1526 X86::CondCode CC; 1527 bool SwapArgs; 1528 std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); 1529 assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); 1530 1531 if (SwapArgs) 1532 std::swap(LHS, RHS); 1533 1534 // Emit a compare of LHS/RHS. 1535 if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) 1536 return false; 1537 1538 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), 1539 ResultReg).addImm(CC); 1540 updateValueMap(I, ResultReg); 1541 return true; 1542 } 1543 1544 bool X86FastISel::X86SelectZExt(const Instruction *I) { 1545 EVT DstVT = TLI.getValueType(DL, I->getType()); 1546 if (!TLI.isTypeLegal(DstVT)) 1547 return false; 1548 1549 Register ResultReg = getRegForValue(I->getOperand(0)); 1550 if (ResultReg == 0) 1551 return false; 1552 1553 // Handle zero-extension from i1 to i8, which is common. 1554 MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); 1555 if (SrcVT == MVT::i1) { 1556 // Set the high bits to zero. 1557 ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); 1558 SrcVT = MVT::i8; 1559 1560 if (ResultReg == 0) 1561 return false; 1562 } 1563 1564 if (DstVT == MVT::i64) { 1565 // Handle extension to 64-bits via sub-register shenanigans. 1566 unsigned MovInst; 1567 1568 switch (SrcVT.SimpleTy) { 1569 case MVT::i8: MovInst = X86::MOVZX32rr8; break; 1570 case MVT::i16: MovInst = X86::MOVZX32rr16; break; 1571 case MVT::i32: MovInst = X86::MOV32rr; break; 1572 default: llvm_unreachable("Unexpected zext to i64 source type"); 1573 } 1574 1575 Register Result32 = createResultReg(&X86::GR32RegClass); 1576 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32) 1577 .addReg(ResultReg); 1578 1579 ResultReg = createResultReg(&X86::GR64RegClass); 1580 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), 1581 ResultReg) 1582 .addImm(0).addReg(Result32).addImm(X86::sub_32bit); 1583 } else if (DstVT == MVT::i16) { 1584 // i8->i16 doesn't exist in the autogenerated isel table. Need to zero 1585 // extend to 32-bits and then extract down to 16-bits. 1586 Register Result32 = createResultReg(&X86::GR32RegClass); 1587 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8), 1588 Result32).addReg(ResultReg); 1589 1590 ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, 1591 /*Op0IsKill=*/true, X86::sub_16bit); 1592 } else if (DstVT != MVT::i8) { 1593 ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, 1594 ResultReg, /*Op0IsKill=*/true); 1595 if (ResultReg == 0) 1596 return false; 1597 } 1598 1599 updateValueMap(I, ResultReg); 1600 return true; 1601 } 1602 1603 bool X86FastISel::X86SelectSExt(const Instruction *I) { 1604 EVT DstVT = TLI.getValueType(DL, I->getType()); 1605 if (!TLI.isTypeLegal(DstVT)) 1606 return false; 1607 1608 Register ResultReg = getRegForValue(I->getOperand(0)); 1609 if (ResultReg == 0) 1610 return false; 1611 1612 // Handle sign-extension from i1 to i8. 1613 MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); 1614 if (SrcVT == MVT::i1) { 1615 // Set the high bits to zero. 1616 Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg, 1617 /*TODO: Kill=*/false); 1618 if (ZExtReg == 0) 1619 return false; 1620 1621 // Negate the result to make an 8-bit sign extended value. 1622 ResultReg = createResultReg(&X86::GR8RegClass); 1623 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::NEG8r), 1624 ResultReg).addReg(ZExtReg); 1625 1626 SrcVT = MVT::i8; 1627 } 1628 1629 if (DstVT == MVT::i16) { 1630 // i8->i16 doesn't exist in the autogenerated isel table. Need to sign 1631 // extend to 32-bits and then extract down to 16-bits. 1632 Register Result32 = createResultReg(&X86::GR32RegClass); 1633 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8), 1634 Result32).addReg(ResultReg); 1635 1636 ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, 1637 /*Op0IsKill=*/true, X86::sub_16bit); 1638 } else if (DstVT != MVT::i8) { 1639 ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND, 1640 ResultReg, /*Op0IsKill=*/true); 1641 if (ResultReg == 0) 1642 return false; 1643 } 1644 1645 updateValueMap(I, ResultReg); 1646 return true; 1647 } 1648 1649 bool X86FastISel::X86SelectBranch(const Instruction *I) { 1650 // Unconditional branches are selected by tablegen-generated code. 1651 // Handle a conditional branch. 1652 const BranchInst *BI = cast<BranchInst>(I); 1653 MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; 1654 MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; 1655 1656 // Fold the common case of a conditional branch with a comparison 1657 // in the same block (values defined on other blocks may not have 1658 // initialized registers). 1659 X86::CondCode CC; 1660 if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { 1661 if (CI->hasOneUse() && CI->getParent() == I->getParent()) { 1662 EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType()); 1663 1664 // Try to optimize or fold the cmp. 1665 CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); 1666 switch (Predicate) { 1667 default: break; 1668 case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true; 1669 case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true; 1670 } 1671 1672 const Value *CmpLHS = CI->getOperand(0); 1673 const Value *CmpRHS = CI->getOperand(1); 1674 1675 // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 1676 // 0.0. 1677 // We don't have to materialize a zero constant for this case and can just 1678 // use %x again on the RHS. 1679 if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { 1680 const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); 1681 if (CmpRHSC && CmpRHSC->isNullValue()) 1682 CmpRHS = CmpLHS; 1683 } 1684 1685 // Try to take advantage of fallthrough opportunities. 1686 if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { 1687 std::swap(TrueMBB, FalseMBB); 1688 Predicate = CmpInst::getInversePredicate(Predicate); 1689 } 1690 1691 // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition 1692 // code check. Instead two branch instructions are required to check all 1693 // the flags. First we change the predicate to a supported condition code, 1694 // which will be the first branch. Later one we will emit the second 1695 // branch. 1696 bool NeedExtraBranch = false; 1697 switch (Predicate) { 1698 default: break; 1699 case CmpInst::FCMP_OEQ: 1700 std::swap(TrueMBB, FalseMBB); 1701 LLVM_FALLTHROUGH; 1702 case CmpInst::FCMP_UNE: 1703 NeedExtraBranch = true; 1704 Predicate = CmpInst::FCMP_ONE; 1705 break; 1706 } 1707 1708 bool SwapArgs; 1709 std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); 1710 assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); 1711 1712 if (SwapArgs) 1713 std::swap(CmpLHS, CmpRHS); 1714 1715 // Emit a compare of the LHS and RHS, setting the flags. 1716 if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc())) 1717 return false; 1718 1719 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) 1720 .addMBB(TrueMBB).addImm(CC); 1721 1722 // X86 requires a second branch to handle UNE (and OEQ, which is mapped 1723 // to UNE above). 1724 if (NeedExtraBranch) { 1725 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) 1726 .addMBB(TrueMBB).addImm(X86::COND_P); 1727 } 1728 1729 finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); 1730 return true; 1731 } 1732 } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { 1733 // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which 1734 // typically happen for _Bool and C++ bools. 1735 MVT SourceVT; 1736 if (TI->hasOneUse() && TI->getParent() == I->getParent() && 1737 isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { 1738 unsigned TestOpc = 0; 1739 switch (SourceVT.SimpleTy) { 1740 default: break; 1741 case MVT::i8: TestOpc = X86::TEST8ri; break; 1742 case MVT::i16: TestOpc = X86::TEST16ri; break; 1743 case MVT::i32: TestOpc = X86::TEST32ri; break; 1744 case MVT::i64: TestOpc = X86::TEST64ri32; break; 1745 } 1746 if (TestOpc) { 1747 Register OpReg = getRegForValue(TI->getOperand(0)); 1748 if (OpReg == 0) return false; 1749 1750 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) 1751 .addReg(OpReg).addImm(1); 1752 1753 unsigned JmpCond = X86::COND_NE; 1754 if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { 1755 std::swap(TrueMBB, FalseMBB); 1756 JmpCond = X86::COND_E; 1757 } 1758 1759 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) 1760 .addMBB(TrueMBB).addImm(JmpCond); 1761 1762 finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); 1763 return true; 1764 } 1765 } 1766 } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { 1767 // Fake request the condition, otherwise the intrinsic might be completely 1768 // optimized away. 1769 Register TmpReg = getRegForValue(BI->getCondition()); 1770 if (TmpReg == 0) 1771 return false; 1772 1773 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) 1774 .addMBB(TrueMBB).addImm(CC); 1775 finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); 1776 return true; 1777 } 1778 1779 // Otherwise do a clumsy setcc and re-test it. 1780 // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used 1781 // in an explicit cast, so make sure to handle that correctly. 1782 Register OpReg = getRegForValue(BI->getCondition()); 1783 if (OpReg == 0) return false; 1784 1785 // In case OpReg is a K register, COPY to a GPR 1786 if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) { 1787 unsigned KOpReg = OpReg; 1788 OpReg = createResultReg(&X86::GR32RegClass); 1789 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1790 TII.get(TargetOpcode::COPY), OpReg) 1791 .addReg(KOpReg); 1792 OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Op0IsKill=*/true, 1793 X86::sub_8bit); 1794 } 1795 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) 1796 .addReg(OpReg) 1797 .addImm(1); 1798 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) 1799 .addMBB(TrueMBB).addImm(X86::COND_NE); 1800 finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); 1801 return true; 1802 } 1803 1804 bool X86FastISel::X86SelectShift(const Instruction *I) { 1805 unsigned CReg = 0, OpReg = 0; 1806 const TargetRegisterClass *RC = nullptr; 1807 if (I->getType()->isIntegerTy(8)) { 1808 CReg = X86::CL; 1809 RC = &X86::GR8RegClass; 1810 switch (I->getOpcode()) { 1811 case Instruction::LShr: OpReg = X86::SHR8rCL; break; 1812 case Instruction::AShr: OpReg = X86::SAR8rCL; break; 1813 case Instruction::Shl: OpReg = X86::SHL8rCL; break; 1814 default: return false; 1815 } 1816 } else if (I->getType()->isIntegerTy(16)) { 1817 CReg = X86::CX; 1818 RC = &X86::GR16RegClass; 1819 switch (I->getOpcode()) { 1820 default: llvm_unreachable("Unexpected shift opcode"); 1821 case Instruction::LShr: OpReg = X86::SHR16rCL; break; 1822 case Instruction::AShr: OpReg = X86::SAR16rCL; break; 1823 case Instruction::Shl: OpReg = X86::SHL16rCL; break; 1824 } 1825 } else if (I->getType()->isIntegerTy(32)) { 1826 CReg = X86::ECX; 1827 RC = &X86::GR32RegClass; 1828 switch (I->getOpcode()) { 1829 default: llvm_unreachable("Unexpected shift opcode"); 1830 case Instruction::LShr: OpReg = X86::SHR32rCL; break; 1831 case Instruction::AShr: OpReg = X86::SAR32rCL; break; 1832 case Instruction::Shl: OpReg = X86::SHL32rCL; break; 1833 } 1834 } else if (I->getType()->isIntegerTy(64)) { 1835 CReg = X86::RCX; 1836 RC = &X86::GR64RegClass; 1837 switch (I->getOpcode()) { 1838 default: llvm_unreachable("Unexpected shift opcode"); 1839 case Instruction::LShr: OpReg = X86::SHR64rCL; break; 1840 case Instruction::AShr: OpReg = X86::SAR64rCL; break; 1841 case Instruction::Shl: OpReg = X86::SHL64rCL; break; 1842 } 1843 } else { 1844 return false; 1845 } 1846 1847 MVT VT; 1848 if (!isTypeLegal(I->getType(), VT)) 1849 return false; 1850 1851 Register Op0Reg = getRegForValue(I->getOperand(0)); 1852 if (Op0Reg == 0) return false; 1853 1854 Register Op1Reg = getRegForValue(I->getOperand(1)); 1855 if (Op1Reg == 0) return false; 1856 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), 1857 CReg).addReg(Op1Reg); 1858 1859 // The shift instruction uses X86::CL. If we defined a super-register 1860 // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. 1861 if (CReg != X86::CL) 1862 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1863 TII.get(TargetOpcode::KILL), X86::CL) 1864 .addReg(CReg, RegState::Kill); 1865 1866 Register ResultReg = createResultReg(RC); 1867 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) 1868 .addReg(Op0Reg); 1869 updateValueMap(I, ResultReg); 1870 return true; 1871 } 1872 1873 bool X86FastISel::X86SelectDivRem(const Instruction *I) { 1874 const static unsigned NumTypes = 4; // i8, i16, i32, i64 1875 const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem 1876 const static bool S = true; // IsSigned 1877 const static bool U = false; // !IsSigned 1878 const static unsigned Copy = TargetOpcode::COPY; 1879 // For the X86 DIV/IDIV instruction, in most cases the dividend 1880 // (numerator) must be in a specific register pair highreg:lowreg, 1881 // producing the quotient in lowreg and the remainder in highreg. 1882 // For most data types, to set up the instruction, the dividend is 1883 // copied into lowreg, and lowreg is sign-extended or zero-extended 1884 // into highreg. The exception is i8, where the dividend is defined 1885 // as a single register rather than a register pair, and we 1886 // therefore directly sign-extend or zero-extend the dividend into 1887 // lowreg, instead of copying, and ignore the highreg. 1888 const static struct DivRemEntry { 1889 // The following portion depends only on the data type. 1890 const TargetRegisterClass *RC; 1891 unsigned LowInReg; // low part of the register pair 1892 unsigned HighInReg; // high part of the register pair 1893 // The following portion depends on both the data type and the operation. 1894 struct DivRemResult { 1895 unsigned OpDivRem; // The specific DIV/IDIV opcode to use. 1896 unsigned OpSignExtend; // Opcode for sign-extending lowreg into 1897 // highreg, or copying a zero into highreg. 1898 unsigned OpCopy; // Opcode for copying dividend into lowreg, or 1899 // zero/sign-extending into lowreg for i8. 1900 unsigned DivRemResultReg; // Register containing the desired result. 1901 bool IsOpSigned; // Whether to use signed or unsigned form. 1902 } ResultTable[NumOps]; 1903 } OpTable[NumTypes] = { 1904 { &X86::GR8RegClass, X86::AX, 0, { 1905 { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv 1906 { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem 1907 { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv 1908 { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem 1909 } 1910 }, // i8 1911 { &X86::GR16RegClass, X86::AX, X86::DX, { 1912 { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv 1913 { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem 1914 { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv 1915 { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem 1916 } 1917 }, // i16 1918 { &X86::GR32RegClass, X86::EAX, X86::EDX, { 1919 { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv 1920 { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem 1921 { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv 1922 { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem 1923 } 1924 }, // i32 1925 { &X86::GR64RegClass, X86::RAX, X86::RDX, { 1926 { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv 1927 { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem 1928 { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv 1929 { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem 1930 } 1931 }, // i64 1932 }; 1933 1934 MVT VT; 1935 if (!isTypeLegal(I->getType(), VT)) 1936 return false; 1937 1938 unsigned TypeIndex, OpIndex; 1939 switch (VT.SimpleTy) { 1940 default: return false; 1941 case MVT::i8: TypeIndex = 0; break; 1942 case MVT::i16: TypeIndex = 1; break; 1943 case MVT::i32: TypeIndex = 2; break; 1944 case MVT::i64: TypeIndex = 3; 1945 if (!Subtarget->is64Bit()) 1946 return false; 1947 break; 1948 } 1949 1950 switch (I->getOpcode()) { 1951 default: llvm_unreachable("Unexpected div/rem opcode"); 1952 case Instruction::SDiv: OpIndex = 0; break; 1953 case Instruction::SRem: OpIndex = 1; break; 1954 case Instruction::UDiv: OpIndex = 2; break; 1955 case Instruction::URem: OpIndex = 3; break; 1956 } 1957 1958 const DivRemEntry &TypeEntry = OpTable[TypeIndex]; 1959 const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; 1960 Register Op0Reg = getRegForValue(I->getOperand(0)); 1961 if (Op0Reg == 0) 1962 return false; 1963 Register Op1Reg = getRegForValue(I->getOperand(1)); 1964 if (Op1Reg == 0) 1965 return false; 1966 1967 // Move op0 into low-order input register. 1968 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1969 TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg); 1970 // Zero-extend or sign-extend into high-order input register. 1971 if (OpEntry.OpSignExtend) { 1972 if (OpEntry.IsOpSigned) 1973 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1974 TII.get(OpEntry.OpSignExtend)); 1975 else { 1976 Register Zero32 = createResultReg(&X86::GR32RegClass); 1977 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1978 TII.get(X86::MOV32r0), Zero32); 1979 1980 // Copy the zero into the appropriate sub/super/identical physical 1981 // register. Unfortunately the operations needed are not uniform enough 1982 // to fit neatly into the table above. 1983 if (VT == MVT::i16) { 1984 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1985 TII.get(Copy), TypeEntry.HighInReg) 1986 .addReg(Zero32, 0, X86::sub_16bit); 1987 } else if (VT == MVT::i32) { 1988 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1989 TII.get(Copy), TypeEntry.HighInReg) 1990 .addReg(Zero32); 1991 } else if (VT == MVT::i64) { 1992 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 1993 TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) 1994 .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); 1995 } 1996 } 1997 } 1998 // Generate the DIV/IDIV instruction. 1999 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2000 TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); 2001 // For i8 remainder, we can't reference ah directly, as we'll end 2002 // up with bogus copies like %r9b = COPY %ah. Reference ax 2003 // instead to prevent ah references in a rex instruction. 2004 // 2005 // The current assumption of the fast register allocator is that isel 2006 // won't generate explicit references to the GR8_NOREX registers. If 2007 // the allocator and/or the backend get enhanced to be more robust in 2008 // that regard, this can be, and should be, removed. 2009 unsigned ResultReg = 0; 2010 if ((I->getOpcode() == Instruction::SRem || 2011 I->getOpcode() == Instruction::URem) && 2012 OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) { 2013 Register SourceSuperReg = createResultReg(&X86::GR16RegClass); 2014 Register ResultSuperReg = createResultReg(&X86::GR16RegClass); 2015 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2016 TII.get(Copy), SourceSuperReg).addReg(X86::AX); 2017 2018 // Shift AX right by 8 bits instead of using AH. 2019 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri), 2020 ResultSuperReg).addReg(SourceSuperReg).addImm(8); 2021 2022 // Now reference the 8-bit subreg of the result. 2023 ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, 2024 /*Op0IsKill=*/true, X86::sub_8bit); 2025 } 2026 // Copy the result out of the physreg if we haven't already. 2027 if (!ResultReg) { 2028 ResultReg = createResultReg(TypeEntry.RC); 2029 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg) 2030 .addReg(OpEntry.DivRemResultReg); 2031 } 2032 updateValueMap(I, ResultReg); 2033 2034 return true; 2035 } 2036 2037 /// Emit a conditional move instruction (if the are supported) to lower 2038 /// the select. 2039 bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { 2040 // Check if the subtarget supports these instructions. 2041 if (!Subtarget->hasCMov()) 2042 return false; 2043 2044 // FIXME: Add support for i8. 2045 if (RetVT < MVT::i16 || RetVT > MVT::i64) 2046 return false; 2047 2048 const Value *Cond = I->getOperand(0); 2049 const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); 2050 bool NeedTest = true; 2051 X86::CondCode CC = X86::COND_NE; 2052 2053 // Optimize conditions coming from a compare if both instructions are in the 2054 // same basic block (values defined in other basic blocks may not have 2055 // initialized registers). 2056 const auto *CI = dyn_cast<CmpInst>(Cond); 2057 if (CI && (CI->getParent() == I->getParent())) { 2058 CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); 2059 2060 // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. 2061 static const uint16_t SETFOpcTable[2][3] = { 2062 { X86::COND_NP, X86::COND_E, X86::TEST8rr }, 2063 { X86::COND_P, X86::COND_NE, X86::OR8rr } 2064 }; 2065 const uint16_t *SETFOpc = nullptr; 2066 switch (Predicate) { 2067 default: break; 2068 case CmpInst::FCMP_OEQ: 2069 SETFOpc = &SETFOpcTable[0][0]; 2070 Predicate = CmpInst::ICMP_NE; 2071 break; 2072 case CmpInst::FCMP_UNE: 2073 SETFOpc = &SETFOpcTable[1][0]; 2074 Predicate = CmpInst::ICMP_NE; 2075 break; 2076 } 2077 2078 bool NeedSwap; 2079 std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate); 2080 assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); 2081 2082 const Value *CmpLHS = CI->getOperand(0); 2083 const Value *CmpRHS = CI->getOperand(1); 2084 if (NeedSwap) 2085 std::swap(CmpLHS, CmpRHS); 2086 2087 EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType()); 2088 // Emit a compare of the LHS and RHS, setting the flags. 2089 if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) 2090 return false; 2091 2092 if (SETFOpc) { 2093 Register FlagReg1 = createResultReg(&X86::GR8RegClass); 2094 Register FlagReg2 = createResultReg(&X86::GR8RegClass); 2095 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), 2096 FlagReg1).addImm(SETFOpc[0]); 2097 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), 2098 FlagReg2).addImm(SETFOpc[1]); 2099 auto const &II = TII.get(SETFOpc[2]); 2100 if (II.getNumDefs()) { 2101 Register TmpReg = createResultReg(&X86::GR8RegClass); 2102 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) 2103 .addReg(FlagReg2).addReg(FlagReg1); 2104 } else { 2105 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) 2106 .addReg(FlagReg2).addReg(FlagReg1); 2107 } 2108 } 2109 NeedTest = false; 2110 } else if (foldX86XALUIntrinsic(CC, I, Cond)) { 2111 // Fake request the condition, otherwise the intrinsic might be completely 2112 // optimized away. 2113 Register TmpReg = getRegForValue(Cond); 2114 if (TmpReg == 0) 2115 return false; 2116 2117 NeedTest = false; 2118 } 2119 2120 if (NeedTest) { 2121 // Selects operate on i1, however, CondReg is 8 bits width and may contain 2122 // garbage. Indeed, only the less significant bit is supposed to be 2123 // accurate. If we read more than the lsb, we may see non-zero values 2124 // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for 2125 // the select. This is achieved by performing TEST against 1. 2126 Register CondReg = getRegForValue(Cond); 2127 if (CondReg == 0) 2128 return false; 2129 bool CondIsKill = hasTrivialKill(Cond); 2130 2131 // In case OpReg is a K register, COPY to a GPR 2132 if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { 2133 unsigned KCondReg = CondReg; 2134 CondReg = createResultReg(&X86::GR32RegClass); 2135 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2136 TII.get(TargetOpcode::COPY), CondReg) 2137 .addReg(KCondReg, getKillRegState(CondIsKill)); 2138 CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true, 2139 X86::sub_8bit); 2140 } 2141 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) 2142 .addReg(CondReg, getKillRegState(CondIsKill)) 2143 .addImm(1); 2144 } 2145 2146 const Value *LHS = I->getOperand(1); 2147 const Value *RHS = I->getOperand(2); 2148 2149 Register RHSReg = getRegForValue(RHS); 2150 bool RHSIsKill = hasTrivialKill(RHS); 2151 2152 Register LHSReg = getRegForValue(LHS); 2153 bool LHSIsKill = hasTrivialKill(LHS); 2154 2155 if (!LHSReg || !RHSReg) 2156 return false; 2157 2158 const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo(); 2159 unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8); 2160 Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, 2161 LHSReg, LHSIsKill, CC); 2162 updateValueMap(I, ResultReg); 2163 return true; 2164 } 2165 2166 /// Emit SSE or AVX instructions to lower the select. 2167 /// 2168 /// Try to use SSE1/SSE2 instructions to simulate a select without branches. 2169 /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary 2170 /// SSE instructions are available. If AVX is available, try to use a VBLENDV. 2171 bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { 2172 // Optimize conditions coming from a compare if both instructions are in the 2173 // same basic block (values defined in other basic blocks may not have 2174 // initialized registers). 2175 const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0)); 2176 if (!CI || (CI->getParent() != I->getParent())) 2177 return false; 2178 2179 if (I->getType() != CI->getOperand(0)->getType() || 2180 !((Subtarget->hasSSE1() && RetVT == MVT::f32) || 2181 (Subtarget->hasSSE2() && RetVT == MVT::f64))) 2182 return false; 2183 2184 const Value *CmpLHS = CI->getOperand(0); 2185 const Value *CmpRHS = CI->getOperand(1); 2186 CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); 2187 2188 // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. 2189 // We don't have to materialize a zero constant for this case and can just use 2190 // %x again on the RHS. 2191 if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { 2192 const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); 2193 if (CmpRHSC && CmpRHSC->isNullValue()) 2194 CmpRHS = CmpLHS; 2195 } 2196 2197 unsigned CC; 2198 bool NeedSwap; 2199 std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); 2200 if (CC > 7 && !Subtarget->hasAVX()) 2201 return false; 2202 2203 if (NeedSwap) 2204 std::swap(CmpLHS, CmpRHS); 2205 2206 const Value *LHS = I->getOperand(1); 2207 const Value *RHS = I->getOperand(2); 2208 2209 Register LHSReg = getRegForValue(LHS); 2210 bool LHSIsKill = hasTrivialKill(LHS); 2211 2212 Register RHSReg = getRegForValue(RHS); 2213 bool RHSIsKill = hasTrivialKill(RHS); 2214 2215 Register CmpLHSReg = getRegForValue(CmpLHS); 2216 bool CmpLHSIsKill = hasTrivialKill(CmpLHS); 2217 2218 Register CmpRHSReg = getRegForValue(CmpRHS); 2219 bool CmpRHSIsKill = hasTrivialKill(CmpRHS); 2220 2221 if (!LHSReg || !RHSReg || !CmpLHSReg || !CmpRHSReg) 2222 return false; 2223 2224 const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); 2225 unsigned ResultReg; 2226 2227 if (Subtarget->hasAVX512()) { 2228 // If we have AVX512 we can use a mask compare and masked movss/sd. 2229 const TargetRegisterClass *VR128X = &X86::VR128XRegClass; 2230 const TargetRegisterClass *VK1 = &X86::VK1RegClass; 2231 2232 unsigned CmpOpcode = 2233 (RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr; 2234 Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill, 2235 CmpRHSReg, CmpRHSIsKill, CC); 2236 2237 // Need an IMPLICIT_DEF for the input that is used to generate the upper 2238 // bits of the result register since its not based on any of the inputs. 2239 Register ImplicitDefReg = createResultReg(VR128X); 2240 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2241 TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); 2242 2243 // Place RHSReg is the passthru of the masked movss/sd operation and put 2244 // LHS in the input. The mask input comes from the compare. 2245 unsigned MovOpcode = 2246 (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk; 2247 unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill, 2248 CmpReg, true, ImplicitDefReg, true, 2249 LHSReg, LHSIsKill); 2250 2251 ResultReg = createResultReg(RC); 2252 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2253 TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg); 2254 2255 } else if (Subtarget->hasAVX()) { 2256 const TargetRegisterClass *VR128 = &X86::VR128RegClass; 2257 2258 // If we have AVX, create 1 blendv instead of 3 logic instructions. 2259 // Blendv was introduced with SSE 4.1, but the 2 register form implicitly 2260 // uses XMM0 as the selection register. That may need just as many 2261 // instructions as the AND/ANDN/OR sequence due to register moves, so 2262 // don't bother. 2263 unsigned CmpOpcode = 2264 (RetVT == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; 2265 unsigned BlendOpcode = 2266 (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; 2267 2268 Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, 2269 CmpRHSReg, CmpRHSIsKill, CC); 2270 Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill, 2271 LHSReg, LHSIsKill, CmpReg, true); 2272 ResultReg = createResultReg(RC); 2273 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2274 TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); 2275 } else { 2276 // Choose the SSE instruction sequence based on data type (float or double). 2277 static const uint16_t OpcTable[2][4] = { 2278 { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr }, 2279 { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr } 2280 }; 2281 2282 const uint16_t *Opc = nullptr; 2283 switch (RetVT.SimpleTy) { 2284 default: return false; 2285 case MVT::f32: Opc = &OpcTable[0][0]; break; 2286 case MVT::f64: Opc = &OpcTable[1][0]; break; 2287 } 2288 2289 const TargetRegisterClass *VR128 = &X86::VR128RegClass; 2290 Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, 2291 CmpRHSReg, CmpRHSIsKill, CC); 2292 Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, 2293 /*Op0IsKill=*/false, LHSReg, LHSIsKill); 2294 Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, 2295 /*Op0IsKill=*/true, RHSReg, RHSIsKill); 2296 Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*Op0IsKill=*/true, 2297 AndReg, /*Op1IsKill=*/true); 2298 ResultReg = createResultReg(RC); 2299 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2300 TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg); 2301 } 2302 updateValueMap(I, ResultReg); 2303 return true; 2304 } 2305 2306 bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { 2307 // These are pseudo CMOV instructions and will be later expanded into control- 2308 // flow. 2309 unsigned Opc; 2310 switch (RetVT.SimpleTy) { 2311 default: return false; 2312 case MVT::i8: Opc = X86::CMOV_GR8; break; 2313 case MVT::i16: Opc = X86::CMOV_GR16; break; 2314 case MVT::i32: Opc = X86::CMOV_GR32; break; 2315 case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X 2316 : X86::CMOV_FR32; break; 2317 case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X 2318 : X86::CMOV_FR64; break; 2319 } 2320 2321 const Value *Cond = I->getOperand(0); 2322 X86::CondCode CC = X86::COND_NE; 2323 2324 // Optimize conditions coming from a compare if both instructions are in the 2325 // same basic block (values defined in other basic blocks may not have 2326 // initialized registers). 2327 const auto *CI = dyn_cast<CmpInst>(Cond); 2328 if (CI && (CI->getParent() == I->getParent())) { 2329 bool NeedSwap; 2330 std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate()); 2331 if (CC > X86::LAST_VALID_COND) 2332 return false; 2333 2334 const Value *CmpLHS = CI->getOperand(0); 2335 const Value *CmpRHS = CI->getOperand(1); 2336 2337 if (NeedSwap) 2338 std::swap(CmpLHS, CmpRHS); 2339 2340 EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType()); 2341 if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) 2342 return false; 2343 } else { 2344 Register CondReg = getRegForValue(Cond); 2345 if (CondReg == 0) 2346 return false; 2347 bool CondIsKill = hasTrivialKill(Cond); 2348 2349 // In case OpReg is a K register, COPY to a GPR 2350 if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { 2351 unsigned KCondReg = CondReg; 2352 CondReg = createResultReg(&X86::GR32RegClass); 2353 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2354 TII.get(TargetOpcode::COPY), CondReg) 2355 .addReg(KCondReg, getKillRegState(CondIsKill)); 2356 CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true, 2357 X86::sub_8bit); 2358 } 2359 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) 2360 .addReg(CondReg, getKillRegState(CondIsKill)) 2361 .addImm(1); 2362 } 2363 2364 const Value *LHS = I->getOperand(1); 2365 const Value *RHS = I->getOperand(2); 2366 2367 Register LHSReg = getRegForValue(LHS); 2368 bool LHSIsKill = hasTrivialKill(LHS); 2369 2370 Register RHSReg = getRegForValue(RHS); 2371 bool RHSIsKill = hasTrivialKill(RHS); 2372 2373 if (!LHSReg || !RHSReg) 2374 return false; 2375 2376 const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); 2377 2378 Register ResultReg = 2379 fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); 2380 updateValueMap(I, ResultReg); 2381 return true; 2382 } 2383 2384 bool X86FastISel::X86SelectSelect(const Instruction *I) { 2385 MVT RetVT; 2386 if (!isTypeLegal(I->getType(), RetVT)) 2387 return false; 2388 2389 // Check if we can fold the select. 2390 if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) { 2391 CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); 2392 const Value *Opnd = nullptr; 2393 switch (Predicate) { 2394 default: break; 2395 case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break; 2396 case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break; 2397 } 2398 // No need for a select anymore - this is an unconditional move. 2399 if (Opnd) { 2400 Register OpReg = getRegForValue(Opnd); 2401 if (OpReg == 0) 2402 return false; 2403 bool OpIsKill = hasTrivialKill(Opnd); 2404 const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); 2405 Register ResultReg = createResultReg(RC); 2406 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2407 TII.get(TargetOpcode::COPY), ResultReg) 2408 .addReg(OpReg, getKillRegState(OpIsKill)); 2409 updateValueMap(I, ResultReg); 2410 return true; 2411 } 2412 } 2413 2414 // First try to use real conditional move instructions. 2415 if (X86FastEmitCMoveSelect(RetVT, I)) 2416 return true; 2417 2418 // Try to use a sequence of SSE instructions to simulate a conditional move. 2419 if (X86FastEmitSSESelect(RetVT, I)) 2420 return true; 2421 2422 // Fall-back to pseudo conditional move instructions, which will be later 2423 // converted to control-flow. 2424 if (X86FastEmitPseudoSelect(RetVT, I)) 2425 return true; 2426 2427 return false; 2428 } 2429 2430 // Common code for X86SelectSIToFP and X86SelectUIToFP. 2431 bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) { 2432 // The target-independent selection algorithm in FastISel already knows how 2433 // to select a SINT_TO_FP if the target is SSE but not AVX. 2434 // Early exit if the subtarget doesn't have AVX. 2435 // Unsigned conversion requires avx512. 2436 bool HasAVX512 = Subtarget->hasAVX512(); 2437 if (!Subtarget->hasAVX() || (!IsSigned && !HasAVX512)) 2438 return false; 2439 2440 // TODO: We could sign extend narrower types. 2441 MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); 2442 if (SrcVT != MVT::i32 && SrcVT != MVT::i64) 2443 return false; 2444 2445 // Select integer to float/double conversion. 2446 Register OpReg = getRegForValue(I->getOperand(0)); 2447 if (OpReg == 0) 2448 return false; 2449 2450 unsigned Opcode; 2451 2452 static const uint16_t SCvtOpc[2][2][2] = { 2453 { { X86::VCVTSI2SSrr, X86::VCVTSI642SSrr }, 2454 { X86::VCVTSI2SDrr, X86::VCVTSI642SDrr } }, 2455 { { X86::VCVTSI2SSZrr, X86::VCVTSI642SSZrr }, 2456 { X86::VCVTSI2SDZrr, X86::VCVTSI642SDZrr } }, 2457 }; 2458 static const uint16_t UCvtOpc[2][2] = { 2459 { X86::VCVTUSI2SSZrr, X86::VCVTUSI642SSZrr }, 2460 { X86::VCVTUSI2SDZrr, X86::VCVTUSI642SDZrr }, 2461 }; 2462 bool Is64Bit = SrcVT == MVT::i64; 2463 2464 if (I->getType()->isDoubleTy()) { 2465 // s/uitofp int -> double 2466 Opcode = IsSigned ? SCvtOpc[HasAVX512][1][Is64Bit] : UCvtOpc[1][Is64Bit]; 2467 } else if (I->getType()->isFloatTy()) { 2468 // s/uitofp int -> float 2469 Opcode = IsSigned ? SCvtOpc[HasAVX512][0][Is64Bit] : UCvtOpc[0][Is64Bit]; 2470 } else 2471 return false; 2472 2473 MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT(); 2474 const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT); 2475 Register ImplicitDefReg = createResultReg(RC); 2476 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2477 TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); 2478 Register ResultReg = 2479 fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false); 2480 updateValueMap(I, ResultReg); 2481 return true; 2482 } 2483 2484 bool X86FastISel::X86SelectSIToFP(const Instruction *I) { 2485 return X86SelectIntToFP(I, /*IsSigned*/true); 2486 } 2487 2488 bool X86FastISel::X86SelectUIToFP(const Instruction *I) { 2489 return X86SelectIntToFP(I, /*IsSigned*/false); 2490 } 2491 2492 // Helper method used by X86SelectFPExt and X86SelectFPTrunc. 2493 bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, 2494 unsigned TargetOpc, 2495 const TargetRegisterClass *RC) { 2496 assert((I->getOpcode() == Instruction::FPExt || 2497 I->getOpcode() == Instruction::FPTrunc) && 2498 "Instruction must be an FPExt or FPTrunc!"); 2499 bool HasAVX = Subtarget->hasAVX(); 2500 2501 Register OpReg = getRegForValue(I->getOperand(0)); 2502 if (OpReg == 0) 2503 return false; 2504 2505 unsigned ImplicitDefReg; 2506 if (HasAVX) { 2507 ImplicitDefReg = createResultReg(RC); 2508 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2509 TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); 2510 2511 } 2512 2513 Register ResultReg = createResultReg(RC); 2514 MachineInstrBuilder MIB; 2515 MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc), 2516 ResultReg); 2517 2518 if (HasAVX) 2519 MIB.addReg(ImplicitDefReg); 2520 2521 MIB.addReg(OpReg); 2522 updateValueMap(I, ResultReg); 2523 return true; 2524 } 2525 2526 bool X86FastISel::X86SelectFPExt(const Instruction *I) { 2527 if (X86ScalarSSEf64 && I->getType()->isDoubleTy() && 2528 I->getOperand(0)->getType()->isFloatTy()) { 2529 bool HasAVX512 = Subtarget->hasAVX512(); 2530 // fpext from float to double. 2531 unsigned Opc = 2532 HasAVX512 ? X86::VCVTSS2SDZrr 2533 : Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr; 2534 return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64)); 2535 } 2536 2537 return false; 2538 } 2539 2540 bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { 2541 if (X86ScalarSSEf64 && I->getType()->isFloatTy() && 2542 I->getOperand(0)->getType()->isDoubleTy()) { 2543 bool HasAVX512 = Subtarget->hasAVX512(); 2544 // fptrunc from double to float. 2545 unsigned Opc = 2546 HasAVX512 ? X86::VCVTSD2SSZrr 2547 : Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr; 2548 return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32)); 2549 } 2550 2551 return false; 2552 } 2553 2554 bool X86FastISel::X86SelectTrunc(const Instruction *I) { 2555 EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); 2556 EVT DstVT = TLI.getValueType(DL, I->getType()); 2557 2558 // This code only handles truncation to byte. 2559 if (DstVT != MVT::i8 && DstVT != MVT::i1) 2560 return false; 2561 if (!TLI.isTypeLegal(SrcVT)) 2562 return false; 2563 2564 Register InputReg = getRegForValue(I->getOperand(0)); 2565 if (!InputReg) 2566 // Unhandled operand. Halt "fast" selection and bail. 2567 return false; 2568 2569 if (SrcVT == MVT::i8) { 2570 // Truncate from i8 to i1; no code needed. 2571 updateValueMap(I, InputReg); 2572 return true; 2573 } 2574 2575 // Issue an extract_subreg. 2576 Register ResultReg = fastEmitInst_extractsubreg(MVT::i8, 2577 InputReg, false, 2578 X86::sub_8bit); 2579 if (!ResultReg) 2580 return false; 2581 2582 updateValueMap(I, ResultReg); 2583 return true; 2584 } 2585 2586 bool X86FastISel::IsMemcpySmall(uint64_t Len) { 2587 return Len <= (Subtarget->is64Bit() ? 32 : 16); 2588 } 2589 2590 bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, 2591 X86AddressMode SrcAM, uint64_t Len) { 2592 2593 // Make sure we don't bloat code by inlining very large memcpy's. 2594 if (!IsMemcpySmall(Len)) 2595 return false; 2596 2597 bool i64Legal = Subtarget->is64Bit(); 2598 2599 // We don't care about alignment here since we just emit integer accesses. 2600 while (Len) { 2601 MVT VT; 2602 if (Len >= 8 && i64Legal) 2603 VT = MVT::i64; 2604 else if (Len >= 4) 2605 VT = MVT::i32; 2606 else if (Len >= 2) 2607 VT = MVT::i16; 2608 else 2609 VT = MVT::i8; 2610 2611 unsigned Reg; 2612 bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); 2613 RV &= X86FastEmitStore(VT, Reg, /*ValIsKill=*/true, DestAM); 2614 assert(RV && "Failed to emit load or store??"); 2615 2616 unsigned Size = VT.getSizeInBits()/8; 2617 Len -= Size; 2618 DestAM.Disp += Size; 2619 SrcAM.Disp += Size; 2620 } 2621 2622 return true; 2623 } 2624 2625 bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { 2626 // FIXME: Handle more intrinsics. 2627 switch (II->getIntrinsicID()) { 2628 default: return false; 2629 case Intrinsic::convert_from_fp16: 2630 case Intrinsic::convert_to_fp16: { 2631 if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) 2632 return false; 2633 2634 const Value *Op = II->getArgOperand(0); 2635 Register InputReg = getRegForValue(Op); 2636 if (InputReg == 0) 2637 return false; 2638 2639 // F16C only allows converting from float to half and from half to float. 2640 bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16; 2641 if (IsFloatToHalf) { 2642 if (!Op->getType()->isFloatTy()) 2643 return false; 2644 } else { 2645 if (!II->getType()->isFloatTy()) 2646 return false; 2647 } 2648 2649 unsigned ResultReg = 0; 2650 const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16); 2651 if (IsFloatToHalf) { 2652 // 'InputReg' is implicitly promoted from register class FR32 to 2653 // register class VR128 by method 'constrainOperandRegClass' which is 2654 // directly called by 'fastEmitInst_ri'. 2655 // Instruction VCVTPS2PHrr takes an extra immediate operand which is 2656 // used to provide rounding control: use MXCSR.RC, encoded as 0b100. 2657 // It's consistent with the other FP instructions, which are usually 2658 // controlled by MXCSR. 2659 unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr 2660 : X86::VCVTPS2PHrr; 2661 InputReg = fastEmitInst_ri(Opc, RC, InputReg, false, 4); 2662 2663 // Move the lower 32-bits of ResultReg to another register of class GR32. 2664 Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr 2665 : X86::VMOVPDI2DIrr; 2666 ResultReg = createResultReg(&X86::GR32RegClass); 2667 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) 2668 .addReg(InputReg, RegState::Kill); 2669 2670 // The result value is in the lower 16-bits of ResultReg. 2671 unsigned RegIdx = X86::sub_16bit; 2672 ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx); 2673 } else { 2674 assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!"); 2675 // Explicitly zero-extend the input to 32-bit. 2676 InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg, 2677 /*Op0IsKill=*/false); 2678 2679 // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr. 2680 InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR, 2681 InputReg, /*Op0IsKill=*/true); 2682 2683 unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr 2684 : X86::VCVTPH2PSrr; 2685 InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Op0IsKill=*/true); 2686 2687 // The result value is in the lower 32-bits of ResultReg. 2688 // Emit an explicit copy from register class VR128 to register class FR32. 2689 ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32)); 2690 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2691 TII.get(TargetOpcode::COPY), ResultReg) 2692 .addReg(InputReg, RegState::Kill); 2693 } 2694 2695 updateValueMap(II, ResultReg); 2696 return true; 2697 } 2698 case Intrinsic::frameaddress: { 2699 MachineFunction *MF = FuncInfo.MF; 2700 if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI()) 2701 return false; 2702 2703 Type *RetTy = II->getCalledFunction()->getReturnType(); 2704 2705 MVT VT; 2706 if (!isTypeLegal(RetTy, VT)) 2707 return false; 2708 2709 unsigned Opc; 2710 const TargetRegisterClass *RC = nullptr; 2711 2712 switch (VT.SimpleTy) { 2713 default: llvm_unreachable("Invalid result type for frameaddress."); 2714 case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; 2715 case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; 2716 } 2717 2718 // This needs to be set before we call getPtrSizedFrameRegister, otherwise 2719 // we get the wrong frame register. 2720 MachineFrameInfo &MFI = MF->getFrameInfo(); 2721 MFI.setFrameAddressIsTaken(true); 2722 2723 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 2724 unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF); 2725 assert(((FrameReg == X86::RBP && VT == MVT::i64) || 2726 (FrameReg == X86::EBP && VT == MVT::i32)) && 2727 "Invalid Frame Register!"); 2728 2729 // Always make a copy of the frame register to a vreg first, so that we 2730 // never directly reference the frame register (the TwoAddressInstruction- 2731 // Pass doesn't like that). 2732 Register SrcReg = createResultReg(RC); 2733 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2734 TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg); 2735 2736 // Now recursively load from the frame address. 2737 // movq (%rbp), %rax 2738 // movq (%rax), %rax 2739 // movq (%rax), %rax 2740 // ... 2741 unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue(); 2742 while (Depth--) { 2743 Register DestReg = createResultReg(RC); 2744 addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2745 TII.get(Opc), DestReg), SrcReg); 2746 SrcReg = DestReg; 2747 } 2748 2749 updateValueMap(II, SrcReg); 2750 return true; 2751 } 2752 case Intrinsic::memcpy: { 2753 const MemCpyInst *MCI = cast<MemCpyInst>(II); 2754 // Don't handle volatile or variable length memcpys. 2755 if (MCI->isVolatile()) 2756 return false; 2757 2758 if (isa<ConstantInt>(MCI->getLength())) { 2759 // Small memcpy's are common enough that we want to do them 2760 // without a call if possible. 2761 uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue(); 2762 if (IsMemcpySmall(Len)) { 2763 X86AddressMode DestAM, SrcAM; 2764 if (!X86SelectAddress(MCI->getRawDest(), DestAM) || 2765 !X86SelectAddress(MCI->getRawSource(), SrcAM)) 2766 return false; 2767 TryEmitSmallMemcpy(DestAM, SrcAM, Len); 2768 return true; 2769 } 2770 } 2771 2772 unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; 2773 if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth)) 2774 return false; 2775 2776 if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255) 2777 return false; 2778 2779 return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 1); 2780 } 2781 case Intrinsic::memset: { 2782 const MemSetInst *MSI = cast<MemSetInst>(II); 2783 2784 if (MSI->isVolatile()) 2785 return false; 2786 2787 unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; 2788 if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth)) 2789 return false; 2790 2791 if (MSI->getDestAddressSpace() > 255) 2792 return false; 2793 2794 return lowerCallTo(II, "memset", II->getNumArgOperands() - 1); 2795 } 2796 case Intrinsic::stackprotector: { 2797 // Emit code to store the stack guard onto the stack. 2798 EVT PtrTy = TLI.getPointerTy(DL); 2799 2800 const Value *Op1 = II->getArgOperand(0); // The guard's value. 2801 const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1)); 2802 2803 MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]); 2804 2805 // Grab the frame index. 2806 X86AddressMode AM; 2807 if (!X86SelectAddress(Slot, AM)) return false; 2808 if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; 2809 return true; 2810 } 2811 case Intrinsic::dbg_declare: { 2812 const DbgDeclareInst *DI = cast<DbgDeclareInst>(II); 2813 X86AddressMode AM; 2814 assert(DI->getAddress() && "Null address should be checked earlier!"); 2815 if (!X86SelectAddress(DI->getAddress(), AM)) 2816 return false; 2817 const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); 2818 // FIXME may need to add RegState::Debug to any registers produced, 2819 // although ESP/EBP should be the only ones at the moment. 2820 assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) && 2821 "Expected inlined-at fields to agree"); 2822 addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM) 2823 .addImm(0) 2824 .addMetadata(DI->getVariable()) 2825 .addMetadata(DI->getExpression()); 2826 return true; 2827 } 2828 case Intrinsic::trap: { 2829 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP)); 2830 return true; 2831 } 2832 case Intrinsic::sqrt: { 2833 if (!Subtarget->hasSSE1()) 2834 return false; 2835 2836 Type *RetTy = II->getCalledFunction()->getReturnType(); 2837 2838 MVT VT; 2839 if (!isTypeLegal(RetTy, VT)) 2840 return false; 2841 2842 // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT 2843 // is not generated by FastISel yet. 2844 // FIXME: Update this code once tablegen can handle it. 2845 static const uint16_t SqrtOpc[3][2] = { 2846 { X86::SQRTSSr, X86::SQRTSDr }, 2847 { X86::VSQRTSSr, X86::VSQRTSDr }, 2848 { X86::VSQRTSSZr, X86::VSQRTSDZr }, 2849 }; 2850 unsigned AVXLevel = Subtarget->hasAVX512() ? 2 : 2851 Subtarget->hasAVX() ? 1 : 2852 0; 2853 unsigned Opc; 2854 switch (VT.SimpleTy) { 2855 default: return false; 2856 case MVT::f32: Opc = SqrtOpc[AVXLevel][0]; break; 2857 case MVT::f64: Opc = SqrtOpc[AVXLevel][1]; break; 2858 } 2859 2860 const Value *SrcVal = II->getArgOperand(0); 2861 Register SrcReg = getRegForValue(SrcVal); 2862 2863 if (SrcReg == 0) 2864 return false; 2865 2866 const TargetRegisterClass *RC = TLI.getRegClassFor(VT); 2867 unsigned ImplicitDefReg = 0; 2868 if (AVXLevel > 0) { 2869 ImplicitDefReg = createResultReg(RC); 2870 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2871 TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); 2872 } 2873 2874 Register ResultReg = createResultReg(RC); 2875 MachineInstrBuilder MIB; 2876 MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), 2877 ResultReg); 2878 2879 if (ImplicitDefReg) 2880 MIB.addReg(ImplicitDefReg); 2881 2882 MIB.addReg(SrcReg); 2883 2884 updateValueMap(II, ResultReg); 2885 return true; 2886 } 2887 case Intrinsic::sadd_with_overflow: 2888 case Intrinsic::uadd_with_overflow: 2889 case Intrinsic::ssub_with_overflow: 2890 case Intrinsic::usub_with_overflow: 2891 case Intrinsic::smul_with_overflow: 2892 case Intrinsic::umul_with_overflow: { 2893 // This implements the basic lowering of the xalu with overflow intrinsics 2894 // into add/sub/mul followed by either seto or setb. 2895 const Function *Callee = II->getCalledFunction(); 2896 auto *Ty = cast<StructType>(Callee->getReturnType()); 2897 Type *RetTy = Ty->getTypeAtIndex(0U); 2898 assert(Ty->getTypeAtIndex(1)->isIntegerTy() && 2899 Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 && 2900 "Overflow value expected to be an i1"); 2901 2902 MVT VT; 2903 if (!isTypeLegal(RetTy, VT)) 2904 return false; 2905 2906 if (VT < MVT::i8 || VT > MVT::i64) 2907 return false; 2908 2909 const Value *LHS = II->getArgOperand(0); 2910 const Value *RHS = II->getArgOperand(1); 2911 2912 // Canonicalize immediate to the RHS. 2913 if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative()) 2914 std::swap(LHS, RHS); 2915 2916 unsigned BaseOpc, CondCode; 2917 switch (II->getIntrinsicID()) { 2918 default: llvm_unreachable("Unexpected intrinsic!"); 2919 case Intrinsic::sadd_with_overflow: 2920 BaseOpc = ISD::ADD; CondCode = X86::COND_O; break; 2921 case Intrinsic::uadd_with_overflow: 2922 BaseOpc = ISD::ADD; CondCode = X86::COND_B; break; 2923 case Intrinsic::ssub_with_overflow: 2924 BaseOpc = ISD::SUB; CondCode = X86::COND_O; break; 2925 case Intrinsic::usub_with_overflow: 2926 BaseOpc = ISD::SUB; CondCode = X86::COND_B; break; 2927 case Intrinsic::smul_with_overflow: 2928 BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break; 2929 case Intrinsic::umul_with_overflow: 2930 BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break; 2931 } 2932 2933 Register LHSReg = getRegForValue(LHS); 2934 if (LHSReg == 0) 2935 return false; 2936 bool LHSIsKill = hasTrivialKill(LHS); 2937 2938 unsigned ResultReg = 0; 2939 // Check if we have an immediate version. 2940 if (const auto *CI = dyn_cast<ConstantInt>(RHS)) { 2941 static const uint16_t Opc[2][4] = { 2942 { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r }, 2943 { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r } 2944 }; 2945 2946 if (CI->isOne() && (BaseOpc == ISD::ADD || BaseOpc == ISD::SUB) && 2947 CondCode == X86::COND_O) { 2948 // We can use INC/DEC. 2949 ResultReg = createResultReg(TLI.getRegClassFor(VT)); 2950 bool IsDec = BaseOpc == ISD::SUB; 2951 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2952 TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg) 2953 .addReg(LHSReg, getKillRegState(LHSIsKill)); 2954 } else 2955 ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, 2956 CI->getZExtValue()); 2957 } 2958 2959 unsigned RHSReg; 2960 bool RHSIsKill; 2961 if (!ResultReg) { 2962 RHSReg = getRegForValue(RHS); 2963 if (RHSReg == 0) 2964 return false; 2965 RHSIsKill = hasTrivialKill(RHS); 2966 ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, 2967 RHSIsKill); 2968 } 2969 2970 // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit 2971 // it manually. 2972 if (BaseOpc == X86ISD::UMUL && !ResultReg) { 2973 static const uint16_t MULOpc[] = 2974 { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; 2975 static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; 2976 // First copy the first operand into RAX, which is an implicit input to 2977 // the X86::MUL*r instruction. 2978 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2979 TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) 2980 .addReg(LHSReg, getKillRegState(LHSIsKill)); 2981 ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], 2982 TLI.getRegClassFor(VT), RHSReg, RHSIsKill); 2983 } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { 2984 static const uint16_t MULOpc[] = 2985 { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr }; 2986 if (VT == MVT::i8) { 2987 // Copy the first operand into AL, which is an implicit input to the 2988 // X86::IMUL8r instruction. 2989 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 2990 TII.get(TargetOpcode::COPY), X86::AL) 2991 .addReg(LHSReg, getKillRegState(LHSIsKill)); 2992 ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, 2993 RHSIsKill); 2994 } else 2995 ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], 2996 TLI.getRegClassFor(VT), LHSReg, LHSIsKill, 2997 RHSReg, RHSIsKill); 2998 } 2999 3000 if (!ResultReg) 3001 return false; 3002 3003 // Assign to a GPR since the overflow return value is lowered to a SETcc. 3004 Register ResultReg2 = createResultReg(&X86::GR8RegClass); 3005 assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); 3006 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), 3007 ResultReg2).addImm(CondCode); 3008 3009 updateValueMap(II, ResultReg, 2); 3010 return true; 3011 } 3012 case Intrinsic::x86_sse_cvttss2si: 3013 case Intrinsic::x86_sse_cvttss2si64: 3014 case Intrinsic::x86_sse2_cvttsd2si: 3015 case Intrinsic::x86_sse2_cvttsd2si64: { 3016 bool IsInputDouble; 3017 switch (II->getIntrinsicID()) { 3018 default: llvm_unreachable("Unexpected intrinsic."); 3019 case Intrinsic::x86_sse_cvttss2si: 3020 case Intrinsic::x86_sse_cvttss2si64: 3021 if (!Subtarget->hasSSE1()) 3022 return false; 3023 IsInputDouble = false; 3024 break; 3025 case Intrinsic::x86_sse2_cvttsd2si: 3026 case Intrinsic::x86_sse2_cvttsd2si64: 3027 if (!Subtarget->hasSSE2()) 3028 return false; 3029 IsInputDouble = true; 3030 break; 3031 } 3032 3033 Type *RetTy = II->getCalledFunction()->getReturnType(); 3034 MVT VT; 3035 if (!isTypeLegal(RetTy, VT)) 3036 return false; 3037 3038 static const uint16_t CvtOpc[3][2][2] = { 3039 { { X86::CVTTSS2SIrr, X86::CVTTSS2SI64rr }, 3040 { X86::CVTTSD2SIrr, X86::CVTTSD2SI64rr } }, 3041 { { X86::VCVTTSS2SIrr, X86::VCVTTSS2SI64rr }, 3042 { X86::VCVTTSD2SIrr, X86::VCVTTSD2SI64rr } }, 3043 { { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SI64Zrr }, 3044 { X86::VCVTTSD2SIZrr, X86::VCVTTSD2SI64Zrr } }, 3045 }; 3046 unsigned AVXLevel = Subtarget->hasAVX512() ? 2 : 3047 Subtarget->hasAVX() ? 1 : 3048 0; 3049 unsigned Opc; 3050 switch (VT.SimpleTy) { 3051 default: llvm_unreachable("Unexpected result type."); 3052 case MVT::i32: Opc = CvtOpc[AVXLevel][IsInputDouble][0]; break; 3053 case MVT::i64: Opc = CvtOpc[AVXLevel][IsInputDouble][1]; break; 3054 } 3055 3056 // Check if we can fold insertelement instructions into the convert. 3057 const Value *Op = II->getArgOperand(0); 3058 while (auto *IE = dyn_cast<InsertElementInst>(Op)) { 3059 const Value *Index = IE->getOperand(2); 3060 if (!isa<ConstantInt>(Index)) 3061 break; 3062 unsigned Idx = cast<ConstantInt>(Index)->getZExtValue(); 3063 3064 if (Idx == 0) { 3065 Op = IE->getOperand(1); 3066 break; 3067 } 3068 Op = IE->getOperand(0); 3069 } 3070 3071 Register Reg = getRegForValue(Op); 3072 if (Reg == 0) 3073 return false; 3074 3075 Register ResultReg = createResultReg(TLI.getRegClassFor(VT)); 3076 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) 3077 .addReg(Reg); 3078 3079 updateValueMap(II, ResultReg); 3080 return true; 3081 } 3082 } 3083 } 3084 3085 bool X86FastISel::fastLowerArguments() { 3086 if (!FuncInfo.CanLowerReturn) 3087 return false; 3088 3089 const Function *F = FuncInfo.Fn; 3090 if (F->isVarArg()) 3091 return false; 3092 3093 CallingConv::ID CC = F->getCallingConv(); 3094 if (CC != CallingConv::C) 3095 return false; 3096 3097 if (Subtarget->isCallingConvWin64(CC)) 3098 return false; 3099 3100 if (!Subtarget->is64Bit()) 3101 return false; 3102 3103 if (Subtarget->useSoftFloat()) 3104 return false; 3105 3106 // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. 3107 unsigned GPRCnt = 0; 3108 unsigned FPRCnt = 0; 3109 for (auto const &Arg : F->args()) { 3110 if (Arg.hasAttribute(Attribute::ByVal) || 3111 Arg.hasAttribute(Attribute::InReg) || 3112 Arg.hasAttribute(Attribute::StructRet) || 3113 Arg.hasAttribute(Attribute::SwiftSelf) || 3114 Arg.hasAttribute(Attribute::SwiftError) || 3115 Arg.hasAttribute(Attribute::Nest)) 3116 return false; 3117 3118 Type *ArgTy = Arg.getType(); 3119 if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) 3120 return false; 3121 3122 EVT ArgVT = TLI.getValueType(DL, ArgTy); 3123 if (!ArgVT.isSimple()) return false; 3124 switch (ArgVT.getSimpleVT().SimpleTy) { 3125 default: return false; 3126 case MVT::i32: 3127 case MVT::i64: 3128 ++GPRCnt; 3129 break; 3130 case MVT::f32: 3131 case MVT::f64: 3132 if (!Subtarget->hasSSE1()) 3133 return false; 3134 ++FPRCnt; 3135 break; 3136 } 3137 3138 if (GPRCnt > 6) 3139 return false; 3140 3141 if (FPRCnt > 8) 3142 return false; 3143 } 3144 3145 static const MCPhysReg GPR32ArgRegs[] = { 3146 X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D 3147 }; 3148 static const MCPhysReg GPR64ArgRegs[] = { 3149 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 3150 }; 3151 static const MCPhysReg XMMArgRegs[] = { 3152 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 3153 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 3154 }; 3155 3156 unsigned GPRIdx = 0; 3157 unsigned FPRIdx = 0; 3158 for (auto const &Arg : F->args()) { 3159 MVT VT = TLI.getSimpleValueType(DL, Arg.getType()); 3160 const TargetRegisterClass *RC = TLI.getRegClassFor(VT); 3161 unsigned SrcReg; 3162 switch (VT.SimpleTy) { 3163 default: llvm_unreachable("Unexpected value type."); 3164 case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; 3165 case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; 3166 case MVT::f32: LLVM_FALLTHROUGH; 3167 case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; 3168 } 3169 Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); 3170 // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. 3171 // Without this, EmitLiveInCopies may eliminate the livein if its only 3172 // use is a bitcast (which isn't turned into an instruction). 3173 Register ResultReg = createResultReg(RC); 3174 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3175 TII.get(TargetOpcode::COPY), ResultReg) 3176 .addReg(DstReg, getKillRegState(true)); 3177 updateValueMap(&Arg, ResultReg); 3178 } 3179 return true; 3180 } 3181 3182 static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget, 3183 CallingConv::ID CC, 3184 const CallBase *CB) { 3185 if (Subtarget->is64Bit()) 3186 return 0; 3187 if (Subtarget->getTargetTriple().isOSMSVCRT()) 3188 return 0; 3189 if (CC == CallingConv::Fast || CC == CallingConv::GHC || 3190 CC == CallingConv::HiPE || CC == CallingConv::Tail) 3191 return 0; 3192 3193 if (CB) 3194 if (CB->arg_empty() || !CB->paramHasAttr(0, Attribute::StructRet) || 3195 CB->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU()) 3196 return 0; 3197 3198 return 4; 3199 } 3200 3201 bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { 3202 auto &OutVals = CLI.OutVals; 3203 auto &OutFlags = CLI.OutFlags; 3204 auto &OutRegs = CLI.OutRegs; 3205 auto &Ins = CLI.Ins; 3206 auto &InRegs = CLI.InRegs; 3207 CallingConv::ID CC = CLI.CallConv; 3208 bool &IsTailCall = CLI.IsTailCall; 3209 bool IsVarArg = CLI.IsVarArg; 3210 const Value *Callee = CLI.Callee; 3211 MCSymbol *Symbol = CLI.Symbol; 3212 3213 bool Is64Bit = Subtarget->is64Bit(); 3214 bool IsWin64 = Subtarget->isCallingConvWin64(CC); 3215 3216 const CallInst *CI = dyn_cast_or_null<CallInst>(CLI.CB); 3217 const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr; 3218 3219 // Call / invoke instructions with NoCfCheck attribute require special 3220 // handling. 3221 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB); 3222 if ((CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck())) 3223 return false; 3224 3225 // Functions with no_caller_saved_registers that need special handling. 3226 if ((CI && CI->hasFnAttr("no_caller_saved_registers")) || 3227 (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers"))) 3228 return false; 3229 3230 // Functions using thunks for indirect calls need to use SDISel. 3231 if (Subtarget->useIndirectThunkCalls()) 3232 return false; 3233 3234 // Handle only C, fastcc, and webkit_js calling conventions for now. 3235 switch (CC) { 3236 default: return false; 3237 case CallingConv::C: 3238 case CallingConv::Fast: 3239 case CallingConv::Tail: 3240 case CallingConv::WebKit_JS: 3241 case CallingConv::Swift: 3242 case CallingConv::X86_FastCall: 3243 case CallingConv::X86_StdCall: 3244 case CallingConv::X86_ThisCall: 3245 case CallingConv::Win64: 3246 case CallingConv::X86_64_SysV: 3247 case CallingConv::CFGuard_Check: 3248 break; 3249 } 3250 3251 // Allow SelectionDAG isel to handle tail calls. 3252 if (IsTailCall) 3253 return false; 3254 3255 // fastcc with -tailcallopt is intended to provide a guaranteed 3256 // tail call optimization. Fastisel doesn't know how to do that. 3257 if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) || 3258 CC == CallingConv::Tail) 3259 return false; 3260 3261 // Don't know how to handle Win64 varargs yet. Nothing special needed for 3262 // x86-32. Special handling for x86-64 is implemented. 3263 if (IsVarArg && IsWin64) 3264 return false; 3265 3266 // Don't know about inalloca yet. 3267 if (CLI.CB && CLI.CB->hasInAllocaArgument()) 3268 return false; 3269 3270 for (auto Flag : CLI.OutFlags) 3271 if (Flag.isSwiftError() || Flag.isPreallocated()) 3272 return false; 3273 3274 SmallVector<MVT, 16> OutVTs; 3275 SmallVector<unsigned, 16> ArgRegs; 3276 3277 // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra 3278 // instruction. This is safe because it is common to all FastISel supported 3279 // calling conventions on x86. 3280 for (int i = 0, e = OutVals.size(); i != e; ++i) { 3281 Value *&Val = OutVals[i]; 3282 ISD::ArgFlagsTy Flags = OutFlags[i]; 3283 if (auto *CI = dyn_cast<ConstantInt>(Val)) { 3284 if (CI->getBitWidth() < 32) { 3285 if (Flags.isSExt()) 3286 Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext())); 3287 else 3288 Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext())); 3289 } 3290 } 3291 3292 // Passing bools around ends up doing a trunc to i1 and passing it. 3293 // Codegen this as an argument + "and 1". 3294 MVT VT; 3295 auto *TI = dyn_cast<TruncInst>(Val); 3296 unsigned ResultReg; 3297 if (TI && TI->getType()->isIntegerTy(1) && CLI.CB && 3298 (TI->getParent() == CLI.CB->getParent()) && TI->hasOneUse()) { 3299 Value *PrevVal = TI->getOperand(0); 3300 ResultReg = getRegForValue(PrevVal); 3301 3302 if (!ResultReg) 3303 return false; 3304 3305 if (!isTypeLegal(PrevVal->getType(), VT)) 3306 return false; 3307 3308 ResultReg = 3309 fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1); 3310 } else { 3311 if (!isTypeLegal(Val->getType(), VT) || 3312 (VT.isVector() && VT.getVectorElementType() == MVT::i1)) 3313 return false; 3314 ResultReg = getRegForValue(Val); 3315 } 3316 3317 if (!ResultReg) 3318 return false; 3319 3320 ArgRegs.push_back(ResultReg); 3321 OutVTs.push_back(VT); 3322 } 3323 3324 // Analyze operands of the call, assigning locations to each operand. 3325 SmallVector<CCValAssign, 16> ArgLocs; 3326 CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext()); 3327 3328 // Allocate shadow area for Win64 3329 if (IsWin64) 3330 CCInfo.AllocateStack(32, Align(8)); 3331 3332 CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); 3333 3334 // Get a count of how many bytes are to be pushed on the stack. 3335 unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); 3336 3337 // Issue CALLSEQ_START 3338 unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); 3339 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) 3340 .addImm(NumBytes).addImm(0).addImm(0); 3341 3342 // Walk the register/memloc assignments, inserting copies/loads. 3343 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 3344 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3345 CCValAssign const &VA = ArgLocs[i]; 3346 const Value *ArgVal = OutVals[VA.getValNo()]; 3347 MVT ArgVT = OutVTs[VA.getValNo()]; 3348 3349 if (ArgVT == MVT::x86mmx) 3350 return false; 3351 3352 unsigned ArgReg = ArgRegs[VA.getValNo()]; 3353 3354 // Promote the value if needed. 3355 switch (VA.getLocInfo()) { 3356 case CCValAssign::Full: break; 3357 case CCValAssign::SExt: { 3358 assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && 3359 "Unexpected extend"); 3360 3361 if (ArgVT == MVT::i1) 3362 return false; 3363 3364 bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, 3365 ArgVT, ArgReg); 3366 assert(Emitted && "Failed to emit a sext!"); (void)Emitted; 3367 ArgVT = VA.getLocVT(); 3368 break; 3369 } 3370 case CCValAssign::ZExt: { 3371 assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && 3372 "Unexpected extend"); 3373 3374 // Handle zero-extension from i1 to i8, which is common. 3375 if (ArgVT == MVT::i1) { 3376 // Set the high bits to zero. 3377 ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false); 3378 ArgVT = MVT::i8; 3379 3380 if (ArgReg == 0) 3381 return false; 3382 } 3383 3384 bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, 3385 ArgVT, ArgReg); 3386 assert(Emitted && "Failed to emit a zext!"); (void)Emitted; 3387 ArgVT = VA.getLocVT(); 3388 break; 3389 } 3390 case CCValAssign::AExt: { 3391 assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && 3392 "Unexpected extend"); 3393 bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg, 3394 ArgVT, ArgReg); 3395 if (!Emitted) 3396 Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, 3397 ArgVT, ArgReg); 3398 if (!Emitted) 3399 Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, 3400 ArgVT, ArgReg); 3401 3402 assert(Emitted && "Failed to emit a aext!"); (void)Emitted; 3403 ArgVT = VA.getLocVT(); 3404 break; 3405 } 3406 case CCValAssign::BCvt: { 3407 ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg, 3408 /*TODO: Kill=*/false); 3409 assert(ArgReg && "Failed to emit a bitcast!"); 3410 ArgVT = VA.getLocVT(); 3411 break; 3412 } 3413 case CCValAssign::VExt: 3414 // VExt has not been implemented, so this should be impossible to reach 3415 // for now. However, fallback to Selection DAG isel once implemented. 3416 return false; 3417 case CCValAssign::AExtUpper: 3418 case CCValAssign::SExtUpper: 3419 case CCValAssign::ZExtUpper: 3420 case CCValAssign::FPExt: 3421 case CCValAssign::Trunc: 3422 llvm_unreachable("Unexpected loc info!"); 3423 case CCValAssign::Indirect: 3424 // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully 3425 // support this. 3426 return false; 3427 } 3428 3429 if (VA.isRegLoc()) { 3430 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3431 TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); 3432 OutRegs.push_back(VA.getLocReg()); 3433 } else { 3434 assert(VA.isMemLoc() && "Unknown value location!"); 3435 3436 // Don't emit stores for undef values. 3437 if (isa<UndefValue>(ArgVal)) 3438 continue; 3439 3440 unsigned LocMemOffset = VA.getLocMemOffset(); 3441 X86AddressMode AM; 3442 AM.Base.Reg = RegInfo->getStackRegister(); 3443 AM.Disp = LocMemOffset; 3444 ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; 3445 Align Alignment = DL.getABITypeAlign(ArgVal->getType()); 3446 MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( 3447 MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset), 3448 MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); 3449 if (Flags.isByVal()) { 3450 X86AddressMode SrcAM; 3451 SrcAM.Base.Reg = ArgReg; 3452 if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize())) 3453 return false; 3454 } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) { 3455 // If this is a really simple value, emit this with the Value* version 3456 // of X86FastEmitStore. If it isn't simple, we don't want to do this, 3457 // as it can cause us to reevaluate the argument. 3458 if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO)) 3459 return false; 3460 } else { 3461 bool ValIsKill = hasTrivialKill(ArgVal); 3462 if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO)) 3463 return false; 3464 } 3465 } 3466 } 3467 3468 // ELF / PIC requires GOT in the EBX register before function calls via PLT 3469 // GOT pointer. 3470 if (Subtarget->isPICStyleGOT()) { 3471 unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); 3472 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3473 TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base); 3474 } 3475 3476 if (Is64Bit && IsVarArg && !IsWin64) { 3477 // From AMD64 ABI document: 3478 // For calls that may call functions that use varargs or stdargs 3479 // (prototype-less calls or calls to functions containing ellipsis (...) in 3480 // the declaration) %al is used as hidden argument to specify the number 3481 // of SSE registers used. The contents of %al do not need to match exactly 3482 // the number of registers, but must be an ubound on the number of SSE 3483 // registers used and is in the range 0 - 8 inclusive. 3484 3485 // Count the number of XMM registers allocated. 3486 static const MCPhysReg XMMArgRegs[] = { 3487 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 3488 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 3489 }; 3490 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); 3491 assert((Subtarget->hasSSE1() || !NumXMMRegs) 3492 && "SSE registers cannot be used when SSE is disabled"); 3493 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), 3494 X86::AL).addImm(NumXMMRegs); 3495 } 3496 3497 // Materialize callee address in a register. FIXME: GV address can be 3498 // handled with a CALLpcrel32 instead. 3499 X86AddressMode CalleeAM; 3500 if (!X86SelectCallAddress(Callee, CalleeAM)) 3501 return false; 3502 3503 unsigned CalleeOp = 0; 3504 const GlobalValue *GV = nullptr; 3505 if (CalleeAM.GV != nullptr) { 3506 GV = CalleeAM.GV; 3507 } else if (CalleeAM.Base.Reg != 0) { 3508 CalleeOp = CalleeAM.Base.Reg; 3509 } else 3510 return false; 3511 3512 // Issue the call. 3513 MachineInstrBuilder MIB; 3514 if (CalleeOp) { 3515 // Register-indirect call. 3516 unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r; 3517 MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)) 3518 .addReg(CalleeOp); 3519 } else { 3520 // Direct call. 3521 assert(GV && "Not a direct call"); 3522 // See if we need any target-specific flags on the GV operand. 3523 unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV); 3524 3525 // This will be a direct call, or an indirect call through memory for 3526 // NonLazyBind calls or dllimport calls. 3527 bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT || 3528 OpFlags == X86II::MO_GOTPCREL || 3529 OpFlags == X86II::MO_COFFSTUB; 3530 unsigned CallOpc = NeedLoad 3531 ? (Is64Bit ? X86::CALL64m : X86::CALL32m) 3532 : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32); 3533 3534 MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); 3535 if (NeedLoad) 3536 MIB.addReg(Is64Bit ? X86::RIP : 0).addImm(1).addReg(0); 3537 if (Symbol) 3538 MIB.addSym(Symbol, OpFlags); 3539 else 3540 MIB.addGlobalAddress(GV, 0, OpFlags); 3541 if (NeedLoad) 3542 MIB.addReg(0); 3543 } 3544 3545 // Add a register mask operand representing the call-preserved registers. 3546 // Proper defs for return values will be added by setPhysRegsDeadExcept(). 3547 MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); 3548 3549 // Add an implicit use GOT pointer in EBX. 3550 if (Subtarget->isPICStyleGOT()) 3551 MIB.addReg(X86::EBX, RegState::Implicit); 3552 3553 if (Is64Bit && IsVarArg && !IsWin64) 3554 MIB.addReg(X86::AL, RegState::Implicit); 3555 3556 // Add implicit physical register uses to the call. 3557 for (auto Reg : OutRegs) 3558 MIB.addReg(Reg, RegState::Implicit); 3559 3560 // Issue CALLSEQ_END 3561 unsigned NumBytesForCalleeToPop = 3562 X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg, 3563 TM.Options.GuaranteedTailCallOpt) 3564 ? NumBytes // Callee pops everything. 3565 : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CB); 3566 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); 3567 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) 3568 .addImm(NumBytes).addImm(NumBytesForCalleeToPop); 3569 3570 // Now handle call return values. 3571 SmallVector<CCValAssign, 16> RVLocs; 3572 CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, 3573 CLI.RetTy->getContext()); 3574 CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); 3575 3576 // Copy all of the result registers out of their specified physreg. 3577 Register ResultReg = FuncInfo.CreateRegs(CLI.RetTy); 3578 for (unsigned i = 0; i != RVLocs.size(); ++i) { 3579 CCValAssign &VA = RVLocs[i]; 3580 EVT CopyVT = VA.getValVT(); 3581 unsigned CopyReg = ResultReg + i; 3582 Register SrcReg = VA.getLocReg(); 3583 3584 // If this is x86-64, and we disabled SSE, we can't return FP values 3585 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 3586 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 3587 report_fatal_error("SSE register return with SSE disabled"); 3588 } 3589 3590 // If we prefer to use the value in xmm registers, copy it out as f80 and 3591 // use a truncate to move it from fp stack reg to xmm reg. 3592 if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) && 3593 isScalarFPTypeInSSEReg(VA.getValVT())) { 3594 CopyVT = MVT::f80; 3595 CopyReg = createResultReg(&X86::RFP80RegClass); 3596 } 3597 3598 // Copy out the result. 3599 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3600 TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg); 3601 InRegs.push_back(VA.getLocReg()); 3602 3603 // Round the f80 to the right size, which also moves it to the appropriate 3604 // xmm register. This is accomplished by storing the f80 value in memory 3605 // and then loading it back. 3606 if (CopyVT != VA.getValVT()) { 3607 EVT ResVT = VA.getValVT(); 3608 unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; 3609 unsigned MemSize = ResVT.getSizeInBits()/8; 3610 int FI = MFI.CreateStackObject(MemSize, Align(MemSize), false); 3611 addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3612 TII.get(Opc)), FI) 3613 .addReg(CopyReg); 3614 Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt; 3615 addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3616 TII.get(Opc), ResultReg + i), FI); 3617 } 3618 } 3619 3620 CLI.ResultReg = ResultReg; 3621 CLI.NumResultRegs = RVLocs.size(); 3622 CLI.Call = MIB; 3623 3624 return true; 3625 } 3626 3627 bool 3628 X86FastISel::fastSelectInstruction(const Instruction *I) { 3629 switch (I->getOpcode()) { 3630 default: break; 3631 case Instruction::Load: 3632 return X86SelectLoad(I); 3633 case Instruction::Store: 3634 return X86SelectStore(I); 3635 case Instruction::Ret: 3636 return X86SelectRet(I); 3637 case Instruction::ICmp: 3638 case Instruction::FCmp: 3639 return X86SelectCmp(I); 3640 case Instruction::ZExt: 3641 return X86SelectZExt(I); 3642 case Instruction::SExt: 3643 return X86SelectSExt(I); 3644 case Instruction::Br: 3645 return X86SelectBranch(I); 3646 case Instruction::LShr: 3647 case Instruction::AShr: 3648 case Instruction::Shl: 3649 return X86SelectShift(I); 3650 case Instruction::SDiv: 3651 case Instruction::UDiv: 3652 case Instruction::SRem: 3653 case Instruction::URem: 3654 return X86SelectDivRem(I); 3655 case Instruction::Select: 3656 return X86SelectSelect(I); 3657 case Instruction::Trunc: 3658 return X86SelectTrunc(I); 3659 case Instruction::FPExt: 3660 return X86SelectFPExt(I); 3661 case Instruction::FPTrunc: 3662 return X86SelectFPTrunc(I); 3663 case Instruction::SIToFP: 3664 return X86SelectSIToFP(I); 3665 case Instruction::UIToFP: 3666 return X86SelectUIToFP(I); 3667 case Instruction::IntToPtr: // Deliberate fall-through. 3668 case Instruction::PtrToInt: { 3669 EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); 3670 EVT DstVT = TLI.getValueType(DL, I->getType()); 3671 if (DstVT.bitsGT(SrcVT)) 3672 return X86SelectZExt(I); 3673 if (DstVT.bitsLT(SrcVT)) 3674 return X86SelectTrunc(I); 3675 Register Reg = getRegForValue(I->getOperand(0)); 3676 if (Reg == 0) return false; 3677 updateValueMap(I, Reg); 3678 return true; 3679 } 3680 case Instruction::BitCast: { 3681 // Select SSE2/AVX bitcasts between 128/256/512 bit vector types. 3682 if (!Subtarget->hasSSE2()) 3683 return false; 3684 3685 MVT SrcVT, DstVT; 3686 if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT) || 3687 !isTypeLegal(I->getType(), DstVT)) 3688 return false; 3689 3690 // Only allow vectors that use xmm/ymm/zmm. 3691 if (!SrcVT.isVector() || !DstVT.isVector() || 3692 SrcVT.getVectorElementType() == MVT::i1 || 3693 DstVT.getVectorElementType() == MVT::i1) 3694 return false; 3695 3696 Register Reg = getRegForValue(I->getOperand(0)); 3697 if (!Reg) 3698 return false; 3699 3700 // Emit a reg-reg copy so we don't propagate cached known bits information 3701 // with the wrong VT if we fall out of fast isel after selecting this. 3702 const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT); 3703 Register ResultReg = createResultReg(DstClass); 3704 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3705 TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg); 3706 3707 updateValueMap(I, ResultReg); 3708 return true; 3709 } 3710 } 3711 3712 return false; 3713 } 3714 3715 unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { 3716 if (VT > MVT::i64) 3717 return 0; 3718 3719 uint64_t Imm = CI->getZExtValue(); 3720 if (Imm == 0) { 3721 Register SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); 3722 switch (VT.SimpleTy) { 3723 default: llvm_unreachable("Unexpected value type"); 3724 case MVT::i1: 3725 case MVT::i8: 3726 return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Op0IsKill=*/true, 3727 X86::sub_8bit); 3728 case MVT::i16: 3729 return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Op0IsKill=*/true, 3730 X86::sub_16bit); 3731 case MVT::i32: 3732 return SrcReg; 3733 case MVT::i64: { 3734 Register ResultReg = createResultReg(&X86::GR64RegClass); 3735 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3736 TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) 3737 .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); 3738 return ResultReg; 3739 } 3740 } 3741 } 3742 3743 unsigned Opc = 0; 3744 switch (VT.SimpleTy) { 3745 default: llvm_unreachable("Unexpected value type"); 3746 case MVT::i1: 3747 VT = MVT::i8; 3748 LLVM_FALLTHROUGH; 3749 case MVT::i8: Opc = X86::MOV8ri; break; 3750 case MVT::i16: Opc = X86::MOV16ri; break; 3751 case MVT::i32: Opc = X86::MOV32ri; break; 3752 case MVT::i64: { 3753 if (isUInt<32>(Imm)) 3754 Opc = X86::MOV32ri64; 3755 else if (isInt<32>(Imm)) 3756 Opc = X86::MOV64ri32; 3757 else 3758 Opc = X86::MOV64ri; 3759 break; 3760 } 3761 } 3762 return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); 3763 } 3764 3765 unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { 3766 if (CFP->isNullValue()) 3767 return fastMaterializeFloatZero(CFP); 3768 3769 // Can't handle alternate code models yet. 3770 CodeModel::Model CM = TM.getCodeModel(); 3771 if (CM != CodeModel::Small && CM != CodeModel::Large) 3772 return 0; 3773 3774 // Get opcode and regclass of the output for the given load instruction. 3775 unsigned Opc = 0; 3776 bool HasAVX = Subtarget->hasAVX(); 3777 bool HasAVX512 = Subtarget->hasAVX512(); 3778 switch (VT.SimpleTy) { 3779 default: return 0; 3780 case MVT::f32: 3781 if (X86ScalarSSEf32) 3782 Opc = HasAVX512 ? X86::VMOVSSZrm_alt : 3783 HasAVX ? X86::VMOVSSrm_alt : 3784 X86::MOVSSrm_alt; 3785 else 3786 Opc = X86::LD_Fp32m; 3787 break; 3788 case MVT::f64: 3789 if (X86ScalarSSEf64) 3790 Opc = HasAVX512 ? X86::VMOVSDZrm_alt : 3791 HasAVX ? X86::VMOVSDrm_alt : 3792 X86::MOVSDrm_alt; 3793 else 3794 Opc = X86::LD_Fp64m; 3795 break; 3796 case MVT::f80: 3797 // No f80 support yet. 3798 return 0; 3799 } 3800 3801 // MachineConstantPool wants an explicit alignment. 3802 Align Alignment = DL.getPrefTypeAlign(CFP->getType()); 3803 3804 // x86-32 PIC requires a PIC base register for constant pools. 3805 unsigned PICBase = 0; 3806 unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr); 3807 if (OpFlag == X86II::MO_PIC_BASE_OFFSET) 3808 PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); 3809 else if (OpFlag == X86II::MO_GOTOFF) 3810 PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); 3811 else if (Subtarget->is64Bit() && TM.getCodeModel() == CodeModel::Small) 3812 PICBase = X86::RIP; 3813 3814 // Create the load from the constant pool. 3815 unsigned CPI = MCP.getConstantPoolIndex(CFP, Alignment); 3816 Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy)); 3817 3818 // Large code model only applies to 64-bit mode. 3819 if (Subtarget->is64Bit() && CM == CodeModel::Large) { 3820 Register AddrReg = createResultReg(&X86::GR64RegClass); 3821 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), 3822 AddrReg) 3823 .addConstantPoolIndex(CPI, 0, OpFlag); 3824 MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3825 TII.get(Opc), ResultReg); 3826 addRegReg(MIB, AddrReg, false, PICBase, false); 3827 MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( 3828 MachinePointerInfo::getConstantPool(*FuncInfo.MF), 3829 MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment); 3830 MIB->addMemOperand(*FuncInfo.MF, MMO); 3831 return ResultReg; 3832 } 3833 3834 addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3835 TII.get(Opc), ResultReg), 3836 CPI, PICBase, OpFlag); 3837 return ResultReg; 3838 } 3839 3840 unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { 3841 // Can't handle alternate code models yet. 3842 if (TM.getCodeModel() != CodeModel::Small) 3843 return 0; 3844 3845 // Materialize addresses with LEA/MOV instructions. 3846 X86AddressMode AM; 3847 if (X86SelectAddress(GV, AM)) { 3848 // If the expression is just a basereg, then we're done, otherwise we need 3849 // to emit an LEA. 3850 if (AM.BaseType == X86AddressMode::RegBase && 3851 AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) 3852 return AM.Base.Reg; 3853 3854 Register ResultReg = createResultReg(TLI.getRegClassFor(VT)); 3855 if (TM.getRelocationModel() == Reloc::Static && 3856 TLI.getPointerTy(DL) == MVT::i64) { 3857 // The displacement code could be more than 32 bits away so we need to use 3858 // an instruction with a 64 bit immediate 3859 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), 3860 ResultReg) 3861 .addGlobalAddress(GV); 3862 } else { 3863 unsigned Opc = 3864 TLI.getPointerTy(DL) == MVT::i32 3865 ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r) 3866 : X86::LEA64r; 3867 addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3868 TII.get(Opc), ResultReg), AM); 3869 } 3870 return ResultReg; 3871 } 3872 return 0; 3873 } 3874 3875 unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { 3876 EVT CEVT = TLI.getValueType(DL, C->getType(), true); 3877 3878 // Only handle simple types. 3879 if (!CEVT.isSimple()) 3880 return 0; 3881 MVT VT = CEVT.getSimpleVT(); 3882 3883 if (const auto *CI = dyn_cast<ConstantInt>(C)) 3884 return X86MaterializeInt(CI, VT); 3885 else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) 3886 return X86MaterializeFP(CFP, VT); 3887 else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) 3888 return X86MaterializeGV(GV, VT); 3889 3890 return 0; 3891 } 3892 3893 unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { 3894 // Fail on dynamic allocas. At this point, getRegForValue has already 3895 // checked its CSE maps, so if we're here trying to handle a dynamic 3896 // alloca, we're not going to succeed. X86SelectAddress has a 3897 // check for dynamic allocas, because it's called directly from 3898 // various places, but targetMaterializeAlloca also needs a check 3899 // in order to avoid recursion between getRegForValue, 3900 // X86SelectAddrss, and targetMaterializeAlloca. 3901 if (!FuncInfo.StaticAllocaMap.count(C)) 3902 return 0; 3903 assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?"); 3904 3905 X86AddressMode AM; 3906 if (!X86SelectAddress(C, AM)) 3907 return 0; 3908 unsigned Opc = 3909 TLI.getPointerTy(DL) == MVT::i32 3910 ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r) 3911 : X86::LEA64r; 3912 const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL)); 3913 Register ResultReg = createResultReg(RC); 3914 addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 3915 TII.get(Opc), ResultReg), AM); 3916 return ResultReg; 3917 } 3918 3919 unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { 3920 MVT VT; 3921 if (!isTypeLegal(CF->getType(), VT)) 3922 return 0; 3923 3924 // Get opcode and regclass for the given zero. 3925 bool HasAVX512 = Subtarget->hasAVX512(); 3926 unsigned Opc = 0; 3927 switch (VT.SimpleTy) { 3928 default: return 0; 3929 case MVT::f32: 3930 if (X86ScalarSSEf32) 3931 Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS; 3932 else 3933 Opc = X86::LD_Fp032; 3934 break; 3935 case MVT::f64: 3936 if (X86ScalarSSEf64) 3937 Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD; 3938 else 3939 Opc = X86::LD_Fp064; 3940 break; 3941 case MVT::f80: 3942 // No f80 support yet. 3943 return 0; 3944 } 3945 3946 Register ResultReg = createResultReg(TLI.getRegClassFor(VT)); 3947 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); 3948 return ResultReg; 3949 } 3950 3951 3952 bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, 3953 const LoadInst *LI) { 3954 const Value *Ptr = LI->getPointerOperand(); 3955 X86AddressMode AM; 3956 if (!X86SelectAddress(Ptr, AM)) 3957 return false; 3958 3959 const X86InstrInfo &XII = (const X86InstrInfo &)TII; 3960 3961 unsigned Size = DL.getTypeAllocSize(LI->getType()); 3962 3963 SmallVector<MachineOperand, 8> AddrOps; 3964 AM.getFullAddress(AddrOps); 3965 3966 MachineInstr *Result = XII.foldMemoryOperandImpl( 3967 *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, LI->getAlign(), 3968 /*AllowCommute=*/true); 3969 if (!Result) 3970 return false; 3971 3972 // The index register could be in the wrong register class. Unfortunately, 3973 // foldMemoryOperandImpl could have commuted the instruction so its not enough 3974 // to just look at OpNo + the offset to the index reg. We actually need to 3975 // scan the instruction to find the index reg and see if its the correct reg 3976 // class. 3977 unsigned OperandNo = 0; 3978 for (MachineInstr::mop_iterator I = Result->operands_begin(), 3979 E = Result->operands_end(); I != E; ++I, ++OperandNo) { 3980 MachineOperand &MO = *I; 3981 if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg) 3982 continue; 3983 // Found the index reg, now try to rewrite it. 3984 Register IndexReg = constrainOperandRegClass(Result->getDesc(), 3985 MO.getReg(), OperandNo); 3986 if (IndexReg == MO.getReg()) 3987 continue; 3988 MO.setReg(IndexReg); 3989 } 3990 3991 Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); 3992 Result->cloneInstrSymbols(*FuncInfo.MF, *MI); 3993 MachineBasicBlock::iterator I(MI); 3994 removeDeadCode(I, std::next(I)); 3995 return true; 3996 } 3997 3998 unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode, 3999 const TargetRegisterClass *RC, 4000 unsigned Op0, bool Op0IsKill, 4001 unsigned Op1, bool Op1IsKill, 4002 unsigned Op2, bool Op2IsKill, 4003 unsigned Op3, bool Op3IsKill) { 4004 const MCInstrDesc &II = TII.get(MachineInstOpcode); 4005 4006 Register ResultReg = createResultReg(RC); 4007 Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs()); 4008 Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1); 4009 Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2); 4010 Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3); 4011 4012 if (II.getNumDefs() >= 1) 4013 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) 4014 .addReg(Op0, getKillRegState(Op0IsKill)) 4015 .addReg(Op1, getKillRegState(Op1IsKill)) 4016 .addReg(Op2, getKillRegState(Op2IsKill)) 4017 .addReg(Op3, getKillRegState(Op3IsKill)); 4018 else { 4019 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) 4020 .addReg(Op0, getKillRegState(Op0IsKill)) 4021 .addReg(Op1, getKillRegState(Op1IsKill)) 4022 .addReg(Op2, getKillRegState(Op2IsKill)) 4023 .addReg(Op3, getKillRegState(Op3IsKill)); 4024 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, 4025 TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]); 4026 } 4027 return ResultReg; 4028 } 4029 4030 4031 namespace llvm { 4032 FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo, 4033 const TargetLibraryInfo *libInfo) { 4034 return new X86FastISel(funcInfo, libInfo); 4035 } 4036 } 4037